Class: ToyLMMetal

Inherits:

Object

Object
ToyLMMetal

show all

Defined in:: lib/toy/models/transformer_lm_metal.rb

Instance Attribute Summary collapse

#arch ⇒ Object readonly

Returns the value of attribute arch.
#max_T ⇒ Object

Returns the value of attribute max_T.
#tokenizer ⇒ Object readonly

Returns the value of attribute tokenizer.

Instance Method Summary collapse

#decode_step(token_id, pos) ⇒ Object
#generate(prompt_ids, n_new, sampler_config = nil) ⇒ Object
#initialize(arch) ⇒ ToyLMMetal constructor

A new instance of ToyLMMetal.
#load(path) ⇒ Object

Constructor Details

#initialize(arch) ⇒ `ToyLMMetal`

Returns a new instance of ToyLMMetal.

# File 'lib/toy/models/transformer_lm_metal.rb', line 34

def initialize(arch)
  @arch    = arch
  @max_T   = 256
  @kv      = nil
  @gguf_handle = nil
  @loaded  = false
end

Instance Attribute Details

#arch ⇒ `Object` (readonly)

Returns the value of attribute arch.



32
33
34

# File 'lib/toy/models/transformer_lm_metal.rb', line 32

def arch
  @arch
end

#max_T ⇒ `Object`

Returns the value of attribute max_T.



32
33
34

# File 'lib/toy/models/transformer_lm_metal.rb', line 32

def max_T
  @max_T
end

#tokenizer ⇒ `Object` (readonly)

Returns the value of attribute tokenizer.



32
33
34

# File 'lib/toy/models/transformer_lm_metal.rb', line 32

def tokenizer
  @tokenizer
end

Instance Method Details

#decode_step(token_id, pos) ⇒ `Object`



91
92
93

# File 'lib/toy/models/transformer_lm_metal.rb', line 91

def decode_step(token_id, pos)
  SmolLM2KVMetal.decode_step(@kv, token_id, pos)
end

#generate(prompt_ids, n_new, sampler_config = nil) ⇒ `Object`

# File 'lib/toy/models/transformer_lm_metal.rb', line 95

def generate(prompt_ids, n_new, sampler_config = nil)
  if !@loaded
    puts "ToyLMMetal.generate: model not loaded; call .load(path) first"
    return prompt_ids
  end
  ids = []
  j = 0
  while j < prompt_ids.length
    ids.push(prompt_ids[j])
    j = j + 1
  end

  i = 0
  while i < prompt_ids.length
    decode_step(prompt_ids[i], i)
    i = i + 1
  end

  ctx = nil
  if sampler_config != nil
    ctx = SamplerContext.new(ids, sampler_config.seed)
  end

  n = 0
  while n < n_new
    pos = ids.length
    last_id = ids[pos - 1]
    logits = decode_step(last_id, pos)
    pick = -1
    if sampler_config == nil
      pick = Sampler.argmax(logits)
    else
      logits = Sampler.repetition_penalty(logits, ctx, sampler_config.rep_penalty)
      logits = Sampler.temperature(logits, sampler_config.temperature)
      logits = Sampler.top_k(logits, sampler_config.top_k)
      logits = Sampler.top_p(logits, sampler_config.top_p)
      pick   = Sampler.pick(logits, sampler_config, ctx)
      ctx.generated_ids.push(pick)
    end
    ids.push(pick)
    if pick == @arch.eos_id
      break
    end
    n = n + 1
  end
  ids
end

#load(path) ⇒ `Object`

# File 'lib/toy/models/transformer_lm_metal.rb', line 46

def load(path)
  flags = GGUFLoad.detect_smollm2_flags(path)
  cfg   = SmolLM2ConfigLoader.read(path)

  # Metal: skip the mmap fast path. tnn_session_attach_weight_mmap
  # falls through to ggml_backend_cpu_buffer_from_ptr on Metal (no
  # public Metal buffer_from_ptr API), and the ggml-metal scheduler
  # crashes during compute when fed CPU-resident weight tensors as
  # kernel inputs. The copy-load path (realize_for + load_kv_cache_auto)
  # is bytes-bounded by the weight tensor size; fine for the 135M
  # smoke and OK up to a few GB. A real Metal mmap path would need
  # ggml-metal to expose buffer-from-bytes-no-copy, tracked as a
  # follow-up on issue #2.

  kv = SmolLM2KVFFICacheMetal.new
  # P5.1: KV_Q8=1 opts into Q8_0 storage for the K cache. See the CPU
  # mirror in lib/transformer_lm.rb for the full rationale.
  if (ENV["KV_Q8"] || "") == "1"
    kv.enable_kv_q8!
  end
  # P4.1: FLASH_ATTN=1 → ggml_flash_attn_ext for attention.
  if (ENV["FLASH_ATTN"] || "") == "1"
    kv.enable_flash_attn!
  end

  if flags.qk_norm
    # #76, fail loud (never mask): Metal only has the copy-load path
    # (realize_for), which has no QK-norm support — the gamma tensors
    # are only allocated on the mmap path. Decoding a QK-norm model
    # (Qwen3 / OLMoE) here would be silently degenerate, exactly the
    # CUDA #76 failure mode. Abort until a Metal mmap/QK-norm path
    # exists.
    puts "ToyLMMetal.load: " + path + " needs QK-norm, which the " +
         "Metal copy-load path cannot apply (no mmap path on Metal " +
         "yet, issue #2). Aborting rather than decode degenerate."
    exit 1
  end
  kv.realize_for(@max_T, cfg.d_model, cfg.d_ff, cfg.n_heads, cfg.n_kv,
                 cfg.n_layers, cfg.vocab, cfg.rope_base, cfg.rms_eps,
                 flags.untied, flags.qkv_bias)
  GGUFLoad.load_kv_cache_auto(kv, path)
  @kv = kv
  @loaded = true
end

Class: ToyLMMetal

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(arch) ⇒ ToyLMMetal

Instance Attribute Details

#arch ⇒ Object (readonly)

#max_T ⇒ Object

#tokenizer ⇒ Object (readonly)

Instance Method Details

#decode_step(token_id, pos) ⇒ Object

#generate(prompt_ids, n_new, sampler_config = nil) ⇒ Object

#load(path) ⇒ Object

#initialize(arch) ⇒ `ToyLMMetal`

#arch ⇒ `Object` (readonly)

#max_T ⇒ `Object`

#tokenizer ⇒ `Object` (readonly)

#decode_step(token_id, pos) ⇒ `Object`

#generate(prompt_ids, n_new, sampler_config = nil) ⇒ `Object`

#load(path) ⇒ `Object`