Class: ToyLMMetal
- Inherits:
-
Object
- Object
- ToyLMMetal
- Defined in:
- lib/toy/models/transformer_lm_metal.rb
Instance Attribute Summary collapse
-
#arch ⇒ Object
readonly
Returns the value of attribute arch.
-
#max_T ⇒ Object
Returns the value of attribute max_T.
-
#tokenizer ⇒ Object
readonly
Returns the value of attribute tokenizer.
Instance Method Summary collapse
- #decode_step(token_id, pos) ⇒ Object
- #generate(prompt_ids, n_new, sampler_config = nil) ⇒ Object
-
#initialize(arch) ⇒ ToyLMMetal
constructor
A new instance of ToyLMMetal.
- #load(path) ⇒ Object
Constructor Details
#initialize(arch) ⇒ ToyLMMetal
Returns a new instance of ToyLMMetal.
34 35 36 37 38 39 40 |
# File 'lib/toy/models/transformer_lm_metal.rb', line 34 def initialize(arch) @arch = arch @max_T = 256 @kv = nil @gguf_handle = nil @loaded = false end |
Instance Attribute Details
#arch ⇒ Object (readonly)
Returns the value of attribute arch.
32 33 34 |
# File 'lib/toy/models/transformer_lm_metal.rb', line 32 def arch @arch end |
#max_T ⇒ Object
Returns the value of attribute max_T.
32 33 34 |
# File 'lib/toy/models/transformer_lm_metal.rb', line 32 def max_T @max_T end |
#tokenizer ⇒ Object (readonly)
Returns the value of attribute tokenizer.
32 33 34 |
# File 'lib/toy/models/transformer_lm_metal.rb', line 32 def tokenizer @tokenizer end |
Instance Method Details
#decode_step(token_id, pos) ⇒ Object
91 92 93 |
# File 'lib/toy/models/transformer_lm_metal.rb', line 91 def decode_step(token_id, pos) SmolLM2KVMetal.decode_step(@kv, token_id, pos) end |
#generate(prompt_ids, n_new, sampler_config = nil) ⇒ Object
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# File 'lib/toy/models/transformer_lm_metal.rb', line 95 def generate(prompt_ids, n_new, sampler_config = nil) if !@loaded puts "ToyLMMetal.generate: model not loaded; call .load(path) first" return prompt_ids end ids = [] j = 0 while j < prompt_ids.length ids.push(prompt_ids[j]) j = j + 1 end i = 0 while i < prompt_ids.length decode_step(prompt_ids[i], i) i = i + 1 end ctx = nil if sampler_config != nil ctx = SamplerContext.new(ids, sampler_config.seed) end n = 0 while n < n_new pos = ids.length last_id = ids[pos - 1] logits = decode_step(last_id, pos) pick = -1 if sampler_config == nil pick = Sampler.argmax(logits) else logits = Sampler.repetition_penalty(logits, ctx, sampler_config.rep_penalty) logits = Sampler.temperature(logits, sampler_config.temperature) logits = Sampler.top_k(logits, sampler_config.top_k) logits = Sampler.top_p(logits, sampler_config.top_p) pick = Sampler.pick(logits, sampler_config, ctx) ctx.generated_ids.push(pick) end ids.push(pick) if pick == @arch.eos_id break end n = n + 1 end ids end |
#load(path) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# File 'lib/toy/models/transformer_lm_metal.rb', line 46 def load(path) flags = GGUFLoad.detect_smollm2_flags(path) cfg = SmolLM2ConfigLoader.read(path) # Metal: skip the mmap fast path. tnn_session_attach_weight_mmap # falls through to ggml_backend_cpu_buffer_from_ptr on Metal (no # public Metal buffer_from_ptr API), and the ggml-metal scheduler # crashes during compute when fed CPU-resident weight tensors as # kernel inputs. The copy-load path (realize_for + load_kv_cache_auto) # is bytes-bounded by the weight tensor size; fine for the 135M # smoke and OK up to a few GB. A real Metal mmap path would need # ggml-metal to expose buffer-from-bytes-no-copy, tracked as a # follow-up on issue #2. kv = SmolLM2KVFFICacheMetal.new # P5.1: KV_Q8=1 opts into Q8_0 storage for the K cache. See the CPU # mirror in lib/transformer_lm.rb for the full rationale. if (ENV["KV_Q8"] || "") == "1" kv.enable_kv_q8! end # P4.1: FLASH_ATTN=1 → ggml_flash_attn_ext for attention. if (ENV["FLASH_ATTN"] || "") == "1" kv.enable_flash_attn! end if flags.qk_norm # #76, fail loud (never mask): Metal only has the copy-load path # (realize_for), which has no QK-norm support — the gamma tensors # are only allocated on the mmap path. Decoding a QK-norm model # (Qwen3 / OLMoE) here would be silently degenerate, exactly the # CUDA #76 failure mode. Abort until a Metal mmap/QK-norm path # exists. puts "ToyLMMetal.load: " + path + " needs QK-norm, which the " + "Metal copy-load path cannot apply (no mmap path on Metal " + "yet, issue #2). Aborting rather than decode degenerate." exit 1 end kv.realize_for(@max_T, cfg.d_model, cfg.d_ff, cfg.n_heads, cfg.n_kv, cfg.n_layers, cfg.vocab, cfg.rope_base, cfg.rms_eps, flags.untied, flags.qkv_bias) GGUFLoad.load_kv_cache_auto(kv, path) @kv = kv @loaded = true end |