Class: ToyLM

Inherits:
Object
  • Object
show all
Defined in:
lib/toy/models/transformer_lm.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(arch, backend) ⇒ ToyLM

Returns a new instance of ToyLM.



35
36
37
38
39
40
41
42
43
# File 'lib/toy/models/transformer_lm.rb', line 35

def initialize(arch, backend)
  @arch    = arch
  @backend = backend
  @max_T   = 256
  @kv_cpu  = nil
  @kv_cuda = nil
  @gguf_handle = nil
  @loaded  = false
end

Instance Attribute Details

#archObject (readonly)

Returns the value of attribute arch.



30
31
32
# File 'lib/toy/models/transformer_lm.rb', line 30

def arch
  @arch
end

#backendObject (readonly)

Returns the value of attribute backend.



30
31
32
# File 'lib/toy/models/transformer_lm.rb', line 30

def backend
  @backend
end

#kv_cpuObject (readonly)

ggml#1506 trace localization: expose the CPU cache so a trace runner can call enable_trace! before decoding. Read-only; harmless when unused.



33
34
35
# File 'lib/toy/models/transformer_lm.rb', line 33

def kv_cpu
  @kv_cpu
end

#max_TObject

Returns the value of attribute max_T.



30
31
32
# File 'lib/toy/models/transformer_lm.rb', line 30

def max_T
  @max_T
end

#tokenizerObject (readonly)

Returns the value of attribute tokenizer.



30
31
32
# File 'lib/toy/models/transformer_lm.rb', line 30

def tokenizer
  @tokenizer
end

Instance Method Details

#decode_step(token_id, pos) ⇒ Object

Single-step decode → logits Mat (1 × vocab).



182
183
184
185
186
187
188
# File 'lib/toy/models/transformer_lm.rb', line 182

def decode_step(token_id, pos)
  if @backend == :cuda
    puts "decode_step: CUDA backend not wired in this build"
    return nil
  end
  SmolLM2KV.decode_step(@kv_cpu, token_id, pos)
end

#decode_step_with_logprobs(token_id, pos, top_k) ⇒ Object

toy#decode-logprobs (#151) — single-step decode that also returns log_softmax(logits) + the top-K (id, logprob) pairs. Building block for Tep’s future /v1/chat/completions with ‘logprobs=true`.

Returns [logits_mat, logprobs_mat, top_ids, top_vals] where:

logits_mat   — Mat[1, vocab] raw logits (same as decode_step)
logprobs_mat — Mat[1, vocab] numerically stable log-softmax
top_ids      — Array<Int>   length top_k, sorted by logprob desc
top_vals     — Array<Float> length top_k, parallel to top_ids


199
200
201
202
203
204
205
206
207
# File 'lib/toy/models/transformer_lm.rb', line 199

def decode_step_with_logprobs(token_id, pos, top_k)
  logits = decode_step(token_id, pos)
  if logits == nil
    return [nil, nil, [0], [0.0]]   # never reached on CPU; CUDA prints+returns
  end
  logprobs = ToyLogProbs.log_softmax(logits)
  pair     = ToyLogProbs.top_k(logprobs, top_k)
  [logits, logprobs, pair[0], pair[1]]
end

#embed_lookup(token_ids) ⇒ Object

toy#embed-api (#145) — return the token-embedding row for each input ID as a flat Array<Float> of length n_tokens * d_model. Callers (Tep /v1/embeddings) can reshape / pool client-side. Works regardless of backend because the GGUF mmap region is CPU-readable. Single-row lookup is dequantize-aware (Q4/Q5/Q6/Q8/F16/F32).



214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# File 'lib/toy/models/transformer_lm.rb', line 214

def embed_lookup(token_ids)
  if !@loaded
    puts "ToyLM.embed_lookup: model not loaded; call .load(path) first"
    return [0.0]
  end
  if @backend == :cuda
    puts "ToyLM.embed_lookup: CUDA backend not wired in this build " +
         "(use lib/transformer_lm_cuda.rb mirror once it lands; #145)"
    return Array.new(token_ids.length * @arch.d_model, 0.0)
  end
  d_model = @arch.d_model
  out = Array.new(token_ids.length * d_model, 0.0)
  row = Array.new(d_model, 0.0)
  handle = @kv_cpu.sess
  tensor = @kv_cpu.t_token_embed
  i = 0
  while i < token_ids.length
    rc = TinyNN.tnn_embed_lookup_to_doubles(handle, tensor, token_ids[i], row, d_model)
    if rc != 0
      puts "embed_lookup: rc=" + rc.to_s + " token=" + token_ids[i].to_s
      return out
    end
    j = 0
    while j < d_model
      out[i * d_model + j] = row[j]
      j = j + 1
    end
    i = i + 1
  end
  out
end

#generate(prompt_ids, n_new, sampler_config = nil) ⇒ Object

Run prefill + generate. Returns the full ID array (prompt + N_NEW generated). Uses greedy argmax if sampler_config is nil; otherwise applies the configured sampler pipeline.



249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# File 'lib/toy/models/transformer_lm.rb', line 249

def generate(prompt_ids, n_new, sampler_config = nil)
  if !@loaded
    puts "TransformerLM.generate: model not loaded; call .load(path) first"
    return prompt_ids
  end
  ids = []
  j = 0
  while j < prompt_ids.length
    ids.push(prompt_ids[j])
    j = j + 1
  end

  # Prefill: feed every prompt token through decode_step. Final
  # logits from the last prefill step ARE the first sampling target.
  i = 0
  while i < prompt_ids.length
    decode_step(prompt_ids[i], i)
    i = i + 1
  end

  ctx = nil
  if sampler_config != nil
    ctx = SamplerContext.new(ids, sampler_config.seed)
  end

  n = 0
  while n < n_new
    pos = ids.length
    last_id = ids[pos - 1]
    logits = decode_step(last_id, pos)
    pick = -1
    if sampler_config == nil
      pick = Sampler.argmax(logits)
    else
      logits = Sampler.repetition_penalty(logits, ctx, sampler_config.rep_penalty)
      logits = Sampler.temperature(logits, sampler_config.temperature)
      logits = Sampler.top_k(logits, sampler_config.top_k)
      logits = Sampler.top_p(logits, sampler_config.top_p)
      pick   = Sampler.pick(logits, sampler_config, ctx)
      ctx.generated_ids.push(pick)
    end
    ids.push(pick)
    if pick == @arch.eos_id
      break
    end
    n = n + 1
  end
  ids
end

#load(path) ⇒ Object

Load weights from the GGUF (mmap path). Path must match the path used to construct the Arch (we re-open the GGUF for the mmap’d weight pages).



52
53
54
55
56
57
58
59
# File 'lib/toy/models/transformer_lm.rb', line 52

def load(path)
  if @backend == :cuda
    load_cuda(path)
  else
    load_cpu(path)
  end
  @loaded = true
end

#load_cpu(path) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/toy/models/transformer_lm.rb', line 61

def load_cpu(path)
  flags = GGUFLoad.detect_smollm2_flags(path)
  wtype = GGUFLoad.detect_weight_type(path)
  cfg   = SmolLM2ConfigLoader.read(path)

  # Format dispatch. Phase 2 mmap requires the GGUF to be in native
  # layout (toy.ggml_native flag set by --ggml-native at convert
  # time). Legacy GGUFs have transposed bytes and would produce
  # garbage if read directly; fall back to the Mat-mediated direct
  # loader for those.
  probe = TinyNN.tnn_gguf_load(path)
  is_native = false
  if probe != nil
    is_native = (TinyNN.tnn_gguf_get_bool(probe, "toy.ggml_native") == 1)
    TinyNN.tnn_gguf_free(probe)
  end
  # M2.3: MoE GGUFs (Mixtral / Qwen-MoE etc.) are always in standard
  # ggml-native layout; the legacy "transposed-load" path doesn't
  # know how to dequantize the 3D expert stacks anyway. Force mmap
  # whenever MoE is detected, regardless of the toy.ggml_native flag.
  if flags.is_moe
    is_native = true
  end
  # #113: same reasoning for Gemma 2 — third-party GGUFs (bartowski /
  # ggml-org / etc.) don't carry toy.ggml_native, but their layout is
  # standard. Force mmap when we see Gemma 2 sentinels (post-norm
  # tensors) so we get the post-norm and softcap paths.
  if flags.has_post_norms
    is_native = true
  end

  kv = SmolLM2KVFFICache.new
  # P5.1: KV_Q8=1 opts into Q8_0 storage for the K cache. Must be set
  # BEFORE realize_for(_mmap). Saves ~half the K-cache bytes &
  # bandwidth; V stays F32 until P5.2 (layout flip needed for V Q8).
  if (ENV["KV_Q8"] || "") == "1"
    kv.enable_kv_q8!
  end
  # P4.1: FLASH_ATTN=1 opts into ggml_flash_attn_ext for the per-Q-head
  # attention step. Inference only — vendored ggml's flash backward
  # aborts.
  if (ENV["FLASH_ATTN"] || "") == "1"
    kv.enable_flash_attn!
  end
  # M2.3: MoE — detected by GGUF tensor presence; enables the routed
  # FFN graph (router → softmax → top_k → 3× mul_mat_id → silu·up
  # → weighted sum). Must come BEFORE realize_for_mmap.
  if flags.is_moe
    kv.enable_moe!(flags.n_experts, flags.n_experts_used)
    puts "MoE detected: n_experts=" + flags.n_experts.to_s +
         " top_k=" + flags.n_experts_used.to_s
  end
  # #110: pass through the detected qk_norm flavor BEFORE realize.
  # 1 = Qwen3-style ([d_head] shared), 2 = OLMoE/Granite-style
  # ([d_model] per-head packed; per-head sliced gamma).
  kv.qk_norm_kind = flags.qk_norm_kind
  # NO_QK_NORM=1 turns the norm off entirely as a diagnostic.
  # #76: the old form (kv.has_qk_norm = false) was ineffective on
  # the mmap path — realize_for_mmap overwrites @has_qk_norm from
  # its qk_norm parameter. Carry the override in a local instead
  # and pass it to realize_for_mmap below.
  qk_norm_on = flags.qk_norm
  if (ENV["NO_QK_NORM"] || "") == "1"
    qk_norm_on = false
    kv.qk_norm_kind = 0
  end
  # #113: Gemma 2 extras. All inert by default — non-Gemma callers
  # pass embed_scale=1.0, softcaps=0.0, has_post_norms=false,
  # swa_alternates=false, and the graph paths skip the extras.
  kv.has_post_norms = flags.has_post_norms
  kv.embed_scale    = flags.embed_scale
  kv.attn_softcap   = flags.attn_softcap
  kv.final_softcap  = flags.final_softcap
  kv.swa_alternates = flags.swa_alternates
  if flags.has_post_norms || flags.attn_softcap > 0.0 || flags.swa_alternates
    puts "Gemma-2 features: post_norms=" + flags.has_post_norms.to_s +
         " embed_scale=" + flags.embed_scale.to_s +
         " attn_softcap=" + flags.attn_softcap.to_s +
         " final_softcap=" + flags.final_softcap.to_s +
         " swa_alt=" + flags.swa_alternates.to_s
  end

  if is_native
    # Native layout: mmap weights at their stored ggml type. Q8_0
    # tensors stay quantized; matmul kernels read them in place.
    kv.set_weight_type(wtype)
    @gguf_handle = TinyNN.tnn_gguf_load(path)
    kv.realize_for_mmap(@gguf_handle, cfg, @max_T, flags.untied, flags.qkv_bias, qk_norm_on)
    kv.swa_window = flags.swa_window
  else
    if qk_norm_on
      # #76, fail loud (never mask): realize_for has no QK-norm
      # support — the gamma tensors are only allocated on the mmap
      # path, so decode here would be silently degenerate.
      puts "ToyLM.load_cpu: " + path + " needs QK-norm but is not in " +
           "toy.ggml_native layout; the legacy copy-load path cannot " +
           "apply QK-norm. Re-convert with --ggml-native. Aborting."
      exit 1
    end
    # Legacy layout: dequantize-to-F32 on copy. The
    # tnn_gguf_copy_head_slice_to_persistent helper writes F32 bytes
    # into the dst, so dst tensors must be F32-typed — there's no
    # quantize-on-write code path yet. We deliberately do NOT call
    # set_weight_type here; the default (F32) is what holds.
    kv.realize_for(@max_T, cfg.d_model, cfg.d_ff, cfg.n_heads, cfg.n_kv,
                   cfg.n_layers, cfg.vocab, cfg.rope_base, cfg.rms_eps,
                   flags.untied, flags.qkv_bias)
    GGUFLoad.load_kv_cache_auto(kv, path)
  end
  @kv_cpu = kv
end

#load_cuda(path) ⇒ Object

The CUDA branch lives in load_cuda; we keep it conditional so the CPU-only builds don’t pull TinyNNCuda in.



175
176
177
178
179
# File 'lib/toy/models/transformer_lm.rb', line 175

def load_cuda(path)
  puts "TransformerLM: CUDA path requires a CUDA-linked build. " +
       "Use lib/transformer_lm_cuda.rb (mirror); not implemented inline."
  nil
end