Class: Toy::LLM::Engine::ViTTinyEngine

Inherits:
Object
  • Object
show all
Defined in:
lib/toy/llm/engine/vit_tiny_engine.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeViTTinyEngine

Returns a new instance of ViTTinyEngine.



69
70
71
72
73
74
75
76
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 69

def initialize
  @sess = TinyNN.tnn_null_ptr
  @ft_globals_weights = [TinyNN.tnn_null_ptr]; @ft_globals_weights.pop
  @ft_globals_m       = [TinyNN.tnn_null_ptr]; @ft_globals_m.pop
  @ft_globals_v       = [TinyNN.tnn_null_ptr]; @ft_globals_v.pop
  @blocks   = [ViTTinyBlockFFI.new]; @blocks.pop
  @realized = false
end

Instance Attribute Details

#blocksObject

Returns the value of attribute blocks.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def blocks
  @blocks
end

#cfgObject

Returns the value of attribute cfg.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def cfg
  @cfg
end

#ft_globals_mObject

Returns the value of attribute ft_globals_m.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def ft_globals_m
  @ft_globals_m
end

#ft_globals_vObject

Returns the value of attribute ft_globals_v.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def ft_globals_v
  @ft_globals_v
end

#ft_globals_weightsObject

Returns the value of attribute ft_globals_weights.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def ft_globals_weights
  @ft_globals_weights
end

#n_patchesObject

Returns the value of attribute n_patches.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def n_patches
  @n_patches
end

#realizedObject

Returns the value of attribute realized.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def realized
  @realized
end

#seq_tObject

Returns the value of attribute seq_t.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def seq_t
  @seq_t
end

#sessObject

Returns the value of attribute sess.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def sess
  @sess
end

#t_cls_idxObject

Returns the value of attribute t_cls_idx.



244
245
246
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 244

def t_cls_idx
  @t_cls_idx
end

#t_cls_tokenObject

Returns the value of attribute t_cls_token.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def t_cls_token
  @t_cls_token
end

#t_final_ln_gammaObject

Returns the value of attribute t_final_ln_gamma.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def t_final_ln_gamma
  @t_final_ln_gamma
end

#t_hpObject

Returns the value of attribute t_hp.



244
245
246
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 244

def t_hp
  @t_hp
end

#t_imageObject

Returns the value of attribute t_image.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def t_image
  @t_image
end

#t_labelsObject

Returns the value of attribute t_labels.



244
245
246
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 244

def t_labels
  @t_labels
end

#t_logitsObject

Returns the value of attribute t_logits.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def t_logits
  @t_logits
end

#t_patch_kernelObject

Returns the value of attribute t_patch_kernel.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def t_patch_kernel
  @t_patch_kernel
end

#t_pos_embedObject

Returns the value of attribute t_pos_embed.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def t_pos_embed
  @t_pos_embed
end

#t_w_headObject

Returns the value of attribute t_w_head.



61
62
63
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61

def t_w_head
  @t_w_head
end

Instance Method Details

#build_forward_in_current_ctxObject



206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 206

def build_forward_in_current_ctx
  cfg = @cfg
  # patch_embed as a flat linear: t_image ne=[IC*P*P, N_patches],
  # W_patch ne=[IC*P*P, d_model] → matmul gives ne=[d_model, N_patches].
  t_patches = TinyNN.tnn_matmul(@sess, @t_patch_kernel, @t_image)
  # Prepend cls token via concat along the sequence (ne[1]) axis.
  # cls token ne=[d_model, 1]; t_patches ne=[d_model, N_patches].
  # concat axis=1 (the "sequence" axis): ne=[d_model, 1 + N_patches]
  t_seq = TinyNN.tnn_concat(@sess, @t_cls_token, t_patches, 1)
  # Add learned pos embed [d_model, T]. ggml broadcasts add OK if
  # shapes match exactly — pos_embed and t_seq are both [d_model, T].
  t_x = TinyNN.tnn_add(@sess, t_seq, @t_pos_embed)
  TinyNN.tnn_set_output(t_x)

  scale = 1.0 / Math.sqrt(cfg.d_head.to_f)
  li = 0
  while li < cfg.n_layers
    t_x = build_vit_block(t_x, @blocks[li], scale)
    li = li + 1
  end

  # Final norm — RMSNorm (see ViTTinyBlockFFI comment).
  t_final = TinyNN.tnn_rms_norm(@sess, t_x, @t_final_ln_gamma, cfg.ln_eps)
  TinyNN.tnn_set_output(t_final)

  # Take cls token: view(t_final, ne[d_model, 1] @ offset 0).
  # Easier: use ggml_view_2d via a slice helper. Approximation —
  # use get_rows with idx=[0] to pick column 0.
  idx_buf = [0]
  t_idx = TinyNN.tnn_input_1d_i32(@sess, 1)
  # Mark t_idx as the cls-index input; uploaded as [0] in each step.
  @t_cls_idx = t_idx
  t_cls_vec = TinyNN.tnn_get_rows(@sess, t_final, t_idx)
  # Head matmul: w_head [num_classes, d_model] · cls [d_model, 1] → [num_classes, 1]
  @t_logits = TinyNN.tnn_matmul(@sess, @t_w_head, t_cls_vec)
  TinyNN.tnn_set_output(@t_logits)
end

#build_training_stepObject



246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 246

def build_training_step
  TinyNN.tnn_reset_for_rebuild(@sess)
  build_forward_in_current_ctx

  # Labels [num_classes, 1] one-hot for the single image.
  @t_labels = TinyNN.tnn_input_2d_f32(@sess, 1, @cfg.num_classes)
  @t_hp     = TinyNN.tnn_input_1d_f32(@sess, 7)

  t_loss = TinyNN.tnn_cross_entropy_loss(@sess, @t_logits, @t_labels)
  TinyNN.tnn_set_output(t_loss)
  TinyNN.tnn_set_loss(t_loss)

  TinyNN.tnn_build_forward_only(@sess, t_loss)
  TinyNN.tnn_build_backward(@sess)

  # AdamW opt_step on every PARAM (block + global).
  li = 0
  while li < @cfg.n_layers
    blk = @blocks[li]
    wi = 0
    while wi < blk.ft_weights.length
      tw = blk.ft_weights[wi]
      tg = TinyNN.tnn_tensor_grad(@sess, tw)
      to = TinyNN.tnn_opt_step_adamw(@sess, tw, tg, blk.ft_m[wi], blk.ft_v[wi], @t_hp)
      TinyNN.tnn_extend_backward_graph(@sess, to)
      wi = wi + 1
    end
    li = li + 1
  end
  gi = 0
  while gi < @ft_globals_weights.length
    tw = @ft_globals_weights[gi]
    tg = TinyNN.tnn_tensor_grad(@sess, tw)
    to = TinyNN.tnn_opt_step_adamw(@sess, tw, tg, @ft_globals_m[gi], @ft_globals_v[gi], @t_hp)
    TinyNN.tnn_extend_backward_graph(@sess, to)
    gi = gi + 1
  end

  TinyNN.tnn_pin_all_graph_b_nodes(@sess)
  TinyNN.tnn_realize_backward(@sess)
  t_loss
end

#build_vit_block(t_x, blk, scale) ⇒ Object



289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 289

def build_vit_block(t_x, blk, scale)
  cfg = @cfg
  # Block-1: RMSNorm → MSA → residual (LayerNorm has no ggml backward).
  t_h = TinyNN.tnn_rms_norm(@sess, t_x, blk.t_ln1_gamma, cfg.ln_eps)

  # Per-head Q, K, V over the [d_model, T] activation.
  # For ViT-Tiny: no RoPE, no causal mask — pure scaled dot-product.
  t_head_outs = [TinyNN.tnn_null_ptr]; t_head_outs.pop
  h = 0
  while h < cfg.n_heads
    t_q = TinyNN.tnn_matmul(@sess, blk.t_w_q[h], t_h)   # [d_head, T]
    t_k = TinyNN.tnn_matmul(@sess, blk.t_w_k[h], t_h)   # [d_head, T]
    t_v = TinyNN.tnn_matmul(@sess, blk.t_w_v[h], t_h)   # [d_head, T]
    # scores = Q^T @ K, shape [T, T]. Then softmax then attn = V @ softmax^T (per ggml conv).
    # ggml matmul(K, Q) → ne=[T, T] (K.ne[0]=d_head contracts with Q.ne[0]).
    t_scores = TinyNN.tnn_matmul(@sess, t_k, t_q)
    t_scaled = TinyNN.tnn_scale(@sess, t_scores, scale)
    t_attn   = TinyNN.tnn_softmax(@sess, t_scaled)
    # V @ attn: V ne=[d_head, T], attn ne=[T, T]. Need [d_head, T].
    # matmul expects ne[0] contraction: V.ne[0]=d_head vs attn.ne[0]=T → mismatch.
    # Transpose V to [T, d_head], then matmul(V_t, attn): V_t.ne[0]=T vs attn.ne[0]=T ✓
    # but result ne=[d_head_new=V.ne[1]=T, T] — wrong.
    # Use the same pattern as the LLM cache: build_seq_qhead does V_t = transpose(V),
    # then matmul(V_t, attn) gives ne=[d_head, T].
    t_v_t = TinyNN.tnn_transpose(@sess, t_v)
    t_head_out = TinyNN.tnn_matmul(@sess, t_v_t, t_attn)
    t_head_outs.push(t_head_out)
    h = h + 1
  end

  # Concat heads along dim 0 (d_head axis): n_heads × [d_head, T] → [d_model, T].
  t_concat = t_head_outs[0]
  hc = 1
  while hc < cfg.n_heads
    t_concat = TinyNN.tnn_concat(@sess, t_concat, t_head_outs[hc], 0)
    hc = hc + 1
  end
  t_out_proj = TinyNN.tnn_matmul(@sess, blk.t_w_o, t_concat)
  t_x_attn   = TinyNN.tnn_add(@sess, t_x, t_out_proj)

  # Block-2: RMSNorm → MLP(SiLU) → residual.
  # ggml has no GELU backward (only SILU among activations), so we
  # use SiLU here for the smoke; algorithmically this matches a
  # documented ViT variant. timm-loader compat (which assumes GELU)
  # is its own piece — see vendored-backward follow-up issue.
  t_h2  = TinyNN.tnn_rms_norm(@sess, t_x_attn, blk.t_ln2_gamma, cfg.ln_eps)
  t_up  = TinyNN.tnn_matmul(@sess, blk.t_w_up, t_h2)
  t_act = TinyNN.tnn_silu(@sess, t_up)
  t_dn  = TinyNN.tnn_matmul(@sess, blk.t_w_down, t_act)
  TinyNN.tnn_add(@sess, t_x_attn, t_dn)
end

#ft_add_1d(blk, weight) ⇒ Object



366
367
368
369
370
371
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 366

def ft_add_1d(blk, weight)
  n = TinyNN.tnn_tensor_ne0(weight)
  m = TinyNN.tnn_input_1d_f32_persistent(@sess, n)
  v = TinyNN.tnn_input_1d_f32_persistent(@sess, n)
  blk.ft_weights.push(weight); blk.ft_m.push(m); blk.ft_v.push(v)
end

#ft_add_2d(blk, weight, rows, cols) ⇒ Object



360
361
362
363
364
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 360

def ft_add_2d(blk, weight, rows, cols)
  m = TinyNN.tnn_input_2d_f32_persistent(@sess, rows, cols)
  v = TinyNN.tnn_input_2d_f32_persistent(@sess, rows, cols)
  blk.ft_weights.push(weight); blk.ft_m.push(m); blk.ft_v.push(v)
end

#ft_add_global_1d(weight) ⇒ Object



351
352
353
354
355
356
357
358
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 351

def ft_add_global_1d(weight)
  n = TinyNN.tnn_tensor_ne0(weight)
  m = TinyNN.tnn_input_1d_f32_persistent(@sess, n)
  v = TinyNN.tnn_input_1d_f32_persistent(@sess, n)
  @ft_globals_weights.push(weight)
  @ft_globals_m.push(m)
  @ft_globals_v.push(v)
end

#ft_add_global_2d(weight, rows, cols) ⇒ Object

— bookkeeping helpers (parallel to Toy::LLM::Engine::LlamaSeqEngine) —



343
344
345
346
347
348
349
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 343

def ft_add_global_2d(weight, rows, cols)
  m = TinyNN.tnn_input_2d_f32_persistent(@sess, rows, cols)
  v = TinyNN.tnn_input_2d_f32_persistent(@sess, rows, cols)
  @ft_globals_weights.push(weight)
  @ft_globals_m.push(m)
  @ft_globals_v.push(v)
end

#ft_name_last(blk, name) ⇒ Object



373
374
375
376
377
378
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 373

def ft_name_last(blk, name)
  last = blk.ft_weights.length - 1
  TinyNN.tnn_tensor_set_name(blk.ft_weights[last], name)
  TinyNN.tnn_tensor_set_name(blk.ft_m[last],       name + ".m")
  TinyNN.tnn_tensor_set_name(blk.ft_v[last],       name + ".v")
end

#ft_name_last_global(name) ⇒ Object



380
381
382
383
384
385
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 380

def ft_name_last_global(name)
  last = @ft_globals_weights.length - 1
  TinyNN.tnn_tensor_set_name(@ft_globals_weights[last], name)
  TinyNN.tnn_tensor_set_name(@ft_globals_m[last],       name + ".m")
  TinyNN.tnn_tensor_set_name(@ft_globals_v[last],       name + ".v")
end

#realize_for_random_init(cfg, seed, init_scale) ⇒ Object

Allocate every PARAM + Adam moments. Random-init at the end via Box-Muller; the caller can overwrite specific tensors (e.g. patch_embed.proj.weight from a timm donor) before the first step.



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 81

def realize_for_random_init(cfg, seed, init_scale)
  @cfg = cfg
  # T = number of "tokens" entering the transformer = N_patches + 1 (cls).
  @n_patches = (cfg.image_size / cfg.patch_size) * (cfg.image_size / cfg.patch_size)
  @seq_t     = @n_patches + 1

  @sess = TinyNN.tnn_session_new(0)
  # Per-head decomposition pushes node count up; budget from cfg.
  cap = cfg.n_layers * cfg.n_heads * 1000 + 65536
  TinyNN.tnn_session_set_graph_capacity(@sess, cap)

  # Globals: patch_embed weight, cls_token, pos_embed, final norm, head.
  #
  # patch_embed is implemented as a linear projection over the
  # flattened patch axis (mathematically equivalent to a Conv2d with
  # stride=patch, no overlap, no padding). We don't use ggml_conv_2d
  # in the training path because its internal implementation ends in
  # a `ggml_cont(ggml_permute(...))` and ggml's auto-backward
  # requires gradients into `cont` to be contiguous — the assertion
  # fires at `tnn_build_backward` time. The flat-linear form has a
  # clean matmul backward.
  #
  # Caller responsibility: pre-flatten each patch into
  # [IC*P*P, N_patches] before uploading to t_image. Done host-side
  # (cheap; see prep/smokes/smoke_vit_tiny.rb).
  @patch_flat_dim = cfg.num_channels * cfg.patch_size * cfg.patch_size
  @t_patch_kernel = TinyNN.tnn_input_2d_f32_persistent(@sess,
                      cfg.d_model, @patch_flat_dim)
  ft_add_global_2d(@t_patch_kernel, cfg.d_model, @patch_flat_dim)
  ft_name_last_global("patch_embed.proj.weight")

  @t_cls_token = TinyNN.tnn_input_2d_f32_persistent(@sess, 1, cfg.d_model)
  ft_add_global_2d(@t_cls_token, 1, cfg.d_model)
  ft_name_last_global("cls_token")

  @t_pos_embed = TinyNN.tnn_input_2d_f32_persistent(@sess, @seq_t, cfg.d_model)
  ft_add_global_2d(@t_pos_embed, @seq_t, cfg.d_model)
  ft_name_last_global("pos_embed")

  @t_final_ln_gamma = TinyNN.tnn_input_1d_f32_persistent(@sess, cfg.d_model)
  ft_add_global_1d(@t_final_ln_gamma)
  ft_name_last_global("norm.weight")

  @t_w_head = TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.num_classes, cfg.d_model)
  ft_add_global_2d(@t_w_head, cfg.num_classes, cfg.d_model)
  ft_name_last_global("head.weight")

  # Per-block weights.
  li_init = 0
  while li_init < cfg.n_layers
    @blocks.push(ViTTinyBlockFFI.new)
    li_init = li_init + 1
  end

  li = 0
  while li < cfg.n_layers
    blk = @blocks[li]
    prefix = "blk." + li.to_s + "."

    blk.t_ln1_gamma = TinyNN.tnn_input_1d_f32_persistent(@sess, cfg.d_model)
    blk.t_ln2_gamma = TinyNN.tnn_input_1d_f32_persistent(@sess, cfg.d_model)
    ft_add_1d(blk, blk.t_ln1_gamma); ft_name_last(blk, prefix + "ln1.weight")
    ft_add_1d(blk, blk.t_ln2_gamma); ft_name_last(blk, prefix + "ln2.weight")

    # Per-head Q/K/V. n_kv = n_heads (no GQA in ViT-Tiny).
    blk.t_w_q = [TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_head, cfg.d_model)]
    blk.t_w_k = [TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_head, cfg.d_model)]
    blk.t_w_v = [TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_head, cfg.d_model)]
    h = 1
    while h < cfg.n_heads
      blk.t_w_q.push(TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_head, cfg.d_model))
      blk.t_w_k.push(TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_head, cfg.d_model))
      blk.t_w_v.push(TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_head, cfg.d_model))
      h = h + 1
    end
    h2 = 0
    while h2 < cfg.n_heads
      ft_add_2d(blk, blk.t_w_q[h2], cfg.d_head, cfg.d_model)
      ft_name_last(blk, prefix + "attn_q.head_" + h2.to_s + ".weight")
      ft_add_2d(blk, blk.t_w_k[h2], cfg.d_head, cfg.d_model)
      ft_name_last(blk, prefix + "attn_k.head_" + h2.to_s + ".weight")
      ft_add_2d(blk, blk.t_w_v[h2], cfg.d_head, cfg.d_model)
      ft_name_last(blk, prefix + "attn_v.head_" + h2.to_s + ".weight")
      h2 = h2 + 1
    end

    blk.t_w_o    = TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_model, cfg.d_model)
    blk.t_w_up   = TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_ff,    cfg.d_model)
    blk.t_w_down = TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_model, cfg.d_ff)
    ft_add_2d(blk, blk.t_w_o,    cfg.d_model, cfg.d_model)
    ft_name_last(blk, prefix + "attn_output.weight")
    ft_add_2d(blk, blk.t_w_up,   cfg.d_ff,    cfg.d_model)
    ft_name_last(blk, prefix + "mlp_up.weight")
    ft_add_2d(blk, blk.t_w_down, cfg.d_model, cfg.d_ff)
    ft_name_last(blk, prefix + "mlp_down.weight")

    # Mark every block weight as PARAM.
    wi = 0
    while wi < blk.ft_weights.length
      TinyNN.tnn_set_param(blk.ft_weights[wi])
      wi = wi + 1
    end
    li = li + 1
  end

  # Mark globals as PARAM.
  gi = 0
  while gi < @ft_globals_weights.length
    TinyNN.tnn_set_param(@ft_globals_weights[gi])
    gi = gi + 1
  end

  # Image input (graph input, NOT a PARAM): pre-flattened patch
  # matrix shape [IC*P*P, N_patches]. Caller does the host-side
  # patch extraction; see prep/smokes/smoke_vit_tiny.rb.
  @t_image = TinyNN.tnn_input_2d_f32_persistent(@sess,
                @n_patches, @patch_flat_dim)

  TinyNN.tnn_finalize_weights(@sess)

  # Random-init every PARAM tensor.
  upload_random_init!(seed, init_scale)
  @realized = true
end

#upload_constant(tensor, n, val) ⇒ Object



456
457
458
459
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 456

def upload_constant(tensor, n, val)
  buf = Array.new(n, val)
  TinyNN.tnn_upload_from_float_array(@sess, tensor, buf, n)
end

#upload_gaussian(tensor, n, std, state) ⇒ Object



431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 431

def upload_gaussian(tensor, n, std, state)
  buf = Array.new(n, 0.0)
  i = 0
  while i < n
    # Box-Muller from a xorshift64 stream (state[0] mutated in place).
    s = state[0]
    s = s ^ (s << 13);  s = s & 0xFFFFFFFFFFFFFFFF
    s = s ^ (s >> 7);   s = s & 0xFFFFFFFFFFFFFFFF
    s = s ^ (s << 17);  s = s & 0xFFFFFFFFFFFFFFFF
    state[0] = s
    u1 = (s & 0xFFFFFFFF).to_f / 4294967296.0
    s = state[0]
    s = s ^ (s << 13);  s = s & 0xFFFFFFFFFFFFFFFF
    s = s ^ (s >> 7);   s = s & 0xFFFFFFFFFFFFFFFF
    s = s ^ (s << 17);  s = s & 0xFFFFFFFFFFFFFFFF
    state[0] = s
    u2 = (s & 0xFFFFFFFF).to_f / 4294967296.0
    if u1 < 1.0e-12; u1 = 1.0e-12; end
    z = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math::PI * u2)
    buf[i] = z * std
    i = i + 1
  end
  TinyNN.tnn_upload_from_float_array(@sess, tensor, buf, n)
end

#upload_random_init!(seed, init_scale) ⇒ Object



387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 387

def upload_random_init!(seed, init_scale)
  state = [seed]
  cfg   = @cfg
  inv_d = init_scale / Math.sqrt(cfg.d_model.to_f)
  # patch_embed.proj.weight (linear over flat patches): Gaussian.
  upload_gaussian(@t_patch_kernel, @patch_flat_dim * cfg.d_model, 0.02, state)
  upload_constant(@t_cls_token, cfg.d_model, 0.0)
  upload_gaussian(@t_pos_embed, @seq_t * cfg.d_model, 0.02, state)
  upload_constant(@t_final_ln_gamma, cfg.d_model, 1.0)
  upload_gaussian(@t_w_head, cfg.num_classes * cfg.d_model, 0.02, state)

  li = 0
  while li < cfg.n_layers
    blk = @blocks[li]
    upload_constant(blk.t_ln1_gamma, cfg.d_model, 1.0)
    upload_constant(blk.t_ln2_gamma, cfg.d_model, 1.0)
    h = 0
    while h < cfg.n_heads
      upload_gaussian(blk.t_w_q[h], cfg.d_head * cfg.d_model, inv_d, state)
      upload_gaussian(blk.t_w_k[h], cfg.d_head * cfg.d_model, inv_d, state)
      upload_gaussian(blk.t_w_v[h], cfg.d_head * cfg.d_model, inv_d, state)
      h = h + 1
    end
    upload_gaussian(blk.t_w_o,    cfg.d_model * cfg.d_model, inv_d, state)
    upload_gaussian(blk.t_w_up,   cfg.d_ff    * cfg.d_model, inv_d, state)
    upload_gaussian(blk.t_w_down, cfg.d_model * cfg.d_ff,
                     init_scale / Math.sqrt(cfg.d_ff.to_f), state)
    # Zero-init Adam moments (m, v).
    wi = 0
    while wi < blk.ft_m.length
      zero_tensor(blk.ft_m[wi])
      zero_tensor(blk.ft_v[wi])
      wi = wi + 1
    end
    li = li + 1
  end
  gi = 0
  while gi < @ft_globals_m.length
    zero_tensor(@ft_globals_m[gi])
    zero_tensor(@ft_globals_v[gi])
    gi = gi + 1
  end
end

#zero_tensor(tensor) ⇒ Object



461
462
463
464
465
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 461

def zero_tensor(tensor)
  n = TinyNN.tnn_tensor_nelements(tensor)
  buf = Array.new(n, 0.0)
  TinyNN.tnn_upload_from_float_array(@sess, tensor, buf, n)
end