Class: Toy::LLM::Engine::ViTTinyEngine
- Inherits:
-
Object
- Object
- Toy::LLM::Engine::ViTTinyEngine
- Defined in:
- lib/toy/llm/engine/vit_tiny_engine.rb
Instance Attribute Summary collapse
-
#blocks ⇒ Object
Returns the value of attribute blocks.
-
#cfg ⇒ Object
Returns the value of attribute cfg.
-
#ft_globals_m ⇒ Object
Returns the value of attribute ft_globals_m.
-
#ft_globals_v ⇒ Object
Returns the value of attribute ft_globals_v.
-
#ft_globals_weights ⇒ Object
Returns the value of attribute ft_globals_weights.
-
#n_patches ⇒ Object
Returns the value of attribute n_patches.
-
#realized ⇒ Object
Returns the value of attribute realized.
-
#seq_t ⇒ Object
Returns the value of attribute seq_t.
-
#sess ⇒ Object
Returns the value of attribute sess.
-
#t_cls_idx ⇒ Object
Returns the value of attribute t_cls_idx.
-
#t_cls_token ⇒ Object
Returns the value of attribute t_cls_token.
-
#t_final_ln_gamma ⇒ Object
Returns the value of attribute t_final_ln_gamma.
-
#t_hp ⇒ Object
Returns the value of attribute t_hp.
-
#t_image ⇒ Object
Returns the value of attribute t_image.
-
#t_labels ⇒ Object
Returns the value of attribute t_labels.
-
#t_logits ⇒ Object
Returns the value of attribute t_logits.
-
#t_patch_kernel ⇒ Object
Returns the value of attribute t_patch_kernel.
-
#t_pos_embed ⇒ Object
Returns the value of attribute t_pos_embed.
-
#t_w_head ⇒ Object
Returns the value of attribute t_w_head.
Instance Method Summary collapse
- #build_forward_in_current_ctx ⇒ Object
- #build_training_step ⇒ Object
- #build_vit_block(t_x, blk, scale) ⇒ Object
- #ft_add_1d(blk, weight) ⇒ Object
- #ft_add_2d(blk, weight, rows, cols) ⇒ Object
- #ft_add_global_1d(weight) ⇒ Object
-
#ft_add_global_2d(weight, rows, cols) ⇒ Object
— bookkeeping helpers (parallel to Toy::LLM::Engine::LlamaSeqEngine) —.
- #ft_name_last(blk, name) ⇒ Object
- #ft_name_last_global(name) ⇒ Object
-
#initialize ⇒ ViTTinyEngine
constructor
A new instance of ViTTinyEngine.
-
#realize_for_random_init(cfg, seed, init_scale) ⇒ Object
Allocate every PARAM + Adam moments.
- #upload_constant(tensor, n, val) ⇒ Object
- #upload_gaussian(tensor, n, std, state) ⇒ Object
- #upload_random_init!(seed, init_scale) ⇒ Object
- #zero_tensor(tensor) ⇒ Object
Constructor Details
#initialize ⇒ ViTTinyEngine
Returns a new instance of ViTTinyEngine.
69 70 71 72 73 74 75 76 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 69 def initialize @sess = TinyNN.tnn_null_ptr @ft_globals_weights = [TinyNN.tnn_null_ptr]; @ft_globals_weights.pop @ft_globals_m = [TinyNN.tnn_null_ptr]; @ft_globals_m.pop @ft_globals_v = [TinyNN.tnn_null_ptr]; @ft_globals_v.pop @blocks = [ViTTinyBlockFFI.new]; @blocks.pop @realized = false end |
Instance Attribute Details
#blocks ⇒ Object
Returns the value of attribute blocks.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def blocks @blocks end |
#cfg ⇒ Object
Returns the value of attribute cfg.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def cfg @cfg end |
#ft_globals_m ⇒ Object
Returns the value of attribute ft_globals_m.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def ft_globals_m @ft_globals_m end |
#ft_globals_v ⇒ Object
Returns the value of attribute ft_globals_v.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def ft_globals_v @ft_globals_v end |
#ft_globals_weights ⇒ Object
Returns the value of attribute ft_globals_weights.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def ft_globals_weights @ft_globals_weights end |
#n_patches ⇒ Object
Returns the value of attribute n_patches.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def n_patches @n_patches end |
#realized ⇒ Object
Returns the value of attribute realized.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def realized @realized end |
#seq_t ⇒ Object
Returns the value of attribute seq_t.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def seq_t @seq_t end |
#sess ⇒ Object
Returns the value of attribute sess.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def sess @sess end |
#t_cls_idx ⇒ Object
Returns the value of attribute t_cls_idx.
244 245 246 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 244 def t_cls_idx @t_cls_idx end |
#t_cls_token ⇒ Object
Returns the value of attribute t_cls_token.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def t_cls_token @t_cls_token end |
#t_final_ln_gamma ⇒ Object
Returns the value of attribute t_final_ln_gamma.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def t_final_ln_gamma @t_final_ln_gamma end |
#t_hp ⇒ Object
Returns the value of attribute t_hp.
244 245 246 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 244 def t_hp @t_hp end |
#t_image ⇒ Object
Returns the value of attribute t_image.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def t_image @t_image end |
#t_labels ⇒ Object
Returns the value of attribute t_labels.
244 245 246 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 244 def t_labels @t_labels end |
#t_logits ⇒ Object
Returns the value of attribute t_logits.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def t_logits @t_logits end |
#t_patch_kernel ⇒ Object
Returns the value of attribute t_patch_kernel.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def t_patch_kernel @t_patch_kernel end |
#t_pos_embed ⇒ Object
Returns the value of attribute t_pos_embed.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def @t_pos_embed end |
#t_w_head ⇒ Object
Returns the value of attribute t_w_head.
61 62 63 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 61 def t_w_head @t_w_head end |
Instance Method Details
#build_forward_in_current_ctx ⇒ Object
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 206 def build_forward_in_current_ctx cfg = @cfg # patch_embed as a flat linear: t_image ne=[IC*P*P, N_patches], # W_patch ne=[IC*P*P, d_model] → matmul gives ne=[d_model, N_patches]. t_patches = TinyNN.tnn_matmul(@sess, @t_patch_kernel, @t_image) # Prepend cls token via concat along the sequence (ne[1]) axis. # cls token ne=[d_model, 1]; t_patches ne=[d_model, N_patches]. # concat axis=1 (the "sequence" axis): ne=[d_model, 1 + N_patches] t_seq = TinyNN.tnn_concat(@sess, @t_cls_token, t_patches, 1) # Add learned pos embed [d_model, T]. ggml broadcasts add OK if # shapes match exactly — pos_embed and t_seq are both [d_model, T]. t_x = TinyNN.tnn_add(@sess, t_seq, @t_pos_embed) TinyNN.tnn_set_output(t_x) scale = 1.0 / Math.sqrt(cfg.d_head.to_f) li = 0 while li < cfg.n_layers t_x = build_vit_block(t_x, @blocks[li], scale) li = li + 1 end # Final norm — RMSNorm (see ViTTinyBlockFFI comment). t_final = TinyNN.tnn_rms_norm(@sess, t_x, @t_final_ln_gamma, cfg.ln_eps) TinyNN.tnn_set_output(t_final) # Take cls token: view(t_final, ne[d_model, 1] @ offset 0). # Easier: use ggml_view_2d via a slice helper. Approximation — # use get_rows with idx=[0] to pick column 0. idx_buf = [0] t_idx = TinyNN.tnn_input_1d_i32(@sess, 1) # Mark t_idx as the cls-index input; uploaded as [0] in each step. @t_cls_idx = t_idx t_cls_vec = TinyNN.tnn_get_rows(@sess, t_final, t_idx) # Head matmul: w_head [num_classes, d_model] · cls [d_model, 1] → [num_classes, 1] @t_logits = TinyNN.tnn_matmul(@sess, @t_w_head, t_cls_vec) TinyNN.tnn_set_output(@t_logits) end |
#build_training_step ⇒ Object
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 246 def build_training_step TinyNN.tnn_reset_for_rebuild(@sess) build_forward_in_current_ctx # Labels [num_classes, 1] one-hot for the single image. @t_labels = TinyNN.tnn_input_2d_f32(@sess, 1, @cfg.num_classes) @t_hp = TinyNN.tnn_input_1d_f32(@sess, 7) t_loss = TinyNN.tnn_cross_entropy_loss(@sess, @t_logits, @t_labels) TinyNN.tnn_set_output(t_loss) TinyNN.tnn_set_loss(t_loss) TinyNN.tnn_build_forward_only(@sess, t_loss) TinyNN.tnn_build_backward(@sess) # AdamW opt_step on every PARAM (block + global). li = 0 while li < @cfg.n_layers blk = @blocks[li] wi = 0 while wi < blk.ft_weights.length tw = blk.ft_weights[wi] tg = TinyNN.tnn_tensor_grad(@sess, tw) to = TinyNN.tnn_opt_step_adamw(@sess, tw, tg, blk.ft_m[wi], blk.ft_v[wi], @t_hp) TinyNN.tnn_extend_backward_graph(@sess, to) wi = wi + 1 end li = li + 1 end gi = 0 while gi < @ft_globals_weights.length tw = @ft_globals_weights[gi] tg = TinyNN.tnn_tensor_grad(@sess, tw) to = TinyNN.tnn_opt_step_adamw(@sess, tw, tg, @ft_globals_m[gi], @ft_globals_v[gi], @t_hp) TinyNN.tnn_extend_backward_graph(@sess, to) gi = gi + 1 end TinyNN.tnn_pin_all_graph_b_nodes(@sess) TinyNN.tnn_realize_backward(@sess) t_loss end |
#build_vit_block(t_x, blk, scale) ⇒ Object
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 289 def build_vit_block(t_x, blk, scale) cfg = @cfg # Block-1: RMSNorm → MSA → residual (LayerNorm has no ggml backward). t_h = TinyNN.tnn_rms_norm(@sess, t_x, blk.t_ln1_gamma, cfg.ln_eps) # Per-head Q, K, V over the [d_model, T] activation. # For ViT-Tiny: no RoPE, no causal mask — pure scaled dot-product. t_head_outs = [TinyNN.tnn_null_ptr]; t_head_outs.pop h = 0 while h < cfg.n_heads t_q = TinyNN.tnn_matmul(@sess, blk.t_w_q[h], t_h) # [d_head, T] t_k = TinyNN.tnn_matmul(@sess, blk.t_w_k[h], t_h) # [d_head, T] t_v = TinyNN.tnn_matmul(@sess, blk.t_w_v[h], t_h) # [d_head, T] # scores = Q^T @ K, shape [T, T]. Then softmax then attn = V @ softmax^T (per ggml conv). # ggml matmul(K, Q) → ne=[T, T] (K.ne[0]=d_head contracts with Q.ne[0]). t_scores = TinyNN.tnn_matmul(@sess, t_k, t_q) t_scaled = TinyNN.tnn_scale(@sess, t_scores, scale) t_attn = TinyNN.tnn_softmax(@sess, t_scaled) # V @ attn: V ne=[d_head, T], attn ne=[T, T]. Need [d_head, T]. # matmul expects ne[0] contraction: V.ne[0]=d_head vs attn.ne[0]=T → mismatch. # Transpose V to [T, d_head], then matmul(V_t, attn): V_t.ne[0]=T vs attn.ne[0]=T ✓ # but result ne=[d_head_new=V.ne[1]=T, T] — wrong. # Use the same pattern as the LLM cache: build_seq_qhead does V_t = transpose(V), # then matmul(V_t, attn) gives ne=[d_head, T]. t_v_t = TinyNN.tnn_transpose(@sess, t_v) t_head_out = TinyNN.tnn_matmul(@sess, t_v_t, t_attn) t_head_outs.push(t_head_out) h = h + 1 end # Concat heads along dim 0 (d_head axis): n_heads × [d_head, T] → [d_model, T]. t_concat = t_head_outs[0] hc = 1 while hc < cfg.n_heads t_concat = TinyNN.tnn_concat(@sess, t_concat, t_head_outs[hc], 0) hc = hc + 1 end t_out_proj = TinyNN.tnn_matmul(@sess, blk.t_w_o, t_concat) t_x_attn = TinyNN.tnn_add(@sess, t_x, t_out_proj) # Block-2: RMSNorm → MLP(SiLU) → residual. # ggml has no GELU backward (only SILU among activations), so we # use SiLU here for the smoke; algorithmically this matches a # documented ViT variant. timm-loader compat (which assumes GELU) # is its own piece — see vendored-backward follow-up issue. t_h2 = TinyNN.tnn_rms_norm(@sess, t_x_attn, blk.t_ln2_gamma, cfg.ln_eps) t_up = TinyNN.tnn_matmul(@sess, blk.t_w_up, t_h2) t_act = TinyNN.tnn_silu(@sess, t_up) t_dn = TinyNN.tnn_matmul(@sess, blk.t_w_down, t_act) TinyNN.tnn_add(@sess, t_x_attn, t_dn) end |
#ft_add_1d(blk, weight) ⇒ Object
366 367 368 369 370 371 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 366 def ft_add_1d(blk, weight) n = TinyNN.tnn_tensor_ne0(weight) m = TinyNN.tnn_input_1d_f32_persistent(@sess, n) v = TinyNN.tnn_input_1d_f32_persistent(@sess, n) blk.ft_weights.push(weight); blk.ft_m.push(m); blk.ft_v.push(v) end |
#ft_add_2d(blk, weight, rows, cols) ⇒ Object
360 361 362 363 364 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 360 def ft_add_2d(blk, weight, rows, cols) m = TinyNN.tnn_input_2d_f32_persistent(@sess, rows, cols) v = TinyNN.tnn_input_2d_f32_persistent(@sess, rows, cols) blk.ft_weights.push(weight); blk.ft_m.push(m); blk.ft_v.push(v) end |
#ft_add_global_1d(weight) ⇒ Object
351 352 353 354 355 356 357 358 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 351 def ft_add_global_1d(weight) n = TinyNN.tnn_tensor_ne0(weight) m = TinyNN.tnn_input_1d_f32_persistent(@sess, n) v = TinyNN.tnn_input_1d_f32_persistent(@sess, n) @ft_globals_weights.push(weight) @ft_globals_m.push(m) @ft_globals_v.push(v) end |
#ft_add_global_2d(weight, rows, cols) ⇒ Object
— bookkeeping helpers (parallel to Toy::LLM::Engine::LlamaSeqEngine) —
343 344 345 346 347 348 349 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 343 def ft_add_global_2d(weight, rows, cols) m = TinyNN.tnn_input_2d_f32_persistent(@sess, rows, cols) v = TinyNN.tnn_input_2d_f32_persistent(@sess, rows, cols) @ft_globals_weights.push(weight) @ft_globals_m.push(m) @ft_globals_v.push(v) end |
#ft_name_last(blk, name) ⇒ Object
373 374 375 376 377 378 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 373 def ft_name_last(blk, name) last = blk.ft_weights.length - 1 TinyNN.tnn_tensor_set_name(blk.ft_weights[last], name) TinyNN.tnn_tensor_set_name(blk.ft_m[last], name + ".m") TinyNN.tnn_tensor_set_name(blk.ft_v[last], name + ".v") end |
#ft_name_last_global(name) ⇒ Object
380 381 382 383 384 385 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 380 def ft_name_last_global(name) last = @ft_globals_weights.length - 1 TinyNN.tnn_tensor_set_name(@ft_globals_weights[last], name) TinyNN.tnn_tensor_set_name(@ft_globals_m[last], name + ".m") TinyNN.tnn_tensor_set_name(@ft_globals_v[last], name + ".v") end |
#realize_for_random_init(cfg, seed, init_scale) ⇒ Object
Allocate every PARAM + Adam moments. Random-init at the end via Box-Muller; the caller can overwrite specific tensors (e.g. patch_embed.proj.weight from a timm donor) before the first step.
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 81 def realize_for_random_init(cfg, seed, init_scale) @cfg = cfg # T = number of "tokens" entering the transformer = N_patches + 1 (cls). @n_patches = (cfg.image_size / cfg.patch_size) * (cfg.image_size / cfg.patch_size) @seq_t = @n_patches + 1 @sess = TinyNN.tnn_session_new(0) # Per-head decomposition pushes node count up; budget from cfg. cap = cfg.n_layers * cfg.n_heads * 1000 + 65536 TinyNN.tnn_session_set_graph_capacity(@sess, cap) # Globals: patch_embed weight, cls_token, pos_embed, final norm, head. # # patch_embed is implemented as a linear projection over the # flattened patch axis (mathematically equivalent to a Conv2d with # stride=patch, no overlap, no padding). We don't use ggml_conv_2d # in the training path because its internal implementation ends in # a `ggml_cont(ggml_permute(...))` and ggml's auto-backward # requires gradients into `cont` to be contiguous — the assertion # fires at `tnn_build_backward` time. The flat-linear form has a # clean matmul backward. # # Caller responsibility: pre-flatten each patch into # [IC*P*P, N_patches] before uploading to t_image. Done host-side # (cheap; see prep/smokes/smoke_vit_tiny.rb). @patch_flat_dim = cfg.num_channels * cfg.patch_size * cfg.patch_size @t_patch_kernel = TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_model, @patch_flat_dim) ft_add_global_2d(@t_patch_kernel, cfg.d_model, @patch_flat_dim) ft_name_last_global("patch_embed.proj.weight") @t_cls_token = TinyNN.tnn_input_2d_f32_persistent(@sess, 1, cfg.d_model) ft_add_global_2d(@t_cls_token, 1, cfg.d_model) ft_name_last_global("cls_token") @t_pos_embed = TinyNN.tnn_input_2d_f32_persistent(@sess, @seq_t, cfg.d_model) ft_add_global_2d(@t_pos_embed, @seq_t, cfg.d_model) ft_name_last_global("pos_embed") @t_final_ln_gamma = TinyNN.tnn_input_1d_f32_persistent(@sess, cfg.d_model) ft_add_global_1d(@t_final_ln_gamma) ft_name_last_global("norm.weight") @t_w_head = TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.num_classes, cfg.d_model) ft_add_global_2d(@t_w_head, cfg.num_classes, cfg.d_model) ft_name_last_global("head.weight") # Per-block weights. li_init = 0 while li_init < cfg.n_layers @blocks.push(ViTTinyBlockFFI.new) li_init = li_init + 1 end li = 0 while li < cfg.n_layers blk = @blocks[li] prefix = "blk." + li.to_s + "." blk.t_ln1_gamma = TinyNN.tnn_input_1d_f32_persistent(@sess, cfg.d_model) blk.t_ln2_gamma = TinyNN.tnn_input_1d_f32_persistent(@sess, cfg.d_model) ft_add_1d(blk, blk.t_ln1_gamma); ft_name_last(blk, prefix + "ln1.weight") ft_add_1d(blk, blk.t_ln2_gamma); ft_name_last(blk, prefix + "ln2.weight") # Per-head Q/K/V. n_kv = n_heads (no GQA in ViT-Tiny). blk.t_w_q = [TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_head, cfg.d_model)] blk.t_w_k = [TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_head, cfg.d_model)] blk.t_w_v = [TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_head, cfg.d_model)] h = 1 while h < cfg.n_heads blk.t_w_q.push(TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_head, cfg.d_model)) blk.t_w_k.push(TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_head, cfg.d_model)) blk.t_w_v.push(TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_head, cfg.d_model)) h = h + 1 end h2 = 0 while h2 < cfg.n_heads ft_add_2d(blk, blk.t_w_q[h2], cfg.d_head, cfg.d_model) ft_name_last(blk, prefix + "attn_q.head_" + h2.to_s + ".weight") ft_add_2d(blk, blk.t_w_k[h2], cfg.d_head, cfg.d_model) ft_name_last(blk, prefix + "attn_k.head_" + h2.to_s + ".weight") ft_add_2d(blk, blk.t_w_v[h2], cfg.d_head, cfg.d_model) ft_name_last(blk, prefix + "attn_v.head_" + h2.to_s + ".weight") h2 = h2 + 1 end blk.t_w_o = TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_model, cfg.d_model) blk.t_w_up = TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_ff, cfg.d_model) blk.t_w_down = TinyNN.tnn_input_2d_f32_persistent(@sess, cfg.d_model, cfg.d_ff) ft_add_2d(blk, blk.t_w_o, cfg.d_model, cfg.d_model) ft_name_last(blk, prefix + "attn_output.weight") ft_add_2d(blk, blk.t_w_up, cfg.d_ff, cfg.d_model) ft_name_last(blk, prefix + "mlp_up.weight") ft_add_2d(blk, blk.t_w_down, cfg.d_model, cfg.d_ff) ft_name_last(blk, prefix + "mlp_down.weight") # Mark every block weight as PARAM. wi = 0 while wi < blk.ft_weights.length TinyNN.tnn_set_param(blk.ft_weights[wi]) wi = wi + 1 end li = li + 1 end # Mark globals as PARAM. gi = 0 while gi < @ft_globals_weights.length TinyNN.tnn_set_param(@ft_globals_weights[gi]) gi = gi + 1 end # Image input (graph input, NOT a PARAM): pre-flattened patch # matrix shape [IC*P*P, N_patches]. Caller does the host-side # patch extraction; see prep/smokes/smoke_vit_tiny.rb. @t_image = TinyNN.tnn_input_2d_f32_persistent(@sess, @n_patches, @patch_flat_dim) TinyNN.tnn_finalize_weights(@sess) # Random-init every PARAM tensor. upload_random_init!(seed, init_scale) @realized = true end |
#upload_constant(tensor, n, val) ⇒ Object
456 457 458 459 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 456 def upload_constant(tensor, n, val) buf = Array.new(n, val) TinyNN.tnn_upload_from_float_array(@sess, tensor, buf, n) end |
#upload_gaussian(tensor, n, std, state) ⇒ Object
431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 431 def upload_gaussian(tensor, n, std, state) buf = Array.new(n, 0.0) i = 0 while i < n # Box-Muller from a xorshift64 stream (state[0] mutated in place). s = state[0] s = s ^ (s << 13); s = s & 0xFFFFFFFFFFFFFFFF s = s ^ (s >> 7); s = s & 0xFFFFFFFFFFFFFFFF s = s ^ (s << 17); s = s & 0xFFFFFFFFFFFFFFFF state[0] = s u1 = (s & 0xFFFFFFFF).to_f / 4294967296.0 s = state[0] s = s ^ (s << 13); s = s & 0xFFFFFFFFFFFFFFFF s = s ^ (s >> 7); s = s & 0xFFFFFFFFFFFFFFFF s = s ^ (s << 17); s = s & 0xFFFFFFFFFFFFFFFF state[0] = s u2 = (s & 0xFFFFFFFF).to_f / 4294967296.0 if u1 < 1.0e-12; u1 = 1.0e-12; end z = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math::PI * u2) buf[i] = z * std i = i + 1 end TinyNN.tnn_upload_from_float_array(@sess, tensor, buf, n) end |
#upload_random_init!(seed, init_scale) ⇒ Object
387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 |
# File 'lib/toy/llm/engine/vit_tiny_engine.rb', line 387 def upload_random_init!(seed, init_scale) state = [seed] cfg = @cfg inv_d = init_scale / Math.sqrt(cfg.d_model.to_f) # patch_embed.proj.weight (linear over flat patches): Gaussian. upload_gaussian(@t_patch_kernel, @patch_flat_dim * cfg.d_model, 0.02, state) upload_constant(@t_cls_token, cfg.d_model, 0.0) upload_gaussian(@t_pos_embed, @seq_t * cfg.d_model, 0.02, state) upload_constant(@t_final_ln_gamma, cfg.d_model, 1.0) upload_gaussian(@t_w_head, cfg.num_classes * cfg.d_model, 0.02, state) li = 0 while li < cfg.n_layers blk = @blocks[li] upload_constant(blk.t_ln1_gamma, cfg.d_model, 1.0) upload_constant(blk.t_ln2_gamma, cfg.d_model, 1.0) h = 0 while h < cfg.n_heads upload_gaussian(blk.t_w_q[h], cfg.d_head * cfg.d_model, inv_d, state) upload_gaussian(blk.t_w_k[h], cfg.d_head * cfg.d_model, inv_d, state) upload_gaussian(blk.t_w_v[h], cfg.d_head * cfg.d_model, inv_d, state) h = h + 1 end upload_gaussian(blk.t_w_o, cfg.d_model * cfg.d_model, inv_d, state) upload_gaussian(blk.t_w_up, cfg.d_ff * cfg.d_model, inv_d, state) upload_gaussian(blk.t_w_down, cfg.d_model * cfg.d_ff, init_scale / Math.sqrt(cfg.d_ff.to_f), state) # Zero-init Adam moments (m, v). wi = 0 while wi < blk.ft_m.length zero_tensor(blk.ft_m[wi]) zero_tensor(blk.ft_v[wi]) wi = wi + 1 end li = li + 1 end gi = 0 while gi < @ft_globals_m.length zero_tensor(@ft_globals_m[gi]) zero_tensor(@ft_globals_v[gi]) gi = gi + 1 end end |