Class: Toy::LLM::Blocks::TransformerBlock

Inherits:
Object
  • Object
show all
Defined in:
lib/toy/llm/blocks/transformer_block.rb,
lib/toy/llm/blocks/transformer_block_cuda.rb,
lib/toy/llm/blocks/transformer_block_metal.rb

Overview

Per-block tensor handles. Field names are UNCHANGED from the former LlamaSeqBlockFFI so the cache-side realize / train / tap walkers keep working by accessor name.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeTransformerBlock

Returns a new instance of TransformerBlock.



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/toy/llm/blocks/transformer_block.rb', line 107

def initialize
  @t_seq_rn1_gamma = TinyNN.tnn_null_ptr
  @t_seq_rn2_gamma = TinyNN.tnn_null_ptr
  @t_seq_w_q = [TinyNN.tnn_null_ptr]
  @t_seq_w_k = [TinyNN.tnn_null_ptr]
  @t_seq_w_v = [TinyNN.tnn_null_ptr]
  @t_seq_b_q = [TinyNN.tnn_null_ptr]
  @t_seq_b_k = [TinyNN.tnn_null_ptr]
  @t_seq_b_v = [TinyNN.tnn_null_ptr]
  @t_seq_w_o    = TinyNN.tnn_null_ptr
  @t_seq_w_gate = TinyNN.tnn_null_ptr
  @t_seq_w_up   = TinyNN.tnn_null_ptr
  @t_seq_w_down = TinyNN.tnn_null_ptr
  @t_seq_w_lora_a_q   = [TinyNN.tnn_null_ptr]
  @t_seq_w_lora_b_q   = [TinyNN.tnn_null_ptr]
  @t_seq_w_lora_a_q_m = [TinyNN.tnn_null_ptr]
  @t_seq_w_lora_a_q_v = [TinyNN.tnn_null_ptr]
  @t_seq_w_lora_b_q_m = [TinyNN.tnn_null_ptr]
  @t_seq_w_lora_b_q_v = [TinyNN.tnn_null_ptr]
  @ft_weights = [TinyNN.tnn_null_ptr]; @ft_weights.pop
  @ft_m       = [TinyNN.tnn_null_ptr]; @ft_m.pop
  @ft_v       = [TinyNN.tnn_null_ptr]; @ft_v.pop
  @tap_attn_norm  = TinyNN.tnn_null_ptr
  @tap_ffn_out    = TinyNN.tnn_null_ptr
  @tap_resid_post = TinyNN.tnn_null_ptr
end

Instance Attribute Details

#ft_mObject

Returns the value of attribute ft_m.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def ft_m
  @ft_m
end

#ft_vObject

Returns the value of attribute ft_v.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def ft_v
  @ft_v
end

#ft_weightsObject

Returns the value of attribute ft_weights.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def ft_weights
  @ft_weights
end

#t_seq_b_kObject

Returns the value of attribute t_seq_b_k.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_b_k
  @t_seq_b_k
end

#t_seq_b_qObject

Returns the value of attribute t_seq_b_q.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_b_q
  @t_seq_b_q
end

#t_seq_b_vObject

Returns the value of attribute t_seq_b_v.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_b_v
  @t_seq_b_v
end

#t_seq_rn1_gammaObject

Returns the value of attribute t_seq_rn1_gamma.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_rn1_gamma
  @t_seq_rn1_gamma
end

#t_seq_rn2_gammaObject

Returns the value of attribute t_seq_rn2_gamma.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_rn2_gamma
  @t_seq_rn2_gamma
end

#t_seq_w_downObject

Returns the value of attribute t_seq_w_down.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_down
  @t_seq_w_down
end

#t_seq_w_gateObject

Returns the value of attribute t_seq_w_gate.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_gate
  @t_seq_w_gate
end

#t_seq_w_kObject

Returns the value of attribute t_seq_w_k.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_k
  @t_seq_w_k
end

#t_seq_w_lora_a_qObject

Returns the value of attribute t_seq_w_lora_a_q.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_lora_a_q
  @t_seq_w_lora_a_q
end

#t_seq_w_lora_a_q_mObject

Returns the value of attribute t_seq_w_lora_a_q_m.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_lora_a_q_m
  @t_seq_w_lora_a_q_m
end

#t_seq_w_lora_a_q_vObject

Returns the value of attribute t_seq_w_lora_a_q_v.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_lora_a_q_v
  @t_seq_w_lora_a_q_v
end

#t_seq_w_lora_b_qObject

Returns the value of attribute t_seq_w_lora_b_q.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_lora_b_q
  @t_seq_w_lora_b_q
end

#t_seq_w_lora_b_q_mObject

Returns the value of attribute t_seq_w_lora_b_q_m.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_lora_b_q_m
  @t_seq_w_lora_b_q_m
end

#t_seq_w_lora_b_q_vObject

Returns the value of attribute t_seq_w_lora_b_q_v.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_lora_b_q_v
  @t_seq_w_lora_b_q_v
end

#t_seq_w_oObject

Returns the value of attribute t_seq_w_o.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_o
  @t_seq_w_o
end

#t_seq_w_qObject

Returns the value of attribute t_seq_w_q.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_q
  @t_seq_w_q
end

#t_seq_w_upObject

Returns the value of attribute t_seq_w_up.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_up
  @t_seq_w_up
end

#t_seq_w_vObject

Returns the value of attribute t_seq_w_v.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def t_seq_w_v
  @t_seq_w_v
end

#tap_attn_normObject

Returns the value of attribute tap_attn_norm.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def tap_attn_norm
  @tap_attn_norm
end

#tap_ffn_outObject

Returns the value of attribute tap_ffn_out.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def tap_ffn_out
  @tap_ffn_out
end

#tap_resid_postObject

Returns the value of attribute tap_resid_post.



92
93
94
# File 'lib/toy/llm/blocks/transformer_block.rb', line 92

def tap_resid_post
  @tap_resid_post
end

Instance Method Details

#alloc_full_finetune_f32_weights!(sess, cache, prefix, seq_d_model, seq_d_ff, seq_d_head, seq_n_heads, seq_n_kv, qkv_bias) ⇒ Object

P2-finish — full fine-tune per-block alloc. Lifted VERBATIM from Toy::LLM::Engine::LlamaSeqEngine#realize_for_full_finetune’s per-block loop (op order unchanged → bit-identical graph, gated by prep/full_finetune_gate.rb). The block OWNS the alloc + assignment of its self.t_seq_* handles, exactly as alloc_trainable_f32_weights! does. NO ivar reads off the cache — sess, the seq dims and qkv_bias arrive as ARGS; cache.ft_add_* / cache.ft_name_last are back-called (the :str naming stays on the cache realize runtime path —step_bind / :str landmine).

TWO deliberate divergences from alloc_trainable_f32_weights! (why this is a SEPARATE method, NOT a reuse):

- w_o is HARD-SQUARE ne=[d_model, d_model] (full_finetune loads a real
  GGUF whose attn_output.weight is square) — NOT random_init's divergent
  [d_model, n_heads*d_head].
- qkv biases ARE allocated when qkv_bias (alloc_trainable has none).


346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
# File 'lib/toy/llm/blocks/transformer_block.rb', line 346

def alloc_full_finetune_f32_weights!(sess, cache, prefix,
                                     seq_d_model, seq_d_ff, seq_d_head,
                                     seq_n_heads, seq_n_kv, qkv_bias)
  self.t_seq_rn1_gamma = TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_model)
  self.t_seq_rn2_gamma = TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_model)
  cache.ft_add_1d(self, self.t_seq_rn1_gamma)
  cache.ft_name_last(self, prefix + "attn_norm.weight")
  cache.ft_add_1d(self, self.t_seq_rn2_gamma)
  cache.ft_name_last(self, prefix + "ffn_norm.weight")

  self.t_seq_w_q = [TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_head, seq_d_model)]
  hq = 1
  while hq < seq_n_heads
    self.t_seq_w_q.push(TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_head, seq_d_model))
    hq = hq + 1
  end
  hq2 = 0
  while hq2 < seq_n_heads
    cache.ft_add_2d(self, self.t_seq_w_q[hq2], seq_d_head, seq_d_model)
    cache.ft_name_last(self, prefix + "attn_q.head_" + hq2.to_s + ".weight")
    hq2 = hq2 + 1
  end

  self.t_seq_w_k = [TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_head, seq_d_model)]
  self.t_seq_w_v = [TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_head, seq_d_model)]
  hkv = 1
  while hkv < seq_n_kv
    self.t_seq_w_k.push(TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_head, seq_d_model))
    self.t_seq_w_v.push(TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_head, seq_d_model))
    hkv = hkv + 1
  end
  hkv2 = 0
  while hkv2 < seq_n_kv
    cache.ft_add_2d(self, self.t_seq_w_k[hkv2], seq_d_head, seq_d_model)
    cache.ft_name_last(self, prefix + "attn_k.head_" + hkv2.to_s + ".weight")
    cache.ft_add_2d(self, self.t_seq_w_v[hkv2], seq_d_head, seq_d_model)
    cache.ft_name_last(self, prefix + "attn_v.head_" + hkv2.to_s + ".weight")
    hkv2 = hkv2 + 1
  end

  if qkv_bias
    self.t_seq_b_q = [TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_head)]
    hbq = 1
    while hbq < seq_n_heads
      self.t_seq_b_q.push(TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_head))
      hbq = hbq + 1
    end
    self.t_seq_b_k = [TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_head)]
    self.t_seq_b_v = [TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_head)]
    hbkv = 1
    while hbkv < seq_n_kv
      self.t_seq_b_k.push(TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_head))
      self.t_seq_b_v.push(TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_head))
      hbkv = hbkv + 1
    end
    hbq2 = 0
    while hbq2 < seq_n_heads
      cache.ft_add_1d(self, self.t_seq_b_q[hbq2])
      cache.ft_name_last(self, prefix + "attn_q.head_" + hbq2.to_s + ".bias")
      hbq2 = hbq2 + 1
    end
    hbkv2 = 0
    while hbkv2 < seq_n_kv
      cache.ft_add_1d(self, self.t_seq_b_k[hbkv2])
      cache.ft_name_last(self, prefix + "attn_k.head_" + hbkv2.to_s + ".bias")
      cache.ft_add_1d(self, self.t_seq_b_v[hbkv2])
      cache.ft_name_last(self, prefix + "attn_v.head_" + hbkv2.to_s + ".bias")
      hbkv2 = hbkv2 + 1
    end
  end

  self.t_seq_w_o    = TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_model, seq_d_model)
  self.t_seq_w_gate = TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_ff,    seq_d_model)
  self.t_seq_w_up   = TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_ff,    seq_d_model)
  self.t_seq_w_down = TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_model, seq_d_ff)
  cache.ft_add_2d(self, self.t_seq_w_o,    seq_d_model, seq_d_model)
  cache.ft_name_last(self, prefix + "attn_output.weight")
  cache.ft_add_2d(self, self.t_seq_w_gate, seq_d_ff,    seq_d_model)
  cache.ft_name_last(self, prefix + "ffn_gate.weight")
  cache.ft_add_2d(self, self.t_seq_w_up,   seq_d_ff,    seq_d_model)
  cache.ft_name_last(self, prefix + "ffn_up.weight")
  cache.ft_add_2d(self, self.t_seq_w_down, seq_d_model, seq_d_ff)
  cache.ft_name_last(self, prefix + "ffn_down.weight")

  wi = 0
  while wi < self.ft_weights.length
    TinyNN.tnn_set_param(self.ft_weights[wi])
    wi = wi + 1
  end
end

#alloc_q8_typed_from_gguf!(sess, gguf_handle, li, seq_n_heads, seq_n_kv, seq_d_head, seq_d_model, seq_d_ff, seq_vocab_size, seq_lora_q_enabled, seq_lora_q_rank, seq_lora_q_adamw_enabled, qkv_bias) ⇒ Object

P2.7 pass-3 — allocate this block’s PERSISTENT weight handles for the realize_for_q8_copy path (Q8-stays-Q8 verbatim copy on CUDA). Moved VERBATIM from the per-block ALLOC-typed loop body in Toy::LLM::Engine::LlamaSeqEngine#realize_for_q8_copy (op order unchanged →bit-identical graph): the block now OWNS the alloc + assignment of its self.t_seq_* handles, exactly as in load_from_gguf_mmap! / alloc_trainable_f32_weights!. NO ivar reads off the cache — every value (sess, the seq dims, the lora flags, qkv_bias, the gguf handle, and the layer index ‘li`) arrives as an ARG. Mirrors load_from_gguf_mmap!’s arg-passing exactly.

CRITICAL constraints:

- Allocates ctx_w tensors of the on-disk gguf type via
  tnn_input_2d_persistent_typed (verbatim copy requires source/target
  types match). rn1/rn2 gammas + qkv_bias + LoRA/Adam are F32.
- This path never names LoRA tensors (the q8 loop body issues NO
  tnn_tensor_set_name), so the moved body is :str-free and
  Spinel-#16-clean — no cache.lora_name_q! back-calls here.
- w_o is allocated hard-square ne=[d_model, d_model] — VERBATIM from
  the former cache line 318. Do NOT unify with the divergent shape;
  the gguf round-trip PINS n_heads*d_head == d_model.
- seq_vocab_size is accepted (positional parity with the cache's
  intent / the mmap precedent) but UNUSED here — the block allocates
  no global tensors.
- The set_param marking loop, finalize, verbatim-copy phase, Adam
  zero-init and build_and_realize! STAY on the cache realize method.


650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
# File 'lib/toy/llm/blocks/transformer_block.rb', line 650

def alloc_q8_typed_from_gguf!(sess, gguf_handle, li,
                              seq_n_heads, seq_n_kv, seq_d_head, seq_d_model,
                              seq_d_ff, seq_vocab_size, seq_lora_q_enabled,
                              seq_lora_q_rank, seq_lora_q_adamw_enabled, qkv_bias)
  prefix = "blk." + li.to_s

  self.t_seq_rn1_gamma = TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_model)
  self.t_seq_rn2_gamma = TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_model)

  q_idx  = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_q.weight")
  q_type = TinyNN.tnn_gguf_tensor_type(gguf_handle, q_idx)
  self.t_seq_w_q = [TinyNN.tnn_input_2d_persistent_typed(sess, seq_d_head, seq_d_model, q_type)]
  hq = 1
  while hq < seq_n_heads
    self.t_seq_w_q.push(TinyNN.tnn_input_2d_persistent_typed(sess, seq_d_head, seq_d_model, q_type))
    hq = hq + 1
  end

  k_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_k.weight")
  v_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_v.weight")
  k_type = TinyNN.tnn_gguf_tensor_type(gguf_handle, k_idx)
  v_type = TinyNN.tnn_gguf_tensor_type(gguf_handle, v_idx)
  self.t_seq_w_k = [TinyNN.tnn_input_2d_persistent_typed(sess, seq_d_head, seq_d_model, k_type)]
  self.t_seq_w_v = [TinyNN.tnn_input_2d_persistent_typed(sess, seq_d_head, seq_d_model, v_type)]
  hkv = 1
  while hkv < seq_n_kv
    self.t_seq_w_k.push(TinyNN.tnn_input_2d_persistent_typed(sess, seq_d_head, seq_d_model, k_type))
    self.t_seq_w_v.push(TinyNN.tnn_input_2d_persistent_typed(sess, seq_d_head, seq_d_model, v_type))
    hkv = hkv + 1
  end

  if qkv_bias
    self.t_seq_b_q = [TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_head)]
    hbq = 1
    while hbq < seq_n_heads
      self.t_seq_b_q.push(TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_head))
      hbq = hbq + 1
    end
    self.t_seq_b_k = [TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_head)]
    self.t_seq_b_v = [TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_head)]
    hbkv = 1
    while hbkv < seq_n_kv
      self.t_seq_b_k.push(TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_head))
      self.t_seq_b_v.push(TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_head))
      hbkv = hbkv + 1
    end
  end

  o_idx    = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_output.weight")
  gate_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".ffn_gate.weight")
  up_idx   = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".ffn_up.weight")
  down_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".ffn_down.weight")
  self.t_seq_w_o    = TinyNN.tnn_input_2d_persistent_typed(sess, seq_d_model, seq_d_model,
                       TinyNN.tnn_gguf_tensor_type(gguf_handle, o_idx))
  self.t_seq_w_gate = TinyNN.tnn_input_2d_persistent_typed(sess, seq_d_ff, seq_d_model,
                       TinyNN.tnn_gguf_tensor_type(gguf_handle, gate_idx))
  self.t_seq_w_up   = TinyNN.tnn_input_2d_persistent_typed(sess, seq_d_ff, seq_d_model,
                       TinyNN.tnn_gguf_tensor_type(gguf_handle, up_idx))
  self.t_seq_w_down = TinyNN.tnn_input_2d_persistent_typed(sess, seq_d_model, seq_d_ff,
                       TinyNN.tnn_gguf_tensor_type(gguf_handle, down_idx))

  # LoRA + Adam allocations (same as realize_for_mmap path).
  if seq_lora_q_enabled
    self.t_seq_w_lora_a_q = [TinyNN.tnn_input_2d_f32_persistent(sess,
                              seq_lora_q_rank, seq_d_model)]
    self.t_seq_w_lora_b_q = [TinyNN.tnn_input_2d_f32_persistent(sess,
                              seq_d_head, seq_lora_q_rank)]
    hql = 1
    while hql < seq_n_heads
      self.t_seq_w_lora_a_q.push(TinyNN.tnn_input_2d_f32_persistent(sess,
                                  seq_lora_q_rank, seq_d_model))
      self.t_seq_w_lora_b_q.push(TinyNN.tnn_input_2d_f32_persistent(sess,
                                  seq_d_head, seq_lora_q_rank))
      hql = hql + 1
    end
    if seq_lora_q_adamw_enabled
      self.t_seq_w_lora_a_q_m = [TinyNN.tnn_input_2d_f32_persistent(sess,
                                  seq_lora_q_rank, seq_d_model)]
      self.t_seq_w_lora_a_q_v = [TinyNN.tnn_input_2d_f32_persistent(sess,
                                  seq_lora_q_rank, seq_d_model)]
      self.t_seq_w_lora_b_q_m = [TinyNN.tnn_input_2d_f32_persistent(sess,
                                  seq_d_head, seq_lora_q_rank)]
      self.t_seq_w_lora_b_q_v = [TinyNN.tnn_input_2d_f32_persistent(sess,
                                  seq_d_head, seq_lora_q_rank)]
      hqm = 1
      while hqm < seq_n_heads
        self.t_seq_w_lora_a_q_m.push(TinyNN.tnn_input_2d_f32_persistent(sess,
                                      seq_lora_q_rank, seq_d_model))
        self.t_seq_w_lora_a_q_v.push(TinyNN.tnn_input_2d_f32_persistent(sess,
                                      seq_lora_q_rank, seq_d_model))
        self.t_seq_w_lora_b_q_m.push(TinyNN.tnn_input_2d_f32_persistent(sess,
                                      seq_d_head, seq_lora_q_rank))
        self.t_seq_w_lora_b_q_v.push(TinyNN.tnn_input_2d_f32_persistent(sess,
                                      seq_d_head, seq_lora_q_rank))
        hqm = hqm + 1
      end
    end
  end
end

#alloc_trainable_f32_weights!(sess, cache, prefix, seq_d_model, seq_d_ff, seq_d_head, seq_n_heads, seq_n_kv) ⇒ Object

P2.6 Step 4 — allocate this block’s trainable persistent-F32 weight tensors for the random_init realize path. Moved VERBATIM from the per-block ALLOC loop body in Toy::LLM::Engine::LlamaSeqEngine#realize_for_random_init (op order unchanged → bit-identical graph): the block now OWNS the alloc + assignment of its self.t_seq_* handles, exactly as it already owns them at forward time. NO ivar reads off the cache — every value (sess, the seq dims, the name prefix) arrives as an ARG.

The ft_add_1d / ft_add_2d / ft_name_last RECORDING primitives STAY on the cache and are called BACK through the passed ‘cache` reference: they read the cache’s @sess to allocate the Adam m/v moments and (ft_name_last) issue tnn_tensor_set_name with a :str name at RUNTIME. That :str call MUST remain on the cache’s realize runtime path — never migrated into block class-load scope (step_bind :str landmine 2026-05-28). They push to THIS block’s ft_weights/ft_m/ft_v arrays (passed in as ‘self`/`blk`).

CRITICAL: w_o is allocated ne=[d_model, n_heads*d_head] — VERBATIM from random_init (NOT [d_model, d_model]; the two differ under latent GQA where n_heads*d_head != d_model, a divergence the smoke gate cannot catch). Do NOT unify with realize_for_full_finetune’s w_o alloc. random_init allocates NO qkv biases (the qkv_bias arg is honoured only by the uploader / Adam-zero paths), so there is no bias branch here.

Closes with the per-block set_param loop (former L1082-1086) so the freshly-recorded ft_weights become graph params, same scope as the alloc.



271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
# File 'lib/toy/llm/blocks/transformer_block.rb', line 271

def alloc_trainable_f32_weights!(sess, cache, prefix,
                                 seq_d_model, seq_d_ff, seq_d_head,
                                 seq_n_heads, seq_n_kv)
  self.t_seq_rn1_gamma = TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_model)
  self.t_seq_rn2_gamma = TinyNN.tnn_input_1d_f32_persistent(sess, seq_d_model)
  cache.ft_add_1d(self, self.t_seq_rn1_gamma)
  cache.ft_name_last(self, prefix + "attn_norm.weight")
  cache.ft_add_1d(self, self.t_seq_rn2_gamma)
  cache.ft_name_last(self, prefix + "ffn_norm.weight")

  self.t_seq_w_q = [TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_head, seq_d_model)]
  hq = 1
  while hq < seq_n_heads
    self.t_seq_w_q.push(TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_head, seq_d_model))
    hq = hq + 1
  end
  hq2 = 0
  while hq2 < seq_n_heads
    cache.ft_add_2d(self, self.t_seq_w_q[hq2], seq_d_head, seq_d_model)
    cache.ft_name_last(self, prefix + "attn_q.head_" + hq2.to_s + ".weight")
    hq2 = hq2 + 1
  end

  self.t_seq_w_k = [TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_head, seq_d_model)]
  self.t_seq_w_v = [TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_head, seq_d_model)]
  hkv = 1
  while hkv < seq_n_kv
    self.t_seq_w_k.push(TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_head, seq_d_model))
    self.t_seq_w_v.push(TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_head, seq_d_model))
    hkv = hkv + 1
  end
  hkv2 = 0
  while hkv2 < seq_n_kv
    cache.ft_add_2d(self, self.t_seq_w_k[hkv2], seq_d_head, seq_d_model)
    cache.ft_name_last(self, prefix + "attn_k.head_" + hkv2.to_s + ".weight")
    cache.ft_add_2d(self, self.t_seq_w_v[hkv2], seq_d_head, seq_d_model)
    cache.ft_name_last(self, prefix + "attn_v.head_" + hkv2.to_s + ".weight")
    hkv2 = hkv2 + 1
  end

  self.t_seq_w_o    = TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_model, seq_n_heads * seq_d_head)
  self.t_seq_w_gate = TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_ff,    seq_d_model)
  self.t_seq_w_up   = TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_ff,    seq_d_model)
  self.t_seq_w_down = TinyNN.tnn_input_2d_f32_persistent(sess, seq_d_model, seq_d_ff)
  cache.ft_add_2d(self, self.t_seq_w_o,    seq_d_model, seq_n_heads * seq_d_head)
  cache.ft_name_last(self, prefix + "attn_output.weight")
  cache.ft_add_2d(self, self.t_seq_w_gate, seq_d_ff,    seq_d_model)
  cache.ft_name_last(self, prefix + "ffn_gate.weight")
  cache.ft_add_2d(self, self.t_seq_w_up,   seq_d_ff,    seq_d_model)
  cache.ft_name_last(self, prefix + "ffn_up.weight")
  cache.ft_add_2d(self, self.t_seq_w_down, seq_d_model, seq_d_ff)
  cache.ft_name_last(self, prefix + "ffn_down.weight")

  wi = 0
  while wi < self.ft_weights.length
    TinyNN.tnn_set_param(self.ft_weights[wi])
    wi = wi + 1
  end
end

#build_forward(sess, t_x, ctx) ⇒ Object

One transformer block. SEQ-MODE forward: no ‘state` input, no `state_out` return (KV decode is the separate toy_smollm2_ffi_kv.rb path). The per-forward context (scale, eps, dims, positions, rope cfg, mask, …) arrives in `ctx`; the block owns its weight handles as self.t_seq_*. Single tensor return (t_resid).

h1   = RMSNorm(x)
per KV head kv_h:
  k_pre = w_k[kv_h] @ h1  (+ b_k[kv_h])         ne=[d_head, T]
  k     = RoPE(k_pre, positions)
  v     = w_v[kv_h] @ h1  (+ b_v[kv_h])         ne=[d_head, T]
  v_t   = transpose(v)                          ne=[T, d_head]
per Q head q_h (kv_h = q_h / group_size):
  q_pre = w_q[q_h] @ h1  (+ b_q[q_h])           ne=[d_head, T]
  q     = RoPE(q_pre, positions)
  scores = k[kv_h] @ q                          ne=[T_keys, T_queries]
  scaled = scores / sqrt(d_head)
  masked = diag_mask_inf(scaled, 0)              causal triangle
  attn   = softmax(masked)                       ne=[T, T]
  head_h = v_t[kv_h] @ attn                     ne=[d_head, T]
concat heads along ne0 → ne=[d_model, T]
x_attn = x + (w_o @ concat)
h2     = RMSNorm(x_attn)
ff     = w_down @ (silu(w_gate @ h2) * (w_up @ h2))
x_out  = x_attn + ff


159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/toy/llm/blocks/transformer_block.rb', line 159

def build_forward(sess, t_x, ctx)
  t_h = Toy::LLM::Primitives::RMSNorm.build(sess, t_x, self.t_seq_rn1_gamma, ctx.seq_eps)
  # GH#15 — tap the post-attn-norm activation. set_output keeps it
  # alive across graph computation so the host can download it.
  self.tap_attn_norm = t_h
  TinyNN.tnn_set_output(t_h)

  # K, V over all KV heads. Pre-compute v_t per head so the per-Q-head
  # attention loop can index it (avoids n_heads × transpose).
  # See the mirror for the Spinel landmine (issue #688 partial fix;
  # the function-parameter type for build_qhead was already locked in
  # as IntArray before the local-var ptr-push promotion runs).
  # Re-verified 2026-05-26: bare `[]` still fires the warning.
  t_k_per_kv  = [TinyNN.tnn_null_ptr]; t_k_per_kv.pop
  t_vt_per_kv = [TinyNN.tnn_null_ptr]; t_vt_per_kv.pop
  hkv = 0
  while hkv < ctx.seq_n_kv
    t_k_raw = mp_matmul(sess, ctx.seq_weight_dtype, self.t_seq_w_k[hkv], t_h)
    if ctx.seq_has_qkv_bias
      t_k_pre = TinyNN.tnn_add(sess, t_k_raw, self.t_seq_b_k[hkv])
    else
      t_k_pre = t_k_raw
    end
    # ggml_rope_ext requires a->ne[2] == positions->ne[0]. Our K is
    # ne=[d_head, T*B] (ne[2]=1); reshape to ne=[d_head, 1, T*B] so
    # ne[2]==T*B, then reshape back after rope. Reshape is metadata-
    # only (no copy) on contiguous tensors. At T=1, B=1 this is a
    # no-op (1 == 1).
    t_k = Toy::LLM::Primitives::RoPE.apply_2d(
            sess, t_k_pre, ctx.t_seq_positions,
            ctx.t_seq_rope_freq_factors, ctx.seq_rope_cfg, ctx.seq_t, ctx.seq_b)
    t_k_per_kv.push(t_k)

    t_v_raw = mp_matmul(sess, ctx.seq_weight_dtype, self.t_seq_w_v[hkv], t_h)
    if ctx.seq_has_qkv_bias
      t_v = TinyNN.tnn_add(sess, t_v_raw, self.t_seq_b_v[hkv])
    else
      t_v = t_v_raw
    end
    # head_out = v_t @ attn. v has ne=[d_head, T]; transpose to
    # ne=[T, d_head] so the second matmul's contraction lines up.
    t_v_t = TinyNN.tnn_transpose(sess, t_v)
    t_vt_per_kv.push(t_v_t)
    hkv = hkv + 1
  end

  # Per-Q-head attention. GQA: each Q head reads from kv_h = q_h / group_size.
  t_head_out0 = build_qhead(sess, ctx, t_h, 0, t_k_per_kv, t_vt_per_kv)
  t_head_outs = [t_head_out0]
  hq = 1
  while hq < ctx.seq_n_heads
    t_head_outs.push(build_qhead(sess, ctx, t_h, hq, t_k_per_kv, t_vt_per_kv))
    hq = hq + 1
  end

  t_concat = t_head_outs[0]
  hq2 = 1
  while hq2 < ctx.seq_n_heads
    t_concat = TinyNN.tnn_concat(sess, t_concat, t_head_outs[hq2], 0)
    hq2 = hq2 + 1
  end

  t_out_proj = mp_matmul(sess, ctx.seq_weight_dtype, self.t_seq_w_o, t_concat)
  t_x_attn   = TinyNN.tnn_add(sess, t_x, t_out_proj)

  # SwiGLU FFN.
  t_h2    = Toy::LLM::Primitives::RMSNorm.build(sess, t_x_attn, self.t_seq_rn2_gamma, ctx.seq_eps)
  t_gate  = mp_matmul(sess, ctx.seq_weight_dtype, self.t_seq_w_gate, t_h2)
  t_up    = mp_matmul(sess, ctx.seq_weight_dtype, self.t_seq_w_up,   t_h2)
  t_gated = Toy::LLM::Primitives::SwiGLU.gate(sess, t_gate, t_up)
  t_dn    = mp_matmul(sess, ctx.seq_weight_dtype, self.t_seq_w_down, t_gated)
  # GH#15 — tap the FFN output (pre-residual). set_output to pin.
  self.tap_ffn_out = t_dn
  TinyNN.tnn_set_output(t_dn)

  t_resid = TinyNN.tnn_add(sess, t_x_attn, t_dn)
  # GH#15 — tap the residual-stream value AFTER this block. Stable,
  # matched-across-runs region name: resid_post_block.
  self.tap_resid_post = t_resid
  TinyNN.tnn_set_output(t_resid)
  t_resid
end

#copy_q8_bytes_from_gguf!(sess, gguf_handle, li, seq_n_heads, seq_n_kv, seq_d_head, qkv_bias) ⇒ Object

P2.7 pass-3 Step 2 — fill this block’s PERSISTENT backend buffers from the GGUF for the realize_for_q8_copy path (Q8-stays-Q8 verbatim copy). Moved VERBATIM from the per-block VERBATIM-COPY loop body in Toy::LLM::Engine::LlamaSeqEngine#realize_for_q8_copy (op order unchanged →bit-identical weights): this is the COPY phase that follows the alloc_q8_typed_from_gguf! ALLOC phase. The block READS its own self.t_seq_* handles (allocated by alloc_q8_typed_from_gguf!) and writes NOTHING on itself — the FFI copy primitives fill the backend buffers by handle. NO ivar reads off the cache — every value (sess, the seq dims, qkv_bias, the gguf handle, the layer index ‘li`) arrives as an ARG, mirroring alloc_q8_typed_from_gguf! exactly.

CRITICAL constraints:

- Per-head slice args are byte-VERBATIM: w_q[hq] takes (hq, n_heads),
  w_k/w_v[hkv] take (hkv, n_kv), the qkv biases take (h, d_head). A
  swapped index produces deterministic-but-WRONG logits that the
  2x-forward byte-identity gate cannot catch — so arg fidelity is the
  load-bearing constraint, not behavior the gate observes.
- All primitives are tnn_gguf_copy_* / tnn_gguf_find_index. The
  find_index :str arg is issued at RUNTIME (same as
  alloc_q8_typed_from_gguf!), never block class-load scope (#16); this
  path names NO LoRA tensors, so there are no cache back-calls.
- The GLOBALS verbatim-copy (token embed / final norm / untied output)
  STAYS on the cache realize method — those touch cache-level handles.


774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
# File 'lib/toy/llm/blocks/transformer_block.rb', line 774

def copy_q8_bytes_from_gguf!(sess, gguf_handle, li,
                             seq_n_heads, seq_n_kv, seq_d_head, qkv_bias)
  prefix = "blk." + li.to_s
  rn1_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_norm.weight")
  rn2_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".ffn_norm.weight")
  TinyNN.tnn_gguf_copy_1d_to_persistent(gguf_handle, rn1_idx, sess, self.t_seq_rn1_gamma)
  TinyNN.tnn_gguf_copy_1d_to_persistent(gguf_handle, rn2_idx, sess, self.t_seq_rn2_gamma)

  q_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_q.weight")
  hq = 0
  while hq < seq_n_heads
    TinyNN.tnn_gguf_copy_verbatim_head_slice_to_persistent(gguf_handle, q_idx, sess,
      self.t_seq_w_q[hq], hq, seq_n_heads)
    hq = hq + 1
  end
  k_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_k.weight")
  v_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_v.weight")
  hkv = 0
  while hkv < seq_n_kv
    TinyNN.tnn_gguf_copy_verbatim_head_slice_to_persistent(gguf_handle, k_idx, sess,
      self.t_seq_w_k[hkv], hkv, seq_n_kv)
    TinyNN.tnn_gguf_copy_verbatim_head_slice_to_persistent(gguf_handle, v_idx, sess,
      self.t_seq_w_v[hkv], hkv, seq_n_kv)
    hkv = hkv + 1
  end

  if qkv_bias
    qb_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_q.bias")
    kb_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_k.bias")
    vb_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_v.bias")
    hbq = 0
    while hbq < seq_n_heads
      TinyNN.tnn_gguf_copy_head_bias_slice_to_persistent(gguf_handle, qb_idx, sess,
        self.t_seq_b_q[hbq], hbq, seq_d_head)
      hbq = hbq + 1
    end
    hbkv = 0
    while hbkv < seq_n_kv
      TinyNN.tnn_gguf_copy_head_bias_slice_to_persistent(gguf_handle, kb_idx, sess,
        self.t_seq_b_k[hbkv], hbkv, seq_d_head)
      TinyNN.tnn_gguf_copy_head_bias_slice_to_persistent(gguf_handle, vb_idx, sess,
        self.t_seq_b_v[hbkv], hbkv, seq_d_head)
      hbkv = hbkv + 1
    end
  end

  o_idx    = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_output.weight")
  gate_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".ffn_gate.weight")
  up_idx   = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".ffn_up.weight")
  down_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".ffn_down.weight")
  TinyNN.tnn_gguf_copy_verbatim_to_persistent(gguf_handle, o_idx,    sess, self.t_seq_w_o)
  TinyNN.tnn_gguf_copy_verbatim_to_persistent(gguf_handle, gate_idx, sess, self.t_seq_w_gate)
  TinyNN.tnn_gguf_copy_verbatim_to_persistent(gguf_handle, up_idx,   sess, self.t_seq_w_up)
  TinyNN.tnn_gguf_copy_verbatim_to_persistent(gguf_handle, down_idx, sess, self.t_seq_w_down)
end

#load_from_gguf_mmap!(sess, cache, gguf_handle, li, seq_n_heads, seq_n_kv, seq_d_head, seq_d_model, seq_d_ff, seq_lora_q_enabled, seq_lora_q_rank, seq_lora_q_adamw_enabled, qkv_bias) ⇒ Object

P2.7 — load this block’s PERSISTENT weight handles from the mmap’d GGUF for the realize_for_mmap path. Moved VERBATIM from the per-block ALLOC-from-offsets loop body in Toy::LLM::Engine::LlamaSeqEngine#realize_for_mmap (op order unchanged → bit-identical graph): the block now OWNS the alloc + assignment of its self.t_seq_* handles, exactly as it already owns them at forward time and in alloc_trainable_f32_weights!. NO ivar reads off the cache — every value (sess, the seq dims, the lora flags, qkv_bias, the gguf handle, and the layer index ‘li`) arrives as an ARG.

CRITICAL constraints (all per the alloc_trainable_f32_weights! / load_globals_from_gguf_mmap! precedents):

- head_nbytes STAYS on the cache and is back-called through the
  passed `cache` ref (same pattern alloc_trainable_f32_weights! uses
  for cache.ft_add_* / cache.ft_name_last).
- The LoRA tnn_tensor_set_name(:str) naming is NOT issued here in
  block class-load scope (step_bind / :str landmine #16). The block
  assembles the runtime name string and hands it to the cache via
  cache.lora_name_q! / cache.lora_name_q_adam! — the actual :str FFI
  call lives on the cache realize runtime path, exactly as
  ft_name_last stays on the cache.
- w_o is allocated hard-square ne=[d_model, d_model] — VERBATIM from
  the former line 668. Do NOT unify with alloc_trainable_f32_weights!'s
  divergent [d_model, n_heads*d_head]; the gguf round-trip PINS
  n_heads*d_head == d_model so this branch is divergence-blind.
- The set_param marking loop, finalize, Adam zero-init and
  build_and_realize! STAY on the cache realize method as head/tail.


463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
# File 'lib/toy/llm/blocks/transformer_block.rb', line 463

def load_from_gguf_mmap!(sess, cache, gguf_handle, li,
                         seq_n_heads, seq_n_kv, seq_d_head, seq_d_model,
                         seq_d_ff, seq_lora_q_enabled, seq_lora_q_rank,
                         seq_lora_q_adamw_enabled, qkv_bias)
  prefix = "blk." + li.to_s

  rn1_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_norm.weight")
  rn2_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".ffn_norm.weight")
  self.t_seq_rn1_gamma = TinyNN.tnn_input_1d_persistent_mmap(sess,
                          seq_d_model, 0,
                          TinyNN.tnn_gguf_tensor_file_offset(gguf_handle, rn1_idx))
  self.t_seq_rn2_gamma = TinyNN.tnn_input_1d_persistent_mmap(sess,
                          seq_d_model, 0,
                          TinyNN.tnn_gguf_tensor_file_offset(gguf_handle, rn2_idx))

  # Q heads — per-head [d_head, d_model] tensor, n_heads of them.
  q_idx      = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_q.weight")
  q_off_base = TinyNN.tnn_gguf_tensor_file_offset(gguf_handle, q_idx)
  q_type     = TinyNN.tnn_gguf_tensor_type(gguf_handle, q_idx)
  q_stride   = cache.head_nbytes(q_type, seq_d_head, seq_d_model)
  self.t_seq_w_q = [TinyNN.tnn_input_2d_persistent_mmap(sess,
                     seq_d_head, seq_d_model, q_type, q_off_base)]
  hq = 1
  while hq < seq_n_heads
    self.t_seq_w_q.push(TinyNN.tnn_input_2d_persistent_mmap(sess,
                         seq_d_head, seq_d_model, q_type,
                         q_off_base + hq * q_stride))
    hq = hq + 1
  end

  # M3 step 3 — LoRA-Q adapter pair per Q head. Trainable F32 in
  # ctx_w (mirrors SmolLM2KVFFICache). Optional persistent Adam m/v.
  # Names ride the llama.cpp convention extended for the per-head /
  # adapter axes: blk.N.attn_q.head_H.lora_{a,b}.weight (+ .m / .v).
  lora_prefix = "blk." + li.to_s + ".attn_q.head_"
  if seq_lora_q_enabled
    self.t_seq_w_lora_a_q = [TinyNN.tnn_input_2d_f32_persistent(sess,
                              seq_lora_q_rank, seq_d_model)]
    self.t_seq_w_lora_b_q = [TinyNN.tnn_input_2d_f32_persistent(sess,
                              seq_d_head, seq_lora_q_rank)]
    hql = 1
    while hql < seq_n_heads
      self.t_seq_w_lora_a_q.push(TinyNN.tnn_input_2d_f32_persistent(sess,
                                  seq_lora_q_rank, seq_d_model))
      self.t_seq_w_lora_b_q.push(TinyNN.tnn_input_2d_f32_persistent(sess,
                                  seq_d_head, seq_lora_q_rank))
      hql = hql + 1
    end
    hqn = 0
    while hqn < seq_n_heads
      cache.lora_name_q!(self.t_seq_w_lora_a_q[hqn],
                         self.t_seq_w_lora_b_q[hqn],
                         lora_prefix + hqn.to_s)
      hqn = hqn + 1
    end

    if seq_lora_q_adamw_enabled
      self.t_seq_w_lora_a_q_m = [TinyNN.tnn_input_2d_f32_persistent(sess,
                                  seq_lora_q_rank, seq_d_model)]
      self.t_seq_w_lora_a_q_v = [TinyNN.tnn_input_2d_f32_persistent(sess,
                                  seq_lora_q_rank, seq_d_model)]
      self.t_seq_w_lora_b_q_m = [TinyNN.tnn_input_2d_f32_persistent(sess,
                                  seq_d_head, seq_lora_q_rank)]
      self.t_seq_w_lora_b_q_v = [TinyNN.tnn_input_2d_f32_persistent(sess,
                                  seq_d_head, seq_lora_q_rank)]
      hqm = 1
      while hqm < seq_n_heads
        self.t_seq_w_lora_a_q_m.push(TinyNN.tnn_input_2d_f32_persistent(sess,
                                      seq_lora_q_rank, seq_d_model))
        self.t_seq_w_lora_a_q_v.push(TinyNN.tnn_input_2d_f32_persistent(sess,
                                      seq_lora_q_rank, seq_d_model))
        self.t_seq_w_lora_b_q_m.push(TinyNN.tnn_input_2d_f32_persistent(sess,
                                      seq_d_head, seq_lora_q_rank))
        self.t_seq_w_lora_b_q_v.push(TinyNN.tnn_input_2d_f32_persistent(sess,
                                      seq_d_head, seq_lora_q_rank))
        hqm = hqm + 1
      end
      hmn = 0
      while hmn < seq_n_heads
        cache.lora_name_q_adam!(self.t_seq_w_lora_a_q_m[hmn],
                                self.t_seq_w_lora_a_q_v[hmn],
                                self.t_seq_w_lora_b_q_m[hmn],
                                self.t_seq_w_lora_b_q_v[hmn],
                                lora_prefix + hmn.to_s)
        hmn = hmn + 1
      end
    end
  end

  # K, V heads — per-KV-head [d_head, d_model].
  k_idx      = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_k.weight")
  v_idx      = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_v.weight")
  k_off_base = TinyNN.tnn_gguf_tensor_file_offset(gguf_handle, k_idx)
  v_off_base = TinyNN.tnn_gguf_tensor_file_offset(gguf_handle, v_idx)
  k_type     = TinyNN.tnn_gguf_tensor_type(gguf_handle, k_idx)
  v_type     = TinyNN.tnn_gguf_tensor_type(gguf_handle, v_idx)
  k_stride   = cache.head_nbytes(k_type, seq_d_head, seq_d_model)
  v_stride   = cache.head_nbytes(v_type, seq_d_head, seq_d_model)
  self.t_seq_w_k = [TinyNN.tnn_input_2d_persistent_mmap(sess,
                     seq_d_head, seq_d_model, k_type, k_off_base)]
  self.t_seq_w_v = [TinyNN.tnn_input_2d_persistent_mmap(sess,
                     seq_d_head, seq_d_model, v_type, v_off_base)]
  hkv = 1
  while hkv < seq_n_kv
    self.t_seq_w_k.push(TinyNN.tnn_input_2d_persistent_mmap(sess,
                         seq_d_head, seq_d_model, k_type,
                         k_off_base + hkv * k_stride))
    self.t_seq_w_v.push(TinyNN.tnn_input_2d_persistent_mmap(sess,
                         seq_d_head, seq_d_model, v_type,
                         v_off_base + hkv * v_stride))
    hkv = hkv + 1
  end

  # Optional Q/K/V biases (Qwen2.x). 1D [d_head] per head, contiguous.
  if qkv_bias
    qb_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_q.bias")
    kb_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_k.bias")
    vb_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_v.bias")
    qb_off = TinyNN.tnn_gguf_tensor_file_offset(gguf_handle, qb_idx)
    kb_off = TinyNN.tnn_gguf_tensor_file_offset(gguf_handle, kb_idx)
    vb_off = TinyNN.tnn_gguf_tensor_file_offset(gguf_handle, vb_idx)
    bias_stride = seq_d_head * 4

    self.t_seq_b_q = [TinyNN.tnn_input_1d_persistent_mmap(sess, seq_d_head, 0, qb_off)]
    hq = 1
    while hq < seq_n_heads
      self.t_seq_b_q.push(TinyNN.tnn_input_1d_persistent_mmap(sess, seq_d_head, 0,
                           qb_off + hq * bias_stride))
      hq = hq + 1
    end
    self.t_seq_b_k = [TinyNN.tnn_input_1d_persistent_mmap(sess, seq_d_head, 0, kb_off)]
    self.t_seq_b_v = [TinyNN.tnn_input_1d_persistent_mmap(sess, seq_d_head, 0, vb_off)]
    hkv = 1
    while hkv < seq_n_kv
      self.t_seq_b_k.push(TinyNN.tnn_input_1d_persistent_mmap(sess, seq_d_head, 0,
                           kb_off + hkv * bias_stride))
      self.t_seq_b_v.push(TinyNN.tnn_input_1d_persistent_mmap(sess, seq_d_head, 0,
                           vb_off + hkv * bias_stride))
      hkv = hkv + 1
    end
  end

  # O, FFN — full 2D weights, no per-head split.
  o_idx    = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".attn_output.weight")
  gate_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".ffn_gate.weight")
  up_idx   = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".ffn_up.weight")
  down_idx = TinyNN.tnn_gguf_find_index(gguf_handle, prefix + ".ffn_down.weight")
  self.t_seq_w_o    = TinyNN.tnn_input_2d_persistent_mmap(sess, seq_d_model, seq_d_model,
                       TinyNN.tnn_gguf_tensor_type(gguf_handle, o_idx),
                       TinyNN.tnn_gguf_tensor_file_offset(gguf_handle, o_idx))
  self.t_seq_w_gate = TinyNN.tnn_input_2d_persistent_mmap(sess, seq_d_ff, seq_d_model,
                       TinyNN.tnn_gguf_tensor_type(gguf_handle, gate_idx),
                       TinyNN.tnn_gguf_tensor_file_offset(gguf_handle, gate_idx))
  self.t_seq_w_up   = TinyNN.tnn_input_2d_persistent_mmap(sess, seq_d_ff, seq_d_model,
                       TinyNN.tnn_gguf_tensor_type(gguf_handle, up_idx),
                       TinyNN.tnn_gguf_tensor_file_offset(gguf_handle, up_idx))
  self.t_seq_w_down = TinyNN.tnn_input_2d_persistent_mmap(sess, seq_d_model, seq_d_ff,
                       TinyNN.tnn_gguf_tensor_type(gguf_handle, down_idx),
                       TinyNN.tnn_gguf_tensor_file_offset(gguf_handle, down_idx))
end