Class: Toy::LLM::Engine::LlamaSeqEngineCuda

Inherits:
Object
  • Object
show all
Defined in:
lib/toy/llm/engine/llama_seq_engine_cuda.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeLlamaSeqEngineCuda

Returns a new instance of LlamaSeqEngineCuda.



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 102

def initialize
  # P2.5 — the arch owns the arch-level persistent handles + the
  # blocks array (seeded with one block in the arch ctor, matching the
  # former cache seed). Constructed first so the delegators are live.
  @seq_arch       = Toy::LLM::Archs::LlamaArch.new
  @seq_realized   = false
  @seq_t          = 0
  @seq_b          = 1
  # GH#9 — mixed-precision compute. 0 = F32 (current behaviour;
  # bit-identical to pre-GH#9). 1 = F16, 30 = BF16. When != 0,
  # weight matmuls inside build_seq_block / build_seq_qhead route
  # through mp_matmul which casts the F32 master to the chosen
  # dtype inline in the forward graph. F32 master is kept (required
  # by opt_step_adamw); the cast result lives in transient scratch.
  @seq_weight_dtype = 0
  @seq_d_model    = 0
  @seq_d_ff       = 0
  @seq_n_heads    = 0
  @seq_n_kv       = 0
  @seq_d_head     = 0
  @seq_group_size = 0
  @seq_n_layers   = 0
  @seq_vocab_size = 0
  @seq_rope_base            = 10000.0
  @seq_rope_scaling         = Toy::RopeScaling.none
  # Seed a concrete Cfg from the same defaults so the ivar always
  # holds a real Toy::LLM::Primitives::RoPE::Cfg (never nil/RbVal).
  # Rebuilt per realize path once the true dims are known.
  @seq_rope_cfg             = Toy::LLM::Primitives::RoPE::Cfg.new(
                                @seq_d_head, @seq_rope_base,
                                @seq_rope_scaling.freq_scale,
                                @seq_rope_scaling.ext_factor,
                                @seq_rope_scaling.attn_factor,
                                @seq_rope_scaling.beta_fast,
                                @seq_rope_scaling.beta_slow)
  @t_seq_rope_freq_factors  = TinyNNCuda.tnn_null_ptr
  @seq_rms_eps    = 1.0e-5
  @sess                  = TinyNNCuda.tnn_null_ptr
  # P2.5 — token_embed / final_norm_gamma / output / w_proj and the
  # blocks array are seeded on @seq_arch (see arch ctor); the cache
  # reaches them via the delegators above.
  @seq_has_untied_output = false
  @seq_has_qkv_bias      = false
  @seq_gguf_handle_keepalive = TinyNNCuda.tnn_null_ptr
  @t_seq_token_ids = TinyNNCuda.tnn_null_ptr
  @t_seq_positions = TinyNNCuda.tnn_null_ptr
  # GH#7 — batched-training block-causal attention mask. Allocated
  # only when @seq_b > 1 (realize_for_random_init with t_batch > 1);
  # otherwise stays NULL and build_seq_qhead falls back to the
  # diag_mask_inf + softmax path (bit-identical to today at B=1).
  @t_seq_attn_mask = TinyNNCuda.tnn_null_ptr
  @t_seq_x_embed   = TinyNNCuda.tnn_null_ptr
  @t_seq_x_final   = TinyNNCuda.tnn_null_ptr
  @t_seq_logits    = TinyNNCuda.tnn_null_ptr
  @seq_lora_q_enabled       = false
  @seq_lora_q_rank          = 0
  @seq_lora_q_adamw_enabled = false
  @seq_full_finetune_enabled = false
  @ft_globals_weights = [TinyNNCuda.tnn_null_ptr]; @ft_globals_weights.pop
  @ft_globals_m       = [TinyNNCuda.tnn_null_ptr]; @ft_globals_m.pop
  @ft_globals_v       = [TinyNNCuda.tnn_null_ptr]; @ft_globals_v.pop
  @ft_train_embeddings_enabled = false
  # E2.3 (towards GH#14) — projection-lens path. donor_d_in is read
  # from cfg in realize_for_random_init; t_seq_w_proj is the
  # trainable [donor_d_in, d_model] linear inserted after the embed
  # get_rows when donor_d_in > 0.
  @seq_donor_d_in   = 0
  # P2.5 — t_seq_w_proj is seeded on @seq_arch (arch ctor); cache
  # reaches it via the t_seq_w_proj delegator.
end

Instance Attribute Details

#ft_globals_mObject

Returns the value of attribute ft_globals_m.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def ft_globals_m
  @ft_globals_m
end

#ft_globals_vObject

Returns the value of attribute ft_globals_v.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def ft_globals_v
  @ft_globals_v
end

#ft_globals_weightsObject

Returns the value of attribute ft_globals_weights.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def ft_globals_weights
  @ft_globals_weights
end

#ft_train_embeddings_enabledObject

Returns the value of attribute ft_train_embeddings_enabled.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def ft_train_embeddings_enabled
  @ft_train_embeddings_enabled
end

#seq_archObject

Returns the value of attribute seq_arch.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_arch
  @seq_arch
end

#seq_bObject

Returns the value of attribute seq_b.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_b
  @seq_b
end

#seq_d_ffObject

Returns the value of attribute seq_d_ff.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_d_ff
  @seq_d_ff
end

#seq_d_headObject

Returns the value of attribute seq_d_head.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_d_head
  @seq_d_head
end

#seq_d_modelObject

Returns the value of attribute seq_d_model.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_d_model
  @seq_d_model
end

#seq_full_finetune_enabledObject

Returns the value of attribute seq_full_finetune_enabled.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_full_finetune_enabled
  @seq_full_finetune_enabled
end

#seq_gguf_handle_keepaliveObject

Returns the value of attribute seq_gguf_handle_keepalive.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_gguf_handle_keepalive
  @seq_gguf_handle_keepalive
end

#seq_group_sizeObject

Returns the value of attribute seq_group_size.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_group_size
  @seq_group_size
end

#seq_has_qkv_biasObject

Returns the value of attribute seq_has_qkv_bias.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_has_qkv_bias
  @seq_has_qkv_bias
end

#seq_has_untied_outputObject

Returns the value of attribute seq_has_untied_output.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_has_untied_output
  @seq_has_untied_output
end

#seq_lora_q_adamw_enabledObject

Returns the value of attribute seq_lora_q_adamw_enabled.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_lora_q_adamw_enabled
  @seq_lora_q_adamw_enabled
end

#seq_lora_q_enabledObject

Returns the value of attribute seq_lora_q_enabled.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_lora_q_enabled
  @seq_lora_q_enabled
end

#seq_lora_q_rankObject

Returns the value of attribute seq_lora_q_rank.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_lora_q_rank
  @seq_lora_q_rank
end

#seq_n_headsObject

Returns the value of attribute seq_n_heads.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_n_heads
  @seq_n_heads
end

#seq_n_kvObject

Returns the value of attribute seq_n_kv.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_n_kv
  @seq_n_kv
end

#seq_n_layersObject

Returns the value of attribute seq_n_layers.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_n_layers
  @seq_n_layers
end

#seq_realizedObject

Returns the value of attribute seq_realized.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_realized
  @seq_realized
end

#seq_rms_epsObject

Returns the value of attribute seq_rms_eps.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_rms_eps
  @seq_rms_eps
end

#seq_rope_baseObject

Returns the value of attribute seq_rope_base.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_rope_base
  @seq_rope_base
end

#seq_rope_scalingObject

Returns the value of attribute seq_rope_scaling.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_rope_scaling
  @seq_rope_scaling
end

#seq_tObject

Returns the value of attribute seq_t.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_t
  @seq_t
end

#seq_vocab_sizeObject

Returns the value of attribute seq_vocab_size.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_vocab_size
  @seq_vocab_size
end

#seq_weight_dtypeObject

Returns the value of attribute seq_weight_dtype.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def seq_weight_dtype
  @seq_weight_dtype
end

#sessObject

Returns the value of attribute sess.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def sess
  @sess
end

#t_seq_attn_maskObject

Returns the value of attribute t_seq_attn_mask.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def t_seq_attn_mask
  @t_seq_attn_mask
end

#t_seq_logitsObject

Returns the value of attribute t_seq_logits.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def t_seq_logits
  @t_seq_logits
end

#t_seq_positionsObject

Returns the value of attribute t_seq_positions.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def t_seq_positions
  @t_seq_positions
end

#t_seq_rope_freq_factorsObject

Returns the value of attribute t_seq_rope_freq_factors.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def t_seq_rope_freq_factors
  @t_seq_rope_freq_factors
end

#t_seq_token_idsObject

Returns the value of attribute t_seq_token_ids.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def t_seq_token_ids
  @t_seq_token_ids
end

#t_seq_x_embedObject

Returns the value of attribute t_seq_x_embed.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def t_seq_x_embed
  @t_seq_x_embed
end

#t_seq_x_finalObject

Returns the value of attribute t_seq_x_final.



37
38
39
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 37

def t_seq_x_final
  @t_seq_x_final
end

Instance Method Details

#apply_seq_cfg!(cfg) ⇒ Object

P2.6 — shared config-prologue helper. Writes the @seq_* shape/RoPE ivars that every realize_for_* path needs before allocating tensors. Pure ivar writes reading only cfg.*; no FFI, no graph state. Each realize path keeps its own ‘@seq_t = t_seq` (and any path-local extras) at the call site and then calls this. Byte-identical to the block that previously lived inline in all four realize_for_* methods.



212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 212

def apply_seq_cfg!(cfg)
  @seq_d_model    = cfg.d_model
  @seq_d_ff       = cfg.d_ff
  @seq_n_heads    = cfg.n_heads
  @seq_n_kv       = cfg.n_kv
  @seq_d_head     = cfg.head_dim
  @seq_group_size = cfg.n_heads / cfg.n_kv
  @seq_n_layers   = cfg.n_layers
  @seq_vocab_size = cfg.vocab
  @seq_rope_base    = cfg.rope_base
  @seq_rope_scaling = cfg.rope_scaling
  @seq_rope_cfg     = Toy::LLM::Primitives::RoPE::Cfg.new(
                        @seq_d_head, @seq_rope_base,
                        @seq_rope_scaling.freq_scale,
                        @seq_rope_scaling.ext_factor,
                        @seq_rope_scaling.attn_factor,
                        @seq_rope_scaling.beta_fast,
                        @seq_rope_scaling.beta_slow)
  @seq_rms_eps    = cfg.rms_eps
end

#build_and_realize!Object

P2.6 — the identical tail-of-tail shared by all four realize_for_* paths: build the forward graph in the current ctx, realize it, and flip @seq_realized. Stays a CACHE method (build_forward_in_current_ctx is the cache->arch wrapper; tnn_realize is session-scoped). Gate-covered by smoke_projection_lens via realize_for_random_init.



1037
1038
1039
1040
1041
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 1037

def build_and_realize!
  build_forward_in_current_ctx
  TinyNNCuda.tnn_realize(@sess, @t_seq_logits)
  @seq_realized = true
end

#build_forward_in_current_ctxObject

Build the forward graph in the CURRENT compute context. Used both from realize_for_mmap (first realize) and after tnn_reset_for_rebuild (e.g. when switching from inference to training, which needs the forward + loss + backward + opt_step all in one rebuilt ctx). Stores the per-graph tensor handles back on ‘self`. P2.5 — thin wrapper around Toy::LLM::Archs::LlamaArch#build_forward. Allocates the per-graph INPUT handles (token_ids, positions) — which stay CACHE-owned graph I/O, read by forward() and the uploaders —then hands the realize-set rope_cfg / donor_d_in onto the arch and calls the lifted orchestration. The three per-graph OUTPUT handles come back in a LlamaArchForwardOut and are spread onto the cache’s own ivars so every downstream reader (@t_seq_logits accessor, build_training_step CE-loss consumer, examples/06 fcache.t_seq_logits) is untouched.



1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 1057

def build_forward_in_current_ctx
  # GH#7 — at B=1, @seq_t * @seq_b == @seq_t (legacy behaviour).
  # At B>1, the layout is flat [T*B]: per-batch positions cycle
  # 0..T-1 (the caller-built positions array is responsible for
  # that ordering); RoPE applies per-batch positional encoding
  # because rope_ext reads positions[k] for each ne[2] slot.
  tb = @seq_t * @seq_b
  @t_seq_token_ids = TinyNNCuda.tnn_input_1d_i32(@sess, tb)
  @t_seq_positions = TinyNNCuda.tnn_input_1d_i32_ctx(@sess, tb)

  # The arch reads seq_rope_cfg / seq_donor_d_in off itself; the cache
  # rebuilds rope_cfg and sets donor_d_in in each realize prologue, so
  # mirror the realize-set values onto the arch right before the call.
  @seq_arch.seq_rope_cfg   = @seq_rope_cfg
  @seq_arch.seq_donor_d_in = @seq_donor_d_in

  out = @seq_arch.build_forward(
    @sess, @t_seq_token_ids, @t_seq_positions, @t_seq_rope_freq_factors,
    @t_seq_attn_mask, @seq_rms_eps, @seq_d_head, @seq_n_kv, @seq_n_heads,
    @seq_group_size, @seq_has_qkv_bias, @seq_weight_dtype,
    @seq_lora_q_enabled, @seq_t, @seq_b, @seq_n_layers,
    @seq_has_untied_output)

  @t_seq_x_embed = out.t_seq_x_embed
  @t_seq_x_final = out.t_seq_x_final
  @t_seq_logits  = out.t_seq_logits
end

#build_training_stepObject

M3 step 3 — rebuild the session graph as forward + CE loss + backward + AdamW opt_step over every LoRA pair. After this, callers upload token IDs + positions + labels (one-hot vocab×T) + hp vector and call tnn_compute_backward to get one training step over the whole T-position sequence.

Returns the (loss_tensor, labels_tensor, hp_tensor) triple.



1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 1092

def build_training_step
  if !@seq_full_finetune_enabled && (!@seq_lora_q_enabled || !@seq_lora_q_adamw_enabled)
    puts "build_training_step: requires enable_lora_q! AND enable_lora_q_adamw!  (or enable_full_finetune!)"
    return nil
  end
  TinyNNCuda.tnn_reset_for_rebuild(@sess)
  build_forward_in_current_ctx

  # Label tensor: same shape as logits, ggml ne=[vocab, T*B]. Our
  # wrapper takes (rows, cols) and emits ggml(cols, rows), so pass
  # (T*B, vocab) here to get ne=[vocab, T*B]. One-hot per ne1-column
  # (i.e. per (batch, position) slot). At B=1, identical to legacy.
  t_labels = TinyNNCuda.tnn_input_2d_f32(@sess, @seq_t * @seq_b, @seq_vocab_size)
  # Hyper-params vector for AdamW: alpha, beta1, beta2, eps, wd, beta1h, beta2h.
  t_hp = TinyNNCuda.tnn_input_1d_f32(@sess, 7)

  # CE loss over all T columns. ggml_cross_entropy_loss returns the
  # mean over columns — masking is a follow-up (would zero specific
  # columns in labels before this op).
  t_loss = TinyNNCuda.tnn_cross_entropy_loss(@sess, @t_seq_logits, t_labels)
  TinyNNCuda.tnn_set_output(t_loss)
  TinyNNCuda.tnn_set_loss(t_loss)

  TinyNNCuda.tnn_build_forward_only(@sess, t_loss)
  TinyNNCuda.tnn_build_backward(@sess)

  if @seq_full_finetune_enabled
    # F3 — emit opt_step_adamw for every recorded (weight, m, v)
    # triple. The arrays are populated in realize_for_full_finetune.
    li = 0
    while li < @seq_n_layers
      blk = self.seq_blocks_ffi[li]
      wi = 0
      while wi < blk.ft_weights.length
        tw = blk.ft_weights[wi]
        tg = TinyNNCuda.tnn_tensor_grad(@sess, tw)
        to = TinyNNCuda.tnn_opt_step_adamw(@sess, tw, tg,
                                        blk.ft_m[wi], blk.ft_v[wi], t_hp)
        TinyNNCuda.tnn_extend_backward_graph(@sess, to)
        wi = wi + 1
      end
      li = li + 1
    end
    # Globals (token_embed, final-norm, optional untied output).
    gi = 0
    while gi < @ft_globals_weights.length
      tw = @ft_globals_weights[gi]
      tg = TinyNNCuda.tnn_tensor_grad(@sess, tw)
      to = TinyNNCuda.tnn_opt_step_adamw(@sess, tw, tg,
                                      @ft_globals_m[gi], @ft_globals_v[gi], t_hp)
      TinyNNCuda.tnn_extend_backward_graph(@sess, to)
      gi = gi + 1
    end
  else
    # LoRA-only training (M3 step 3). One opt_step_adamw per LoRA-A
    # and per LoRA-B tensor; thread each through extend_backward_graph
    # so sched sees the writes.
    li = 0
    while li < @seq_n_layers
      blk = self.seq_blocks_ffi[li]
      hq = 0
      while hq < @seq_n_heads
        t_a       = blk.t_seq_w_lora_a_q[hq]
        t_b       = blk.t_seq_w_lora_b_q[hq]
        t_grad_a  = TinyNNCuda.tnn_tensor_grad(@sess, t_a)
        t_grad_b  = TinyNNCuda.tnn_tensor_grad(@sess, t_b)
        t_opt_a   = TinyNNCuda.tnn_opt_step_adamw(@sess, t_a, t_grad_a,
                                                blk.t_seq_w_lora_a_q_m[hq],
                                                blk.t_seq_w_lora_a_q_v[hq], t_hp)
        t_opt_b   = TinyNNCuda.tnn_opt_step_adamw(@sess, t_b, t_grad_b,
                                                blk.t_seq_w_lora_b_q_m[hq],
                                                blk.t_seq_w_lora_b_q_v[hq], t_hp)
        TinyNNCuda.tnn_extend_backward_graph(@sess, t_opt_a)
        TinyNNCuda.tnn_extend_backward_graph(@sess, t_opt_b)
        hq = hq + 1
      end
      li = li + 1
    end
  end

  # Pin every node in graph_b before sched-alloc — workaround for the
  # ggml-cpu sched-aliasing bug on long backward chains (documented in
  # project_cpu_cuda_lora_train_divergence_2026_05_21). Memory cost
  # grows roughly with node count; fine for SmolLM2-135M at T<=64.
  TinyNNCuda.tnn_pin_all_graph_b_nodes(@sess)
  TinyNNCuda.tnn_realize_backward(@sess)
  [t_loss, t_labels, t_hp]
end

#enable_full_finetune!Object

F3 — turn on full fine-tune. Every per-block weight tensor will be allocated as writable F32 in ctx_w (instead of mmap’d from the GGUF), paired with persistent Adam m/v, and marked trainable. Mutually exclusive with enable_lora_q!. Call BEFORE realize_for_full_finetune.



186
187
188
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 186

def enable_full_finetune!
  @seq_full_finetune_enabled = true
end

#enable_full_finetune_embeddings!Object

F3 — additionally train the embedding / final-norm gamma / untied output. Opt-in: the embed tensor on Qwen-class vocab is large and makes the memory budget noticeably tighter, but the math itself works correctly post vendor-patches/0006 (chunked get_rows_back).



177
178
179
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 177

def enable_full_finetune_embeddings!
  @ft_train_embeddings_enabled = true
end

#enable_lora_q!(r) ⇒ Object

M3 step 3 — turn on LoRA on the Q projection. Adapter A is (r, d_model), B is (d_head, r). Standard init: A small Gaussian, B zero → adapter is a no-op at step 0. Call BEFORE realize_for_mmap. Mirrors SmolLM2KVFFICache#enable_lora_q!.



194
195
196
197
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 194

def enable_lora_q!(r)
  @seq_lora_q_enabled = true
  @seq_lora_q_rank    = r
end

#enable_lora_q_adamw!Object

M3 step 3 — allocate persistent AdamW moments next to each LoRA pair (parallel to F1.2 step 6b on SmolLM2KVFFICache). Required to keep optimizer state alive across reset_for_rebuild / multi-step training.



202
203
204
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 202

def enable_lora_q_adamw!
  @seq_lora_q_adamw_enabled = true
end

#finalize_weights_and_upload_constants!Object

P2.6 — finalize the backend weight buffers and upload the per-model constants that depend on the buffers existing. This is the identical head-of-tail shared by all four realize_for_* paths:

1. allocate the B>1 block-causal mask in ctx_w (NULL at B=1),
2. tnn_finalize_weights,
3. upload the llama3 RoPE freq_factors (no-op unless :llama3),
4. upload the B>1 block-causal mask values.

Stays a CACHE method: the finalize FFI sequencing is session-scoped. Gate-covered end-to-end by smoke_projection_lens (B=1, non-llama3): the two inner branches are dead under the gate but relocate verbatim.



1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 1006

def finalize_weights_and_upload_constants!
  # GH#7 — block-causal attention mask for B>1. At B=1 the mask stays
  # NULL and build_seq_qhead uses diag_mask_inf + softmax. Allocated
  # in ctx_w as f32 persistent so it survives reset_for_rebuild.
  if @seq_b > 1
    tb_alloc = @seq_t * @seq_b
    @t_seq_attn_mask = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, tb_alloc, tb_alloc)
  end

  TinyNNCuda.tnn_finalize_weights(@sess)

  # Upload llama3-style RoPE freq_factors once the backend buffer
  # exists. Per-model constant; never re-uploaded.
  if @seq_rope_scaling.kind == :llama3
    ff = Toy::RopeScaling.compute_llama3_freq_factors(
      @seq_d_head, @seq_rope_base,
      @seq_rope_scaling.orig_max_pos, @seq_rope_scaling.factor,
      @seq_rope_scaling.low_freq_factor, @seq_rope_scaling.high_freq_factor)
    TinyNNCuda.tnn_upload_from_float_array(@sess, @t_seq_rope_freq_factors, ff, ff.length)
  end

  if @seq_b > 1
    upload_block_causal_mask!
  end
end

#forward(ids, positions) ⇒ Object

Run one forward pass. ‘ids` and `positions` are length-T Int arrays. Returns the t_seq_logits handle; caller downloads via download_row_major against (vocab, T) shape.



1196
1197
1198
1199
1200
1201
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 1196

def forward(ids, positions)
  TinyNNCuda.upload_int_array(@sess, @t_seq_token_ids, ids)
  TinyNNCuda.upload_int_array(@sess, @t_seq_positions, positions)
  TinyNNCuda.tnn_compute(@sess)
  @t_seq_logits
end

#ft_add_1d(blk, weight) ⇒ Object



819
820
821
822
823
824
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 819

def ft_add_1d(blk, weight)
  n = TinyNNCuda.tnn_tensor_nelements(weight)
  blk.ft_weights.push(weight)
  blk.ft_m.push(TinyNNCuda.tnn_input_1d_f32_persistent(@sess, n))
  blk.ft_v.push(TinyNNCuda.tnn_input_1d_f32_persistent(@sess, n))
end

#ft_add_2d(blk, weight, rows, cols) ⇒ Object

Append (weight, m, v) to the block’s parallel arrays. Allocates Adam m and v of the same shape as ‘weight` as a side effect.



813
814
815
816
817
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 813

def ft_add_2d(blk, weight, rows, cols)
  blk.ft_weights.push(weight)
  blk.ft_m.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, rows, cols))
  blk.ft_v.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, rows, cols))
end

#ft_add_global_1d(weight) ⇒ Object



880
881
882
883
884
885
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 880

def ft_add_global_1d(weight)
  n = TinyNNCuda.tnn_tensor_nelements(weight)
  @ft_globals_weights.push(weight)
  @ft_globals_m.push(TinyNNCuda.tnn_input_1d_f32_persistent(@sess, n))
  @ft_globals_v.push(TinyNNCuda.tnn_input_1d_f32_persistent(@sess, n))
end

#ft_add_global_2d(weight, rows, cols) ⇒ Object

Same shape as ft_add_2d / ft_add_1d but writes to the cache-level globals arrays (token_embed, final-norm, untied output).



874
875
876
877
878
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 874

def ft_add_global_2d(weight, rows, cols)
  @ft_globals_weights.push(weight)
  @ft_globals_m.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, rows, cols))
  @ft_globals_v.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, rows, cols))
end

#ft_load_from_gguf(gguf, qkv_bias) ⇒ Object

Pull bytes from the GGUF into each writable weight. Uses the existing C-side dequantize-and-copy primitives so a Q8 source transparently becomes F32 in the target tensor.



912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 912

def ft_load_from_gguf(gguf, qkv_bias)
  li = 0
  while li < @seq_n_layers
    blk = self.seq_blocks_ffi[li]
    prefix = "blk." + li.to_s

    rn1_idx = TinyNNCuda.tnn_gguf_find_index(gguf, prefix + ".attn_norm.weight")
    rn2_idx = TinyNNCuda.tnn_gguf_find_index(gguf, prefix + ".ffn_norm.weight")
    TinyNNCuda.tnn_gguf_copy_1d_to_persistent(gguf, rn1_idx, @sess, blk.t_seq_rn1_gamma)
    TinyNNCuda.tnn_gguf_copy_1d_to_persistent(gguf, rn2_idx, @sess, blk.t_seq_rn2_gamma)

    q_idx = TinyNNCuda.tnn_gguf_find_index(gguf, prefix + ".attn_q.weight")
    hq = 0
    while hq < @seq_n_heads
      TinyNNCuda.tnn_gguf_copy_head_slice_to_persistent_native(gguf, q_idx, @sess,
        blk.t_seq_w_q[hq], hq, @seq_n_heads, @seq_d_model, @seq_d_head)
      hq = hq + 1
    end

    k_idx = TinyNNCuda.tnn_gguf_find_index(gguf, prefix + ".attn_k.weight")
    v_idx = TinyNNCuda.tnn_gguf_find_index(gguf, prefix + ".attn_v.weight")
    hkv = 0
    while hkv < @seq_n_kv
      TinyNNCuda.tnn_gguf_copy_head_slice_to_persistent_native(gguf, k_idx, @sess,
        blk.t_seq_w_k[hkv], hkv, @seq_n_kv, @seq_d_model, @seq_d_head)
      TinyNNCuda.tnn_gguf_copy_head_slice_to_persistent_native(gguf, v_idx, @sess,
        blk.t_seq_w_v[hkv], hkv, @seq_n_kv, @seq_d_model, @seq_d_head)
      hkv = hkv + 1
    end

    if qkv_bias
      # qbias / kbias / vbias are 1-D head-sliced. We don't have a
      # dedicated head-slice loader for them; fall through and use
      # tnn_gguf_copy_head_bias_slice_to_persistent.
      qb_idx = TinyNNCuda.tnn_gguf_find_index(gguf, prefix + ".attn_q.bias")
      kb_idx = TinyNNCuda.tnn_gguf_find_index(gguf, prefix + ".attn_k.bias")
      vb_idx = TinyNNCuda.tnn_gguf_find_index(gguf, prefix + ".attn_v.bias")
      hbq = 0
      while hbq < @seq_n_heads
        TinyNNCuda.tnn_gguf_copy_head_bias_slice_to_persistent(gguf, qb_idx, @sess,
          blk.t_seq_b_q[hbq], hbq, @seq_d_head)
        hbq = hbq + 1
      end
      hbkv = 0
      while hbkv < @seq_n_kv
        TinyNNCuda.tnn_gguf_copy_head_bias_slice_to_persistent(gguf, kb_idx, @sess,
          blk.t_seq_b_k[hbkv], hbkv, @seq_d_head)
        TinyNNCuda.tnn_gguf_copy_head_bias_slice_to_persistent(gguf, vb_idx, @sess,
          blk.t_seq_b_v[hbkv], hbkv, @seq_d_head)
        hbkv = hbkv + 1
      end
    end

    o_idx    = TinyNNCuda.tnn_gguf_find_index(gguf, prefix + ".attn_output.weight")
    gate_idx = TinyNNCuda.tnn_gguf_find_index(gguf, prefix + ".ffn_gate.weight")
    up_idx   = TinyNNCuda.tnn_gguf_find_index(gguf, prefix + ".ffn_up.weight")
    down_idx = TinyNNCuda.tnn_gguf_find_index(gguf, prefix + ".ffn_down.weight")
    TinyNNCuda.tnn_gguf_copy_to_persistent(gguf, o_idx,    @sess, blk.t_seq_w_o)
    TinyNNCuda.tnn_gguf_copy_to_persistent(gguf, gate_idx, @sess, blk.t_seq_w_gate)
    TinyNNCuda.tnn_gguf_copy_to_persistent(gguf, up_idx,   @sess, blk.t_seq_w_up)
    TinyNNCuda.tnn_gguf_copy_to_persistent(gguf, down_idx, @sess, blk.t_seq_w_down)

    li = li + 1
  end
end

#ft_load_globals(gguf, untied) ⇒ Object

Load token_embed + final-norm + (untied) output from the GGUF into their now-allocated backend buffers.



889
890
891
892
893
894
895
896
897
898
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 889

def ft_load_globals(gguf, untied)
  eidx = TinyNNCuda.tnn_gguf_find_index(gguf, "token_embd.weight")
  TinyNNCuda.tnn_gguf_copy_to_persistent(gguf, eidx, @sess, self.t_seq_token_embed)
  fnidx = TinyNNCuda.tnn_gguf_find_index(gguf, "output_norm.weight")
  TinyNNCuda.tnn_gguf_copy_1d_to_persistent(gguf, fnidx, @sess, self.t_seq_final_norm_gamma)
  if untied
    oidx = TinyNNCuda.tnn_gguf_find_index(gguf, "output.weight")
    TinyNNCuda.tnn_gguf_copy_to_persistent(gguf, oidx, @sess, self.t_seq_output)
  end
end

#ft_name_last(blk, name) ⇒ Object

Name the most-recently-pushed (weight, m, v) triple in a block. Used right after ft_add_2d / ft_add_1d so drift/grad event consumers see llama.cpp-convention names like “blk.0.attn_norm.weight” instead of ggml’s auto-generated “node_N”. toy#semantic-tensor-names.



830
831
832
833
834
835
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 830

def ft_name_last(blk, name)
  last = blk.ft_weights.length - 1
  TinyNNCuda.tnn_tensor_set_name(blk.ft_weights[last], name)
  TinyNNCuda.tnn_tensor_set_name(blk.ft_m[last],       name + ".m")
  TinyNNCuda.tnn_tensor_set_name(blk.ft_v[last],       name + ".v")
end

#ft_name_last_global(name) ⇒ Object



837
838
839
840
841
842
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 837

def ft_name_last_global(name)
  last = @ft_globals_weights.length - 1
  TinyNNCuda.tnn_tensor_set_name(@ft_globals_weights[last], name)
  TinyNNCuda.tnn_tensor_set_name(@ft_globals_m[last],       name + ".m")
  TinyNNCuda.tnn_tensor_set_name(@ft_globals_v[last],       name + ".v")
end

#ft_zero_init_adam(qkv_bias) ⇒ Object

Zero-init the Adam moments. m and v both start at 0 per the AdamW step-0 contract. Uses the backend-side memset primitive (tnn_zero_tensor) so a 1 GB Adam state doesn’t materialize a Mat-of-zeros in Ruby first.



982
983
984
985
986
987
988
989
990
991
992
993
994
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 982

def ft_zero_init_adam(qkv_bias)
  li = 0
  while li < @seq_n_layers
    blk = self.seq_blocks_ffi[li]
    i = 0
    while i < blk.ft_weights.length
      TinyNNCuda.tnn_zero_tensor(@sess, blk.ft_m[i])
      TinyNNCuda.tnn_zero_tensor(@sess, blk.ft_v[i])
      i = i + 1
    end
    li = li + 1
  end
end

#ft_zero_init_adam_globalsObject



900
901
902
903
904
905
906
907
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 900

def ft_zero_init_adam_globals
  gi = 0
  while gi < @ft_globals_weights.length
    TinyNNCuda.tnn_zero_tensor(@sess, @ft_globals_m[gi])
    TinyNNCuda.tnn_zero_tensor(@sess, @ft_globals_v[gi])
    gi = gi + 1
  end
end

#head_nbytes(ggml_type, d_head, d_model) ⇒ Object

GGUF type → bytes-per-row stride for per-head slicing. Mirrors the SmolLM2KVFFICache helper of the same name. F32=0, Q8_0=8.



1183
1184
1185
1186
1187
1188
1189
1190
1191
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 1183

def head_nbytes(ggml_type, d_head, d_model)
  if ggml_type == 0
    d_head * d_model * 4
  elsif ggml_type == 8
    d_head * (d_model / 32) * 34
  else
    0
  end
end

#lora_name_q!(t_a, t_b, head_prefix) ⇒ Object

P2.7 — LoRA-Q tensor naming callbacks for the extracted block-side mmap loader (TransformerBlock#load_from_gguf_mmap!). The :str tnn_tensor_set_name FFI calls MUST stay on the cache realize RUNTIME path — never migrate into block class-load scope (step_bind / :str landmine #16). The block assembles the runtime name string and hands it here, exactly as it hands ft_name_last its assembled name. Verbatim lift of the former realize_for_mmap loop lines 567-570 / 597-604.



860
861
862
863
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 860

def lora_name_q!(t_a, t_b, head_prefix)
  TinyNNCuda.tnn_tensor_set_name(t_a, head_prefix + ".lora_a.weight")
  TinyNNCuda.tnn_tensor_set_name(t_b, head_prefix + ".lora_b.weight")
end

#lora_name_q_adam!(t_a_m, t_a_v, t_b_m, t_b_v, head_prefix) ⇒ Object



865
866
867
868
869
870
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 865

def lora_name_q_adam!(t_a_m, t_a_v, t_b_m, t_b_v, head_prefix)
  TinyNNCuda.tnn_tensor_set_name(t_a_m, head_prefix + ".lora_a.m")
  TinyNNCuda.tnn_tensor_set_name(t_a_v, head_prefix + ".lora_a.v")
  TinyNNCuda.tnn_tensor_set_name(t_b_m, head_prefix + ".lora_b.m")
  TinyNNCuda.tnn_tensor_set_name(t_b_v, head_prefix + ".lora_b.v")
end

#name_global!(t, name) ⇒ Object

Name a single FROZEN global (e.g. the projection-lens donor embed, which is NOT pushed to @ft_globals so ft_name_last_global cannot reach it). Kept on the engine so this tnn_tensor_set_name(:str) FFI stays on the cache realize runtime path — same discipline as ft_name_last / lora_name_q!. Back-called by LlamaArch#alloc_globals_trainable_f32!.



849
850
851
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 849

def name_global!(t, name)
  TinyNNCuda.tnn_tensor_set_name(t, name)
end

#realize_for_full_finetune(gguf_handle, cfg, t_seq, untied, qkv_bias) ⇒ Object

F3 — full fine-tune realize path. Parallel to realize_for_mmap but every per-block weight is allocated writable F32 in ctx_w (no mmap), set_param-marked, paired with Adam m/v, and loaded from the GGUF post-finalize via the dequantize-friendly tnn_gguf_copy_* primitives. The embedding tensor + final_norm gamma stay mmap’d (read-only) — the MVP doesn’t train them.



472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 472

def realize_for_full_finetune(gguf_handle, cfg, t_seq, untied, qkv_bias)
  @seq_t          = t_seq
  apply_seq_cfg!(cfg)

  @seq_gguf_handle_keepalive = gguf_handle
  @sess                  = TinyNNCuda.tnn_session_new(1)

  # llama3 / LongRoPE: allocate the freq_factors tensor in ctx_w
  # before finalize_weights. Values uploaded post-finalize.
  if @seq_rope_scaling.kind == :llama3
    @t_seq_rope_freq_factors = TinyNNCuda.tnn_rope_freq_factors_alloc(@sess, cfg.head_dim)
  else
    @t_seq_rope_freq_factors = TinyNNCuda.tnn_null_ptr
  end
  @seq_has_untied_output = untied
  @seq_has_qkv_bias      = qkv_bias

  # Token embed + final-norm gamma + (untied) output: trainable
  # only when opt-in (ft_train_embeddings_enabled). Otherwise
  # they stay mmap'd / read-only (still need a mmap attach for
  # this branch).
  if @ft_train_embeddings_enabled
    self.t_seq_token_embed = TinyNNCuda.tnn_input_2d_f32_persistent(@sess,
                           @seq_vocab_size, @seq_d_model)
    ft_add_global_2d(self.t_seq_token_embed, @seq_vocab_size, @seq_d_model)
    ft_name_last_global("token_embd.weight")

    self.t_seq_final_norm_gamma = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, @seq_d_model)
    ft_add_global_1d(self.t_seq_final_norm_gamma)
    ft_name_last_global("output_norm.weight")

    if untied
      self.t_seq_output = TinyNNCuda.tnn_input_2d_f32_persistent(@sess,
                        @seq_vocab_size, @seq_d_model)
      ft_add_global_2d(self.t_seq_output, @seq_vocab_size, @seq_d_model)
      ft_name_last_global("output.weight")
    end
  else
    map_base = TinyNNCuda.tnn_gguf_mmap_base(gguf_handle)
    map_size = TinyNNCuda.tnn_gguf_mmap_size(gguf_handle)
    TinyNNCuda.tnn_session_attach_weight_mmap(@sess, map_base, map_size)

    eidx = TinyNNCuda.tnn_gguf_find_index(gguf_handle, "token_embd.weight")
    eoff = TinyNNCuda.tnn_gguf_tensor_file_offset(gguf_handle, eidx)
    etyp = TinyNNCuda.tnn_gguf_tensor_type(gguf_handle, eidx)
    self.t_seq_token_embed = TinyNNCuda.tnn_input_2d_persistent_mmap(@sess,
                           @seq_vocab_size, @seq_d_model, etyp, eoff)

    fnidx = TinyNNCuda.tnn_gguf_find_index(gguf_handle, "output_norm.weight")
    fnoff = TinyNNCuda.tnn_gguf_tensor_file_offset(gguf_handle, fnidx)
    self.t_seq_final_norm_gamma = TinyNNCuda.tnn_input_1d_persistent_mmap(@sess,
                                @seq_d_model, 0, fnoff)

    if untied
      oidx = TinyNNCuda.tnn_gguf_find_index(gguf_handle, "output.weight")
      ooff = TinyNNCuda.tnn_gguf_tensor_file_offset(gguf_handle, oidx)
      otyp = TinyNNCuda.tnn_gguf_tensor_type(gguf_handle, oidx)
      self.t_seq_output = TinyNNCuda.tnn_input_2d_persistent_mmap(@sess,
                        @seq_vocab_size, @seq_d_model, otyp, ooff)
    end
  end

  # P2.6 Step 2 — seeding loop moved onto the arch (LlamaArch#seed_blocks!).
  @seq_arch.seed_blocks!(@seq_n_layers)

  # P2-finish — per-block FT alloc lifted onto the block (verbatim:
  # TransformerBlock#alloc_full_finetune_f32_weights!), mirroring how
  # realize_for_random_init drives alloc_trainable_f32_weights!. The block
  # owns its self.t_seq_* handles + the per-block set_param loop; the cache
  # passes @sess + dims + qkv_bias and the ft_add_*/ft_name_last recorders
  # are back-called. Gated byte-exact by prep/full_finetune_gate.rb.
  li = 0
  while li < @seq_n_layers
    blk = self.seq_blocks_ffi[li]
    prefix = "blk." + li.to_s + "."
    blk.alloc_full_finetune_f32_weights!(@sess, self, prefix,
                                         @seq_d_model, @seq_d_ff, @seq_d_head,
                                         @seq_n_heads, @seq_n_kv, qkv_bias)
    li = li + 1
  end

  # Globals are trainable too only when embeddings are opt-in.
  if @ft_train_embeddings_enabled
    gi = 0
    while gi < @ft_globals_weights.length
      TinyNNCuda.tnn_set_param(@ft_globals_weights[gi])
      gi = gi + 1
    end
  end

  finalize_weights_and_upload_constants!

  # Post-finalize: load every writable weight from the GGUF.
  if @ft_train_embeddings_enabled
    ft_load_globals(gguf_handle, untied)
  end
  ft_load_from_gguf(gguf_handle, qkv_bias)
  ft_zero_init_adam(qkv_bias)
  if @ft_train_embeddings_enabled
    ft_zero_init_adam_globals
  end

  build_and_realize!
end

#realize_for_mmap(gguf_handle, cfg, t_seq, untied, qkv_bias) ⇒ Object

Allocate persistent weights mmap’d from ‘gguf_handle` (caller is responsible for keeping the handle alive — we keepalive it via @seq_gguf_handle_keepalive), compute inputs, and the full forward graph for T = `t_seq` positions. Fixed T; rebuild for a different T.

Weight layout matches SmolLM2KVFFICache#realize_for_mmap exactly (same byte offsets + per-head split), so a sharded GGUF can be loaded by either class.



371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 371

def realize_for_mmap(gguf_handle, cfg, t_seq, untied, qkv_bias)
  @seq_t          = t_seq
  apply_seq_cfg!(cfg)

  @seq_gguf_handle_keepalive = gguf_handle
  @sess                  = TinyNNCuda.tnn_session_new(1)

  # llama3 / LongRoPE: allocate the freq_factors tensor in ctx_w
  # before finalize_weights. Values uploaded post-finalize.
  if @seq_rope_scaling.kind == :llama3
    @t_seq_rope_freq_factors = TinyNNCuda.tnn_rope_freq_factors_alloc(@sess, cfg.head_dim)
  else
    @t_seq_rope_freq_factors = TinyNNCuda.tnn_null_ptr
  end
  @seq_has_untied_output = untied
  @seq_has_qkv_bias      = qkv_bias

  map_base = TinyNNCuda.tnn_gguf_mmap_base(gguf_handle)
  map_size = TinyNNCuda.tnn_gguf_mmap_size(gguf_handle)
  TinyNNCuda.tnn_session_attach_weight_mmap(@sess, map_base, map_size)

  # Embeddings + final norm + optional untied LM head.
  # P2.6 pass-2 Step 1 — the three arch-owned global mmap allocs moved
  # onto LlamaArch#load_globals_from_gguf_mmap! (verbatim; called ONLY
  # from here). Mirrors the seed_blocks! / alloc_trainable_f32_weights!
  # extraction precedents.
  @seq_arch.load_globals_from_gguf_mmap!(@sess, gguf_handle,
                                         @seq_vocab_size, @seq_d_model, untied)

  # P2.6 Step 2 — seeding loop moved onto the arch (LlamaArch#seed_blocks!).
  @seq_arch.seed_blocks!(@seq_n_layers)

  # P2.7 — the per-block alloc-from-mmap-offsets loop body moved onto
  # TransformerBlock#load_from_gguf_mmap! (verbatim; called ONLY from
  # here). Mirrors the alloc_trainable_f32_weights! / seed_blocks! /
  # load_globals_from_gguf_mmap! extraction precedents. head_nbytes and
  # the LoRA :str tnn_tensor_set_name naming stay on THIS cache and are
  # back-called through the passed `self` ref (lora_name_q! /
  # lora_name_q_adam! issue the :str FFI at this runtime scope, never in
  # block class-load scope — landmine #16).
  li = 0
  while li < @seq_n_layers
    blk = self.seq_blocks_ffi[li]
    blk.load_from_gguf_mmap!(@sess, self, gguf_handle, li,
                             @seq_n_heads, @seq_n_kv, @seq_d_head, @seq_d_model,
                             @seq_d_ff, @seq_lora_q_enabled, @seq_lora_q_rank,
                             @seq_lora_q_adamw_enabled, qkv_bias)
    li = li + 1
  end

  # Mark LoRA tensors as trainable BEFORE finalize. set_param flips
  # the PARAM flag so build_backward walks them when emitting grad nodes.
  if @seq_lora_q_enabled
    li2 = 0
    while li2 < @seq_n_layers
      blk2 = self.seq_blocks_ffi[li2]
      hq_p = 0
      while hq_p < @seq_n_heads
        TinyNNCuda.tnn_set_param(blk2.t_seq_w_lora_a_q[hq_p])
        TinyNNCuda.tnn_set_param(blk2.t_seq_w_lora_b_q[hq_p])
        hq_p = hq_p + 1
      end
      li2 = li2 + 1
    end
  end

  finalize_weights_and_upload_constants!

  # Zero-init persistent AdamW moments. Same contract as F1.2 step 6b
  # on SmolLM2KVFFICache — m and v start at 0 per the AdamW update rule.
  if @seq_lora_q_adamw_enabled
    za = Mat.new(@seq_lora_q_rank, @seq_d_model)
    zb = Mat.new(@seq_d_head,      @seq_lora_q_rank)
    i = 0
    while i < @seq_lora_q_rank * @seq_d_model; za.flat[i] = 0.0; i = i + 1; end
    j = 0
    while j < @seq_d_head * @seq_lora_q_rank; zb.flat[j] = 0.0; j = j + 1; end
    li_z = 0
    while li_z < @seq_n_layers
      blk_z = self.seq_blocks_ffi[li_z]
      hqz = 0
      while hqz < @seq_n_heads
        TinyNNCuda.upload_row_major(@sess, blk_z.t_seq_w_lora_a_q_m[hqz], za)
        TinyNNCuda.upload_row_major(@sess, blk_z.t_seq_w_lora_a_q_v[hqz], za)
        TinyNNCuda.upload_row_major(@sess, blk_z.t_seq_w_lora_b_q_m[hqz], zb)
        TinyNNCuda.upload_row_major(@sess, blk_z.t_seq_w_lora_b_q_v[hqz], zb)
        hqz = hqz + 1
      end
      li_z = li_z + 1
    end
  end

  build_and_realize!
end

#realize_for_q8_copy(gguf_handle, cfg, t_seq, untied, qkv_bias) ⇒ Object

F4 alternative realize for CUDA + Q8 base. Allocates every weight tensor in the standard ggml ctx_w (NOT the BYO mmap region), then verbatim-copies the GGUF bytes in. Buys correctness on CUDA at the cost of holding the weights twice transiently (mmap + ctx_w during load; ctx_w only after). Required because the BYO-pointer cuda buffer’s quantized padding zeroing (cudaMemset past tensor data) would otherwise crash on Q8 tensors with ‘ne0 % 512 != 0`.

Use this realize when (a) the GGUF is Q8 AND (b) the backend is CUDA. CPU + Q8 stays on realize_for_mmap (no padding issue).



243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 243

def realize_for_q8_copy(gguf_handle, cfg, t_seq, untied, qkv_bias)
  @seq_t          = t_seq
  apply_seq_cfg!(cfg)

  @seq_gguf_handle_keepalive = gguf_handle
  @sess                  = TinyNNCuda.tnn_session_new(1)

  # llama3 / LongRoPE: allocate the freq_factors tensor in ctx_w
  # before finalize_weights. Values uploaded post-finalize.
  if @seq_rope_scaling.kind == :llama3
    @t_seq_rope_freq_factors = TinyNNCuda.tnn_rope_freq_factors_alloc(@sess, cfg.head_dim)
  else
    @t_seq_rope_freq_factors = TinyNNCuda.tnn_null_ptr
  end
  @seq_has_untied_output = untied
  @seq_has_qkv_bias      = qkv_bias

  # Read source tensor types so we can allocate ctx_w tensors of the
  # MATCHING type (verbatim copy requires source/target types match).
  eidx = TinyNNCuda.tnn_gguf_find_index(gguf_handle, "token_embd.weight")
  etyp = TinyNNCuda.tnn_gguf_tensor_type(gguf_handle, eidx)
  self.t_seq_token_embed = TinyNNCuda.tnn_input_2d_persistent_typed(@sess,
                         @seq_vocab_size, @seq_d_model, etyp)
  self.t_seq_final_norm_gamma = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, @seq_d_model)
  if untied
    oidx = TinyNNCuda.tnn_gguf_find_index(gguf_handle, "output.weight")
    otyp = TinyNNCuda.tnn_gguf_tensor_type(gguf_handle, oidx)
    self.t_seq_output = TinyNNCuda.tnn_input_2d_persistent_typed(@sess,
                      @seq_vocab_size, @seq_d_model, otyp)
  end

  # P2.6 Step 2 — seeding loop moved onto the arch (LlamaArch#seed_blocks!).
  @seq_arch.seed_blocks!(@seq_n_layers)

  # P2.7 pass-3 — the per-block ALLOC-typed loop body moved onto
  # TransformerBlock#alloc_q8_typed_from_gguf! (verbatim; called ONLY
  # from here). Mirrors load_from_gguf_mmap!'s arg-passing exactly: every
  # dim/flag arrives as an arg, NO ivar reads off the block. The q8 path
  # never names LoRA tensors, so the moved body is :str-free (#16-clean)
  # — no lora_name_q! back-calls (unlike load_from_gguf_mmap!).
  li = 0
  while li < @seq_n_layers
    blk = self.seq_blocks_ffi[li]
    blk.alloc_q8_typed_from_gguf!(@sess, gguf_handle, li,
                                  @seq_n_heads, @seq_n_kv, @seq_d_head, @seq_d_model,
                                  @seq_d_ff, @seq_vocab_size, @seq_lora_q_enabled,
                                  @seq_lora_q_rank, @seq_lora_q_adamw_enabled, qkv_bias)
    li = li + 1
  end

  if @seq_lora_q_enabled
    li2 = 0
    while li2 < @seq_n_layers
      blk2 = self.seq_blocks_ffi[li2]
      hq_p = 0
      while hq_p < @seq_n_heads
        TinyNNCuda.tnn_set_param(blk2.t_seq_w_lora_a_q[hq_p])
        TinyNNCuda.tnn_set_param(blk2.t_seq_w_lora_b_q[hq_p])
        hq_p = hq_p + 1
      end
      li2 = li2 + 1
    end
  end

  finalize_weights_and_upload_constants!

  # Load all weight bytes from the GGUF into the now-allocated
  # backend buffers. Verbatim copy keeps Q8 as Q8.
  TinyNNCuda.tnn_gguf_copy_verbatim_to_persistent(gguf_handle, eidx, @sess, self.t_seq_token_embed)
  fnidx = TinyNNCuda.tnn_gguf_find_index(gguf_handle, "output_norm.weight")
  TinyNNCuda.tnn_gguf_copy_1d_to_persistent(gguf_handle, fnidx, @sess, self.t_seq_final_norm_gamma)
  if untied
    oidx2 = TinyNNCuda.tnn_gguf_find_index(gguf_handle, "output.weight")
    TinyNNCuda.tnn_gguf_copy_verbatim_to_persistent(gguf_handle, oidx2, @sess, self.t_seq_output)
  end

  # P2.7 pass-3 Step 2 — the per-block VERBATIM-COPY loop body moved onto
  # TransformerBlock#copy_q8_bytes_from_gguf! (verbatim; called ONLY from
  # here). The copy phase fills the backend buffers allocated by the
  # alloc_q8_typed_from_gguf! pass; the block reads its OWN t_seq_* handles
  # and writes nothing on itself. NO ivar reads off the cache — every dim
  # (n_heads, n_kv, d_head) and the qkv_bias flag arrive as ARGS. All the
  # moved primitives are tnn_gguf_copy_* / tnn_gguf_find_index — the same
  # :str-at-runtime pattern alloc_q8_typed_from_gguf! already uses, never
  # block class-load scope (#16). The GLOBALS verbatim-copy above (token
  # embed / final norm / untied output) STAYS on the cache — those touch
  # cache-level t_seq_* handles, not the block.
  li_l = 0
  while li_l < @seq_n_layers
    blk = self.seq_blocks_ffi[li_l]
    blk.copy_q8_bytes_from_gguf!(@sess, gguf_handle, li_l,
                                 @seq_n_heads, @seq_n_kv, @seq_d_head, qkv_bias)
    li_l = li_l + 1
  end

  if @seq_lora_q_adamw_enabled
    za = Mat.new(@seq_lora_q_rank, @seq_d_model)
    zb = Mat.new(@seq_d_head,      @seq_lora_q_rank)
    i = 0
    while i < @seq_lora_q_rank * @seq_d_model; za.flat[i] = 0.0; i = i + 1; end
    j = 0
    while j < @seq_d_head * @seq_lora_q_rank; zb.flat[j] = 0.0; j = j + 1; end
    li_z = 0
    while li_z < @seq_n_layers
      blk_z = self.seq_blocks_ffi[li_z]
      hqz = 0
      while hqz < @seq_n_heads
        TinyNNCuda.upload_row_major(@sess, blk_z.t_seq_w_lora_a_q_m[hqz], za)
        TinyNNCuda.upload_row_major(@sess, blk_z.t_seq_w_lora_a_q_v[hqz], za)
        TinyNNCuda.upload_row_major(@sess, blk_z.t_seq_w_lora_b_q_m[hqz], zb)
        TinyNNCuda.upload_row_major(@sess, blk_z.t_seq_w_lora_b_q_v[hqz], zb)
        hqz = hqz + 1
      end
      li_z = li_z + 1
    end
  end

  build_and_realize!
end

#realize_for_random_init(cfg, t_seq, t_batch, weight_dtype, untied, qkv_bias, seed, init_scale) ⇒ Object

P2-α: from-scratch training entry. Allocates the same persistent tensor layout as realize_for_full_finetune (embeddings + per-block weights, all trainable F32 in ctx_w), then random-initialises every weight via Ruby-side Gaussian upload — no GGUF needed.

Force-enables ‘@ft_train_embeddings_enabled` so the existing full-FT machinery allocates persistent F32 embeddings instead of the mmap branch. Caller doesn’t need to call enable_full_finetune_embeddings! first.

Currently Llama-arch only (RMSNorm + GQA + RoPE + SwiGLU). Other architectures (GPT-2 LN, MHA + biases) need a separate trainer cache class; deferred until we actually need GPT-2 from-scratch.



590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 590

def realize_for_random_init(cfg, t_seq, t_batch, weight_dtype, untied, qkv_bias, seed, init_scale)
  @ft_train_embeddings_enabled = true   # forces persistent-F32 alloc of embeddings
  @seq_full_finetune_enabled   = true   # build_training_step gates on this

  @seq_t          = t_seq
  # GH#7 — micro-batching. B=1 keeps the codepath bit-identical to
  # the pre-GH#7 single-sequence training. B>1 lays out tokens as a
  # flat [T*B] vector with a block-causal attention mask uploaded
  # post-finalize and applied via soft_max_ext.
  @seq_b          = t_batch
  # GH#9 — mixed-precision compute. 0 = F32 (bit-identical to
  # pre-GH#9). 1 = F16, 30 = BF16. See mp_matmul + ivar comment in
  # initialize for the master-copy details.
  @seq_weight_dtype = weight_dtype
  apply_seq_cfg!(cfg)

  @sess                  = TinyNNCuda.tnn_session_new(1)
  # GH#17 — per-head decomposition makes node count scale as
  # O(n_layers × n_heads). The default 65536 cap overflows on
  # 24L × 16-head Qwen-shape at backward-expand. Empirically a
  # 24L × 16-head model needs ~450k nodes for forward + backward +
  # AdamW, so we budget ~1000 nodes per (layer × head) cell + floor.
  cap = cfg.n_layers * cfg.n_heads * 1000 + 65536
  TinyNNCuda.tnn_session_set_graph_capacity(@sess, cap)
  @seq_has_untied_output = untied
  @seq_has_qkv_bias      = qkv_bias
  @seq_donor_d_in        = cfg.donor_d_in   # E2.3 — 0 disables projection lens

  if @seq_rope_scaling.kind == :llama3
    @t_seq_rope_freq_factors = TinyNNCuda.tnn_rope_freq_factors_alloc(@sess, cfg.head_dim)
  else
    @t_seq_rope_freq_factors = TinyNNCuda.tnn_null_ptr
  end

  # Globals — trainable persistent F32 (+ E2.3 projection-lens branch when
  # @seq_donor_d_in > 0). P2-finish: the alloc lifted onto the arch
  # (LlamaArch#alloc_globals_trainable_f32!), which already owns these handles
  # — verbatim, same order, byte-identical. @ft_globals_* recorders + the
  # frozen-embed namer are back-called through `self`.
  @seq_arch.alloc_globals_trainable_f32!(@sess, self, @seq_vocab_size,
                                         @seq_d_model, @seq_donor_d_in, untied)

  # Per-block weights — identical structure to realize_for_full_finetune.
  # P2.6 Step 2 — the block-array seeding loop now lives on the arch
  # (LlamaArch#seed_blocks!), which already owns @seq_blocks_ffi.
  @seq_arch.seed_blocks!(@seq_n_layers)

  # P2.6 Step 4 — the per-block F32 ALLOC loop body now lives on the
  # block (TransformerBlock#alloc_trainable_f32_weights!), which already
  # OWNS these self.t_seq_* handles at forward time. The block takes
  # @sess + the seq dims + the name prefix as ARGS (no ivar reads on the
  # block) and calls the cache's ft_add_1d / ft_add_2d / ft_name_last
  # recorders BACK through the passed `self` reference — those stay on
  # the cache (they read @sess and issue tnn_tensor_set_name :str at
  # runtime; never migrate into block class-load scope). w_o keeps its
  # random_init shape ne=[d_model, n_heads*d_head] inside the block
  # method (not unified with full_finetune's [d_model,d_model]).
  li = 0
  while li < @seq_n_layers
    blk = self.seq_blocks_ffi[li]
    prefix = "blk." + li.to_s + "."
    blk.alloc_trainable_f32_weights!(@sess, self, prefix,
                                     @seq_d_model, @seq_d_ff, @seq_d_head,
                                     @seq_n_heads, @seq_n_kv)
    li = li + 1
  end

  # Mark globals as params too (gated on @ft_train_embeddings_enabled).
  gi = 0
  while gi < @ft_globals_weights.length
    TinyNNCuda.tnn_set_param(@ft_globals_weights[gi])
    gi = gi + 1
  end

  finalize_weights_and_upload_constants!

  # Random-init every weight + zero biases + ones gammas.
  upload_random_init!(seed, init_scale, qkv_bias, untied)
  ft_zero_init_adam(qkv_bias)
  ft_zero_init_adam_globals

  build_and_realize!
end

#seq_blocks_ffiObject



99
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 99

def seq_blocks_ffi;           @seq_arch.seq_blocks_ffi;           end

#seq_blocks_ffi=(v) ⇒ Object



100
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 100

def seq_blocks_ffi=(v);       @seq_arch.seq_blocks_ffi = v;       end

#seq_donor_d_inObject

E2.3 — projection-lens donor width (0 disables the lens). Plain ivar (NOT in the attr_accessor list); the GGUF-fold writer reads it to know the donor->d_model contraction dimension.



98
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 98

def seq_donor_d_in;           @seq_donor_d_in;                    end

#t_seq_final_norm_gammaObject



89
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 89

def t_seq_final_norm_gamma;     @seq_arch.t_seq_final_norm_gamma;     end

#t_seq_final_norm_gamma=(v) ⇒ Object



90
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 90

def t_seq_final_norm_gamma=(v); @seq_arch.t_seq_final_norm_gamma = v; end

#t_seq_outputObject



91
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 91

def t_seq_output;             @seq_arch.t_seq_output;             end

#t_seq_output=(v) ⇒ Object



92
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 92

def t_seq_output=(v);         @seq_arch.t_seq_output = v;         end

#t_seq_token_embedObject

P2.5 — delegators forwarding the arch-owned handle accessors to former public attr_accessor surface (the realize paths assign via self.t_seq_token_embed=, external PCA-init writes fcache.t_seq_w_proj=, examples read fcache.t_seq_*). Single source of truth: the arch.



87
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 87

def t_seq_token_embed;        @seq_arch.t_seq_token_embed;        end

#t_seq_token_embed=(v) ⇒ Object



88
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 88

def t_seq_token_embed=(v);    @seq_arch.t_seq_token_embed = v;    end

#t_seq_w_projObject



93
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 93

def t_seq_w_proj;             @seq_arch.t_seq_w_proj;             end

#t_seq_w_proj=(v) ⇒ Object



94
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 94

def t_seq_w_proj=(v);         @seq_arch.t_seq_w_proj = v;         end

#upload_block_causal_mask!Object

GH#7 — build + upload the block-causal attention mask for B>1. Layout: scores from matmul(K[d_head, T*B], Q[d_head, T*B]) have ne=[T*B, T*B] where ne0 indexes keys and ne1 indexes queries (ggml column-major: flat[ne0_idx + ne1_idx * T*B]). For query position i1 = b_q*T + p_q and key position i0 = b_k*T + p_k:

mask = 0.0  iff b_k == b_q AND p_k <= p_q   (intra-batch causal)
mask = NEG  otherwise                      (cross-batch + future)

NEG = -1.0e30 so exp(NEG) == 0.0 in f32 (avoids Float::INFINITY, which would also work but is one less Spinel codegen variable).



683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 683

def upload_block_causal_mask!
  tb = @seq_t * @seq_b
  neg = -1.0e30
  mask_arr = [0.0]; mask_arr.pop
  i1 = 0
  while i1 < tb
    b_q = i1 / @seq_t
    p_q = i1 % @seq_t
    i0 = 0
    while i0 < tb
      b_k = i0 / @seq_t
      p_k = i0 % @seq_t
      if b_k == b_q && p_k <= p_q
        mask_arr.push(0.0)
      else
        mask_arr.push(neg)
      end
      i0 = i0 + 1
    end
    i1 = i1 + 1
  end
  TinyNNCuda.tnn_upload_from_float_array(@sess, @t_seq_attn_mask, mask_arr, mask_arr.length)
end

#upload_constant(tensor, n, v) ⇒ Object



789
790
791
792
793
794
795
796
797
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 789

def upload_constant(tensor, n, v)
  buf = [0.0]; buf.pop
  i = 0
  while i < n
    buf.push(v)
    i = i + 1
  end
  TinyNNCuda.tnn_upload_from_float_array(@sess, tensor, buf, n)
end

#upload_gaussian(tensor, n, std, state) ⇒ Object

Box-Muller from a xorshift64-driven uniform stream. state is a one-element Array<Integer> so the mutable PRNG state survives across calls without using class variables. Always emits exactly ‘n` Gaussian-distributed F32 values via tnn_upload_from_float_array.



763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 763

def upload_gaussian(tensor, n, std, state)
  buf = [0.0]; buf.pop
  pair = 0
  saved = 0.0
  i = 0
  while i < n
    if pair == 0
      u1 = xorshift_uniform!(state)
      u2 = xorshift_uniform!(state)
      if u1 < 1.0e-300; u1 = 1.0e-300; end
      r = Math.sqrt(-2.0 * Math.log(u1))
      theta = 2.0 * Math::PI * u2
      z0 = r * Math.cos(theta) * std
      z1 = r * Math.sin(theta) * std
      buf.push(z0)
      saved = z1
      pair = 1
    else
      buf.push(saved)
      pair = 0
    end
    i = i + 1
  end
  TinyNNCuda.tnn_upload_from_float_array(@sess, tensor, buf, n)
end

#upload_lora_q_init!(seed, init_scale) ⇒ Object

Seed LoRA-A with a small Gaussian and LoRA-B with zero — the standard init makes the adapter a no-op at step 0 (forward output equals the base model). Mirror of SmolLM2KVFFICache#upload_lora_q_init!.



1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 1206

def upload_lora_q_init!(seed, init_scale)
  if !@seq_lora_q_enabled; return; end
  s = seed
  m_a = Mat.new(@seq_lora_q_rank, @seq_d_model)
  m_b = Mat.new(@seq_d_head, @seq_lora_q_rank)
  i_b = 0
  while i_b < @seq_d_head * @seq_lora_q_rank
    m_b.flat[i_b] = 0.0
    i_b = i_b + 1
  end
  li = 0
  while li < @seq_n_layers
    blk = self.seq_blocks_ffi[li]
    hq = 0
    while hq < @seq_n_heads
      ii = 0
      while ii < @seq_lora_q_rank * @seq_d_model
        s = (s * 1103515245 + 12345) & 0x7FFFFFFF
        u1 = (s.to_f + 1.0) / 2147483648.0
        s = (s * 1103515245 + 12345) & 0x7FFFFFFF
        u2 = (s.to_f + 1.0) / 2147483648.0
        m_a.flat[ii] = init_scale * Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math::PI * u2)
        ii = ii + 1
      end
      TinyNNCuda.upload_row_major(@sess, blk.t_seq_w_lora_a_q[hq], m_a)
      TinyNNCuda.upload_row_major(@sess, blk.t_seq_w_lora_b_q[hq], m_b)
      hq = hq + 1
    end
    li = li + 1
  end
end

#upload_random_init!(seed, init_scale, qkv_bias, untied) ⇒ Object

Fill every persistent weight tensor with N(0, std) values. Norm gammas → 1.0, biases (if present) → 0.0, matmul weights →N(0, init_scale/sqrt(fan_in)). Token embedding uses GPT-2-style N(0, 0.02). All values computed in Ruby, uploaded in bulk via tnn_upload_from_float_array.



712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 712

def upload_random_init!(seed, init_scale, qkv_bias, untied)
  state = [seed]

  # Token embed: width depends on projection lens.
  # When donor_d_in > 0, embed is [vocab, donor_d_in] (caller may
  # overwrite with real donor values after realize); the trainable
  # projection W_proj [donor_d_in, d_model] also gets a Gaussian init.
  embed_cols = @seq_donor_d_in > 0 ? @seq_donor_d_in : @seq_d_model
  upload_gaussian(self.t_seq_token_embed, @seq_vocab_size * embed_cols, 0.02, state)
  if @seq_donor_d_in > 0
    upload_gaussian(self.t_seq_w_proj, @seq_donor_d_in * @seq_d_model,
                     1.0 / Math.sqrt(@seq_donor_d_in.to_f), state)
  end
  upload_constant(self.t_seq_final_norm_gamma, @seq_d_model, 1.0)
  if untied
    upload_gaussian(self.t_seq_output, @seq_vocab_size * @seq_d_model, 0.02, state)
  end

  inv_sqrt_d   = init_scale / Math.sqrt(@seq_d_model.to_f)
  inv_sqrt_dff = init_scale / Math.sqrt(@seq_d_ff.to_f)

  li = 0
  while li < @seq_n_layers
    blk = self.seq_blocks_ffi[li]
    upload_constant(blk.t_seq_rn1_gamma, @seq_d_model, 1.0)
    upload_constant(blk.t_seq_rn2_gamma, @seq_d_model, 1.0)

    hq = 0
    while hq < @seq_n_heads
      upload_gaussian(blk.t_seq_w_q[hq], @seq_d_head * @seq_d_model, inv_sqrt_d, state)
      hq = hq + 1
    end
    hkv = 0
    while hkv < @seq_n_kv
      upload_gaussian(blk.t_seq_w_k[hkv], @seq_d_head * @seq_d_model, inv_sqrt_d, state)
      upload_gaussian(blk.t_seq_w_v[hkv], @seq_d_head * @seq_d_model, inv_sqrt_d, state)
      hkv = hkv + 1
    end

    upload_gaussian(blk.t_seq_w_o,    @seq_d_model * @seq_n_heads * @seq_d_head, inv_sqrt_d, state)
    upload_gaussian(blk.t_seq_w_gate, @seq_d_ff    * @seq_d_model, inv_sqrt_d, state)
    upload_gaussian(blk.t_seq_w_up,   @seq_d_ff    * @seq_d_model, inv_sqrt_d, state)
    upload_gaussian(blk.t_seq_w_down, @seq_d_model * @seq_d_ff,    inv_sqrt_dff, state)
    li = li + 1
  end
end

#xorshift_uniform!(state) ⇒ Object

xorshift64 → uniform in (0, 1). Mutates state.



800
801
802
803
804
805
806
807
808
809
# File 'lib/toy/llm/engine/llama_seq_engine_cuda.rb', line 800

def xorshift_uniform!(state)
  x = state[0]
  x = x ^ (x << 13)
  x = x & 0xFFFFFFFFFFFFFFFF
  x = x ^ (x >> 7)
  x = x ^ (x << 17)
  x = x & 0xFFFFFFFFFFFFFFFF
  state[0] = x
  (x.to_f / 18446744073709551616.0) + 1.0e-300
end