Class: GPT2FullForwardFFICacheCuda

Inherits:
Object
  • Object
show all
Defined in:
lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeGPT2FullForwardFFICacheCuda

Returns a new instance of GPT2FullForwardFFICacheCuda.



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 69

def initialize
  @realized   = false
  @t_seq      = 0
  @d_model    = 0
  @d_ff       = 0
  @n_heads    = 0
  @d_head     = 0
  @n_layers   = 0
  @vocab_size = 0
  @sess          = TinyNNCuda.tnn_null_ptr
  @t_token_embed = TinyNNCuda.tnn_null_ptr
  @t_pos_slice   = TinyNNCuda.tnn_null_ptr
  @t_token_ids   = TinyNNCuda.tnn_null_ptr
  @t_ln_f_gamma  = TinyNNCuda.tnn_null_ptr
  @t_ln_f_beta   = TinyNNCuda.tnn_null_ptr
  @t_x_embed     = TinyNNCuda.tnn_null_ptr
  @t_x_final     = TinyNNCuda.tnn_null_ptr
  @t_logits      = TinyNNCuda.tnn_null_ptr
  @gpt2_blocks_ffi = [GPT2BlockFFICuda.new]
end

Instance Attribute Details

#d_ffObject

Returns the value of attribute d_ff.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def d_ff
  @d_ff
end

#d_headObject

Returns the value of attribute d_head.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def d_head
  @d_head
end

#d_modelObject

Returns the value of attribute d_model.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def d_model
  @d_model
end

#gpt2_blocks_ffiObject

Returns the value of attribute gpt2_blocks_ffi.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def gpt2_blocks_ffi
  @gpt2_blocks_ffi
end

#n_headsObject

Returns the value of attribute n_heads.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def n_heads
  @n_heads
end

#n_layersObject

Returns the value of attribute n_layers.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def n_layers
  @n_layers
end

#realizedObject

Returns the value of attribute realized.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def realized
  @realized
end

#sessObject

Returns the value of attribute sess.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def sess
  @sess
end

#t_ln_f_betaObject

Returns the value of attribute t_ln_f_beta.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_ln_f_beta
  @t_ln_f_beta
end

#t_ln_f_gammaObject

Returns the value of attribute t_ln_f_gamma.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_ln_f_gamma
  @t_ln_f_gamma
end

#t_logitsObject

Returns the value of attribute t_logits.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_logits
  @t_logits
end

#t_pos_sliceObject

Returns the value of attribute t_pos_slice.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_pos_slice
  @t_pos_slice
end

#t_seqObject

Returns the value of attribute t_seq.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_seq
  @t_seq
end

#t_token_embedObject

Returns the value of attribute t_token_embed.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_token_embed
  @t_token_embed
end

#t_token_idsObject

Returns the value of attribute t_token_ids.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_token_ids
  @t_token_ids
end

#t_x_embedObject

Returns the value of attribute t_x_embed.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_x_embed
  @t_x_embed
end

#t_x_finalObject

Returns the value of attribute t_x_final.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_x_final
  @t_x_final
end

#vocab_sizeObject

Returns the value of attribute vocab_size.



62
63
64
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def vocab_size
  @vocab_size
end

Instance Method Details

#build_attention_head(t_x, t_w_q, t_w_k, t_w_v, t_b_q, t_b_k, t_b_v, scale) ⇒ Object



253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 253

def build_attention_head(t_x, t_w_q, t_w_k, t_w_v, t_b_q, t_b_k, t_b_v, scale)
  t_q_raw = TinyNNCuda.tnn_matmul(@sess, t_w_q, t_x)        # ne=[d_head, T]
  t_q     = TinyNNCuda.tnn_add(@sess, t_q_raw, t_b_q)
  t_k_raw = TinyNNCuda.tnn_matmul(@sess, t_w_k, t_x)
  t_k     = TinyNNCuda.tnn_add(@sess, t_k_raw, t_b_k)
  # v in transposed pattern (ne=[T, d_head]) so head_out's k_dim matches.
  t_v_raw = TinyNNCuda.tnn_matmul(@sess, t_x, t_w_v)
  t_v     = TinyNNCuda.tnn_add(@sess, t_v_raw, t_b_v)

  t_scores = TinyNNCuda.tnn_matmul(@sess, t_k, t_q)
  t_scaled = TinyNNCuda.tnn_scale(@sess, t_scores, scale)
  t_masked = TinyNNCuda.tnn_diag_mask_inf(@sess, t_scaled, 0)
  t_attn   = TinyNNCuda.tnn_softmax(@sess, t_masked)

  TinyNNCuda.tnn_matmul(@sess, t_v, t_attn)                  # ne=[d_head, T_query]
end

#build_block(t_x, blk, eps, scale) ⇒ Object

Build one GPT-2 block’s graph nodes.

h1 = LayerNorm(x, ln1_gamma, ln1_beta)
per head h:
  q_h = mul_mat(w_q_t_h, h1) + b_q_h  ne=[d_head, T]
  k_h = mul_mat(w_k_t_h, h1) + b_k_h
  v_h = mul_mat(h1, w_v_t_h) + b_v_h  ne=[T, d_head] (transposed)
  scores_h = mul_mat(k_h, q_h)
  attn_h   = softmax(causal_mask(scale(scores_h)))
  head_out = mul_mat(v_h, attn_h)
concat_h = concat along ne0 (d_head -> d_model)
x_attn = x + (mul_mat(w_o_t, concat) + b_o)
h2 = LayerNorm(x_attn, ln2_gamma, ln2_beta)
ff_up = mul_mat(w_ff1_t, h2) + b_ff1
ff_g  = gelu(ff_up)
ff_dn = mul_mat(w_ff2_t, ff_g) + b_ff2
x_out = x_attn + ff_dn


214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 214

def build_block(t_x, blk, eps, scale)
  t_h1 = TinyNNCuda.tnn_layer_norm(@sess, t_x, blk.t_ln1_gamma, blk.t_ln1_beta, eps)

  t_head0 = build_attention_head(t_h1, blk.t_w_q[0], blk.t_w_k[0], blk.t_w_v[0],
                                  blk.t_b_q[0], blk.t_b_k[0], blk.t_b_v[0], scale)
  t_head_outs = [t_head0]
  h = 1
  while h < @n_heads
    t_head_outs.push(build_attention_head(t_h1,
                                           blk.t_w_q[h], blk.t_w_k[h], blk.t_w_v[h],
                                           blk.t_b_q[h], blk.t_b_k[h], blk.t_b_v[h],
                                           scale))
    h = h + 1
  end

  # Concat along ne0 (d_head -> d_model).
  t_concat = t_head_outs[0]
  h = 1
  while h < @n_heads
    t_concat = TinyNNCuda.tnn_concat(@sess, t_concat, t_head_outs[h], 0)
    h = h + 1
  end

  # Output projection + bias + residual.
  t_out_proj_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_o, t_concat)
  t_out_proj     = TinyNNCuda.tnn_add(@sess, t_out_proj_raw, blk.t_b_o)
  t_x_attn       = TinyNNCuda.tnn_add(@sess, t_x, t_out_proj)

  # FFN.
  t_h2     = TinyNNCuda.tnn_layer_norm(@sess, t_x_attn, blk.t_ln2_gamma, blk.t_ln2_beta, eps)
  t_pre_raw= TinyNNCuda.tnn_matmul(@sess, blk.t_w_ff1, t_h2)
  t_pre    = TinyNNCuda.tnn_add(@sess, t_pre_raw, blk.t_b_ff1)
  t_hidden = TinyNNCuda.tnn_gelu(@sess, t_pre)
  t_dn_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_ff2, t_hidden)
  t_dn     = TinyNNCuda.tnn_add(@sess, t_dn_raw, blk.t_b_ff2)

  TinyNNCuda.tnn_add(@sess, t_x_attn, t_dn)
end

#realize_for(t_seq, d_model, d_ff, n_heads, n_layers, vocab_size) ⇒ Object

Allocate persistent ctx_w, declare all weights, build the compute graph. After this, only token_ids changes per call. Call once per T_SEQ choice; rebuild for a different T_SEQ.



93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 93

def realize_for(t_seq, d_model, d_ff, n_heads, n_layers, vocab_size)
  @t_seq      = t_seq
  @d_model    = d_model
  @d_ff       = d_ff
  @n_heads    = n_heads
  @d_head     = d_model / n_heads
  @n_layers   = n_layers
  @vocab_size = vocab_size

  @sess = TinyNNCuda.tnn_session_new(1)

  # === Persistent weights (ctx_w) ===
  @t_token_embed = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, vocab_size, d_model)
  @t_pos_slice   = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, t_seq,      d_model)
  @t_ln_f_gamma  = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
  @t_ln_f_beta   = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)

  # Per-block handles — seed-then-push so Spinel types as Array<GPT2BlockFFICuda>.
  @gpt2_blocks_ffi = [GPT2BlockFFICuda.new]
  li = 1
  while li < n_layers
    @gpt2_blocks_ffi.push(GPT2BlockFFICuda.new)
    li = li + 1
  end

  li = 0
  while li < n_layers
    blk = @gpt2_blocks_ffi[li]
    blk.t_ln1_gamma = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_ln1_beta  = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_ln2_gamma = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_ln2_beta  = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)

    # Per-head Q/K/V weights. Uploaded TRANSPOSED so ne=[d_model, d_head]
    # holds W.elem(r, c) = mat[r][c]. matmul(w_q_t, h) then yields
    # ne=[d_head, T] — same trick as FullForwardFFICache.
    #
    # Bias shapes:
    #   b_q / b_k  ne=[d_head, 1]    — broadcasts against (d_head, T)
    #                                  matmul result, the QK layout
    #   b_v        ne=[1, d_head]    — broadcasts against (T, d_head)
    #                                  matmul result, the transposed-V
    #                                  layout (needed for head_out = v @ attn)
    # Both declarations are 2D under the hood; data is still a flat
    # length-d_head Array<Float>.
    blk.t_w_q = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model)]
    blk.t_w_k = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model)]
    blk.t_w_v = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model)]
    blk.t_b_q = [TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head)]
    blk.t_b_k = [TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head)]
    blk.t_b_v = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, 1)]
    h = 1
    while h < n_heads
      blk.t_w_q.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model))
      blk.t_w_k.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model))
      blk.t_w_v.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model))
      blk.t_b_q.push(TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head))
      blk.t_b_k.push(TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head))
      blk.t_b_v.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, 1))
      h = h + 1
    end

    blk.t_w_o   = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_model, d_model)
    blk.t_b_o   = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_w_ff1 = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_ff,    d_model)
    blk.t_b_ff1 = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_ff)
    blk.t_w_ff2 = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_model, d_ff)
    blk.t_b_ff2 = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    li = li + 1
  end

  TinyNNCuda.tnn_finalize_weights(@sess)

  # === Compute input ===
  @t_token_ids = TinyNNCuda.tnn_input_1d_i32(@sess, t_seq)

  # === Forward graph ===
  # x_embed = token_embed[ids] + pos_slice  (ne=[d_model, T])
  t_embedded = TinyNNCuda.tnn_get_rows(@sess, @t_token_embed, @t_token_ids)
  @t_x_embed = TinyNNCuda.tnn_add(@sess, t_embedded, @t_pos_slice)
  TinyNNCuda.tnn_set_output(@t_x_embed)

  eps = 1.0e-5
  scale = 1.0 / Math.sqrt(@d_head.to_f)

  t_cur = @t_x_embed
  li = 0
  while li < n_layers
    t_cur = build_block(t_cur, @gpt2_blocks_ffi[li], eps, scale)
    li = li + 1
  end

  # Final LayerNorm.
  @t_x_final = TinyNNCuda.tnn_layer_norm(@sess, t_cur, @t_ln_f_gamma, @t_ln_f_beta, eps)
  TinyNNCuda.tnn_set_output(@t_x_final)

  # Tied unembed: logits = mul_mat(token_embed, x_final)  ne=[vocab, T]
  @t_logits = TinyNNCuda.tnn_matmul(@sess, @t_token_embed, @t_x_final)
  TinyNNCuda.tnn_set_output(@t_logits)

  TinyNNCuda.tnn_realize(@sess, @t_logits)
  @realized = true
end