Class: GPT2KVFFICacheCuda

Inherits:
Object
  • Object
show all
Defined in:
lib/toy/llm/engine/gpt2_kv_engine_cuda.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeGPT2KVFFICacheCuda

Returns a new instance of GPT2KVFFICacheCuda.



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 65

def initialize
  @realized   = false
  @max_T      = 0
  @d_model    = 0
  @d_ff       = 0
  @n_heads    = 0
  @d_head     = 0
  @n_layers   = 0
  @vocab_size = 0
  @context_length = 0
  @sess          = TinyNNCuda.tnn_null_ptr
  @t_token_embed = TinyNNCuda.tnn_null_ptr
  @t_pos_embed   = TinyNNCuda.tnn_null_ptr
  @t_ln_f_gamma  = TinyNNCuda.tnn_null_ptr
  @t_ln_f_beta   = TinyNNCuda.tnn_null_ptr
  @kv_blocks_ffi = [GPT2KVBlockFFICuda.new]
end

Instance Attribute Details

#context_lengthObject

Returns the value of attribute context_length.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def context_length
  @context_length
end

#d_ffObject

Returns the value of attribute d_ff.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def d_ff
  @d_ff
end

#d_headObject

Returns the value of attribute d_head.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def d_head
  @d_head
end

#d_modelObject

Returns the value of attribute d_model.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def d_model
  @d_model
end

#kv_blocks_ffiObject

Returns the value of attribute kv_blocks_ffi.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def kv_blocks_ffi
  @kv_blocks_ffi
end

#max_TObject

Returns the value of attribute max_T.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def max_T
  @max_T
end

#n_headsObject

Returns the value of attribute n_heads.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def n_heads
  @n_heads
end

#n_layersObject

Returns the value of attribute n_layers.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def n_layers
  @n_layers
end

#realizedObject

Returns the value of attribute realized.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def realized
  @realized
end

#sessObject

Returns the value of attribute sess.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def sess
  @sess
end

#t_ln_f_betaObject

Returns the value of attribute t_ln_f_beta.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def t_ln_f_beta
  @t_ln_f_beta
end

#t_ln_f_gammaObject

Returns the value of attribute t_ln_f_gamma.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def t_ln_f_gamma
  @t_ln_f_gamma
end

#t_pos_embedObject

Returns the value of attribute t_pos_embed.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def t_pos_embed
  @t_pos_embed
end

#t_token_embedObject

Returns the value of attribute t_token_embed.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def t_token_embed
  @t_token_embed
end

#vocab_sizeObject

Returns the value of attribute vocab_size.



59
60
61
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def vocab_size
  @vocab_size
end

Instance Method Details

#build_attention_head_step(t_h, blk, head_idx, pos, scale, bytes_d_head, bytes_max_T) ⇒ Object



224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 224

def build_attention_head_step(t_h, blk, head_idx, pos, scale,
                               bytes_d_head, bytes_max_T)
  # q_new, k_new, v_new for the single new position.
  t_q_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_q[head_idx], t_h)   # ne=[d_head, 1]
  t_q     = TinyNNCuda.tnn_add(@sess, t_q_raw, blk.t_b_q[head_idx])
  t_k_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_k[head_idx], t_h)
  t_k_new = TinyNNCuda.tnn_add(@sess, t_k_raw, blk.t_b_k[head_idx])
  t_v_raw = TinyNNCuda.tnn_matmul(@sess, t_h, blk.t_w_v[head_idx])   # ne=[1, d_head]
  t_v_new = TinyNNCuda.tnn_add(@sess, t_v_raw, blk.t_b_v[head_idx])

  # Write k_new → K[pos], v_new → V[:, pos] via cpy-into-view.
  t_K_slot = TinyNNCuda.tnn_view_2d(@sess, blk.t_K[head_idx],
                                  @d_head, 1, bytes_d_head, pos * bytes_d_head)
  t_cpy_k  = TinyNNCuda.tnn_cpy(@sess, t_k_new, t_K_slot)
  t_V_slot = TinyNNCuda.tnn_view_2d(@sess, blk.t_V[head_idx],
                                  1, @d_head, bytes_max_T, pos * 4)
  t_cpy_v  = TinyNNCuda.tnn_cpy(@sess, t_v_new, t_V_slot)
  # The cpy tensors aren't reachable from head_out; force them into
  # the graph so the scheduler runs them before the attn matmuls
  # read K/V history.
  TinyNNCuda.tnn_add_to_graph(@sess, t_cpy_k)
  TinyNNCuda.tnn_add_to_graph(@sess, t_cpy_v)

  # Attention over K[0:pos+1] / V[0:pos+1].
  t_K_hist = TinyNNCuda.tnn_view_2d(@sess, blk.t_K[head_idx],
                                  @d_head, pos + 1, bytes_d_head, 0)
  t_V_hist = TinyNNCuda.tnn_view_2d(@sess, blk.t_V[head_idx],
                                  pos + 1, @d_head, bytes_max_T, 0)

  t_scores = TinyNNCuda.tnn_matmul(@sess, t_K_hist, t_q)        # ne=[pos+1, 1]
  t_scaled = TinyNNCuda.tnn_scale(@sess, t_scores, scale)
  # No causal mask: K_hist already covers only valid past positions.
  t_attn   = TinyNNCuda.tnn_softmax(@sess, t_scaled)
  TinyNNCuda.tnn_matmul(@sess, t_V_hist, t_attn)                 # ne=[d_head, 1]
end

#build_block_step(t_x, blk, pos, scale, eps, bytes_d_head, bytes_max_T) ⇒ Object



190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 190

def build_block_step(t_x, blk, pos, scale, eps, bytes_d_head, bytes_max_T)
  t_h = TinyNNCuda.tnn_layer_norm(@sess, t_x, blk.t_ln1_gamma, blk.t_ln1_beta, eps)

  t_head_out0 = build_attention_head_step(t_h, blk, 0, pos, scale,
                                           bytes_d_head, bytes_max_T)
  t_head_outs = [t_head_out0]
  h = 1
  while h < @n_heads
    t_head_outs.push(build_attention_head_step(t_h, blk, h, pos, scale,
                                                bytes_d_head, bytes_max_T))
    h = h + 1
  end

  t_concat = t_head_outs[0]
  h = 1
  while h < @n_heads
    t_concat = TinyNNCuda.tnn_concat(@sess, t_concat, t_head_outs[h], 0)
    h = h + 1
  end

  t_out_proj_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_o, t_concat)
  t_out_proj     = TinyNNCuda.tnn_add(@sess, t_out_proj_raw, blk.t_b_o)
  t_x_attn       = TinyNNCuda.tnn_add(@sess, t_x, t_out_proj)

  t_h2     = TinyNNCuda.tnn_layer_norm(@sess, t_x_attn, blk.t_ln2_gamma, blk.t_ln2_beta, eps)
  t_up_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_ff1, t_h2)
  t_up     = TinyNNCuda.tnn_add(@sess, t_up_raw, blk.t_b_ff1)
  t_g      = TinyNNCuda.tnn_gelu(@sess, t_up)
  t_dn_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_ff2, t_g)
  t_dn     = TinyNNCuda.tnn_add(@sess, t_dn_raw, blk.t_b_ff2)

  TinyNNCuda.tnn_add(@sess, t_x_attn, t_dn)
end

#build_decode_step(pos) ⇒ Object

Build the compute graph for one decode position. Returns the logits tensor handle. Caller calls tnn_compute then download_row_major.



157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 157

def build_decode_step(pos)
  eps   = 1.0e-5
  scale = 1.0 / Math.sqrt(@d_head.to_f)
  d_model = @d_model
  d_head  = @d_head
  max_T   = @max_T
  bytes_d_head    = d_head * 4
  bytes_d_model   = d_model * 4
  bytes_max_T     = max_T * 4

  # Single-token input.
  t_token_id = TinyNNCuda.tnn_input_1d_i32(@sess, 1)

  # x = embed[token_id] + pos_embed[pos]
  t_embed_row = TinyNNCuda.tnn_get_rows(@sess, @t_token_embed, t_token_id)  # ne=[d_model, 1]
  t_pos_row   = TinyNNCuda.tnn_view_2d(@sess, @t_pos_embed,
                                    d_model, 1, bytes_d_model,
                                    pos * bytes_d_model)
  t_x = TinyNNCuda.tnn_add(@sess, t_embed_row, t_pos_row)

  li = 0
  while li < @n_layers
    t_x = build_block_step(t_x, @kv_blocks_ffi[li], pos, scale, eps,
                            bytes_d_head, bytes_max_T)
    li = li + 1
  end

  t_x_final = TinyNNCuda.tnn_layer_norm(@sess, t_x, @t_ln_f_gamma, @t_ln_f_beta, eps)
  t_kv_logits = TinyNNCuda.tnn_matmul(@sess, @t_token_embed, t_x_final)  # ne=[vocab, 1]
  TinyNNCuda.tnn_set_output(t_kv_logits)
  GPT2KVStepResultCuda.new(t_token_id, t_kv_logits)
end

#realize_for(max_T, d_model, d_ff, n_heads, n_layers, vocab_size, context_length) ⇒ Object

Declare all persistent tensors (weights + K/V buffers) and finalize the backend buffer. After this, weights can be uploaded; compute graphs are built per decode step via build_decode_step.



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 86

def realize_for(max_T, d_model, d_ff, n_heads, n_layers,
                vocab_size, context_length)
  @max_T          = max_T
  @d_model        = d_model
  @d_ff           = d_ff
  @n_heads        = n_heads
  @d_head         = d_model / n_heads
  @n_layers       = n_layers
  @vocab_size     = vocab_size
  @context_length = context_length

  @sess = TinyNNCuda.tnn_session_new(1)

  @t_token_embed = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, vocab_size, d_model)
  @t_pos_embed   = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, context_length, d_model)
  @t_ln_f_gamma  = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
  @t_ln_f_beta   = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)

  @kv_blocks_ffi = [GPT2KVBlockFFICuda.new]
  li = 1
  while li < n_layers
    @kv_blocks_ffi.push(GPT2KVBlockFFICuda.new)
    li = li + 1
  end

  li = 0
  while li < n_layers
    blk = @kv_blocks_ffi[li]
    blk.t_ln1_gamma = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_ln1_beta  = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_ln2_gamma = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_ln2_beta  = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)

    # Per-head: weights, biases, and KV buffers.
    blk.t_w_q = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model)]
    blk.t_w_k = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model)]
    blk.t_w_v = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model)]
    blk.t_b_q = [TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head)]
    blk.t_b_k = [TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head)]
    blk.t_b_v = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, 1)]   # ne=[1, d_head]
    # K: ne=[d_head, max_T]; V: ne=[max_T, d_head] (transposed layout).
    blk.t_K   = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, max_T,  d_head)]
    blk.t_V   = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, max_T)]
    h = 1
    while h < n_heads
      blk.t_w_q.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model))
      blk.t_w_k.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model))
      blk.t_w_v.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model))
      blk.t_b_q.push(TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head))
      blk.t_b_k.push(TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head))
      blk.t_b_v.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, 1))
      blk.t_K.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, max_T,  d_head))
      blk.t_V.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, max_T))
      h = h + 1
    end

    blk.t_w_o   = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_model, d_model)
    blk.t_b_o   = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_w_ff1 = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_ff,    d_model)
    blk.t_b_ff1 = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_ff)
    blk.t_w_ff2 = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_model, d_ff)
    blk.t_b_ff2 = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    li = li + 1
  end

  TinyNNCuda.tnn_finalize_weights(@sess)
  @realized = true
end