Class: GPT2KVFFICache
- Inherits:
-
Object
- Object
- GPT2KVFFICache
- Defined in:
- lib/toy/llm/engine/gpt2_kv_engine.rb
Instance Attribute Summary collapse
-
#context_length ⇒ Object
Returns the value of attribute context_length.
-
#d_ff ⇒ Object
Returns the value of attribute d_ff.
-
#d_head ⇒ Object
Returns the value of attribute d_head.
-
#d_model ⇒ Object
Returns the value of attribute d_model.
-
#kv_blocks_ffi ⇒ Object
Returns the value of attribute kv_blocks_ffi.
-
#max_T ⇒ Object
Returns the value of attribute max_T.
-
#n_heads ⇒ Object
Returns the value of attribute n_heads.
-
#n_layers ⇒ Object
Returns the value of attribute n_layers.
-
#realized ⇒ Object
Returns the value of attribute realized.
-
#sess ⇒ Object
Returns the value of attribute sess.
-
#t_ln_f_beta ⇒ Object
Returns the value of attribute t_ln_f_beta.
-
#t_ln_f_gamma ⇒ Object
Returns the value of attribute t_ln_f_gamma.
-
#t_pos_embed ⇒ Object
Returns the value of attribute t_pos_embed.
-
#t_token_embed ⇒ Object
Returns the value of attribute t_token_embed.
-
#vocab_size ⇒ Object
Returns the value of attribute vocab_size.
Instance Method Summary collapse
- #build_attention_head_step(t_h, blk, head_idx, pos, scale, bytes_d_head, bytes_max_T) ⇒ Object
- #build_block_step(t_x, blk, pos, scale, eps, bytes_d_head, bytes_max_T) ⇒ Object
-
#build_decode_step(pos) ⇒ Object
Build the compute graph for one decode position.
-
#initialize ⇒ GPT2KVFFICache
constructor
A new instance of GPT2KVFFICache.
-
#realize_for(max_T, d_model, d_ff, n_heads, n_layers, vocab_size, context_length) ⇒ Object
Declare all persistent tensors (weights + K/V buffers) and finalize the backend buffer.
Constructor Details
#initialize ⇒ GPT2KVFFICache
Returns a new instance of GPT2KVFFICache.
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 63 def initialize @realized = false @max_T = 0 @d_model = 0 @d_ff = 0 @n_heads = 0 @d_head = 0 @n_layers = 0 @vocab_size = 0 @context_length = 0 @sess = TinyNN.tnn_null_ptr @t_token_embed = TinyNN.tnn_null_ptr @t_pos_embed = TinyNN.tnn_null_ptr @t_ln_f_gamma = TinyNN.tnn_null_ptr @t_ln_f_beta = TinyNN.tnn_null_ptr @kv_blocks_ffi = [GPT2KVBlockFFI.new] end |
Instance Attribute Details
#context_length ⇒ Object
Returns the value of attribute context_length.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def context_length @context_length end |
#d_ff ⇒ Object
Returns the value of attribute d_ff.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def d_ff @d_ff end |
#d_head ⇒ Object
Returns the value of attribute d_head.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def d_head @d_head end |
#d_model ⇒ Object
Returns the value of attribute d_model.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def d_model @d_model end |
#kv_blocks_ffi ⇒ Object
Returns the value of attribute kv_blocks_ffi.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def kv_blocks_ffi @kv_blocks_ffi end |
#max_T ⇒ Object
Returns the value of attribute max_T.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def max_T @max_T end |
#n_heads ⇒ Object
Returns the value of attribute n_heads.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def n_heads @n_heads end |
#n_layers ⇒ Object
Returns the value of attribute n_layers.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def n_layers @n_layers end |
#realized ⇒ Object
Returns the value of attribute realized.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def realized @realized end |
#sess ⇒ Object
Returns the value of attribute sess.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def sess @sess end |
#t_ln_f_beta ⇒ Object
Returns the value of attribute t_ln_f_beta.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def t_ln_f_beta @t_ln_f_beta end |
#t_ln_f_gamma ⇒ Object
Returns the value of attribute t_ln_f_gamma.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def t_ln_f_gamma @t_ln_f_gamma end |
#t_pos_embed ⇒ Object
Returns the value of attribute t_pos_embed.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def @t_pos_embed end |
#t_token_embed ⇒ Object
Returns the value of attribute t_token_embed.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def @t_token_embed end |
#vocab_size ⇒ Object
Returns the value of attribute vocab_size.
57 58 59 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 57 def vocab_size @vocab_size end |
Instance Method Details
#build_attention_head_step(t_h, blk, head_idx, pos, scale, bytes_d_head, bytes_max_T) ⇒ Object
222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 222 def build_attention_head_step(t_h, blk, head_idx, pos, scale, bytes_d_head, bytes_max_T) # q_new, k_new, v_new for the single new position. t_q_raw = TinyNN.tnn_matmul(@sess, blk.t_w_q[head_idx], t_h) # ne=[d_head, 1] t_q = TinyNN.tnn_add(@sess, t_q_raw, blk.t_b_q[head_idx]) t_k_raw = TinyNN.tnn_matmul(@sess, blk.t_w_k[head_idx], t_h) t_k_new = TinyNN.tnn_add(@sess, t_k_raw, blk.t_b_k[head_idx]) t_v_raw = TinyNN.tnn_matmul(@sess, t_h, blk.t_w_v[head_idx]) # ne=[1, d_head] t_v_new = TinyNN.tnn_add(@sess, t_v_raw, blk.t_b_v[head_idx]) # Write k_new → K[pos], v_new → V[:, pos] via cpy-into-view. t_K_slot = TinyNN.tnn_view_2d(@sess, blk.t_K[head_idx], @d_head, 1, bytes_d_head, pos * bytes_d_head) t_cpy_k = TinyNN.tnn_cpy(@sess, t_k_new, t_K_slot) t_V_slot = TinyNN.tnn_view_2d(@sess, blk.t_V[head_idx], 1, @d_head, bytes_max_T, pos * 4) t_cpy_v = TinyNN.tnn_cpy(@sess, t_v_new, t_V_slot) # The cpy tensors aren't reachable from head_out; force them into # the graph so the scheduler runs them before the attn matmuls # read K/V history. TinyNN.tnn_add_to_graph(@sess, t_cpy_k) TinyNN.tnn_add_to_graph(@sess, t_cpy_v) # Attention over K[0:pos+1] / V[0:pos+1]. t_K_hist = TinyNN.tnn_view_2d(@sess, blk.t_K[head_idx], @d_head, pos + 1, bytes_d_head, 0) t_V_hist = TinyNN.tnn_view_2d(@sess, blk.t_V[head_idx], pos + 1, @d_head, bytes_max_T, 0) t_scores = TinyNN.tnn_matmul(@sess, t_K_hist, t_q) # ne=[pos+1, 1] t_scaled = TinyNN.tnn_scale(@sess, t_scores, scale) # No causal mask: K_hist already covers only valid past positions. t_attn = TinyNN.tnn_softmax(@sess, t_scaled) TinyNN.tnn_matmul(@sess, t_V_hist, t_attn) # ne=[d_head, 1] end |
#build_block_step(t_x, blk, pos, scale, eps, bytes_d_head, bytes_max_T) ⇒ Object
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 188 def build_block_step(t_x, blk, pos, scale, eps, bytes_d_head, bytes_max_T) t_h = TinyNN.tnn_layer_norm(@sess, t_x, blk.t_ln1_gamma, blk.t_ln1_beta, eps) t_head_out0 = build_attention_head_step(t_h, blk, 0, pos, scale, bytes_d_head, bytes_max_T) t_head_outs = [t_head_out0] h = 1 while h < @n_heads t_head_outs.push(build_attention_head_step(t_h, blk, h, pos, scale, bytes_d_head, bytes_max_T)) h = h + 1 end t_concat = t_head_outs[0] h = 1 while h < @n_heads t_concat = TinyNN.tnn_concat(@sess, t_concat, t_head_outs[h], 0) h = h + 1 end t_out_proj_raw = TinyNN.tnn_matmul(@sess, blk.t_w_o, t_concat) t_out_proj = TinyNN.tnn_add(@sess, t_out_proj_raw, blk.t_b_o) t_x_attn = TinyNN.tnn_add(@sess, t_x, t_out_proj) t_h2 = TinyNN.tnn_layer_norm(@sess, t_x_attn, blk.t_ln2_gamma, blk.t_ln2_beta, eps) t_up_raw = TinyNN.tnn_matmul(@sess, blk.t_w_ff1, t_h2) t_up = TinyNN.tnn_add(@sess, t_up_raw, blk.t_b_ff1) t_g = TinyNN.tnn_gelu(@sess, t_up) t_dn_raw = TinyNN.tnn_matmul(@sess, blk.t_w_ff2, t_g) t_dn = TinyNN.tnn_add(@sess, t_dn_raw, blk.t_b_ff2) TinyNN.tnn_add(@sess, t_x_attn, t_dn) end |
#build_decode_step(pos) ⇒ Object
Build the compute graph for one decode position. Returns the logits tensor handle. Caller calls tnn_compute then download_row_major.
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 155 def build_decode_step(pos) eps = 1.0e-5 scale = 1.0 / Math.sqrt(@d_head.to_f) d_model = @d_model d_head = @d_head max_T = @max_T bytes_d_head = d_head * 4 bytes_d_model = d_model * 4 bytes_max_T = max_T * 4 # Single-token input. t_token_id = TinyNN.tnn_input_1d_i32(@sess, 1) # x = embed[token_id] + pos_embed[pos] = TinyNN.tnn_get_rows(@sess, @t_token_embed, t_token_id) # ne=[d_model, 1] t_pos_row = TinyNN.tnn_view_2d(@sess, @t_pos_embed, d_model, 1, bytes_d_model, pos * bytes_d_model) t_x = TinyNN.tnn_add(@sess, , t_pos_row) li = 0 while li < @n_layers t_x = build_block_step(t_x, @kv_blocks_ffi[li], pos, scale, eps, bytes_d_head, bytes_max_T) li = li + 1 end t_x_final = TinyNN.tnn_layer_norm(@sess, t_x, @t_ln_f_gamma, @t_ln_f_beta, eps) t_kv_logits = TinyNN.tnn_matmul(@sess, @t_token_embed, t_x_final) # ne=[vocab, 1] TinyNN.tnn_set_output(t_kv_logits) GPT2KVStepResult.new(t_token_id, t_kv_logits) end |
#realize_for(max_T, d_model, d_ff, n_heads, n_layers, vocab_size, context_length) ⇒ Object
Declare all persistent tensors (weights + K/V buffers) and finalize the backend buffer. After this, weights can be uploaded; compute graphs are built per decode step via build_decode_step.
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# File 'lib/toy/llm/engine/gpt2_kv_engine.rb', line 84 def realize_for(max_T, d_model, d_ff, n_heads, n_layers, vocab_size, context_length) @max_T = max_T @d_model = d_model @d_ff = d_ff @n_heads = n_heads @d_head = d_model / n_heads @n_layers = n_layers @vocab_size = vocab_size @context_length = context_length @sess = TinyNN.tnn_session_new(0) @t_token_embed = TinyNN.tnn_input_2d_f32_persistent(@sess, vocab_size, d_model) @t_pos_embed = TinyNN.tnn_input_2d_f32_persistent(@sess, context_length, d_model) @t_ln_f_gamma = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) @t_ln_f_beta = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) @kv_blocks_ffi = [GPT2KVBlockFFI.new] li = 1 while li < n_layers @kv_blocks_ffi.push(GPT2KVBlockFFI.new) li = li + 1 end li = 0 while li < n_layers blk = @kv_blocks_ffi[li] blk.t_ln1_gamma = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) blk.t_ln1_beta = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) blk.t_ln2_gamma = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) blk.t_ln2_beta = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) # Per-head: weights, biases, and KV buffers. blk.t_w_q = [TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, d_model)] blk.t_w_k = [TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, d_model)] blk.t_w_v = [TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, d_model)] blk.t_b_q = [TinyNN.tnn_input_1d_f32_persistent(@sess, d_head)] blk.t_b_k = [TinyNN.tnn_input_1d_f32_persistent(@sess, d_head)] blk.t_b_v = [TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, 1)] # ne=[1, d_head] # K: ne=[d_head, max_T]; V: ne=[max_T, d_head] (transposed layout). blk.t_K = [TinyNN.tnn_input_2d_f32_persistent(@sess, max_T, d_head)] blk.t_V = [TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, max_T)] h = 1 while h < n_heads blk.t_w_q.push(TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, d_model)) blk.t_w_k.push(TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, d_model)) blk.t_w_v.push(TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, d_model)) blk.t_b_q.push(TinyNN.tnn_input_1d_f32_persistent(@sess, d_head)) blk.t_b_k.push(TinyNN.tnn_input_1d_f32_persistent(@sess, d_head)) blk.t_b_v.push(TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, 1)) blk.t_K.push(TinyNN.tnn_input_2d_f32_persistent(@sess, max_T, d_head)) blk.t_V.push(TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, max_T)) h = h + 1 end blk.t_w_o = TinyNN.tnn_input_2d_f32_persistent(@sess, d_model, d_model) blk.t_b_o = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) blk.t_w_ff1 = TinyNN.tnn_input_2d_f32_persistent(@sess, d_ff, d_model) blk.t_b_ff1 = TinyNN.tnn_input_1d_f32_persistent(@sess, d_ff) blk.t_w_ff2 = TinyNN.tnn_input_2d_f32_persistent(@sess, d_model, d_ff) blk.t_b_ff2 = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) li = li + 1 end TinyNN.tnn_finalize_weights(@sess) @realized = true end |