Class: GPT2FullForwardFFICache
- Inherits:
-
Object
- Object
- GPT2FullForwardFFICache
- Defined in:
- lib/toy/llm/engine/gpt2_fwd_engine.rb
Instance Attribute Summary collapse
-
#d_ff ⇒ Object
Returns the value of attribute d_ff.
-
#d_head ⇒ Object
Returns the value of attribute d_head.
-
#d_model ⇒ Object
Returns the value of attribute d_model.
-
#gpt2_blocks_ffi ⇒ Object
Returns the value of attribute gpt2_blocks_ffi.
-
#n_heads ⇒ Object
Returns the value of attribute n_heads.
-
#n_layers ⇒ Object
Returns the value of attribute n_layers.
-
#realized ⇒ Object
Returns the value of attribute realized.
-
#sess ⇒ Object
Returns the value of attribute sess.
-
#t_ln_f_beta ⇒ Object
Returns the value of attribute t_ln_f_beta.
-
#t_ln_f_gamma ⇒ Object
Returns the value of attribute t_ln_f_gamma.
-
#t_logits ⇒ Object
Returns the value of attribute t_logits.
-
#t_pos_slice ⇒ Object
Returns the value of attribute t_pos_slice.
-
#t_seq ⇒ Object
Returns the value of attribute t_seq.
-
#t_token_embed ⇒ Object
Returns the value of attribute t_token_embed.
-
#t_token_ids ⇒ Object
Returns the value of attribute t_token_ids.
-
#t_x_embed ⇒ Object
Returns the value of attribute t_x_embed.
-
#t_x_final ⇒ Object
Returns the value of attribute t_x_final.
-
#vocab_size ⇒ Object
Returns the value of attribute vocab_size.
Instance Method Summary collapse
- #build_attention_head(t_x, t_w_q, t_w_k, t_w_v, t_b_q, t_b_k, t_b_v, scale) ⇒ Object
-
#build_block(t_x, blk, eps, scale) ⇒ Object
Build one GPT-2 block’s graph nodes.
-
#initialize ⇒ GPT2FullForwardFFICache
constructor
A new instance of GPT2FullForwardFFICache.
-
#realize_for(t_seq, d_model, d_ff, n_heads, n_layers, vocab_size) ⇒ Object
Allocate persistent ctx_w, declare all weights, build the compute graph.
Constructor Details
#initialize ⇒ GPT2FullForwardFFICache
Returns a new instance of GPT2FullForwardFFICache.
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 67 def initialize @realized = false @t_seq = 0 @d_model = 0 @d_ff = 0 @n_heads = 0 @d_head = 0 @n_layers = 0 @vocab_size = 0 @sess = TinyNN.tnn_null_ptr @t_token_embed = TinyNN.tnn_null_ptr @t_pos_slice = TinyNN.tnn_null_ptr @t_token_ids = TinyNN.tnn_null_ptr @t_ln_f_gamma = TinyNN.tnn_null_ptr @t_ln_f_beta = TinyNN.tnn_null_ptr @t_x_embed = TinyNN.tnn_null_ptr @t_x_final = TinyNN.tnn_null_ptr @t_logits = TinyNN.tnn_null_ptr @gpt2_blocks_ffi = [GPT2BlockFFI.new] end |
Instance Attribute Details
#d_ff ⇒ Object
Returns the value of attribute d_ff.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def d_ff @d_ff end |
#d_head ⇒ Object
Returns the value of attribute d_head.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def d_head @d_head end |
#d_model ⇒ Object
Returns the value of attribute d_model.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def d_model @d_model end |
#gpt2_blocks_ffi ⇒ Object
Returns the value of attribute gpt2_blocks_ffi.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def gpt2_blocks_ffi @gpt2_blocks_ffi end |
#n_heads ⇒ Object
Returns the value of attribute n_heads.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def n_heads @n_heads end |
#n_layers ⇒ Object
Returns the value of attribute n_layers.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def n_layers @n_layers end |
#realized ⇒ Object
Returns the value of attribute realized.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def realized @realized end |
#sess ⇒ Object
Returns the value of attribute sess.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def sess @sess end |
#t_ln_f_beta ⇒ Object
Returns the value of attribute t_ln_f_beta.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def t_ln_f_beta @t_ln_f_beta end |
#t_ln_f_gamma ⇒ Object
Returns the value of attribute t_ln_f_gamma.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def t_ln_f_gamma @t_ln_f_gamma end |
#t_logits ⇒ Object
Returns the value of attribute t_logits.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def t_logits @t_logits end |
#t_pos_slice ⇒ Object
Returns the value of attribute t_pos_slice.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def t_pos_slice @t_pos_slice end |
#t_seq ⇒ Object
Returns the value of attribute t_seq.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def t_seq @t_seq end |
#t_token_embed ⇒ Object
Returns the value of attribute t_token_embed.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def @t_token_embed end |
#t_token_ids ⇒ Object
Returns the value of attribute t_token_ids.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def t_token_ids @t_token_ids end |
#t_x_embed ⇒ Object
Returns the value of attribute t_x_embed.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def @t_x_embed end |
#t_x_final ⇒ Object
Returns the value of attribute t_x_final.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def t_x_final @t_x_final end |
#vocab_size ⇒ Object
Returns the value of attribute vocab_size.
60 61 62 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 60 def vocab_size @vocab_size end |
Instance Method Details
#build_attention_head(t_x, t_w_q, t_w_k, t_w_v, t_b_q, t_b_k, t_b_v, scale) ⇒ Object
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 251 def build_attention_head(t_x, t_w_q, t_w_k, t_w_v, t_b_q, t_b_k, t_b_v, scale) t_q_raw = TinyNN.tnn_matmul(@sess, t_w_q, t_x) # ne=[d_head, T] t_q = TinyNN.tnn_add(@sess, t_q_raw, t_b_q) t_k_raw = TinyNN.tnn_matmul(@sess, t_w_k, t_x) t_k = TinyNN.tnn_add(@sess, t_k_raw, t_b_k) # v in transposed pattern (ne=[T, d_head]) so head_out's k_dim matches. t_v_raw = TinyNN.tnn_matmul(@sess, t_x, t_w_v) t_v = TinyNN.tnn_add(@sess, t_v_raw, t_b_v) t_scores = TinyNN.tnn_matmul(@sess, t_k, t_q) t_scaled = TinyNN.tnn_scale(@sess, t_scores, scale) t_masked = TinyNN.tnn_diag_mask_inf(@sess, t_scaled, 0) t_attn = TinyNN.tnn_softmax(@sess, t_masked) TinyNN.tnn_matmul(@sess, t_v, t_attn) # ne=[d_head, T_query] end |
#build_block(t_x, blk, eps, scale) ⇒ Object
Build one GPT-2 block’s graph nodes.
h1 = LayerNorm(x, ln1_gamma, ln1_beta)
per head h:
q_h = mul_mat(w_q_t_h, h1) + b_q_h ne=[d_head, T]
k_h = mul_mat(w_k_t_h, h1) + b_k_h
v_h = mul_mat(h1, w_v_t_h) + b_v_h ne=[T, d_head] (transposed)
scores_h = mul_mat(k_h, q_h)
attn_h = softmax(causal_mask(scale(scores_h)))
head_out = mul_mat(v_h, attn_h)
concat_h = concat along ne0 (d_head -> d_model)
x_attn = x + (mul_mat(w_o_t, concat) + b_o)
h2 = LayerNorm(x_attn, ln2_gamma, ln2_beta)
ff_up = mul_mat(w_ff1_t, h2) + b_ff1
ff_g = gelu(ff_up)
ff_dn = mul_mat(w_ff2_t, ff_g) + b_ff2
x_out = x_attn + ff_dn
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 212 def build_block(t_x, blk, eps, scale) t_h1 = TinyNN.tnn_layer_norm(@sess, t_x, blk.t_ln1_gamma, blk.t_ln1_beta, eps) t_head0 = build_attention_head(t_h1, blk.t_w_q[0], blk.t_w_k[0], blk.t_w_v[0], blk.t_b_q[0], blk.t_b_k[0], blk.t_b_v[0], scale) t_head_outs = [t_head0] h = 1 while h < @n_heads t_head_outs.push(build_attention_head(t_h1, blk.t_w_q[h], blk.t_w_k[h], blk.t_w_v[h], blk.t_b_q[h], blk.t_b_k[h], blk.t_b_v[h], scale)) h = h + 1 end # Concat along ne0 (d_head -> d_model). t_concat = t_head_outs[0] h = 1 while h < @n_heads t_concat = TinyNN.tnn_concat(@sess, t_concat, t_head_outs[h], 0) h = h + 1 end # Output projection + bias + residual. t_out_proj_raw = TinyNN.tnn_matmul(@sess, blk.t_w_o, t_concat) t_out_proj = TinyNN.tnn_add(@sess, t_out_proj_raw, blk.t_b_o) t_x_attn = TinyNN.tnn_add(@sess, t_x, t_out_proj) # FFN. t_h2 = TinyNN.tnn_layer_norm(@sess, t_x_attn, blk.t_ln2_gamma, blk.t_ln2_beta, eps) t_pre_raw= TinyNN.tnn_matmul(@sess, blk.t_w_ff1, t_h2) t_pre = TinyNN.tnn_add(@sess, t_pre_raw, blk.t_b_ff1) t_hidden = TinyNN.tnn_gelu(@sess, t_pre) t_dn_raw = TinyNN.tnn_matmul(@sess, blk.t_w_ff2, t_hidden) t_dn = TinyNN.tnn_add(@sess, t_dn_raw, blk.t_b_ff2) TinyNN.tnn_add(@sess, t_x_attn, t_dn) end |
#realize_for(t_seq, d_model, d_ff, n_heads, n_layers, vocab_size) ⇒ Object
Allocate persistent ctx_w, declare all weights, build the compute graph. After this, only token_ids changes per call. Call once per T_SEQ choice; rebuild for a different T_SEQ.
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
# File 'lib/toy/llm/engine/gpt2_fwd_engine.rb', line 91 def realize_for(t_seq, d_model, d_ff, n_heads, n_layers, vocab_size) @t_seq = t_seq @d_model = d_model @d_ff = d_ff @n_heads = n_heads @d_head = d_model / n_heads @n_layers = n_layers @vocab_size = vocab_size @sess = TinyNN.tnn_session_new(0) # === Persistent weights (ctx_w) === @t_token_embed = TinyNN.tnn_input_2d_f32_persistent(@sess, vocab_size, d_model) @t_pos_slice = TinyNN.tnn_input_2d_f32_persistent(@sess, t_seq, d_model) @t_ln_f_gamma = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) @t_ln_f_beta = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) # Per-block handles — seed-then-push so Spinel types as Array<GPT2BlockFFI>. @gpt2_blocks_ffi = [GPT2BlockFFI.new] li = 1 while li < n_layers @gpt2_blocks_ffi.push(GPT2BlockFFI.new) li = li + 1 end li = 0 while li < n_layers blk = @gpt2_blocks_ffi[li] blk.t_ln1_gamma = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) blk.t_ln1_beta = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) blk.t_ln2_gamma = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) blk.t_ln2_beta = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) # Per-head Q/K/V weights. Uploaded TRANSPOSED so ne=[d_model, d_head] # holds W.elem(r, c) = mat[r][c]. matmul(w_q_t, h) then yields # ne=[d_head, T] — same trick as FullForwardFFICache. # # Bias shapes: # b_q / b_k ne=[d_head, 1] — broadcasts against (d_head, T) # matmul result, the QK layout # b_v ne=[1, d_head] — broadcasts against (T, d_head) # matmul result, the transposed-V # layout (needed for head_out = v @ attn) # Both declarations are 2D under the hood; data is still a flat # length-d_head Array<Float>. blk.t_w_q = [TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, d_model)] blk.t_w_k = [TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, d_model)] blk.t_w_v = [TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, d_model)] blk.t_b_q = [TinyNN.tnn_input_1d_f32_persistent(@sess, d_head)] blk.t_b_k = [TinyNN.tnn_input_1d_f32_persistent(@sess, d_head)] blk.t_b_v = [TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, 1)] h = 1 while h < n_heads blk.t_w_q.push(TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, d_model)) blk.t_w_k.push(TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, d_model)) blk.t_w_v.push(TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, d_model)) blk.t_b_q.push(TinyNN.tnn_input_1d_f32_persistent(@sess, d_head)) blk.t_b_k.push(TinyNN.tnn_input_1d_f32_persistent(@sess, d_head)) blk.t_b_v.push(TinyNN.tnn_input_2d_f32_persistent(@sess, d_head, 1)) h = h + 1 end blk.t_w_o = TinyNN.tnn_input_2d_f32_persistent(@sess, d_model, d_model) blk.t_b_o = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) blk.t_w_ff1 = TinyNN.tnn_input_2d_f32_persistent(@sess, d_ff, d_model) blk.t_b_ff1 = TinyNN.tnn_input_1d_f32_persistent(@sess, d_ff) blk.t_w_ff2 = TinyNN.tnn_input_2d_f32_persistent(@sess, d_model, d_ff) blk.t_b_ff2 = TinyNN.tnn_input_1d_f32_persistent(@sess, d_model) li = li + 1 end TinyNN.tnn_finalize_weights(@sess) # === Compute input === @t_token_ids = TinyNN.tnn_input_1d_i32(@sess, t_seq) # === Forward graph === # x_embed = token_embed[ids] + pos_slice (ne=[d_model, T]) = TinyNN.tnn_get_rows(@sess, @t_token_embed, @t_token_ids) @t_x_embed = TinyNN.tnn_add(@sess, , @t_pos_slice) TinyNN.tnn_set_output(@t_x_embed) eps = 1.0e-5 scale = 1.0 / Math.sqrt(@d_head.to_f) t_cur = @t_x_embed li = 0 while li < n_layers t_cur = build_block(t_cur, @gpt2_blocks_ffi[li], eps, scale) li = li + 1 end # Final LayerNorm. @t_x_final = TinyNN.tnn_layer_norm(@sess, t_cur, @t_ln_f_gamma, @t_ln_f_beta, eps) TinyNN.tnn_set_output(@t_x_final) # Tied unembed: logits = mul_mat(token_embed, x_final) ne=[vocab, T] @t_logits = TinyNN.tnn_matmul(@sess, @t_token_embed, @t_x_final) TinyNN.tnn_set_output(@t_logits) TinyNN.tnn_realize(@sess, @t_logits) @realized = true end |