Class: GPT2FullForwardFFICacheCuda

Inherits:

Object

Object
GPT2FullForwardFFICacheCuda

show all

Defined in:: lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb

Instance Attribute Summary collapse

#d_ff ⇒ Object

Returns the value of attribute d_ff.
#d_head ⇒ Object

Returns the value of attribute d_head.
#d_model ⇒ Object

Returns the value of attribute d_model.
#gpt2_blocks_ffi ⇒ Object

Returns the value of attribute gpt2_blocks_ffi.
#n_heads ⇒ Object

Returns the value of attribute n_heads.
#n_layers ⇒ Object

Returns the value of attribute n_layers.
#realized ⇒ Object

Returns the value of attribute realized.
#sess ⇒ Object

Returns the value of attribute sess.
#t_ln_f_beta ⇒ Object

Returns the value of attribute t_ln_f_beta.
#t_ln_f_gamma ⇒ Object

Returns the value of attribute t_ln_f_gamma.
#t_logits ⇒ Object

Returns the value of attribute t_logits.
#t_pos_slice ⇒ Object

Returns the value of attribute t_pos_slice.
#t_seq ⇒ Object

Returns the value of attribute t_seq.
#t_token_embed ⇒ Object

Returns the value of attribute t_token_embed.
#t_token_ids ⇒ Object

Returns the value of attribute t_token_ids.
#t_x_embed ⇒ Object

Returns the value of attribute t_x_embed.
#t_x_final ⇒ Object

Returns the value of attribute t_x_final.
#vocab_size ⇒ Object

Returns the value of attribute vocab_size.

Instance Method Summary collapse

#build_attention_head(t_x, t_w_q, t_w_k, t_w_v, t_b_q, t_b_k, t_b_v, scale) ⇒ Object
#build_block(t_x, blk, eps, scale) ⇒ Object

Build one GPT-2 block’s graph nodes.
#initialize ⇒ GPT2FullForwardFFICacheCuda constructor

A new instance of GPT2FullForwardFFICacheCuda.
#realize_for(t_seq, d_model, d_ff, n_heads, n_layers, vocab_size) ⇒ Object

Allocate persistent ctx_w, declare all weights, build the compute graph.

Constructor Details

#initialize ⇒ `GPT2FullForwardFFICacheCuda`

Returns a new instance of GPT2FullForwardFFICacheCuda.

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 69

def initialize
  @realized   = false
  @t_seq      = 0
  @d_model    = 0
  @d_ff       = 0
  @n_heads    = 0
  @d_head     = 0
  @n_layers   = 0
  @vocab_size = 0
  @sess          = TinyNNCuda.tnn_null_ptr
  @t_token_embed = TinyNNCuda.tnn_null_ptr
  @t_pos_slice   = TinyNNCuda.tnn_null_ptr
  @t_token_ids   = TinyNNCuda.tnn_null_ptr
  @t_ln_f_gamma  = TinyNNCuda.tnn_null_ptr
  @t_ln_f_beta   = TinyNNCuda.tnn_null_ptr
  @t_x_embed     = TinyNNCuda.tnn_null_ptr
  @t_x_final     = TinyNNCuda.tnn_null_ptr
  @t_logits      = TinyNNCuda.tnn_null_ptr
  @gpt2_blocks_ffi = [GPT2BlockFFICuda.new]
end

Instance Attribute Details

#d_ff ⇒ `Object`

Returns the value of attribute d_ff.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def d_ff
  @d_ff
end

#d_head ⇒ `Object`

Returns the value of attribute d_head.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def d_head
  @d_head
end

#d_model ⇒ `Object`

Returns the value of attribute d_model.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def d_model
  @d_model
end

#gpt2_blocks_ffi ⇒ `Object`

Returns the value of attribute gpt2_blocks_ffi.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def gpt2_blocks_ffi
  @gpt2_blocks_ffi
end

#n_heads ⇒ `Object`

Returns the value of attribute n_heads.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def n_heads
  @n_heads
end

#n_layers ⇒ `Object`

Returns the value of attribute n_layers.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def n_layers
  @n_layers
end

#realized ⇒ `Object`

Returns the value of attribute realized.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def realized
  @realized
end

#sess ⇒ `Object`

Returns the value of attribute sess.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def sess
  @sess
end

#t_ln_f_beta ⇒ `Object`

Returns the value of attribute t_ln_f_beta.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_ln_f_beta
  @t_ln_f_beta
end

#t_ln_f_gamma ⇒ `Object`

Returns the value of attribute t_ln_f_gamma.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_ln_f_gamma
  @t_ln_f_gamma
end

#t_logits ⇒ `Object`

Returns the value of attribute t_logits.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_logits
  @t_logits
end

#t_pos_slice ⇒ `Object`

Returns the value of attribute t_pos_slice.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_pos_slice
  @t_pos_slice
end

#t_seq ⇒ `Object`

Returns the value of attribute t_seq.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_seq
  @t_seq
end

#t_token_embed ⇒ `Object`

Returns the value of attribute t_token_embed.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_token_embed
  @t_token_embed
end

#t_token_ids ⇒ `Object`

Returns the value of attribute t_token_ids.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_token_ids
  @t_token_ids
end

#t_x_embed ⇒ `Object`

Returns the value of attribute t_x_embed.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_x_embed
  @t_x_embed
end

#t_x_final ⇒ `Object`

Returns the value of attribute t_x_final.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def t_x_final
  @t_x_final
end

#vocab_size ⇒ `Object`

Returns the value of attribute vocab_size.



62
63
64

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 62

def vocab_size
  @vocab_size
end

Instance Method Details

#build_attention_head(t_x, t_w_q, t_w_k, t_w_v, t_b_q, t_b_k, t_b_v, scale) ⇒ `Object`

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 253

def build_attention_head(t_x, t_w_q, t_w_k, t_w_v, t_b_q, t_b_k, t_b_v, scale)
  t_q_raw = TinyNNCuda.tnn_matmul(@sess, t_w_q, t_x)        # ne=[d_head, T]
  t_q     = TinyNNCuda.tnn_add(@sess, t_q_raw, t_b_q)
  t_k_raw = TinyNNCuda.tnn_matmul(@sess, t_w_k, t_x)
  t_k     = TinyNNCuda.tnn_add(@sess, t_k_raw, t_b_k)
  # v in transposed pattern (ne=[T, d_head]) so head_out's k_dim matches.
  t_v_raw = TinyNNCuda.tnn_matmul(@sess, t_x, t_w_v)
  t_v     = TinyNNCuda.tnn_add(@sess, t_v_raw, t_b_v)

  t_scores = TinyNNCuda.tnn_matmul(@sess, t_k, t_q)
  t_scaled = TinyNNCuda.tnn_scale(@sess, t_scores, scale)
  t_masked = TinyNNCuda.tnn_diag_mask_inf(@sess, t_scaled, 0)
  t_attn   = TinyNNCuda.tnn_softmax(@sess, t_masked)

  TinyNNCuda.tnn_matmul(@sess, t_v, t_attn)                  # ne=[d_head, T_query]
end

#build_block(t_x, blk, eps, scale) ⇒ `Object`

Build one GPT-2 block’s graph nodes.

h1 = LayerNorm(x, ln1_gamma, ln1_beta)
per head h:
  q_h = mul_mat(w_q_t_h, h1) + b_q_h  ne=[d_head, T]
  k_h = mul_mat(w_k_t_h, h1) + b_k_h
  v_h = mul_mat(h1, w_v_t_h) + b_v_h  ne=[T, d_head] (transposed)
  scores_h = mul_mat(k_h, q_h)
  attn_h   = softmax(causal_mask(scale(scores_h)))
  head_out = mul_mat(v_h, attn_h)
concat_h = concat along ne0 (d_head -> d_model)
x_attn = x + (mul_mat(w_o_t, concat) + b_o)
h2 = LayerNorm(x_attn, ln2_gamma, ln2_beta)
ff_up = mul_mat(w_ff1_t, h2) + b_ff1
ff_g  = gelu(ff_up)
ff_dn = mul_mat(w_ff2_t, ff_g) + b_ff2
x_out = x_attn + ff_dn

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 214

def build_block(t_x, blk, eps, scale)
  t_h1 = TinyNNCuda.tnn_layer_norm(@sess, t_x, blk.t_ln1_gamma, blk.t_ln1_beta, eps)

  t_head0 = build_attention_head(t_h1, blk.t_w_q[0], blk.t_w_k[0], blk.t_w_v[0],
                                  blk.t_b_q[0], blk.t_b_k[0], blk.t_b_v[0], scale)
  t_head_outs = [t_head0]
  h = 1
  while h < @n_heads
    t_head_outs.push(build_attention_head(t_h1,
                                           blk.t_w_q[h], blk.t_w_k[h], blk.t_w_v[h],
                                           blk.t_b_q[h], blk.t_b_k[h], blk.t_b_v[h],
                                           scale))
    h = h + 1
  end

  # Concat along ne0 (d_head -> d_model).
  t_concat = t_head_outs[0]
  h = 1
  while h < @n_heads
    t_concat = TinyNNCuda.tnn_concat(@sess, t_concat, t_head_outs[h], 0)
    h = h + 1
  end

  # Output projection + bias + residual.
  t_out_proj_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_o, t_concat)
  t_out_proj     = TinyNNCuda.tnn_add(@sess, t_out_proj_raw, blk.t_b_o)
  t_x_attn       = TinyNNCuda.tnn_add(@sess, t_x, t_out_proj)

  # FFN.
  t_h2     = TinyNNCuda.tnn_layer_norm(@sess, t_x_attn, blk.t_ln2_gamma, blk.t_ln2_beta, eps)
  t_pre_raw= TinyNNCuda.tnn_matmul(@sess, blk.t_w_ff1, t_h2)
  t_pre    = TinyNNCuda.tnn_add(@sess, t_pre_raw, blk.t_b_ff1)
  t_hidden = TinyNNCuda.tnn_gelu(@sess, t_pre)
  t_dn_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_ff2, t_hidden)
  t_dn     = TinyNNCuda.tnn_add(@sess, t_dn_raw, blk.t_b_ff2)

  TinyNNCuda.tnn_add(@sess, t_x_attn, t_dn)
end

#realize_for(t_seq, d_model, d_ff, n_heads, n_layers, vocab_size) ⇒ `Object`

Allocate persistent ctx_w, declare all weights, build the compute graph. After this, only token_ids changes per call. Call once per T_SEQ choice; rebuild for a different T_SEQ.

# File 'lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb', line 93

def realize_for(t_seq, d_model, d_ff, n_heads, n_layers, vocab_size)
  @t_seq      = t_seq
  @d_model    = d_model
  @d_ff       = d_ff
  @n_heads    = n_heads
  @d_head     = d_model / n_heads
  @n_layers   = n_layers
  @vocab_size = vocab_size

  @sess = TinyNNCuda.tnn_session_new(1)

  # === Persistent weights (ctx_w) ===
  @t_token_embed = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, vocab_size, d_model)
  @t_pos_slice   = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, t_seq,      d_model)
  @t_ln_f_gamma  = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
  @t_ln_f_beta   = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)

  # Per-block handles — seed-then-push so Spinel types as Array<GPT2BlockFFICuda>.
  @gpt2_blocks_ffi = [GPT2BlockFFICuda.new]
  li = 1
  while li < n_layers
    @gpt2_blocks_ffi.push(GPT2BlockFFICuda.new)
    li = li + 1
  end

  li = 0
  while li < n_layers
    blk = @gpt2_blocks_ffi[li]
    blk.t_ln1_gamma = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_ln1_beta  = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_ln2_gamma = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_ln2_beta  = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)

    # Per-head Q/K/V weights. Uploaded TRANSPOSED so ne=[d_model, d_head]
    # holds W.elem(r, c) = mat[r][c]. matmul(w_q_t, h) then yields
    # ne=[d_head, T] — same trick as FullForwardFFICache.
    #
    # Bias shapes:
    #   b_q / b_k  ne=[d_head, 1]    — broadcasts against (d_head, T)
    #                                  matmul result, the QK layout
    #   b_v        ne=[1, d_head]    — broadcasts against (T, d_head)
    #                                  matmul result, the transposed-V
    #                                  layout (needed for head_out = v @ attn)
    # Both declarations are 2D under the hood; data is still a flat
    # length-d_head Array<Float>.
    blk.t_w_q = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model)]
    blk.t_w_k = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model)]
    blk.t_w_v = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model)]
    blk.t_b_q = [TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head)]
    blk.t_b_k = [TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head)]
    blk.t_b_v = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, 1)]
    h = 1
    while h < n_heads
      blk.t_w_q.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model))
      blk.t_w_k.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model))
      blk.t_w_v.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model))
      blk.t_b_q.push(TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head))
      blk.t_b_k.push(TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head))
      blk.t_b_v.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, 1))
      h = h + 1
    end

    blk.t_w_o   = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_model, d_model)
    blk.t_b_o   = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_w_ff1 = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_ff,    d_model)
    blk.t_b_ff1 = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_ff)
    blk.t_w_ff2 = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_model, d_ff)
    blk.t_b_ff2 = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    li = li + 1
  end

  TinyNNCuda.tnn_finalize_weights(@sess)

  # === Compute input ===
  @t_token_ids = TinyNNCuda.tnn_input_1d_i32(@sess, t_seq)

  # === Forward graph ===
  # x_embed = token_embed[ids] + pos_slice  (ne=[d_model, T])
  t_embedded = TinyNNCuda.tnn_get_rows(@sess, @t_token_embed, @t_token_ids)
  @t_x_embed = TinyNNCuda.tnn_add(@sess, t_embedded, @t_pos_slice)
  TinyNNCuda.tnn_set_output(@t_x_embed)

  eps = 1.0e-5
  scale = 1.0 / Math.sqrt(@d_head.to_f)

  t_cur = @t_x_embed
  li = 0
  while li < n_layers
    t_cur = build_block(t_cur, @gpt2_blocks_ffi[li], eps, scale)
    li = li + 1
  end

  # Final LayerNorm.
  @t_x_final = TinyNNCuda.tnn_layer_norm(@sess, t_cur, @t_ln_f_gamma, @t_ln_f_beta, eps)
  TinyNNCuda.tnn_set_output(@t_x_final)

  # Tied unembed: logits = mul_mat(token_embed, x_final)  ne=[vocab, T]
  @t_logits = TinyNNCuda.tnn_matmul(@sess, @t_token_embed, @t_x_final)
  TinyNNCuda.tnn_set_output(@t_logits)

  TinyNNCuda.tnn_realize(@sess, @t_logits)
  @realized = true
end

Class: GPT2FullForwardFFICacheCuda

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ GPT2FullForwardFFICacheCuda

Instance Attribute Details

#d_ff ⇒ Object

#d_head ⇒ Object

#d_model ⇒ Object

#gpt2_blocks_ffi ⇒ Object

#n_heads ⇒ Object

#n_layers ⇒ Object

#realized ⇒ Object

#sess ⇒ Object

#t_ln_f_beta ⇒ Object

#t_ln_f_gamma ⇒ Object

#t_logits ⇒ Object

#t_pos_slice ⇒ Object

#t_seq ⇒ Object

#t_token_embed ⇒ Object

#t_token_ids ⇒ Object

#t_x_embed ⇒ Object

#t_x_final ⇒ Object

#vocab_size ⇒ Object