Class: GPT2KVFFICacheCuda

Inherits:

Object

Object
GPT2KVFFICacheCuda

show all

Defined in:: lib/toy/llm/engine/gpt2_kv_engine_cuda.rb

Instance Attribute Summary collapse

#context_length ⇒ Object

Returns the value of attribute context_length.
#d_ff ⇒ Object

Returns the value of attribute d_ff.
#d_head ⇒ Object

Returns the value of attribute d_head.
#d_model ⇒ Object

Returns the value of attribute d_model.
#kv_blocks_ffi ⇒ Object

Returns the value of attribute kv_blocks_ffi.
#max_T ⇒ Object

Returns the value of attribute max_T.
#n_heads ⇒ Object

Returns the value of attribute n_heads.
#n_layers ⇒ Object

Returns the value of attribute n_layers.
#realized ⇒ Object

Returns the value of attribute realized.
#sess ⇒ Object

Returns the value of attribute sess.
#t_ln_f_beta ⇒ Object

Returns the value of attribute t_ln_f_beta.
#t_ln_f_gamma ⇒ Object

Returns the value of attribute t_ln_f_gamma.
#t_pos_embed ⇒ Object

Returns the value of attribute t_pos_embed.
#t_token_embed ⇒ Object

Returns the value of attribute t_token_embed.
#vocab_size ⇒ Object

Returns the value of attribute vocab_size.

Instance Method Summary collapse

#build_attention_head_step(t_h, blk, head_idx, pos, scale, bytes_d_head, bytes_max_T) ⇒ Object
#build_block_step(t_x, blk, pos, scale, eps, bytes_d_head, bytes_max_T) ⇒ Object
#build_decode_step(pos) ⇒ Object

Build the compute graph for one decode position.
#initialize ⇒ GPT2KVFFICacheCuda constructor

A new instance of GPT2KVFFICacheCuda.
#realize_for(max_T, d_model, d_ff, n_heads, n_layers, vocab_size, context_length) ⇒ Object

Declare all persistent tensors (weights + K/V buffers) and finalize the backend buffer.

Constructor Details

#initialize ⇒ `GPT2KVFFICacheCuda`

Returns a new instance of GPT2KVFFICacheCuda.

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 65

def initialize
  @realized   = false
  @max_T      = 0
  @d_model    = 0
  @d_ff       = 0
  @n_heads    = 0
  @d_head     = 0
  @n_layers   = 0
  @vocab_size = 0
  @context_length = 0
  @sess          = TinyNNCuda.tnn_null_ptr
  @t_token_embed = TinyNNCuda.tnn_null_ptr
  @t_pos_embed   = TinyNNCuda.tnn_null_ptr
  @t_ln_f_gamma  = TinyNNCuda.tnn_null_ptr
  @t_ln_f_beta   = TinyNNCuda.tnn_null_ptr
  @kv_blocks_ffi = [GPT2KVBlockFFICuda.new]
end

Instance Attribute Details

#context_length ⇒ `Object`

Returns the value of attribute context_length.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def context_length
  @context_length
end

#d_ff ⇒ `Object`

Returns the value of attribute d_ff.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def d_ff
  @d_ff
end

#d_head ⇒ `Object`

Returns the value of attribute d_head.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def d_head
  @d_head
end

#d_model ⇒ `Object`

Returns the value of attribute d_model.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def d_model
  @d_model
end

#kv_blocks_ffi ⇒ `Object`

Returns the value of attribute kv_blocks_ffi.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def kv_blocks_ffi
  @kv_blocks_ffi
end

#max_T ⇒ `Object`

Returns the value of attribute max_T.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def max_T
  @max_T
end

#n_heads ⇒ `Object`

Returns the value of attribute n_heads.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def n_heads
  @n_heads
end

#n_layers ⇒ `Object`

Returns the value of attribute n_layers.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def n_layers
  @n_layers
end

#realized ⇒ `Object`

Returns the value of attribute realized.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def realized
  @realized
end

#sess ⇒ `Object`

Returns the value of attribute sess.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def sess
  @sess
end

#t_ln_f_beta ⇒ `Object`

Returns the value of attribute t_ln_f_beta.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def t_ln_f_beta
  @t_ln_f_beta
end

#t_ln_f_gamma ⇒ `Object`

Returns the value of attribute t_ln_f_gamma.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def t_ln_f_gamma
  @t_ln_f_gamma
end

#t_pos_embed ⇒ `Object`

Returns the value of attribute t_pos_embed.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def t_pos_embed
  @t_pos_embed
end

#t_token_embed ⇒ `Object`

Returns the value of attribute t_token_embed.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def t_token_embed
  @t_token_embed
end

#vocab_size ⇒ `Object`

Returns the value of attribute vocab_size.



59
60
61

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 59

def vocab_size
  @vocab_size
end

Instance Method Details

#build_attention_head_step(t_h, blk, head_idx, pos, scale, bytes_d_head, bytes_max_T) ⇒ `Object`

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 224

def build_attention_head_step(t_h, blk, head_idx, pos, scale,
                               bytes_d_head, bytes_max_T)
  # q_new, k_new, v_new for the single new position.
  t_q_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_q[head_idx], t_h)   # ne=[d_head, 1]
  t_q     = TinyNNCuda.tnn_add(@sess, t_q_raw, blk.t_b_q[head_idx])
  t_k_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_k[head_idx], t_h)
  t_k_new = TinyNNCuda.tnn_add(@sess, t_k_raw, blk.t_b_k[head_idx])
  t_v_raw = TinyNNCuda.tnn_matmul(@sess, t_h, blk.t_w_v[head_idx])   # ne=[1, d_head]
  t_v_new = TinyNNCuda.tnn_add(@sess, t_v_raw, blk.t_b_v[head_idx])

  # Write k_new → K[pos], v_new → V[:, pos] via cpy-into-view.
  t_K_slot = TinyNNCuda.tnn_view_2d(@sess, blk.t_K[head_idx],
                                  @d_head, 1, bytes_d_head, pos * bytes_d_head)
  t_cpy_k  = TinyNNCuda.tnn_cpy(@sess, t_k_new, t_K_slot)
  t_V_slot = TinyNNCuda.tnn_view_2d(@sess, blk.t_V[head_idx],
                                  1, @d_head, bytes_max_T, pos * 4)
  t_cpy_v  = TinyNNCuda.tnn_cpy(@sess, t_v_new, t_V_slot)
  # The cpy tensors aren't reachable from head_out; force them into
  # the graph so the scheduler runs them before the attn matmuls
  # read K/V history.
  TinyNNCuda.tnn_add_to_graph(@sess, t_cpy_k)
  TinyNNCuda.tnn_add_to_graph(@sess, t_cpy_v)

  # Attention over K[0:pos+1] / V[0:pos+1].
  t_K_hist = TinyNNCuda.tnn_view_2d(@sess, blk.t_K[head_idx],
                                  @d_head, pos + 1, bytes_d_head, 0)
  t_V_hist = TinyNNCuda.tnn_view_2d(@sess, blk.t_V[head_idx],
                                  pos + 1, @d_head, bytes_max_T, 0)

  t_scores = TinyNNCuda.tnn_matmul(@sess, t_K_hist, t_q)        # ne=[pos+1, 1]
  t_scaled = TinyNNCuda.tnn_scale(@sess, t_scores, scale)
  # No causal mask: K_hist already covers only valid past positions.
  t_attn   = TinyNNCuda.tnn_softmax(@sess, t_scaled)
  TinyNNCuda.tnn_matmul(@sess, t_V_hist, t_attn)                 # ne=[d_head, 1]
end

#build_block_step(t_x, blk, pos, scale, eps, bytes_d_head, bytes_max_T) ⇒ `Object`

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 190

def build_block_step(t_x, blk, pos, scale, eps, bytes_d_head, bytes_max_T)
  t_h = TinyNNCuda.tnn_layer_norm(@sess, t_x, blk.t_ln1_gamma, blk.t_ln1_beta, eps)

  t_head_out0 = build_attention_head_step(t_h, blk, 0, pos, scale,
                                           bytes_d_head, bytes_max_T)
  t_head_outs = [t_head_out0]
  h = 1
  while h < @n_heads
    t_head_outs.push(build_attention_head_step(t_h, blk, h, pos, scale,
                                                bytes_d_head, bytes_max_T))
    h = h + 1
  end

  t_concat = t_head_outs[0]
  h = 1
  while h < @n_heads
    t_concat = TinyNNCuda.tnn_concat(@sess, t_concat, t_head_outs[h], 0)
    h = h + 1
  end

  t_out_proj_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_o, t_concat)
  t_out_proj     = TinyNNCuda.tnn_add(@sess, t_out_proj_raw, blk.t_b_o)
  t_x_attn       = TinyNNCuda.tnn_add(@sess, t_x, t_out_proj)

  t_h2     = TinyNNCuda.tnn_layer_norm(@sess, t_x_attn, blk.t_ln2_gamma, blk.t_ln2_beta, eps)
  t_up_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_ff1, t_h2)
  t_up     = TinyNNCuda.tnn_add(@sess, t_up_raw, blk.t_b_ff1)
  t_g      = TinyNNCuda.tnn_gelu(@sess, t_up)
  t_dn_raw = TinyNNCuda.tnn_matmul(@sess, blk.t_w_ff2, t_g)
  t_dn     = TinyNNCuda.tnn_add(@sess, t_dn_raw, blk.t_b_ff2)

  TinyNNCuda.tnn_add(@sess, t_x_attn, t_dn)
end

#build_decode_step(pos) ⇒ `Object`

Build the compute graph for one decode position. Returns the logits tensor handle. Caller calls tnn_compute then download_row_major.

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 157

def build_decode_step(pos)
  eps   = 1.0e-5
  scale = 1.0 / Math.sqrt(@d_head.to_f)
  d_model = @d_model
  d_head  = @d_head
  max_T   = @max_T
  bytes_d_head    = d_head * 4
  bytes_d_model   = d_model * 4
  bytes_max_T     = max_T * 4

  # Single-token input.
  t_token_id = TinyNNCuda.tnn_input_1d_i32(@sess, 1)

  # x = embed[token_id] + pos_embed[pos]
  t_embed_row = TinyNNCuda.tnn_get_rows(@sess, @t_token_embed, t_token_id)  # ne=[d_model, 1]
  t_pos_row   = TinyNNCuda.tnn_view_2d(@sess, @t_pos_embed,
                                    d_model, 1, bytes_d_model,
                                    pos * bytes_d_model)
  t_x = TinyNNCuda.tnn_add(@sess, t_embed_row, t_pos_row)

  li = 0
  while li < @n_layers
    t_x = build_block_step(t_x, @kv_blocks_ffi[li], pos, scale, eps,
                            bytes_d_head, bytes_max_T)
    li = li + 1
  end

  t_x_final = TinyNNCuda.tnn_layer_norm(@sess, t_x, @t_ln_f_gamma, @t_ln_f_beta, eps)
  t_kv_logits = TinyNNCuda.tnn_matmul(@sess, @t_token_embed, t_x_final)  # ne=[vocab, 1]
  TinyNNCuda.tnn_set_output(t_kv_logits)
  GPT2KVStepResultCuda.new(t_token_id, t_kv_logits)
end

#realize_for(max_T, d_model, d_ff, n_heads, n_layers, vocab_size, context_length) ⇒ `Object`

Declare all persistent tensors (weights + K/V buffers) and finalize the backend buffer. After this, weights can be uploaded; compute graphs are built per decode step via build_decode_step.

# File 'lib/toy/llm/engine/gpt2_kv_engine_cuda.rb', line 86

def realize_for(max_T, d_model, d_ff, n_heads, n_layers,
                vocab_size, context_length)
  @max_T          = max_T
  @d_model        = d_model
  @d_ff           = d_ff
  @n_heads        = n_heads
  @d_head         = d_model / n_heads
  @n_layers       = n_layers
  @vocab_size     = vocab_size
  @context_length = context_length

  @sess = TinyNNCuda.tnn_session_new(1)

  @t_token_embed = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, vocab_size, d_model)
  @t_pos_embed   = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, context_length, d_model)
  @t_ln_f_gamma  = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
  @t_ln_f_beta   = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)

  @kv_blocks_ffi = [GPT2KVBlockFFICuda.new]
  li = 1
  while li < n_layers
    @kv_blocks_ffi.push(GPT2KVBlockFFICuda.new)
    li = li + 1
  end

  li = 0
  while li < n_layers
    blk = @kv_blocks_ffi[li]
    blk.t_ln1_gamma = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_ln1_beta  = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_ln2_gamma = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_ln2_beta  = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)

    # Per-head: weights, biases, and KV buffers.
    blk.t_w_q = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model)]
    blk.t_w_k = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model)]
    blk.t_w_v = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model)]
    blk.t_b_q = [TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head)]
    blk.t_b_k = [TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head)]
    blk.t_b_v = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, 1)]   # ne=[1, d_head]
    # K: ne=[d_head, max_T]; V: ne=[max_T, d_head] (transposed layout).
    blk.t_K   = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, max_T,  d_head)]
    blk.t_V   = [TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, max_T)]
    h = 1
    while h < n_heads
      blk.t_w_q.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model))
      blk.t_w_k.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model))
      blk.t_w_v.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, d_model))
      blk.t_b_q.push(TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head))
      blk.t_b_k.push(TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_head))
      blk.t_b_v.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, 1))
      blk.t_K.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, max_T,  d_head))
      blk.t_V.push(TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_head, max_T))
      h = h + 1
    end

    blk.t_w_o   = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_model, d_model)
    blk.t_b_o   = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    blk.t_w_ff1 = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_ff,    d_model)
    blk.t_b_ff1 = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_ff)
    blk.t_w_ff2 = TinyNNCuda.tnn_input_2d_f32_persistent(@sess, d_model, d_ff)
    blk.t_b_ff2 = TinyNNCuda.tnn_input_1d_f32_persistent(@sess, d_model)
    li = li + 1
  end

  TinyNNCuda.tnn_finalize_weights(@sess)
  @realized = true
end

Class: GPT2KVFFICacheCuda

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ GPT2KVFFICacheCuda

Instance Attribute Details

#context_length ⇒ Object

#d_ff ⇒ Object

#d_head ⇒ Object

#d_model ⇒ Object

#kv_blocks_ffi ⇒ Object

#max_T ⇒ Object

#n_heads ⇒ Object

#n_layers ⇒ Object

#realized ⇒ Object

#sess ⇒ Object

#t_ln_f_beta ⇒ Object

#t_ln_f_gamma ⇒ Object

#t_pos_embed ⇒ Object

#t_token_embed ⇒ Object

#vocab_size ⇒ Object