Module: SmolLM2KV

Defined in:
lib/toy/llm/engine/llama_kv_engine.rb

Class Method Summary collapse

Class Method Details

.decode_step(kv_cache, token_id, pos) ⇒ Object

Decode one new token at position ‘pos`. Returns the (1, vocab) logits Mat for the new position. If `kv_cache.trace_on` is set the rebuild path inserts taps and we dump stats before reading logits.



1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
# File 'lib/toy/llm/engine/llama_kv_engine.rb', line 1583

def self.decode_step(kv_cache, token_id, pos)
  TinyNN.tnn_reset_for_rebuild(kv_cache.sess)
  step = kv_cache.build_decode_step(pos)
  TinyNN.tnn_realize(kv_cache.sess, step.kv_step_logits)
  TinyNN.upload_int_array(kv_cache.sess, step.t_token_id, [token_id])
  TinyNN.upload_int_array(kv_cache.sess, step.t_pos,      [pos])
  TinyNN.tnn_compute(kv_cache.sess)
  kv_cache.dump_trace
  TinyNN.download_row_major(kv_cache.sess, step.kv_step_logits, 1, kv_cache.vocab_size)
end

.upload_from(kv_cache, model) ⇒ Object

Upload all Toy::SmolLM2 weights into a realized cache (+ zero-init the K/V buffers).



1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
# File 'lib/toy/llm/engine/llama_kv_engine.rb', line 1516

def self.upload_from(kv_cache, model)
  sess     = kv_cache.sess
  n        = kv_cache.n_layers
  n_heads  = kv_cache.n_heads
  n_kv     = kv_cache.n_kv
  d_model  = kv_cache.d_model
  d_head   = kv_cache.d_head
  max_T    = kv_cache.max_T

  TinyNN.upload_row_major(sess, kv_cache.t_token_embed, model.token_embed.weight)
  TinyNN.tnn_upload_from_float_array(sess, kv_cache.t_final_norm_gamma,
                                      model.final_norm.gamma, d_model)
  if kv_cache.has_untied_output
    TinyNN.upload_row_major(sess, kv_cache.t_output, model.output_proj)
  end

  # P5.2: K and V share the same layout ne=[d_head, max_T] now,
  # so they share the same zero-init Mat.
  kv_zero = Mat.new(max_T, d_head)

  li = 0
  while li < n
    blk_n = model.stack[li]
    blk_f = kv_cache.kv_blocks_ffi[li]

    TinyNN.tnn_upload_from_float_array(sess, blk_f.t_rn1_gamma, blk_n.rn1.gamma, d_model)
    TinyNN.tnn_upload_from_float_array(sess, blk_f.t_rn2_gamma, blk_n.rn2.gamma, d_model)

    hq = 0
    while hq < n_heads
      TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_q[hq], blk_n.attn.w_q[hq])
      if kv_cache.has_qkv_bias
        TinyNN.tnn_upload_from_float_array(sess, blk_f.t_b_q[hq], blk_n.attn.b_q[hq], d_head)
      end
      hq = hq + 1
    end

    hkv = 0
    while hkv < n_kv
      TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_k[hkv], blk_n.attn.w_k[hkv])
      TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_v[hkv], blk_n.attn.w_v[hkv])
      if kv_cache.has_qkv_bias
        TinyNN.tnn_upload_from_float_array(sess, blk_f.t_b_k[hkv], blk_n.attn.b_k[hkv], d_head)
        TinyNN.tnn_upload_from_float_array(sess, blk_f.t_b_v[hkv], blk_n.attn.b_v[hkv], d_head)
      end
      # P5.1+P5.2: same Q8 skip rule as realize_for_mmap.
      if kv_cache.kv_type_k != 8
        TinyNN.upload_row_major(sess, blk_f.t_K[hkv], kv_zero)
      end
      if kv_cache.kv_type_v != 8
        TinyNN.upload_row_major(sess, blk_f.t_V[hkv], kv_zero)
      end
      hkv = hkv + 1
    end

    TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_o,    blk_n.attn.w_o)
    TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_gate, blk_n.ffn.w_gate)
    TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_up,   blk_n.ffn.w_up)
    TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_down, blk_n.ffn.w_down)

    li = li + 1
  end
end