Module: SmolLM2KV
- Defined in:
- lib/toy/llm/engine/llama_kv_engine.rb
Class Method Summary collapse
-
.decode_step(kv_cache, token_id, pos) ⇒ Object
Decode one new token at position ‘pos`.
-
.upload_from(kv_cache, model) ⇒ Object
Upload all Toy::SmolLM2 weights into a realized cache (+ zero-init the K/V buffers).
Class Method Details
.decode_step(kv_cache, token_id, pos) ⇒ Object
Decode one new token at position ‘pos`. Returns the (1, vocab) logits Mat for the new position. If `kv_cache.trace_on` is set the rebuild path inserts taps and we dump stats before reading logits.
1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 |
# File 'lib/toy/llm/engine/llama_kv_engine.rb', line 1583 def self.decode_step(kv_cache, token_id, pos) TinyNN.tnn_reset_for_rebuild(kv_cache.sess) step = kv_cache.build_decode_step(pos) TinyNN.tnn_realize(kv_cache.sess, step.kv_step_logits) TinyNN.upload_int_array(kv_cache.sess, step.t_token_id, [token_id]) TinyNN.upload_int_array(kv_cache.sess, step.t_pos, [pos]) TinyNN.tnn_compute(kv_cache.sess) kv_cache.dump_trace TinyNN.download_row_major(kv_cache.sess, step.kv_step_logits, 1, kv_cache.vocab_size) end |
.upload_from(kv_cache, model) ⇒ Object
Upload all Toy::SmolLM2 weights into a realized cache (+ zero-init the K/V buffers).
1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 |
# File 'lib/toy/llm/engine/llama_kv_engine.rb', line 1516 def self.upload_from(kv_cache, model) sess = kv_cache.sess n = kv_cache.n_layers n_heads = kv_cache.n_heads n_kv = kv_cache.n_kv d_model = kv_cache.d_model d_head = kv_cache.d_head max_T = kv_cache.max_T TinyNN.upload_row_major(sess, kv_cache., model..weight) TinyNN.tnn_upload_from_float_array(sess, kv_cache.t_final_norm_gamma, model.final_norm.gamma, d_model) if kv_cache.has_untied_output TinyNN.upload_row_major(sess, kv_cache.t_output, model.output_proj) end # P5.2: K and V share the same layout ne=[d_head, max_T] now, # so they share the same zero-init Mat. kv_zero = Mat.new(max_T, d_head) li = 0 while li < n blk_n = model.stack[li] blk_f = kv_cache.kv_blocks_ffi[li] TinyNN.tnn_upload_from_float_array(sess, blk_f.t_rn1_gamma, blk_n.rn1.gamma, d_model) TinyNN.tnn_upload_from_float_array(sess, blk_f.t_rn2_gamma, blk_n.rn2.gamma, d_model) hq = 0 while hq < n_heads TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_q[hq], blk_n.attn.w_q[hq]) if kv_cache.has_qkv_bias TinyNN.tnn_upload_from_float_array(sess, blk_f.t_b_q[hq], blk_n.attn.b_q[hq], d_head) end hq = hq + 1 end hkv = 0 while hkv < n_kv TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_k[hkv], blk_n.attn.w_k[hkv]) TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_v[hkv], blk_n.attn.w_v[hkv]) if kv_cache.has_qkv_bias TinyNN.tnn_upload_from_float_array(sess, blk_f.t_b_k[hkv], blk_n.attn.b_k[hkv], d_head) TinyNN.tnn_upload_from_float_array(sess, blk_f.t_b_v[hkv], blk_n.attn.b_v[hkv], d_head) end # P5.1+P5.2: same Q8 skip rule as realize_for_mmap. if kv_cache.kv_type_k != 8 TinyNN.upload_row_major(sess, blk_f.t_K[hkv], kv_zero) end if kv_cache.kv_type_v != 8 TinyNN.upload_row_major(sess, blk_f.t_V[hkv], kv_zero) end hkv = hkv + 1 end TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_o, blk_n.attn.w_o) TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_gate, blk_n.ffn.w_gate) TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_up, blk_n.ffn.w_up) TinyNN.stage_transposed_and_upload(sess, blk_f.t_w_down, blk_n.ffn.w_down) li = li + 1 end end |