Module: SmolLM2KVCuda
- Defined in:
- lib/toy/llm/engine/llama_kv_engine_cuda.rb
Class Method Summary collapse
-
.decode_step(kv_cache, token_id, pos) ⇒ Object
Decode one new token at position ‘pos`.
-
.upload_from(kv_cache, model) ⇒ Object
Upload all Toy::SmolLM2 weights into a realized cache (+ zero-init the K/V buffers).
Class Method Details
.decode_step(kv_cache, token_id, pos) ⇒ Object
Decode one new token at position ‘pos`. Returns the (1, vocab) logits Mat for the new position. If `kv_cache.trace_on` is set the rebuild path inserts taps and we dump stats before reading logits.
1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 |
# File 'lib/toy/llm/engine/llama_kv_engine_cuda.rb', line 1516 def self.decode_step(kv_cache, token_id, pos) TinyNNCuda.tnn_reset_for_rebuild(kv_cache.sess) step = kv_cache.build_decode_step(pos) TinyNNCuda.tnn_realize(kv_cache.sess, step.kv_step_logits) TinyNNCuda.upload_int_array(kv_cache.sess, step.t_token_id, [token_id]) TinyNNCuda.upload_int_array(kv_cache.sess, step.t_pos, [pos]) TinyNNCuda.tnn_compute(kv_cache.sess) kv_cache.dump_trace TinyNNCuda.download_row_major(kv_cache.sess, step.kv_step_logits, 1, kv_cache.vocab_size) end |
.upload_from(kv_cache, model) ⇒ Object
Upload all Toy::SmolLM2 weights into a realized cache (+ zero-init the K/V buffers).
1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 |
# File 'lib/toy/llm/engine/llama_kv_engine_cuda.rb', line 1449 def self.upload_from(kv_cache, model) sess = kv_cache.sess n = kv_cache.n_layers n_heads = kv_cache.n_heads n_kv = kv_cache.n_kv d_model = kv_cache.d_model d_head = kv_cache.d_head max_T = kv_cache.max_T TinyNNCuda.upload_row_major(sess, kv_cache., model..weight) TinyNNCuda.tnn_upload_from_float_array(sess, kv_cache.t_final_norm_gamma, model.final_norm.gamma, d_model) if kv_cache.has_untied_output TinyNNCuda.upload_row_major(sess, kv_cache.t_output, model.output_proj) end # P5.2: K and V share the same layout ne=[d_head, max_T] now, # so they share the same zero-init Mat. kv_zero = Mat.new(max_T, d_head) li = 0 while li < n blk_n = model.stack[li] blk_f = kv_cache.kv_blocks_ffi[li] TinyNNCuda.tnn_upload_from_float_array(sess, blk_f.t_rn1_gamma, blk_n.rn1.gamma, d_model) TinyNNCuda.tnn_upload_from_float_array(sess, blk_f.t_rn2_gamma, blk_n.rn2.gamma, d_model) hq = 0 while hq < n_heads TinyNNCuda.stage_transposed_and_upload(sess, blk_f.t_w_q[hq], blk_n.attn.w_q[hq]) if kv_cache.has_qkv_bias TinyNNCuda.tnn_upload_from_float_array(sess, blk_f.t_b_q[hq], blk_n.attn.b_q[hq], d_head) end hq = hq + 1 end hkv = 0 while hkv < n_kv TinyNNCuda.stage_transposed_and_upload(sess, blk_f.t_w_k[hkv], blk_n.attn.w_k[hkv]) TinyNNCuda.stage_transposed_and_upload(sess, blk_f.t_w_v[hkv], blk_n.attn.w_v[hkv]) if kv_cache.has_qkv_bias TinyNNCuda.tnn_upload_from_float_array(sess, blk_f.t_b_k[hkv], blk_n.attn.b_k[hkv], d_head) TinyNNCuda.tnn_upload_from_float_array(sess, blk_f.t_b_v[hkv], blk_n.attn.b_v[hkv], d_head) end # P5.1+P5.2: same Q8 skip rule as realize_for_mmap. if kv_cache.kv_type_k != 8 TinyNNCuda.upload_row_major(sess, blk_f.t_K[hkv], kv_zero) end if kv_cache.kv_type_v != 8 TinyNNCuda.upload_row_major(sess, blk_f.t_V[hkv], kv_zero) end hkv = hkv + 1 end TinyNNCuda.stage_transposed_and_upload(sess, blk_f.t_w_o, blk_n.attn.w_o) TinyNNCuda.stage_transposed_and_upload(sess, blk_f.t_w_gate, blk_n.ffn.w_gate) TinyNNCuda.stage_transposed_and_upload(sess, blk_f.t_w_up, blk_n.ffn.w_up) TinyNNCuda.stage_transposed_and_upload(sess, blk_f.t_w_down, blk_n.ffn.w_down) li = li + 1 end end |