Class: ToyLMCuda
- Inherits:
-
Object
- Object
- ToyLMCuda
- Defined in:
- lib/toy/models/transformer_lm_cuda.rb
Instance Attribute Summary collapse
-
#arch ⇒ Object
readonly
Returns the value of attribute arch.
-
#max_T ⇒ Object
Returns the value of attribute max_T.
-
#tokenizer ⇒ Object
readonly
Returns the value of attribute tokenizer.
Instance Method Summary collapse
- #decode_step(token_id, pos) ⇒ Object
- #generate(prompt_ids, n_new, sampler_config = nil) ⇒ Object
-
#initialize(arch) ⇒ ToyLMCuda
constructor
A new instance of ToyLMCuda.
- #load(path) ⇒ Object
Constructor Details
#initialize(arch) ⇒ ToyLMCuda
Returns a new instance of ToyLMCuda.
35 36 37 38 39 40 41 |
# File 'lib/toy/models/transformer_lm_cuda.rb', line 35 def initialize(arch) @arch = arch @max_T = 256 @kv = nil @gguf_handle = nil @loaded = false end |
Instance Attribute Details
#arch ⇒ Object (readonly)
Returns the value of attribute arch.
33 34 35 |
# File 'lib/toy/models/transformer_lm_cuda.rb', line 33 def arch @arch end |
#max_T ⇒ Object
Returns the value of attribute max_T.
33 34 35 |
# File 'lib/toy/models/transformer_lm_cuda.rb', line 33 def max_T @max_T end |
#tokenizer ⇒ Object (readonly)
Returns the value of attribute tokenizer.
33 34 35 |
# File 'lib/toy/models/transformer_lm_cuda.rb', line 33 def tokenizer @tokenizer end |
Instance Method Details
#decode_step(token_id, pos) ⇒ Object
108 109 110 |
# File 'lib/toy/models/transformer_lm_cuda.rb', line 108 def decode_step(token_id, pos) SmolLM2KVCuda.decode_step(@kv, token_id, pos) end |
#generate(prompt_ids, n_new, sampler_config = nil) ⇒ Object
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# File 'lib/toy/models/transformer_lm_cuda.rb', line 112 def generate(prompt_ids, n_new, sampler_config = nil) if !@loaded puts "ToyLMCuda.generate: model not loaded; call .load(path) first" return prompt_ids end ids = [] j = 0 while j < prompt_ids.length ids.push(prompt_ids[j]) j = j + 1 end i = 0 while i < prompt_ids.length decode_step(prompt_ids[i], i) i = i + 1 end ctx = nil if sampler_config != nil ctx = SamplerContext.new(ids, sampler_config.seed) end n = 0 while n < n_new pos = ids.length last_id = ids[pos - 1] logits = decode_step(last_id, pos) pick = -1 if sampler_config == nil pick = Sampler.argmax(logits) else logits = Sampler.repetition_penalty(logits, ctx, sampler_config.rep_penalty) logits = Sampler.temperature(logits, sampler_config.temperature) logits = Sampler.top_k(logits, sampler_config.top_k) logits = Sampler.top_p(logits, sampler_config.top_p) pick = Sampler.pick(logits, sampler_config, ctx) ctx.generated_ids.push(pick) end ids.push(pick) if pick == @arch.eos_id break end n = n + 1 end ids end |
#load(path) ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/toy/models/transformer_lm_cuda.rb', line 47 def load(path) flags = GGUFLoad.detect_smollm2_flags(path) cfg = SmolLM2ConfigLoader.read(path) probe = TinyNNCuda.tnn_gguf_load(path) is_native = false if probe != nil is_native = (TinyNNCuda.tnn_gguf_get_bool(probe, "toy.ggml_native") == 1) TinyNNCuda.tnn_gguf_free(probe) end kv = SmolLM2KVFFICacheCuda.new # P5.1: KV_Q8=1 opts into Q8_0 storage for the K cache. See the CPU # mirror in lib/transformer_lm.rb for the full rationale. if (ENV["KV_Q8"] || "") == "1" kv.enable_kv_q8! end # P4.1: FLASH_ATTN=1 → ggml_flash_attn_ext for attention. if (ENV["FLASH_ATTN"] || "") == "1" kv.enable_flash_attn! end # #76 fix: wire QK-norm through to the engine, parity with # load_cpu in lib/toy/models/transformer_lm.rb. This call site # previously passed only 5 of realize_for_mmap's 6 args; Spinel # zero-fills missing call args WITHOUT a diagnostic, so qk_norm # arrived as false and Qwen3's per-head Q/K RMS-norms were never # built on CUDA → degenerate decode (CPU was coherent). # 1 = Qwen3-style ([d_head] shared), 2 = OLMoE/Granite-style # ([d_model] packed, per-head sliced gamma). Must be set BEFORE # realize_for_mmap. kv.qk_norm_kind = flags.qk_norm_kind qk_norm_on = flags.qk_norm # NO_QK_NORM=1 turns the norm off entirely as a diagnostic # (same env knob as the CPU loader). if (ENV["NO_QK_NORM"] || "") == "1" qk_norm_on = false kv.qk_norm_kind = 0 end if is_native @gguf_handle = TinyNNCuda.tnn_gguf_load(path) kv.realize_for_mmap(@gguf_handle, cfg, @max_T, flags.untied, flags.qkv_bias, qk_norm_on) else if qk_norm_on # Fail loud (never mask): the legacy copy-load path has no # QK-norm support — realize_for can't allocate the gamma # tensors, so decode would be silently degenerate. puts "ToyLMCuda.load: " + path + " needs QK-norm but is not in " + "toy.ggml_native layout; the legacy copy-load path cannot " + "apply QK-norm. Re-convert with --ggml-native. Aborting." exit 1 end kv.realize_for(@max_T, cfg.d_model, cfg.d_ff, cfg.n_heads, cfg.n_kv, cfg.n_layers, cfg.vocab, cfg.rope_base, cfg.rms_eps, flags.untied, flags.qkv_bias) GGUFLoad.load_kv_cache_auto(kv, path) end @kv = kv @loaded = true end |