Class: ToyLMCuda

Inherits:

Object

Object
ToyLMCuda

show all

Defined in:: lib/toy/models/transformer_lm_cuda.rb

Instance Attribute Summary collapse

#arch ⇒ Object readonly

Returns the value of attribute arch.
#max_T ⇒ Object

Returns the value of attribute max_T.
#tokenizer ⇒ Object readonly

Returns the value of attribute tokenizer.

Instance Method Summary collapse

#decode_step(token_id, pos) ⇒ Object
#generate(prompt_ids, n_new, sampler_config = nil) ⇒ Object
#initialize(arch) ⇒ ToyLMCuda constructor

A new instance of ToyLMCuda.
#load(path) ⇒ Object

Constructor Details

#initialize(arch) ⇒ `ToyLMCuda`

Returns a new instance of ToyLMCuda.

# File 'lib/toy/models/transformer_lm_cuda.rb', line 35

def initialize(arch)
  @arch    = arch
  @max_T   = 256
  @kv      = nil
  @gguf_handle = nil
  @loaded  = false
end

Instance Attribute Details

#arch ⇒ `Object` (readonly)

Returns the value of attribute arch.



33
34
35

# File 'lib/toy/models/transformer_lm_cuda.rb', line 33

def arch
  @arch
end

#max_T ⇒ `Object`

Returns the value of attribute max_T.



33
34
35

# File 'lib/toy/models/transformer_lm_cuda.rb', line 33

def max_T
  @max_T
end

#tokenizer ⇒ `Object` (readonly)

Returns the value of attribute tokenizer.



33
34
35

# File 'lib/toy/models/transformer_lm_cuda.rb', line 33

def tokenizer
  @tokenizer
end

Instance Method Details

#decode_step(token_id, pos) ⇒ `Object`



108
109
110

# File 'lib/toy/models/transformer_lm_cuda.rb', line 108

def decode_step(token_id, pos)
  SmolLM2KVCuda.decode_step(@kv, token_id, pos)
end

#generate(prompt_ids, n_new, sampler_config = nil) ⇒ `Object`

# File 'lib/toy/models/transformer_lm_cuda.rb', line 112

def generate(prompt_ids, n_new, sampler_config = nil)
  if !@loaded
    puts "ToyLMCuda.generate: model not loaded; call .load(path) first"
    return prompt_ids
  end
  ids = []
  j = 0
  while j < prompt_ids.length
    ids.push(prompt_ids[j])
    j = j + 1
  end

  i = 0
  while i < prompt_ids.length
    decode_step(prompt_ids[i], i)
    i = i + 1
  end

  ctx = nil
  if sampler_config != nil
    ctx = SamplerContext.new(ids, sampler_config.seed)
  end

  n = 0
  while n < n_new
    pos = ids.length
    last_id = ids[pos - 1]
    logits = decode_step(last_id, pos)
    pick = -1
    if sampler_config == nil
      pick = Sampler.argmax(logits)
    else
      logits = Sampler.repetition_penalty(logits, ctx, sampler_config.rep_penalty)
      logits = Sampler.temperature(logits, sampler_config.temperature)
      logits = Sampler.top_k(logits, sampler_config.top_k)
      logits = Sampler.top_p(logits, sampler_config.top_p)
      pick   = Sampler.pick(logits, sampler_config, ctx)
      ctx.generated_ids.push(pick)
    end
    ids.push(pick)
    if pick == @arch.eos_id
      break
    end
    n = n + 1
  end
  ids
end

#load(path) ⇒ `Object`

# File 'lib/toy/models/transformer_lm_cuda.rb', line 47

def load(path)
  flags = GGUFLoad.detect_smollm2_flags(path)
  cfg   = SmolLM2ConfigLoader.read(path)

  probe = TinyNNCuda.tnn_gguf_load(path)
  is_native = false
  if probe != nil
    is_native = (TinyNNCuda.tnn_gguf_get_bool(probe, "toy.ggml_native") == 1)
    TinyNNCuda.tnn_gguf_free(probe)
  end

  kv = SmolLM2KVFFICacheCuda.new
  # P5.1: KV_Q8=1 opts into Q8_0 storage for the K cache. See the CPU
  # mirror in lib/transformer_lm.rb for the full rationale.
  if (ENV["KV_Q8"] || "") == "1"
    kv.enable_kv_q8!
  end
  # P4.1: FLASH_ATTN=1 → ggml_flash_attn_ext for attention.
  if (ENV["FLASH_ATTN"] || "") == "1"
    kv.enable_flash_attn!
  end
  # #76 fix: wire QK-norm through to the engine, parity with
  # load_cpu in lib/toy/models/transformer_lm.rb. This call site
  # previously passed only 5 of realize_for_mmap's 6 args; Spinel
  # zero-fills missing call args WITHOUT a diagnostic, so qk_norm
  # arrived as false and Qwen3's per-head Q/K RMS-norms were never
  # built on CUDA → degenerate decode (CPU was coherent).
  # 1 = Qwen3-style ([d_head] shared), 2 = OLMoE/Granite-style
  # ([d_model] packed, per-head sliced gamma). Must be set BEFORE
  # realize_for_mmap.
  kv.qk_norm_kind = flags.qk_norm_kind
  qk_norm_on = flags.qk_norm
  # NO_QK_NORM=1 turns the norm off entirely as a diagnostic
  # (same env knob as the CPU loader).
  if (ENV["NO_QK_NORM"] || "") == "1"
    qk_norm_on = false
    kv.qk_norm_kind = 0
  end

  if is_native
    @gguf_handle = TinyNNCuda.tnn_gguf_load(path)
    kv.realize_for_mmap(@gguf_handle, cfg, @max_T, flags.untied, flags.qkv_bias, qk_norm_on)
  else
    if qk_norm_on
      # Fail loud (never mask): the legacy copy-load path has no
      # QK-norm support — realize_for can't allocate the gamma
      # tensors, so decode would be silently degenerate.
      puts "ToyLMCuda.load: " + path + " needs QK-norm but is not in " +
           "toy.ggml_native layout; the legacy copy-load path cannot " +
           "apply QK-norm. Re-convert with --ggml-native. Aborting."
      exit 1
    end
    kv.realize_for(@max_T, cfg.d_model, cfg.d_ff, cfg.n_heads, cfg.n_kv,
                   cfg.n_layers, cfg.vocab, cfg.rope_base, cfg.rms_eps,
                   flags.untied, flags.qkv_bias)
    GGUFLoad.load_kv_cache_auto(kv, path)
  end
  @kv = kv
  @loaded = true
end

Class: ToyLMCuda

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(arch) ⇒ ToyLMCuda

Instance Attribute Details

#arch ⇒ Object (readonly)

#max_T ⇒ Object

#tokenizer ⇒ Object (readonly)

Instance Method Details

#decode_step(token_id, pos) ⇒ Object

#generate(prompt_ids, n_new, sampler_config = nil) ⇒ Object

#load(path) ⇒ Object

#initialize(arch) ⇒ `ToyLMCuda`

#arch ⇒ `Object` (readonly)

#max_T ⇒ `Object`

#tokenizer ⇒ `Object` (readonly)

#decode_step(token_id, pos) ⇒ `Object`

#generate(prompt_ids, n_new, sampler_config = nil) ⇒ `Object`

#load(path) ⇒ `Object`