Class: Toy::Core::CLI::Train

Inherits:

Object

Object
Toy::Core::CLI::Train

show all

Defined in:: lib/toy/core/cli/train.rb

Constant Summary collapse

FORMAT =

"toy/train-v1"

RUNNER_TARGET = NOTE: target name MUST equal the output path — ToyRoot.ensure_built runs ‘make <RUNNER_TARGET>` and File.join(root, RUNNER_TARGET) is the binary. `make libexec/toy-train` outputs libexec/toy-train. from-scratch + warm-start share libexec/toy-train (both random-init). lora dispatches to a SEPARATE binary, libexec/toy-train-lora: its realize_for_mmap path cannot share a Spinel compilation unit with the random-init path without a cfg type-merge miscompile (landmine #16; see lib/toy/run/train_lora.rb header). Same byte-gated stdout contract — only the binary differs.

"libexec/toy-train"

LORA_RUNNER_TARGET =

"libexec/toy-train-lora"

CUDA_RUNNER_TARGET = CUDA from-scratch runner — a SEPARATE per-device binary (single-type binary, landmine #16). Selected only for –device cuda + from-scratch.

"libexec/toy-train-cuda"

LORA_CUDA_RUNNER_TARGET = CUDA lora runner — a SEPARATE single-type binary (landmine #16): its realize_for_mmap cfg path is monomorphic and cannot share a Spinel compilation unit with the random-init path. Selected only for –device cuda + lora.

"libexec/toy-train-lora-cuda"

METAL_RUNNER_TARGET = Metal from-scratch runner — a SEPARATE per-device binary (single-type binary, landmine #16). Selected only for –device metal + from-scratch, and only on macOS (the build target is macOS-guarded).

"libexec/toy-train-metal"

VIT_RUNNER_TARGET = ViT-Tiny from-scratch CPU runner — a SEPARATE binary (landmine #16): ViTTinyConfig must NOT share a Spinel compilation unit with SmolLM2Config. CPU-only this slice. Binary path EQUALS the make target so ToyRoot.ensure_built builds + locates it.

"libexec/toy-train-vit"

GPT2_RUNNER_TARGET = GPT-2 from-scratch CPU runner — a SEPARATE binary (landmine #16): the GPT2SeqEngine realize path can’t share a Spinel unit with the llama one. Selected by ‘–arch gpt2` (from-scratch, CPU only this slice). Backward of its LayerNorm + GELU rides the vendored kernels (vendor-patches/0007).

"libexec/toy-train-gpt2"

GPT2_CUDA_RUNNER_TARGET = GPT-2 GPU twins (–arch gpt2 –device cuda|metal). SEPARATE single-type binaries (landmine #16); link the generated CUDA/Metal engine mirrors. The GELU/LayerNorm backward ops fall back to the CPU backend on GPU.

"libexec/toy-train-gpt2-cuda"

GPT2_METAL_RUNNER_TARGET =

"libexec/toy-train-gpt2-metal"

DEFAULT_STEPS = the gate config (smoke_recipe_from_scratch)

DEFAULT_SEED =

ARCH = The from-scratch arch family — substituted for arch in the run_id_template. The runner hardcodes a llama-shape model.

"llama"

Instance Method Summary collapse

#initialize(argv) ⇒ Train constructor

A new instance of Train.
#run ⇒ Object

Constructor Details

#initialize(argv) ⇒ `Train`

Returns a new instance of Train.

# File 'lib/toy/core/cli/train.rb', line 88

def initialize(argv)
  @argv  = argv
  @json  = false
  @recipe = nil
  @steps = DEFAULT_STEPS
  @seed  = DEFAULT_SEED
  @out   = nil
  @model = nil   # lora GGUF path
  @rank  = nil   # lora rank (Integer)
  @corpus = nil  # warm-start corpus path
  @init  = nil   # warm-start init mode
  @device = "cpu"  # cpu | cuda | metal (from-scratch only for non-cpu)
  @arch   = ARCH   # llama | gpt2 (gpt2 = from-scratch CPU only this slice)
end

Instance Method Details

#run ⇒ `Object`

# File 'lib/toy/core/cli/train.rb', line 103

def run
  parsed = parse_args
  return parsed unless parsed == true

  # The TOY INSTALL root (for `make`) — may differ from Dir.pwd.
  # metal is accepted by the parser but only buildable in a macOS
  # build — gate it HERE, before any build/Open3 (mirrors infer.rb).
  if @device == "metal" && RUBY_PLATFORM !~ /darwin/
    return fail_out("metal is only available in a macOS build")
  end

  # --arch gpt2 is from-scratch only (CPU/CUDA/Metal). Metal is gated to
  # macOS by the @device check above; CUDA/Metal back the GELU/LayerNorm
  # backward on the CPU fallback backend (no GPU kernel yet).
  if @arch == "gpt2" && @recipe != "from-scratch"
    return fail_out("--arch gpt2 supports only the `from-scratch` recipe")
  end

  # Existence checks on user-suppliable paths BEFORE any build/shell
  # (spinel-dev#17 class: the runner side also guards, but the CLI
  # names the file and the fix first — and exits 2 like infer's /
  # describe's named-but-missing model). Paths are cwd-relative to
  # the PROJECT (the runner runs in Dir.pwd).
  if @recipe == "lora"
    lora_gguf = @model || "data/smollm2-135m-native.gguf"
    unless File.file?(lora_gguf)
      return bad_arg("no such file: #{lora_gguf} (lora needs a " \
                     "native-layout base GGUF; see `toy list`, or convert one " \
                     "with prep/convert_smollm2_to_gguf.py --ggml-native)")
    end
  elsif @recipe == "warm-start"
    ws_corpus = @corpus || "data/ts_seqs.bin"
    unless File.file?(ws_corpus)
      return bad_arg("no such file: #{ws_corpus} (warm-start streams " \
                     "packed-i32 tokens; `toy new` seeds data/ts_seqs.bin, or pass --corpus)")
    end
  end

  root = ToyRoot.locate_root
  unless root
    return fail_out(
      "could not locate toy's install root. Set TOY_HOME to a toy " \
      "checkout (one with a Makefile + tinynn/tinynn_ggml.c), or run " \
      "from inside the toy source tree. Then `toy install` to build " \
      "the backend."
    )
  end

  # Per-device-AND-recipe binary (single-type, landmine #16).
  #   lora + cuda      -> toy-train-lora-cuda (monomorphic mmap cfg)
  #   lora + cpu       -> toy-train-lora      (unchanged)
  #   from-scratch /   -> toy-train-cuda (the warm-start branch lives
  #     warm-start +cuda                  in train_cuda.rb source)
  #   metal (fs only)  -> toy-train-metal
  #   cpu fs/warm-start-> toy-train
  target = if @recipe == "lora"
             @device == "cuda" ? LORA_CUDA_RUNNER_TARGET : LORA_RUNNER_TARGET
           elsif @recipe == "vit-tiny"
             VIT_RUNNER_TARGET
           elsif @arch == "gpt2"
             case @device
             when "cuda"  then GPT2_CUDA_RUNNER_TARGET
             when "metal" then GPT2_METAL_RUNNER_TARGET
             else              GPT2_RUNNER_TARGET
             end
           elsif @device == "cuda"
             CUDA_RUNNER_TARGET
           elsif @device == "metal"
             METAL_RUNNER_TARGET
           else
             RUNNER_TARGET
           end
  ok, err = ToyRoot.ensure_built(root, target, quiet: @json)
  return fail_out(err) unless ok

  runner = File.join(root, target)
  unless File.file?(runner) && File.executable?(runner)
    return fail_out(
      "runner missing after build: #{runner}. Run `toy install` to " \
      "build the backend, then retry."
    )
  end

  # Resolve a run id + create runs/<id>/ in the PROJECT cwd (the
  # train-specific net-new step, CRuby-side, BEFORE Open3 — the runner
  # assumes TAO_RUN_DIR pre-exists; tnn_events_open does no parent
  # mkdir).
  project  = Dir.pwd
  cfg      = Toy::Core::Config.load(project)
  run_id   = resolve_run_id(cfg.run_id_template, project,
                            @arch == "gpt2" ? "gpt2" : arch_for(@recipe))
  run_dir  = @out ? File.expand_path(@out) : File.join(project, "runs", run_id)
  begin
    FileUtils.mkdir_p(run_dir)
  rescue SystemCallError => e
    return fail_out("could not create run dir #{run_dir}: #{e.message}")
  end

  # CONTROLLED ENV (first positional) so a stale caller env can't
  # leak. Built per-recipe (parallel to the runner's landmine-#16
  # branch discipline): each recipe's exact keys reproduce its gate.
  base = { "TAO_RUN_DIR" => run_dir, "TOY_RUN_ID" => run_id,
           "RECIPE" => @recipe }
  if @recipe == "lora"
    # NO SEED key: lora seed=42 is hardcoded in the runner branch.
    env = base.merge("STEPS" => @steps.to_s,
                     "GGUF"  => (@model || "data/smollm2-135m-native.gguf"),
                     "RANK"  => (@rank || 8).to_s)
  elsif @recipe == "warm-start"
    env = base.merge("STEPS"  => @steps.to_s,
                     "SEED"   => @seed.to_s,
                     "CORPUS" => (@corpus || "data/ts_seqs.bin"),
                     "INIT"   => (@init || "scratch"))
  elsif @recipe == "vit-tiny"
    # CPU-only ViT from-scratch on the COMMITTED data/vit_smoke corpus.
    # Runner hard-codes 224/16/196/10 + AdamW hp; only STEPS/SEED vary.
    # data/vit_smoke is committed → no --corpus needed. vit IS seeded.
    env = base.merge("STEPS" => @steps.to_s, "SEED" => @seed.to_s,
                     "IMG_DIR" => "data/vit_smoke")
  else
    # from-scratch — byte-identical to today plus the harmless RECIPE key.
    env = base.merge("STEPS" => @steps.to_s, "SEED" => @seed.to_s)
  end
  # Metal: disable ggml's residency-set optimization so the runner exits
  # cleanly. See lib/toy/core/cli/infer.rb for the full rationale — the
  # ggml-metal static-destructor teardown asserts the residency set is
  # empty and aborts at exit; disabling it keeps compute byte-identical.
  env["GGML_METAL_NO_RESIDENCY"] = "1" if @device == "metal"
  out, status = Open3.capture2e(env, runner)
  unless status.success?
    tail = out.lines.last(20).join
    # exitstatus is nil for a signal death (e.g. SEGV) — say so
    # instead of the formerly-masked "runner exited :".
    how = status.exitstatus ? status.exitstatus.to_s
                            : "from signal #{status.termsig} (#{Signal.signame(status.termsig) rescue '?'})"
    tail = "(no output)" if tail.strip.empty?
    return fail_out("runner exited #{how}:\n#{tail}")
  end

  losses = out.lines.select { |l| l.start_with?("step ") }.map(&:chomp)
  emit(run_id, run_dir, losses)
end