Module: Trainers::Optimization

Defined in:: lib/trainers/optimization/optimizer.rb,
lib/trainers/optimization/scheduler.rb

Class Method Summary collapse

.create_optimizer(model, args) ⇒ Object

Builds AdamW with two param groups: - Parameters with weight decay (Linear weights, Embedding weights) - Parameters without weight decay (biases, LayerNorm/layer_norm params).
.create_scheduler(type, optimizer, num_warmup_steps:, num_training_steps:) ⇒ Object

Dispatcher: pick scheduler by type symbol.
.get_constant_schedule_with_warmup(optimizer, num_warmup_steps:) ⇒ Object

Linear warmup then constant LR.
.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:, num_cycles: 0.5) ⇒ Object

Linear warmup then cosine decay to 0.
.get_linear_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:) ⇒ Object

Linear warmup then linear decay to 0.

Class Method Details

.create_optimizer(model, args) ⇒ `Object`

Builds AdamW with two param groups:

- Parameters with weight decay (Linear weights, Embedding weights)
- Parameters without weight decay (biases, LayerNorm/layer_norm params)

This split is critical for transformer fine-tuning — regularizing biases and normalization weights hurts convergence.

# File 'lib/trainers/optimization/optimizer.rb', line 11

def self.create_optimizer(model, args)
  decay_params    = []
  no_decay_params = []

  no_decay_patterns = ["bias", "LayerNorm", "layer_norm", "layernorm"]

  model.named_parameters.each do |name, param|
    next unless param.requires_grad

    if no_decay_patterns.any? { |pattern| name.include?(pattern) }
      no_decay_params << param
    else
      decay_params << param
    end
  end

  param_groups = []

  if decay_params.any?
    param_groups << { params: decay_params, weight_decay: args.weight_decay }
  end

  if no_decay_params.any?
    param_groups << { params: no_decay_params, weight_decay: 0.0 }
  end

  if param_groups.empty?
    raise "No trainable parameters found. Did you forget to unfreeze the model or apply LoRA?"
  end

  Torch::Optim::AdamW.new(
    param_groups,
    lr:    args.learning_rate,
    betas: [args.adam_beta1, args.adam_beta2],
    eps:   args.adam_epsilon
  )
end

.create_scheduler(type, optimizer, num_warmup_steps:, num_training_steps:) ⇒ `Object`

Dispatcher: pick scheduler by type symbol

# File 'lib/trainers/optimization/scheduler.rb', line 49

def self.create_scheduler(type, optimizer, num_warmup_steps:, num_training_steps:)
  case type
  when :linear
    get_linear_schedule_with_warmup(optimizer,
      num_warmup_steps: num_warmup_steps,
      num_training_steps: num_training_steps)
  when :cosine
    get_cosine_schedule_with_warmup(optimizer,
      num_warmup_steps: num_warmup_steps,
      num_training_steps: num_training_steps)
  when :constant
    get_constant_schedule_with_warmup(optimizer,
      num_warmup_steps: num_warmup_steps)
  else
    raise ArgumentError, "Unknown scheduler type: #{type}. Use :linear, :cosine, or :constant"
  end
end

.get_constant_schedule_with_warmup(optimizer, num_warmup_steps:) ⇒ `Object`

Linear warmup then constant LR

# File 'lib/trainers/optimization/scheduler.rb', line 36

def self.get_constant_schedule_with_warmup(optimizer, num_warmup_steps:)
  lr_lambda = ->(current_step) {
    if current_step < num_warmup_steps
      current_step.to_f / [1, num_warmup_steps].max
    else
      1.0
    end
  }

  Torch::Optim::LRScheduler::LambdaLR.new(optimizer, lr_lambda)
end

.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:, num_cycles: 0.5) ⇒ `Object`

Linear warmup then cosine decay to 0

# File 'lib/trainers/optimization/scheduler.rb', line 21

def self.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:, num_cycles: 0.5)
  lr_lambda = ->(current_step) {
    if current_step < num_warmup_steps
      current_step.to_f / [1, num_warmup_steps].max
    else
      progress = (current_step - num_warmup_steps).to_f /
                 [1, num_training_steps - num_warmup_steps].max
      [0.0, 0.5 * (1.0 + Math.cos(Math::PI * num_cycles * 2.0 * progress))].max
    end
  }

  Torch::Optim::LRScheduler::LambdaLR.new(optimizer, lr_lambda)
end

.get_linear_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:) ⇒ `Object`

Linear warmup then linear decay to 0

# File 'lib/trainers/optimization/scheduler.rb', line 6

def self.get_linear_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:)
  lr_lambda = ->(current_step) {
    if current_step < num_warmup_steps
      current_step.to_f / [1, num_warmup_steps].max
    else
      remaining   = num_training_steps - current_step
      total_decay = num_training_steps - num_warmup_steps
      [0.0, remaining.to_f / [1.0, total_decay].max].max
    end
  }

  Torch::Optim::LRScheduler::LambdaLR.new(optimizer, lr_lambda)
end

Module: Trainers::Optimization

Class Method Summary collapse

Class Method Details

.create_optimizer(model, args) ⇒ Object

.create_scheduler(type, optimizer, num_warmup_steps:, num_training_steps:) ⇒ Object

.get_constant_schedule_with_warmup(optimizer, num_warmup_steps:) ⇒ Object

.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:, num_cycles: 0.5) ⇒ Object

.get_linear_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:) ⇒ Object

.create_optimizer(model, args) ⇒ `Object`

.create_scheduler(type, optimizer, num_warmup_steps:, num_training_steps:) ⇒ `Object`

.get_constant_schedule_with_warmup(optimizer, num_warmup_steps:) ⇒ `Object`

.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:, num_cycles: 0.5) ⇒ `Object`

.get_linear_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:) ⇒ `Object`