Module: Trainers::Optimization

Defined in:
lib/trainers/optimization/optimizer.rb,
lib/trainers/optimization/scheduler.rb

Class Method Summary collapse

Class Method Details

.create_optimizer(model, args) ⇒ Object

Builds AdamW with two param groups:

- Parameters with weight decay (Linear weights, Embedding weights)
- Parameters without weight decay (biases, LayerNorm/layer_norm params)

This split is critical for transformer fine-tuning — regularizing biases and normalization weights hurts convergence.



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/trainers/optimization/optimizer.rb', line 11

def self.create_optimizer(model, args)
  decay_params    = []
  no_decay_params = []

  no_decay_patterns = ["bias", "LayerNorm", "layer_norm", "layernorm"]

  model.named_parameters.each do |name, param|
    next unless param.requires_grad

    if no_decay_patterns.any? { |pattern| name.include?(pattern) }
      no_decay_params << param
    else
      decay_params << param
    end
  end

  param_groups = []

  if decay_params.any?
    param_groups << { params: decay_params, weight_decay: args.weight_decay }
  end

  if no_decay_params.any?
    param_groups << { params: no_decay_params, weight_decay: 0.0 }
  end

  if param_groups.empty?
    raise "No trainable parameters found. Did you forget to unfreeze the model or apply LoRA?"
  end

  Torch::Optim::AdamW.new(
    param_groups,
    lr:    args.learning_rate,
    betas: [args.adam_beta1, args.adam_beta2],
    eps:   args.adam_epsilon
  )
end

.create_scheduler(type, optimizer, num_warmup_steps:, num_training_steps:) ⇒ Object

Dispatcher: pick scheduler by type symbol



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/trainers/optimization/scheduler.rb', line 49

def self.create_scheduler(type, optimizer, num_warmup_steps:, num_training_steps:)
  case type
  when :linear
    get_linear_schedule_with_warmup(optimizer,
      num_warmup_steps: num_warmup_steps,
      num_training_steps: num_training_steps)
  when :cosine
    get_cosine_schedule_with_warmup(optimizer,
      num_warmup_steps: num_warmup_steps,
      num_training_steps: num_training_steps)
  when :constant
    get_constant_schedule_with_warmup(optimizer,
      num_warmup_steps: num_warmup_steps)
  else
    raise ArgumentError, "Unknown scheduler type: #{type}. Use :linear, :cosine, or :constant"
  end
end

.get_constant_schedule_with_warmup(optimizer, num_warmup_steps:) ⇒ Object

Linear warmup then constant LR



36
37
38
39
40
41
42
43
44
45
46
# File 'lib/trainers/optimization/scheduler.rb', line 36

def self.get_constant_schedule_with_warmup(optimizer, num_warmup_steps:)
  lr_lambda = ->(current_step) {
    if current_step < num_warmup_steps
      current_step.to_f / [1, num_warmup_steps].max
    else
      1.0
    end
  }

  Torch::Optim::LRScheduler::LambdaLR.new(optimizer, lr_lambda)
end

.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:, num_cycles: 0.5) ⇒ Object

Linear warmup then cosine decay to 0



21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/trainers/optimization/scheduler.rb', line 21

def self.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:, num_cycles: 0.5)
  lr_lambda = ->(current_step) {
    if current_step < num_warmup_steps
      current_step.to_f / [1, num_warmup_steps].max
    else
      progress = (current_step - num_warmup_steps).to_f /
                 [1, num_training_steps - num_warmup_steps].max
      [0.0, 0.5 * (1.0 + Math.cos(Math::PI * num_cycles * 2.0 * progress))].max
    end
  }

  Torch::Optim::LRScheduler::LambdaLR.new(optimizer, lr_lambda)
end

.get_linear_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:) ⇒ Object

Linear warmup then linear decay to 0



6
7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/trainers/optimization/scheduler.rb', line 6

def self.get_linear_schedule_with_warmup(optimizer, num_warmup_steps:, num_training_steps:)
  lr_lambda = ->(current_step) {
    if current_step < num_warmup_steps
      current_step.to_f / [1, num_warmup_steps].max
    else
      remaining   = num_training_steps - current_step
      total_decay = num_training_steps - num_warmup_steps
      [0.0, remaining.to_f / [1.0, total_decay].max].max
    end
  }

  Torch::Optim::LRScheduler::LambdaLR.new(optimizer, lr_lambda)
end