Module: Ignis::AI::Loss

Defined in:
lib/nnw/ai/loss.rb

Overview

Loss functions for training. Each returns a scalar Tensor with autograd support.

Class Method Summary collapse

Class Method Details

.binary_cross_entropy(logits, targets) ⇒ Tensor

Binary cross-entropy with logits (sigmoid applied inside).

Parameters:

  • logits (Tensor)
  • targets (Tensor)

    (0.0 or 1.0)

Returns:

  • (Tensor)

    scalar loss



120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/nnw/ai/loss.rb', line 120

def binary_cross_entropy(logits, targets)
  n = logits.numel

  losses_nv = Ignis::Shared::NvArray.new(shape: logits.shape, dtype: :float32,
                                        device_id: logits.device_id)
  losses_nv.from_host(Array.new(n, 0.0))

  kernel = Ignis::JIT::Kernels::Loss.bce_forward
  kernel.launch(grid: [(n + 255) / 256], block: [256],
                args: [logits.data, targets.data, losses_nv, n])

  mean_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: :float32,
                                      device_id: logits.device_id)
  mean_nv.from_host([0.0])
  mean_k = Ignis::JIT::Kernels::Loss.mean_reduce
  mean_k.launch(grid: [1], block: [1], args: [losses_nv, mean_nv, n])

  result = Tensor.new(data: mean_nv, requires_grad: logits.requires_grad, is_leaf: false)

  if logits.requires_grad
    saved_logits = logits.data
    saved_targets = targets.data
    Tape.record(result, inputs: [logits]) do |grad|
      grad_input = Ignis::Shared::NvArray.new(shape: logits.shape, dtype: :float32,
                                             device_id: logits.device_id)
      grad_input.from_host(Array.new(n, 0.0))
      bk = Ignis::JIT::Kernels::Loss.bce_backward
      bk.launch(grid: [(n + 255) / 256], block: [256],
                args: [saved_logits, saved_targets, grad, grad_input, n])
      [grad_input]
    end
  end

  result
end

.cross_entropy(logits, targets, label_smoothing: 0.0) ⇒ Tensor

Cross-entropy loss (classification, language modeling). Fused log-softmax + NLL for numerical stability.

Parameters:

  • logits (Tensor)

    model output [batch_size, vocab_size]

  • targets (Tensor)

    target indices [batch_size] (int32)

  • label_smoothing (Float) (defaults to: 0.0)

    label smoothing factor (0.0 = none)

Returns:

  • (Tensor)

    scalar loss



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/nnw/ai/loss.rb', line 15

def cross_entropy(logits, targets, label_smoothing: 0.0)
  batch_size = logits.shape[0]
  vocab_size = logits.shape[-1]

  # Allocate outputs
  losses_nv = Ignis::Shared::NvArray.new(shape: [batch_size], dtype: :float32,
                                        device_id: logits.device_id)
  losses_nv.from_host(Array.new(batch_size, 0.0))

  log_softmax_nv = Ignis::Shared::NvArray.new(shape: logits.shape, dtype: :float32,
                                              device_id: logits.device_id)
  log_softmax_nv.from_host(Array.new(logits.numel, 0.0))

  # Forward kernel
  kernel = Ignis::JIT::Kernels::Loss.cross_entropy_forward
  kernel.launch(grid: [(batch_size + 255) / 256], block: [256],
                args: [logits.data, targets.data, losses_nv, log_softmax_nv,
                       batch_size, vocab_size, label_smoothing.to_f])

  # Mean reduction
  mean_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: :float32, device_id: logits.device_id)
  mean_nv.from_host([0.0])
  mean_k = Ignis::JIT::Kernels::Loss.mean_reduce
  mean_k.launch(grid: [1], block: [1], args: [losses_nv, mean_nv, batch_size])

  result = Tensor.new(data: mean_nv, requires_grad: logits.requires_grad, is_leaf: false)

  if logits.requires_grad
    saved_lsm = log_softmax_nv
    saved_targets = targets.data
    Tape.record(result, inputs: [logits]) do |grad|
      # Scale by the MEAN reduction (1/batch_size) AND the chained upstream
      # gradient. The upstream grad was previously ignored (hardcoded to
      # 1/batch_size), so any downstream loss scaling — e.g. the Trainer's
      # gradient-accumulation division, or loss*k — was silently dropped,
      # making effective gradients wrong. grad is the scalar [1] cotangent
      # of this scalar loss.
      upstream = grad.to_host[0].to_f
      grad_scale = Ignis::Shared::NvArray.new(shape: [batch_size], dtype: :float32,
                                             device_id: logits.device_id)
      scale_val = upstream / batch_size
      grad_scale.from_host(Array.new(batch_size, scale_val))

      grad_logits = Ignis::Shared::NvArray.new(shape: logits.shape, dtype: :float32,
                                              device_id: logits.device_id)
      grad_logits.from_host(Array.new(logits.numel, 0.0))

      bk = Ignis::JIT::Kernels::Loss.cross_entropy_backward
      total = batch_size * vocab_size
      bk.launch(grid: [(total + 255) / 256], block: [256],
                args: [saved_lsm, saved_targets, grad_scale,
                       grad_logits, batch_size, vocab_size, label_smoothing.to_f])
      [grad_logits]
    end
  end

  result
end

.kl_divergence(log_q, p) ⇒ Tensor

KL divergence: KL(p || q) = sum(p * log(p/q))

Parameters:

  • log_q (Tensor)

    log probabilities of model

  • p (Tensor)

    target distribution

Returns:

  • (Tensor)

    scalar loss



160
161
162
163
164
165
166
# File 'lib/nnw/ai/loss.rb', line 160

def kl_divergence(log_q, p)
  # KL = sum(p * (log(p) - log_q))
  # Implement via existing tensor ops
  diff = p * (p.relu + Tensor.from_host([1e-8], shape: [1], device_id: p.device_id)) - log_q
  loss = (p * diff).sum
  loss
end

.mse(predictions, targets) ⇒ Tensor

Mean squared error loss.

Parameters:

  • predictions (Tensor)
  • targets (Tensor)

Returns:

  • (Tensor)

    scalar loss



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/nnw/ai/loss.rb', line 78

def mse(predictions, targets)
  n = predictions.numel

  losses_nv = Ignis::Shared::NvArray.new(shape: predictions.shape, dtype: :float32,
                                        device_id: predictions.device_id)
  losses_nv.from_host(Array.new(n, 0.0))

  kernel = Ignis::JIT::Kernels::Loss.mse_forward
  kernel.launch(grid: [(n + 255) / 256], block: [256],
                args: [predictions.data, targets.data, losses_nv, n])

  # Mean
  mean_nv = Ignis::Shared::NvArray.new(shape: [1], dtype: :float32,
                                      device_id: predictions.device_id)
  mean_nv.from_host([0.0])
  mean_k = Ignis::JIT::Kernels::Loss.mean_reduce
  mean_k.launch(grid: [1], block: [1], args: [losses_nv, mean_nv, n])

  result = Tensor.new(data: mean_nv, requires_grad: predictions.requires_grad, is_leaf: false)

  if predictions.requires_grad
    saved_pred = predictions.data
    saved_tgt = targets.data
    Tape.record(result, inputs: [predictions]) do |grad|
      grad_input = Ignis::Shared::NvArray.new(shape: predictions.shape, dtype: :float32,
                                             device_id: predictions.device_id)
      grad_input.from_host(Array.new(n, 0.0))
      bk = Ignis::JIT::Kernels::Loss.mse_backward
      scale = 1.0 / n
      bk.launch(grid: [(n + 255) / 256], block: [256],
                args: [saved_pred, saved_tgt, grad, grad_input, n, scale.to_f])
      [grad_input]
    end
  end

  result
end