Module: Ignis::JIT::Kernels::Loss

Defined in:
lib/nvruby/jit/kernels/loss.rb

Overview

Loss function CUDA kernels for training. All are fused implementations for numerical stability and performance.

Class Method Summary collapse

Class Method Details

.bce_backwardIgnis::JIT::Kernel

BCE backward: σ(x) - y

Returns:



165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# File 'lib/nvruby/jit/kernels/loss.rb', line 165

def bce_backward
  source = <<~CUDA
    extern "C" __global__
    void bce_backward(const float* __restrict__ logits,
                      const float* __restrict__ targets,
                      const float* __restrict__ grad_output,
                      float* __restrict__ grad_input,
                      const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float sig = 1.0f / (1.0f + expf(-logits[idx]));
        grad_input[idx] = grad_output[idx] * (sig - targets[idx]);
      }
    }
  CUDA
  compile_cached(source, "bce_backward")
end

.bce_forwardIgnis::JIT::Kernel

Binary cross-entropy with logits: -[y*log(σ(x)) + (1-y)*log(1-σ(x))]

Returns:



143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/nvruby/jit/kernels/loss.rb', line 143

def bce_forward
  source = <<~CUDA
    extern "C" __global__
    void bce_forward(const float* __restrict__ logits,
                     const float* __restrict__ targets,
                     float* __restrict__ losses,
                     const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float x = logits[idx];
        float y = targets[idx];
        // Numerically stable: max(x,0) - x*y + log(1+exp(-|x|))
        float max_val = fmaxf(x, 0.0f);
        losses[idx] = max_val - x * y + logf(1.0f + expf(-fabsf(x)));
      }
    }
  CUDA
  compile_cached(source, "bce_forward")
end

.cross_entropy_backwardIgnis::JIT::Kernel

Cross-entropy backward: softmax(logits) - one_hot(target) Combined softmax + gradient in one kernel

Returns:



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/nvruby/jit/kernels/loss.rb', line 68

def cross_entropy_backward
  source = <<~CUDA
    extern "C" __global__
    void cross_entropy_backward(const float* __restrict__ log_softmax,
                                 const int* __restrict__ targets,
                                 const float* __restrict__ grad_output,
                                 float* __restrict__ grad_logits,
                                 const int batch_size,
                                 const int vocab_size,
                                 const float label_smoothing) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      int total = batch_size * vocab_size;
      if (idx < total) {
        int row = idx / vocab_size;
        int col = idx % vocab_size;
        int target = targets[row];

        float softmax_val = expf(log_softmax[idx]);
        float grad_scale = grad_output[row];

        if (label_smoothing > 0.0f) {
          float smooth_target = label_smoothing / (float)vocab_size;
          float hard_target = (col == target) ? (1.0f - label_smoothing + smooth_target) : smooth_target;
          grad_logits[idx] = grad_scale * (softmax_val - hard_target);
        } else {
          float indicator = (col == target) ? 1.0f : 0.0f;
          grad_logits[idx] = grad_scale * (softmax_val - indicator);
        }
      }
    }
  CUDA
  compile_cached(source, "cross_entropy_backward")
end

.cross_entropy_forwardIgnis::JIT::Kernel

Fused cross-entropy forward: log_softmax + NLL in a single kernel Avoids materializing full log-softmax output

Returns:



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/nvruby/jit/kernels/loss.rb', line 13

def cross_entropy_forward
  source = <<~CUDA
    extern "C" __global__
    void cross_entropy_forward(const float* __restrict__ logits,
                                const int* __restrict__ targets,
                                float* __restrict__ losses,
                                float* __restrict__ log_softmax_out,
                                const int batch_size,
                                const int vocab_size,
                                const float label_smoothing) {
      int row = blockIdx.x * blockDim.x + threadIdx.x;
      if (row < batch_size) {
        const float* row_logits = logits + row * vocab_size;
        float* row_lsm = log_softmax_out + row * vocab_size;
        int target = targets[row];

        // Find max for numerical stability
        float max_val = row_logits[0];
        for (int j = 1; j < vocab_size; j++) {
          max_val = fmaxf(max_val, row_logits[j]);
        }

        // log_softmax = x - max - log(sum(exp(x - max)))
        float log_sum_exp = 0.0f;
        for (int j = 0; j < vocab_size; j++) {
          log_sum_exp += expf(row_logits[j] - max_val);
        }
        log_sum_exp = logf(log_sum_exp);

        // Compute log_softmax and store
        for (int j = 0; j < vocab_size; j++) {
          row_lsm[j] = row_logits[j] - max_val - log_sum_exp;
        }

        // NLL loss with optional label smoothing
        if (label_smoothing > 0.0f) {
          float smooth_loss = 0.0f;
          for (int j = 0; j < vocab_size; j++) {
            smooth_loss -= row_lsm[j];
          }
          smooth_loss /= (float)vocab_size;
          float nll = -row_lsm[target];
          losses[row] = (1.0f - label_smoothing) * nll + label_smoothing * smooth_loss;
        } else {
          losses[row] = -row_lsm[target];
        }
      }
    }
  CUDA
  compile_cached(source, "cross_entropy_forward")
end

.mean_reduceIgnis::JIT::Kernel

Mean reduction: compute mean of array

Returns:



185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# File 'lib/nvruby/jit/kernels/loss.rb', line 185

def mean_reduce
  source = <<~CUDA
    extern "C" __global__
    void mean_reduce(const float* __restrict__ input,
                     float* __restrict__ output,
                     const int n) {
      // Single-thread simple reduction (for loss scalar)
      if (blockIdx.x == 0 && threadIdx.x == 0) {
        float sum = 0.0f;
        for (int i = 0; i < n; i++) {
          sum += input[i];
        }
        output[0] = sum / (float)n;
      }
    }
  CUDA
  compile_cached(source, "mean_reduce")
end

.mse_backwardIgnis::JIT::Kernel

MSE backward: 2 * (pred - target) / n

Returns:



123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/nvruby/jit/kernels/loss.rb', line 123

def mse_backward
  source = <<~CUDA
    extern "C" __global__
    void mse_backward(const float* __restrict__ predictions,
                      const float* __restrict__ targets,
                      const float* __restrict__ grad_output,
                      float* __restrict__ grad_input,
                      const int n,
                      const float scale) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        grad_input[idx] = grad_output[idx] * 2.0f * (predictions[idx] - targets[idx]) * scale;
      }
    }
  CUDA
  compile_cached(source, "mse_backward")
end

.mse_forwardIgnis::JIT::Kernel

MSE forward: (pred - target)^2, per element

Returns:



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/nvruby/jit/kernels/loss.rb', line 104

def mse_forward
  source = <<~CUDA
    extern "C" __global__
    void mse_forward(const float* __restrict__ predictions,
                     const float* __restrict__ targets,
                     float* __restrict__ losses,
                     const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float diff = predictions[idx] - targets[idx];
        losses[idx] = diff * diff;
      }
    }
  CUDA
  compile_cached(source, "mse_forward")
end