Module: Ignis::JIT::Kernels::Loss
- Defined in:
- lib/nvruby/jit/kernels/loss.rb
Overview
Loss function CUDA kernels for training. All are fused implementations for numerical stability and performance.
Class Method Summary collapse
-
.bce_backward ⇒ Ignis::JIT::Kernel
BCE backward: σ(x) - y.
-
.bce_forward ⇒ Ignis::JIT::Kernel
Binary cross-entropy with logits: -[y*log(σ(x)) + (1-y)*log(1-σ(x))].
-
.cross_entropy_backward ⇒ Ignis::JIT::Kernel
Cross-entropy backward: softmax(logits) - one_hot(target) Combined softmax + gradient in one kernel.
-
.cross_entropy_forward ⇒ Ignis::JIT::Kernel
Fused cross-entropy forward: log_softmax + NLL in a single kernel Avoids materializing full log-softmax output.
-
.mean_reduce ⇒ Ignis::JIT::Kernel
Mean reduction: compute mean of array.
-
.mse_backward ⇒ Ignis::JIT::Kernel
MSE backward: 2 * (pred - target) / n.
-
.mse_forward ⇒ Ignis::JIT::Kernel
MSE forward: (pred - target)^2, per element.
Class Method Details
.bce_backward ⇒ Ignis::JIT::Kernel
BCE backward: σ(x) - y
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
# File 'lib/nvruby/jit/kernels/loss.rb', line 165 def bce_backward source = <<~CUDA extern "C" __global__ void bce_backward(const float* __restrict__ logits, const float* __restrict__ targets, const float* __restrict__ grad_output, float* __restrict__ grad_input, const int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { float sig = 1.0f / (1.0f + expf(-logits[idx])); grad_input[idx] = grad_output[idx] * (sig - targets[idx]); } } CUDA compile_cached(source, "bce_backward") end |
.bce_forward ⇒ Ignis::JIT::Kernel
Binary cross-entropy with logits: -[y*log(σ(x)) + (1-y)*log(1-σ(x))]
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# File 'lib/nvruby/jit/kernels/loss.rb', line 143 def bce_forward source = <<~CUDA extern "C" __global__ void bce_forward(const float* __restrict__ logits, const float* __restrict__ targets, float* __restrict__ losses, const int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { float x = logits[idx]; float y = targets[idx]; // Numerically stable: max(x,0) - x*y + log(1+exp(-|x|)) float max_val = fmaxf(x, 0.0f); losses[idx] = max_val - x * y + logf(1.0f + expf(-fabsf(x))); } } CUDA compile_cached(source, "bce_forward") end |
.cross_entropy_backward ⇒ Ignis::JIT::Kernel
Cross-entropy backward: softmax(logits) - one_hot(target) Combined softmax + gradient in one kernel
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/nvruby/jit/kernels/loss.rb', line 68 def cross_entropy_backward source = <<~CUDA extern "C" __global__ void cross_entropy_backward(const float* __restrict__ log_softmax, const int* __restrict__ targets, const float* __restrict__ grad_output, float* __restrict__ grad_logits, const int batch_size, const int vocab_size, const float label_smoothing) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int total = batch_size * vocab_size; if (idx < total) { int row = idx / vocab_size; int col = idx % vocab_size; int target = targets[row]; float softmax_val = expf(log_softmax[idx]); float grad_scale = grad_output[row]; if (label_smoothing > 0.0f) { float smooth_target = label_smoothing / (float)vocab_size; float hard_target = (col == target) ? (1.0f - label_smoothing + smooth_target) : smooth_target; grad_logits[idx] = grad_scale * (softmax_val - hard_target); } else { float indicator = (col == target) ? 1.0f : 0.0f; grad_logits[idx] = grad_scale * (softmax_val - indicator); } } } CUDA compile_cached(source, "cross_entropy_backward") end |
.cross_entropy_forward ⇒ Ignis::JIT::Kernel
Fused cross-entropy forward: log_softmax + NLL in a single kernel Avoids materializing full log-softmax output
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'lib/nvruby/jit/kernels/loss.rb', line 13 def cross_entropy_forward source = <<~CUDA extern "C" __global__ void cross_entropy_forward(const float* __restrict__ logits, const int* __restrict__ targets, float* __restrict__ losses, float* __restrict__ log_softmax_out, const int batch_size, const int vocab_size, const float label_smoothing) { int row = blockIdx.x * blockDim.x + threadIdx.x; if (row < batch_size) { const float* row_logits = logits + row * vocab_size; float* row_lsm = log_softmax_out + row * vocab_size; int target = targets[row]; // Find max for numerical stability float max_val = row_logits[0]; for (int j = 1; j < vocab_size; j++) { max_val = fmaxf(max_val, row_logits[j]); } // log_softmax = x - max - log(sum(exp(x - max))) float log_sum_exp = 0.0f; for (int j = 0; j < vocab_size; j++) { log_sum_exp += expf(row_logits[j] - max_val); } log_sum_exp = logf(log_sum_exp); // Compute log_softmax and store for (int j = 0; j < vocab_size; j++) { row_lsm[j] = row_logits[j] - max_val - log_sum_exp; } // NLL loss with optional label smoothing if (label_smoothing > 0.0f) { float smooth_loss = 0.0f; for (int j = 0; j < vocab_size; j++) { smooth_loss -= row_lsm[j]; } smooth_loss /= (float)vocab_size; float nll = -row_lsm[target]; losses[row] = (1.0f - label_smoothing) * nll + label_smoothing * smooth_loss; } else { losses[row] = -row_lsm[target]; } } } CUDA compile_cached(source, "cross_entropy_forward") end |
.mean_reduce ⇒ Ignis::JIT::Kernel
Mean reduction: compute mean of array
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
# File 'lib/nvruby/jit/kernels/loss.rb', line 185 def mean_reduce source = <<~CUDA extern "C" __global__ void mean_reduce(const float* __restrict__ input, float* __restrict__ output, const int n) { // Single-thread simple reduction (for loss scalar) if (blockIdx.x == 0 && threadIdx.x == 0) { float sum = 0.0f; for (int i = 0; i < n; i++) { sum += input[i]; } output[0] = sum / (float)n; } } CUDA compile_cached(source, "mean_reduce") end |
.mse_backward ⇒ Ignis::JIT::Kernel
MSE backward: 2 * (pred - target) / n
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# File 'lib/nvruby/jit/kernels/loss.rb', line 123 def mse_backward source = <<~CUDA extern "C" __global__ void mse_backward(const float* __restrict__ predictions, const float* __restrict__ targets, const float* __restrict__ grad_output, float* __restrict__ grad_input, const int n, const float scale) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { grad_input[idx] = grad_output[idx] * 2.0f * (predictions[idx] - targets[idx]) * scale; } } CUDA compile_cached(source, "mse_backward") end |
.mse_forward ⇒ Ignis::JIT::Kernel
MSE forward: (pred - target)^2, per element
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/nvruby/jit/kernels/loss.rb', line 104 def mse_forward source = <<~CUDA extern "C" __global__ void mse_forward(const float* __restrict__ predictions, const float* __restrict__ targets, float* __restrict__ losses, const int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { float diff = predictions[idx] - targets[idx]; losses[idx] = diff * diff; } } CUDA compile_cached(source, "mse_forward") end |