Module: Ignis::JIT::Kernels::Loss

Defined in:: lib/nvruby/jit/kernels/loss.rb

Overview

Loss function CUDA kernels for training. All are fused implementations for numerical stability and performance.

Class Method Summary collapse

.bce_backward ⇒ Ignis::JIT::Kernel

BCE backward: σ(x) - y.
.bce_forward ⇒ Ignis::JIT::Kernel

Binary cross-entropy with logits: -[y*log(σ(x)) + (1-y)*log(1-σ(x))].
.cross_entropy_backward ⇒ Ignis::JIT::Kernel

Cross-entropy backward: softmax(logits) - one_hot(target) Combined softmax + gradient in one kernel.
.cross_entropy_forward ⇒ Ignis::JIT::Kernel

Fused cross-entropy forward: log_softmax + NLL in a single kernel Avoids materializing full log-softmax output.
.mean_reduce ⇒ Ignis::JIT::Kernel

Mean reduction: compute mean of array.
.mse_backward ⇒ Ignis::JIT::Kernel

MSE backward: 2 * (pred - target) / n.
.mse_forward ⇒ Ignis::JIT::Kernel

MSE forward: (pred - target)^2, per element.

Class Method Details

.bce_backward ⇒ `Ignis::JIT::Kernel`

BCE backward: σ(x) - y

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/loss.rb', line 165

def bce_backward
  source = <<~CUDA
    extern "C" __global__
    void bce_backward(const float* __restrict__ logits,
                      const float* __restrict__ targets,
                      const float* __restrict__ grad_output,
                      float* __restrict__ grad_input,
                      const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float sig = 1.0f / (1.0f + expf(-logits[idx]));
        grad_input[idx] = grad_output[idx] * (sig - targets[idx]);
      }
    }
  CUDA
  compile_cached(source, "bce_backward")
end

.bce_forward ⇒ `Ignis::JIT::Kernel`

Binary cross-entropy with logits: -[y*log(σ(x)) + (1-y)*log(1-σ(x))]

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/loss.rb', line 143

def bce_forward
  source = <<~CUDA
    extern "C" __global__
    void bce_forward(const float* __restrict__ logits,
                     const float* __restrict__ targets,
                     float* __restrict__ losses,
                     const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float x = logits[idx];
        float y = targets[idx];
        // Numerically stable: max(x,0) - x*y + log(1+exp(-|x|))
        float max_val = fmaxf(x, 0.0f);
        losses[idx] = max_val - x * y + logf(1.0f + expf(-fabsf(x)));
      }
    }
  CUDA
  compile_cached(source, "bce_forward")
end

.cross_entropy_backward ⇒ `Ignis::JIT::Kernel`

Cross-entropy backward: softmax(logits) - one_hot(target) Combined softmax + gradient in one kernel

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/loss.rb', line 68

def cross_entropy_backward
  source = <<~CUDA
    extern "C" __global__
    void cross_entropy_backward(const float* __restrict__ log_softmax,
                                 const int* __restrict__ targets,
                                 const float* __restrict__ grad_output,
                                 float* __restrict__ grad_logits,
                                 const int batch_size,
                                 const int vocab_size,
                                 const float label_smoothing) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      int total = batch_size * vocab_size;
      if (idx < total) {
        int row = idx / vocab_size;
        int col = idx % vocab_size;
        int target = targets[row];

        float softmax_val = expf(log_softmax[idx]);
        float grad_scale = grad_output[row];

        if (label_smoothing > 0.0f) {
          float smooth_target = label_smoothing / (float)vocab_size;
          float hard_target = (col == target) ? (1.0f - label_smoothing + smooth_target) : smooth_target;
          grad_logits[idx] = grad_scale * (softmax_val - hard_target);
        } else {
          float indicator = (col == target) ? 1.0f : 0.0f;
          grad_logits[idx] = grad_scale * (softmax_val - indicator);
        }
      }
    }
  CUDA
  compile_cached(source, "cross_entropy_backward")
end

.cross_entropy_forward ⇒ `Ignis::JIT::Kernel`

Fused cross-entropy forward: log_softmax + NLL in a single kernel Avoids materializing full log-softmax output

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/loss.rb', line 13

def cross_entropy_forward
  source = <<~CUDA
    extern "C" __global__
    void cross_entropy_forward(const float* __restrict__ logits,
                                const int* __restrict__ targets,
                                float* __restrict__ losses,
                                float* __restrict__ log_softmax_out,
                                const int batch_size,
                                const int vocab_size,
                                const float label_smoothing) {
      int row = blockIdx.x * blockDim.x + threadIdx.x;
      if (row < batch_size) {
        const float* row_logits = logits + row * vocab_size;
        float* row_lsm = log_softmax_out + row * vocab_size;
        int target = targets[row];

        // Find max for numerical stability
        float max_val = row_logits[0];
        for (int j = 1; j < vocab_size; j++) {
          max_val = fmaxf(max_val, row_logits[j]);
        }

        // log_softmax = x - max - log(sum(exp(x - max)))
        float log_sum_exp = 0.0f;
        for (int j = 0; j < vocab_size; j++) {
          log_sum_exp += expf(row_logits[j] - max_val);
        }
        log_sum_exp = logf(log_sum_exp);

        // Compute log_softmax and store
        for (int j = 0; j < vocab_size; j++) {
          row_lsm[j] = row_logits[j] - max_val - log_sum_exp;
        }

        // NLL loss with optional label smoothing
        if (label_smoothing > 0.0f) {
          float smooth_loss = 0.0f;
          for (int j = 0; j < vocab_size; j++) {
            smooth_loss -= row_lsm[j];
          }
          smooth_loss /= (float)vocab_size;
          float nll = -row_lsm[target];
          losses[row] = (1.0f - label_smoothing) * nll + label_smoothing * smooth_loss;
        } else {
          losses[row] = -row_lsm[target];
        }
      }
    }
  CUDA
  compile_cached(source, "cross_entropy_forward")
end

.mean_reduce ⇒ `Ignis::JIT::Kernel`

Mean reduction: compute mean of array

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/loss.rb', line 185

def mean_reduce
  source = <<~CUDA
    extern "C" __global__
    void mean_reduce(const float* __restrict__ input,
                     float* __restrict__ output,
                     const int n) {
      // Single-thread simple reduction (for loss scalar)
      if (blockIdx.x == 0 && threadIdx.x == 0) {
        float sum = 0.0f;
        for (int i = 0; i < n; i++) {
          sum += input[i];
        }
        output[0] = sum / (float)n;
      }
    }
  CUDA
  compile_cached(source, "mean_reduce")
end

.mse_backward ⇒ `Ignis::JIT::Kernel`

MSE backward: 2 * (pred - target) / n

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/loss.rb', line 123

def mse_backward
  source = <<~CUDA
    extern "C" __global__
    void mse_backward(const float* __restrict__ predictions,
                      const float* __restrict__ targets,
                      const float* __restrict__ grad_output,
                      float* __restrict__ grad_input,
                      const int n,
                      const float scale) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        grad_input[idx] = grad_output[idx] * 2.0f * (predictions[idx] - targets[idx]) * scale;
      }
    }
  CUDA
  compile_cached(source, "mse_backward")
end

.mse_forward ⇒ `Ignis::JIT::Kernel`

MSE forward: (pred - target)^2, per element

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/loss.rb', line 104

def mse_forward
  source = <<~CUDA
    extern "C" __global__
    void mse_forward(const float* __restrict__ predictions,
                     const float* __restrict__ targets,
                     float* __restrict__ losses,
                     const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float diff = predictions[idx] - targets[idx];
        losses[idx] = diff * diff;
      }
    }
  CUDA
  compile_cached(source, "mse_forward")
end

Module: Ignis::JIT::Kernels::Loss

Overview

Class Method Summary collapse

Class Method Details

.bce_backward ⇒ Ignis::JIT::Kernel

.bce_forward ⇒ Ignis::JIT::Kernel

.cross_entropy_backward ⇒ Ignis::JIT::Kernel

.cross_entropy_forward ⇒ Ignis::JIT::Kernel

.mean_reduce ⇒ Ignis::JIT::Kernel

.mse_backward ⇒ Ignis::JIT::Kernel

.mse_forward ⇒ Ignis::JIT::Kernel

.bce_backward ⇒ `Ignis::JIT::Kernel`

.bce_forward ⇒ `Ignis::JIT::Kernel`

.cross_entropy_backward ⇒ `Ignis::JIT::Kernel`

.cross_entropy_forward ⇒ `Ignis::JIT::Kernel`

.mean_reduce ⇒ `Ignis::JIT::Kernel`

.mse_backward ⇒ `Ignis::JIT::Kernel`

.mse_forward ⇒ `Ignis::JIT::Kernel`