Module: Ignis::JIT::Kernels::Normalization

Defined in:: lib/nvruby/jit/kernels/normalization.rb

Overview

Layer normalization CUDA kernels. Forward computes mean, variance, normalizes, scales, and shifts. Backward computes gradients for input, weight (gamma), and bias (beta).

Class Method Summary collapse

.layer_norm_backward ⇒ Ignis::JIT::Kernel

LayerNorm backward: computes dL/dx, dL/dgamma, dL/dbeta.
.layer_norm_forward ⇒ Ignis::JIT::Kernel

LayerNorm forward: y = gamma * (x - mean) / sqrt(var + eps) + beta Each row (last dim) is normalized independently.
.rms_norm_backward ⇒ Ignis::JIT::Kernel

RMSNorm backward: dL/dx and dL/dgamma (no bias in RMSNorm).
.rms_norm_forward ⇒ Ignis::JIT::Kernel

RMSNorm forward: y = gamma * x / sqrt(mean(x^2) + eps) Used in LLaMA/Mistral architectures.

Class Method Details

.layer_norm_backward ⇒ `Ignis::JIT::Kernel`

LayerNorm backward: computes dL/dx, dL/dgamma, dL/dbeta

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/normalization.rb', line 65

def layer_norm_backward
  source = <<~CUDA
    extern "C" __global__
    void layer_norm_backward(const float* __restrict__ grad_output,
                             const float* __restrict__ input,
                             const float* __restrict__ gamma,
                             const float* __restrict__ mean,
                             const float* __restrict__ rstd,
                             float* __restrict__ grad_input,
                             float* __restrict__ grad_gamma,
                             float* __restrict__ grad_beta,
                             const int outer_size,
                             const int norm_size) {
      int row = blockIdx.x * blockDim.x + threadIdx.x;
      if (row < outer_size) {
        const float* go = grad_output + row * norm_size;
        const float* in_row = input + row * norm_size;
        float* gi = grad_input + row * norm_size;
        float m = mean[row];
        float rs = rstd[row];

        // Compute intermediate sums for efficient backward
        float sum_go_x = 0.0f;
        float sum_go = 0.0f;
        for (int j = 0; j < norm_size; j++) {
          float x_hat = (in_row[j] - m) * rs;
          sum_go_x += go[j] * gamma[j] * x_hat;
          sum_go += go[j] * gamma[j];
        }

        float inv_n = 1.0f / (float)norm_size;

        // Compute grad_input
        for (int j = 0; j < norm_size; j++) {
          float x_hat = (in_row[j] - m) * rs;
          gi[j] = rs * (go[j] * gamma[j] - inv_n * (sum_go + x_hat * sum_go_x));
        }

        // Accumulate grad_gamma and grad_beta (needs atomicAdd for multi-row)
        for (int j = 0; j < norm_size; j++) {
          float x_hat = (in_row[j] - m) * rs;
          atomicAdd(&grad_gamma[j], go[j] * x_hat);
          atomicAdd(&grad_beta[j], go[j]);
        }
      }
    }
  CUDA
  compile_cached(source, "layer_norm_backward")
end

.layer_norm_forward ⇒ `Ignis::JIT::Kernel`

LayerNorm forward: y = gamma * (x - mean) / sqrt(var + eps) + beta Each row (last dim) is normalized independently.

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/normalization.rb', line 14

def layer_norm_forward
  source = <<~CUDA
    extern "C" __global__
    void layer_norm_forward(const float* __restrict__ input,
                            const float* __restrict__ gamma,
                            const float* __restrict__ beta,
                            float* __restrict__ output,
                            float* __restrict__ mean_out,
                            float* __restrict__ rstd_out,
                            const int outer_size,
                            const int norm_size,
                            const float eps) {
      int row = blockIdx.x * blockDim.x + threadIdx.x;
      if (row < outer_size) {
        const float* in_row = input + row * norm_size;
        float* out_row = output + row * norm_size;

        // Compute mean
        float mean = 0.0f;
        for (int j = 0; j < norm_size; j++) {
          mean += in_row[j];
        }
        mean /= (float)norm_size;

        // Compute variance
        float var = 0.0f;
        for (int j = 0; j < norm_size; j++) {
          float diff = in_row[j] - mean;
          var += diff * diff;
        }
        var /= (float)norm_size;

        float rstd = rsqrtf(var + eps);

        // Save for backward pass
        if (mean_out) mean_out[row] = mean;
        if (rstd_out) rstd_out[row] = rstd;

        // Normalize, scale, shift
        for (int j = 0; j < norm_size; j++) {
          float normalized = (in_row[j] - mean) * rstd;
          out_row[j] = gamma[j] * normalized + beta[j];
        }
      }
    }
  CUDA
  compile_cached(source, "layer_norm_forward")
end

.rms_norm_backward ⇒ `Ignis::JIT::Kernel`

RMSNorm backward: dL/dx and dL/dgamma (no bias in RMSNorm). With x_hat_j = x_j * rstd and y_j = gamma_j * x_hat_j:

dL/dx_i     = rstd * (go_i*gamma_i - x_hat_i * S / n),  S = sum_j go_j*gamma_j*x_hat_j
dL/dgamma_j = sum_rows go_j * x_hat_j

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/normalization.rb', line 155

def rms_norm_backward
  source = <<~CUDA
    extern "C" __global__
    void rms_norm_backward(const float* __restrict__ grad_output,
                           const float* __restrict__ input,
                           const float* __restrict__ gamma,
                           const float* __restrict__ rstd,
                           float* __restrict__ grad_input,
                           float* __restrict__ grad_gamma,
                           const int outer_size,
                           const int norm_size) {
      int row = blockIdx.x * blockDim.x + threadIdx.x;
      if (row < outer_size) {
        const float* go = grad_output + row * norm_size;
        const float* in_row = input + row * norm_size;
        float* gi = grad_input + row * norm_size;
        float r = rstd[row];

        // S = sum_j go_j * gamma_j * x_hat_j   (x_hat_j = x_j * r)
        float s = 0.0f;
        for (int j = 0; j < norm_size; j++) {
          s += go[j] * gamma[j] * (in_row[j] * r);
        }

        float inv_n = 1.0f / (float)norm_size;
        for (int j = 0; j < norm_size; j++) {
          float x_hat = in_row[j] * r;
          gi[j] = r * (go[j] * gamma[j] - x_hat * s * inv_n);
          atomicAdd(&grad_gamma[j], go[j] * x_hat);
        }
      }
    }
  CUDA
  compile_cached(source, "rms_norm_backward")
end

.rms_norm_forward ⇒ `Ignis::JIT::Kernel`

RMSNorm forward: y = gamma * x / sqrt(mean(x^2) + eps) Used in LLaMA/Mistral architectures

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/normalization.rb', line 118

def rms_norm_forward
  source = <<~CUDA
    extern "C" __global__
    void rms_norm_forward(const float* __restrict__ input,
                          const float* __restrict__ gamma,
                          float* __restrict__ output,
                          float* __restrict__ rstd_out,
                          const int outer_size,
                          const int norm_size,
                          const float eps) {
      int row = blockIdx.x * blockDim.x + threadIdx.x;
      if (row < outer_size) {
        const float* in_row = input + row * norm_size;
        float* out_row = output + row * norm_size;

        float ss = 0.0f;
        for (int j = 0; j < norm_size; j++) {
          ss += in_row[j] * in_row[j];
        }
        float rstd = rsqrtf(ss / (float)norm_size + eps);

        if (rstd_out) rstd_out[row] = rstd;

        for (int j = 0; j < norm_size; j++) {
          out_row[j] = gamma[j] * in_row[j] * rstd;
        }
      }
    }
  CUDA
  compile_cached(source, "rms_norm_forward")
end

Module: Ignis::JIT::Kernels::Normalization

Overview

Class Method Summary collapse

Class Method Details

.layer_norm_backward ⇒ Ignis::JIT::Kernel

.layer_norm_forward ⇒ Ignis::JIT::Kernel

.rms_norm_backward ⇒ Ignis::JIT::Kernel

.rms_norm_forward ⇒ Ignis::JIT::Kernel

.layer_norm_backward ⇒ `Ignis::JIT::Kernel`

.layer_norm_forward ⇒ `Ignis::JIT::Kernel`

.rms_norm_backward ⇒ `Ignis::JIT::Kernel`

.rms_norm_forward ⇒ `Ignis::JIT::Kernel`