Module: Ignis::JIT::Kernels::Normalization

Defined in:
lib/nvruby/jit/kernels/normalization.rb

Overview

Layer normalization CUDA kernels. Forward computes mean, variance, normalizes, scales, and shifts. Backward computes gradients for input, weight (gamma), and bias (beta).

Class Method Summary collapse

Class Method Details

.layer_norm_backwardIgnis::JIT::Kernel

LayerNorm backward: computes dL/dx, dL/dgamma, dL/dbeta

Returns:



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/nvruby/jit/kernels/normalization.rb', line 65

def layer_norm_backward
  source = <<~CUDA
    extern "C" __global__
    void layer_norm_backward(const float* __restrict__ grad_output,
                             const float* __restrict__ input,
                             const float* __restrict__ gamma,
                             const float* __restrict__ mean,
                             const float* __restrict__ rstd,
                             float* __restrict__ grad_input,
                             float* __restrict__ grad_gamma,
                             float* __restrict__ grad_beta,
                             const int outer_size,
                             const int norm_size) {
      int row = blockIdx.x * blockDim.x + threadIdx.x;
      if (row < outer_size) {
        const float* go = grad_output + row * norm_size;
        const float* in_row = input + row * norm_size;
        float* gi = grad_input + row * norm_size;
        float m = mean[row];
        float rs = rstd[row];

        // Compute intermediate sums for efficient backward
        float sum_go_x = 0.0f;
        float sum_go = 0.0f;
        for (int j = 0; j < norm_size; j++) {
          float x_hat = (in_row[j] - m) * rs;
          sum_go_x += go[j] * gamma[j] * x_hat;
          sum_go += go[j] * gamma[j];
        }

        float inv_n = 1.0f / (float)norm_size;

        // Compute grad_input
        for (int j = 0; j < norm_size; j++) {
          float x_hat = (in_row[j] - m) * rs;
          gi[j] = rs * (go[j] * gamma[j] - inv_n * (sum_go + x_hat * sum_go_x));
        }

        // Accumulate grad_gamma and grad_beta (needs atomicAdd for multi-row)
        for (int j = 0; j < norm_size; j++) {
          float x_hat = (in_row[j] - m) * rs;
          atomicAdd(&grad_gamma[j], go[j] * x_hat);
          atomicAdd(&grad_beta[j], go[j]);
        }
      }
    }
  CUDA
  compile_cached(source, "layer_norm_backward")
end

.layer_norm_forwardIgnis::JIT::Kernel

LayerNorm forward: y = gamma * (x - mean) / sqrt(var + eps) + beta Each row (last dim) is normalized independently.

Returns:



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/nvruby/jit/kernels/normalization.rb', line 14

def layer_norm_forward
  source = <<~CUDA
    extern "C" __global__
    void layer_norm_forward(const float* __restrict__ input,
                            const float* __restrict__ gamma,
                            const float* __restrict__ beta,
                            float* __restrict__ output,
                            float* __restrict__ mean_out,
                            float* __restrict__ rstd_out,
                            const int outer_size,
                            const int norm_size,
                            const float eps) {
      int row = blockIdx.x * blockDim.x + threadIdx.x;
      if (row < outer_size) {
        const float* in_row = input + row * norm_size;
        float* out_row = output + row * norm_size;

        // Compute mean
        float mean = 0.0f;
        for (int j = 0; j < norm_size; j++) {
          mean += in_row[j];
        }
        mean /= (float)norm_size;

        // Compute variance
        float var = 0.0f;
        for (int j = 0; j < norm_size; j++) {
          float diff = in_row[j] - mean;
          var += diff * diff;
        }
        var /= (float)norm_size;

        float rstd = rsqrtf(var + eps);

        // Save for backward pass
        if (mean_out) mean_out[row] = mean;
        if (rstd_out) rstd_out[row] = rstd;

        // Normalize, scale, shift
        for (int j = 0; j < norm_size; j++) {
          float normalized = (in_row[j] - mean) * rstd;
          out_row[j] = gamma[j] * normalized + beta[j];
        }
      }
    }
  CUDA
  compile_cached(source, "layer_norm_forward")
end

.rms_norm_backwardIgnis::JIT::Kernel

RMSNorm backward: dL/dx and dL/dgamma (no bias in RMSNorm). With x_hat_j = x_j * rstd and y_j = gamma_j * x_hat_j:

dL/dx_i     = rstd * (go_i*gamma_i - x_hat_i * S / n),  S = sum_j go_j*gamma_j*x_hat_j
dL/dgamma_j = sum_rows go_j * x_hat_j

Returns:



155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'lib/nvruby/jit/kernels/normalization.rb', line 155

def rms_norm_backward
  source = <<~CUDA
    extern "C" __global__
    void rms_norm_backward(const float* __restrict__ grad_output,
                           const float* __restrict__ input,
                           const float* __restrict__ gamma,
                           const float* __restrict__ rstd,
                           float* __restrict__ grad_input,
                           float* __restrict__ grad_gamma,
                           const int outer_size,
                           const int norm_size) {
      int row = blockIdx.x * blockDim.x + threadIdx.x;
      if (row < outer_size) {
        const float* go = grad_output + row * norm_size;
        const float* in_row = input + row * norm_size;
        float* gi = grad_input + row * norm_size;
        float r = rstd[row];

        // S = sum_j go_j * gamma_j * x_hat_j   (x_hat_j = x_j * r)
        float s = 0.0f;
        for (int j = 0; j < norm_size; j++) {
          s += go[j] * gamma[j] * (in_row[j] * r);
        }

        float inv_n = 1.0f / (float)norm_size;
        for (int j = 0; j < norm_size; j++) {
          float x_hat = in_row[j] * r;
          gi[j] = r * (go[j] * gamma[j] - x_hat * s * inv_n);
          atomicAdd(&grad_gamma[j], go[j] * x_hat);
        }
      }
    }
  CUDA
  compile_cached(source, "rms_norm_backward")
end

.rms_norm_forwardIgnis::JIT::Kernel

RMSNorm forward: y = gamma * x / sqrt(mean(x^2) + eps) Used in LLaMA/Mistral architectures

Returns:



118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/nvruby/jit/kernels/normalization.rb', line 118

def rms_norm_forward
  source = <<~CUDA
    extern "C" __global__
    void rms_norm_forward(const float* __restrict__ input,
                          const float* __restrict__ gamma,
                          float* __restrict__ output,
                          float* __restrict__ rstd_out,
                          const int outer_size,
                          const int norm_size,
                          const float eps) {
      int row = blockIdx.x * blockDim.x + threadIdx.x;
      if (row < outer_size) {
        const float* in_row = input + row * norm_size;
        float* out_row = output + row * norm_size;

        float ss = 0.0f;
        for (int j = 0; j < norm_size; j++) {
          ss += in_row[j] * in_row[j];
        }
        float rstd = rsqrtf(ss / (float)norm_size + eps);

        if (rstd_out) rstd_out[row] = rstd;

        for (int j = 0; j < norm_size; j++) {
          out_row[j] = gamma[j] * in_row[j] * rstd;
        }
      }
    }
  CUDA
  compile_cached(source, "rms_norm_forward")
end