Module: Ignis::JIT::Kernels::Activations

Defined in:
lib/nvruby/jit/kernels/activations.rb

Overview

Activation function CUDA kernels for AI training and inference. Each kernel has a forward and backward variant for autograd compatibility. All kernels are compiled via NVRTC on first use and cached.

Class Method Summary collapse

Class Method Details

.gelu_backwardIgnis::JIT::Kernel

GELU backward: Φ(x) + x * φ(x)

Returns:



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/nvruby/jit/kernels/activations.rb', line 69

def gelu_backward
  source = <<~CUDA
    extern "C" __global__
    void gelu_backward(const float* __restrict__ grad_output,
                       const float* __restrict__ input,
                       float* __restrict__ grad_input,
                       const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float x = input[idx];
        float s = 0.7978845608f * (x + 0.044715f * x * x * x);
        float tanh_s = tanhf(s);
        float cdf = 0.5f * (1.0f + tanh_s);
        float pdf_term = 0.5f * (1.0f - tanh_s * tanh_s) * 0.7978845608f * (1.0f + 3.0f * 0.044715f * x * x);
        grad_input[idx] = grad_output[idx] * (cdf + x * pdf_term);
      }
    }
  CUDA
  compile_cached(source, "gelu_backward")
end

.gelu_forwardIgnis::JIT::Kernel

GELU forward: x * Φ(x) using tanh approximation GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x³)))

Returns:



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/nvruby/jit/kernels/activations.rb', line 50

def gelu_forward
  source = <<~CUDA
    extern "C" __global__
    void gelu_forward(const float* __restrict__ input,
                      float* __restrict__ output,
                      const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float x = input[idx];
        float cdf = 0.5f * (1.0f + tanhf(0.7978845608f * (x + 0.044715f * x * x * x)));
        output[idx] = x * cdf;
      }
    }
  CUDA
  compile_cached(source, "gelu_forward")
end

.relu_backwardIgnis::JIT::Kernel

ReLU backward: grad * (x > 0 ? 1 : 0)

Returns:



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/nvruby/jit/kernels/activations.rb', line 31

def relu_backward
  source = <<~CUDA
    extern "C" __global__
    void relu_backward(const float* __restrict__ grad_output,
                       const float* __restrict__ input,
                       float* __restrict__ grad_input,
                       const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        grad_input[idx] = input[idx] > 0.0f ? grad_output[idx] : 0.0f;
      }
    }
  CUDA
  compile_cached(source, "relu_backward")
end

.relu_forward(n) ⇒ Ignis::JIT::Kernel

ReLU forward: max(0, x)

Parameters:

  • n (Integer)

    number of elements

Returns:



14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/nvruby/jit/kernels/activations.rb', line 14

def relu_forward(n)
  source = <<~CUDA
    extern "C" __global__
    void relu_forward(const float* __restrict__ input,
                      float* __restrict__ output,
                      const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        output[idx] = fmaxf(input[idx], 0.0f);
      }
    }
  CUDA
  compile_cached(source, "relu_forward")
end

.sigmoid_backwardIgnis::JIT::Kernel

Sigmoid backward: sigmoid(x) * (1 - sigmoid(x)) * grad

Returns:



149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/nvruby/jit/kernels/activations.rb', line 149

def sigmoid_backward
  source = <<~CUDA
    extern "C" __global__
    void sigmoid_backward(const float* __restrict__ grad_output,
                          const float* __restrict__ output,
                          float* __restrict__ grad_input,
                          const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float s = output[idx];
        grad_input[idx] = grad_output[idx] * s * (1.0f - s);
      }
    }
  CUDA
  compile_cached(source, "sigmoid_backward")
end

.sigmoid_forwardIgnis::JIT::Kernel

Sigmoid forward: 1 / (1 + exp(-x))

Returns:



132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/nvruby/jit/kernels/activations.rb', line 132

def sigmoid_forward
  source = <<~CUDA
    extern "C" __global__
    void sigmoid_forward(const float* __restrict__ input,
                         float* __restrict__ output,
                         const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        output[idx] = 1.0f / (1.0f + expf(-input[idx]));
      }
    }
  CUDA
  compile_cached(source, "sigmoid_forward")
end

.silu_backwardIgnis::JIT::Kernel

SiLU backward: sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x))

sigmoid(x) * (1 + x * (1 - sigmoid(x)))

Returns:



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/nvruby/jit/kernels/activations.rb', line 112

def silu_backward
  source = <<~CUDA
    extern "C" __global__
    void silu_backward(const float* __restrict__ grad_output,
                       const float* __restrict__ input,
                       float* __restrict__ grad_input,
                       const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float x = input[idx];
        float sig = 1.0f / (1.0f + expf(-x));
        grad_input[idx] = grad_output[idx] * (sig * (1.0f + x * (1.0f - sig)));
      }
    }
  CUDA
  compile_cached(source, "silu_backward")
end

.silu_forwardIgnis::JIT::Kernel

SiLU forward: x * sigmoid(x)

Returns:



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/nvruby/jit/kernels/activations.rb', line 92

def silu_forward
  source = <<~CUDA
    extern "C" __global__
    void silu_forward(const float* __restrict__ input,
                      float* __restrict__ output,
                      const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float x = input[idx];
        float sig = 1.0f / (1.0f + expf(-x));
        output[idx] = x * sig;
      }
    }
  CUDA
  compile_cached(source, "silu_forward")
end