Module: Ignis::JIT::Kernels::Activations
- Defined in:
- lib/nvruby/jit/kernels/activations.rb
Overview
Activation function CUDA kernels for AI training and inference. Each kernel has a forward and backward variant for autograd compatibility. All kernels are compiled via NVRTC on first use and cached.
Class Method Summary collapse
-
.gelu_backward ⇒ Ignis::JIT::Kernel
GELU backward: Φ(x) + x * φ(x).
-
.gelu_forward ⇒ Ignis::JIT::Kernel
GELU forward: x * Φ(x) using tanh approximation GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x³))).
-
.relu_backward ⇒ Ignis::JIT::Kernel
ReLU backward: grad * (x > 0 ? 1 : 0).
-
.relu_forward(n) ⇒ Ignis::JIT::Kernel
ReLU forward: max(0, x).
-
.sigmoid_backward ⇒ Ignis::JIT::Kernel
Sigmoid backward: sigmoid(x) * (1 - sigmoid(x)) * grad.
-
.sigmoid_forward ⇒ Ignis::JIT::Kernel
Sigmoid forward: 1 / (1 + exp(-x)).
-
.silu_backward ⇒ Ignis::JIT::Kernel
SiLU backward: sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x)) = sigmoid(x) * (1 + x * (1 - sigmoid(x))).
-
.silu_forward ⇒ Ignis::JIT::Kernel
SiLU forward: x * sigmoid(x).
Class Method Details
.gelu_backward ⇒ Ignis::JIT::Kernel
GELU backward: Φ(x) + x * φ(x)
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/nvruby/jit/kernels/activations.rb', line 69 def gelu_backward source = <<~CUDA extern "C" __global__ void gelu_backward(const float* __restrict__ grad_output, const float* __restrict__ input, float* __restrict__ grad_input, const int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { float x = input[idx]; float s = 0.7978845608f * (x + 0.044715f * x * x * x); float tanh_s = tanhf(s); float cdf = 0.5f * (1.0f + tanh_s); float pdf_term = 0.5f * (1.0f - tanh_s * tanh_s) * 0.7978845608f * (1.0f + 3.0f * 0.044715f * x * x); grad_input[idx] = grad_output[idx] * (cdf + x * pdf_term); } } CUDA compile_cached(source, "gelu_backward") end |
.gelu_forward ⇒ Ignis::JIT::Kernel
GELU forward: x * Φ(x) using tanh approximation GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x³)))
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/nvruby/jit/kernels/activations.rb', line 50 def gelu_forward source = <<~CUDA extern "C" __global__ void gelu_forward(const float* __restrict__ input, float* __restrict__ output, const int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { float x = input[idx]; float cdf = 0.5f * (1.0f + tanhf(0.7978845608f * (x + 0.044715f * x * x * x))); output[idx] = x * cdf; } } CUDA compile_cached(source, "gelu_forward") end |
.relu_backward ⇒ Ignis::JIT::Kernel
ReLU backward: grad * (x > 0 ? 1 : 0)
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/nvruby/jit/kernels/activations.rb', line 31 def relu_backward source = <<~CUDA extern "C" __global__ void relu_backward(const float* __restrict__ grad_output, const float* __restrict__ input, float* __restrict__ grad_input, const int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { grad_input[idx] = input[idx] > 0.0f ? grad_output[idx] : 0.0f; } } CUDA compile_cached(source, "relu_backward") end |
.relu_forward(n) ⇒ Ignis::JIT::Kernel
ReLU forward: max(0, x)
14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/nvruby/jit/kernels/activations.rb', line 14 def relu_forward(n) source = <<~CUDA extern "C" __global__ void relu_forward(const float* __restrict__ input, float* __restrict__ output, const int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { output[idx] = fmaxf(input[idx], 0.0f); } } CUDA compile_cached(source, "relu_forward") end |
.sigmoid_backward ⇒ Ignis::JIT::Kernel
Sigmoid backward: sigmoid(x) * (1 - sigmoid(x)) * grad
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
# File 'lib/nvruby/jit/kernels/activations.rb', line 149 def sigmoid_backward source = <<~CUDA extern "C" __global__ void sigmoid_backward(const float* __restrict__ grad_output, const float* __restrict__ output, float* __restrict__ grad_input, const int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { float s = output[idx]; grad_input[idx] = grad_output[idx] * s * (1.0f - s); } } CUDA compile_cached(source, "sigmoid_backward") end |
.sigmoid_forward ⇒ Ignis::JIT::Kernel
Sigmoid forward: 1 / (1 + exp(-x))
132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
# File 'lib/nvruby/jit/kernels/activations.rb', line 132 def sigmoid_forward source = <<~CUDA extern "C" __global__ void sigmoid_forward(const float* __restrict__ input, float* __restrict__ output, const int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { output[idx] = 1.0f / (1.0f + expf(-input[idx])); } } CUDA compile_cached(source, "sigmoid_forward") end |
.silu_backward ⇒ Ignis::JIT::Kernel
SiLU backward: sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x))
sigmoid(x) * (1 + x * (1 - sigmoid(x)))
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
# File 'lib/nvruby/jit/kernels/activations.rb', line 112 def silu_backward source = <<~CUDA extern "C" __global__ void silu_backward(const float* __restrict__ grad_output, const float* __restrict__ input, float* __restrict__ grad_input, const int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { float x = input[idx]; float sig = 1.0f / (1.0f + expf(-x)); grad_input[idx] = grad_output[idx] * (sig * (1.0f + x * (1.0f - sig))); } } CUDA compile_cached(source, "silu_backward") end |
.silu_forward ⇒ Ignis::JIT::Kernel
SiLU forward: x * sigmoid(x)
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/nvruby/jit/kernels/activations.rb', line 92 def silu_forward source = <<~CUDA extern "C" __global__ void silu_forward(const float* __restrict__ input, float* __restrict__ output, const int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { float x = input[idx]; float sig = 1.0f / (1.0f + expf(-x)); output[idx] = x * sig; } } CUDA compile_cached(source, "silu_forward") end |