Module: Ignis::Epilogues::Kernels
- Defined in:
- lib/nvruby/epilogues.rb
Overview
JIT CUDA kernels for epilogues
Constant Summary collapse
- GELU_KERNEL =
<<~CUDA extern "C" __global__ void gelu_forward( const float* __restrict__ input, float* __restrict__ output, int n ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { float x = input[idx]; // Approximation: 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) float x3 = x * x * x; float tanh_arg = 0.7978845608f * (x + 0.044715f * x3); output[idx] = 0.5f * x * (1.0f + tanhf(tanh_arg)); } } CUDA
- GELU_EXACT_KERNEL =
<<~CUDA extern "C" __global__ void gelu_exact_forward( const float* __restrict__ input, float* __restrict__ output, int n ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { float x = input[idx]; // Exact: x * 0.5 * (1 + erf(x / sqrt(2))) output[idx] = x * 0.5f * (1.0f + erff(x * 0.7071067811865476f)); } } CUDA
- SILU_KERNEL =
<<~CUDA extern "C" __global__ void silu_forward( const float* __restrict__ input, float* __restrict__ output, int n ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { float x = input[idx]; // SiLU: x * sigmoid(x) = x / (1 + exp(-x)) output[idx] = x / (1.0f + expf(-x)); } } CUDA
- RELU_KERNEL =
<<~CUDA extern "C" __global__ void relu_forward( const float* __restrict__ input, float* __restrict__ output, int n ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { output[idx] = fmaxf(0.0f, input[idx]); } } CUDA
- LEAKY_RELU_KERNEL =
<<~CUDA extern "C" __global__ void leaky_relu_forward( const float* __restrict__ input, float* __restrict__ output, int n, float negative_slope ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { float x = input[idx]; output[idx] = x > 0.0f ? x : x * negative_slope; } } CUDA
- BIAS_ADD_KERNEL =
<<~CUDA extern "C" __global__ void bias_add( const float* __restrict__ input, const float* __restrict__ bias, float* __restrict__ output, int rows, int cols ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int total = rows * cols; if (idx < total) { int col = idx % cols; output[idx] = input[idx] + bias[col]; } } CUDA
- GELU_BIAS_KERNEL =
<<~CUDA extern "C" __global__ void gelu_bias_forward( const float* __restrict__ input, const float* __restrict__ bias, float* __restrict__ output, int rows, int cols ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int total = rows * cols; if (idx < total) { int col = idx % cols; float x = input[idx] + bias[col]; float x3 = x * x * x; float tanh_arg = 0.7978845608f * (x + 0.044715f * x3); output[idx] = 0.5f * x * (1.0f + tanhf(tanh_arg)); } } CUDA
- SILU_BIAS_KERNEL =
<<~CUDA extern "C" __global__ void silu_bias_forward( const float* __restrict__ input, const float* __restrict__ bias, float* __restrict__ output, int rows, int cols ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int total = rows * cols; if (idx < total) { int col = idx % cols; float x = input[idx] + bias[col]; output[idx] = x / (1.0f + expf(-x)); } } CUDA
- RESIDUAL_ADD_KERNEL =
<<~CUDA extern "C" __global__ void residual_add( const float* __restrict__ input, const float* __restrict__ residual, float* __restrict__ output, int n ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { output[idx] = input[idx] + residual[idx]; } } CUDA
- SCALE_KERNEL =
<<~CUDA extern "C" __global__ void scale( const float* __restrict__ input, float* __restrict__ output, float scale_factor, int n ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { output[idx] = input[idx] * scale_factor; } } CUDA