Module: Ignis::Epilogues::Kernels

Defined in:
lib/nvruby/epilogues.rb

Overview

JIT CUDA kernels for epilogues

Constant Summary collapse

GELU_KERNEL =
<<~CUDA
  extern "C" __global__ void gelu_forward(
      const float* __restrict__ input,
      float* __restrict__ output,
      int n
  ) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
          float x = input[idx];
          // Approximation: 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
          float x3 = x * x * x;
          float tanh_arg = 0.7978845608f * (x + 0.044715f * x3);
          output[idx] = 0.5f * x * (1.0f + tanhf(tanh_arg));
      }
  }
CUDA
GELU_EXACT_KERNEL =
<<~CUDA
  extern "C" __global__ void gelu_exact_forward(
      const float* __restrict__ input,
      float* __restrict__ output,
      int n
  ) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
          float x = input[idx];
          // Exact: x * 0.5 * (1 + erf(x / sqrt(2)))
          output[idx] = x * 0.5f * (1.0f + erff(x * 0.7071067811865476f));
      }
  }
CUDA
SILU_KERNEL =
<<~CUDA
  extern "C" __global__ void silu_forward(
      const float* __restrict__ input,
      float* __restrict__ output,
      int n
  ) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
          float x = input[idx];
          // SiLU: x * sigmoid(x) = x / (1 + exp(-x))
          output[idx] = x / (1.0f + expf(-x));
      }
  }
CUDA
RELU_KERNEL =
<<~CUDA
  extern "C" __global__ void relu_forward(
      const float* __restrict__ input,
      float* __restrict__ output,
      int n
  ) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
          output[idx] = fmaxf(0.0f, input[idx]);
      }
  }
CUDA
LEAKY_RELU_KERNEL =
<<~CUDA
  extern "C" __global__ void leaky_relu_forward(
      const float* __restrict__ input,
      float* __restrict__ output,
      int n,
      float negative_slope
  ) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
          float x = input[idx];
          output[idx] = x > 0.0f ? x : x * negative_slope;
      }
  }
CUDA
BIAS_ADD_KERNEL =
<<~CUDA
  extern "C" __global__ void bias_add(
      const float* __restrict__ input,
      const float* __restrict__ bias,
      float* __restrict__ output,
      int rows,
      int cols
  ) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      int total = rows * cols;
      if (idx < total) {
          int col = idx % cols;
          output[idx] = input[idx] + bias[col];
      }
  }
CUDA
GELU_BIAS_KERNEL =
<<~CUDA
  extern "C" __global__ void gelu_bias_forward(
      const float* __restrict__ input,
      const float* __restrict__ bias,
      float* __restrict__ output,
      int rows,
      int cols
  ) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      int total = rows * cols;
      if (idx < total) {
          int col = idx % cols;
          float x = input[idx] + bias[col];
          float x3 = x * x * x;
          float tanh_arg = 0.7978845608f * (x + 0.044715f * x3);
          output[idx] = 0.5f * x * (1.0f + tanhf(tanh_arg));
      }
  }
CUDA
SILU_BIAS_KERNEL =
<<~CUDA
  extern "C" __global__ void silu_bias_forward(
      const float* __restrict__ input,
      const float* __restrict__ bias,
      float* __restrict__ output,
      int rows,
      int cols
  ) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      int total = rows * cols;
      if (idx < total) {
          int col = idx % cols;
          float x = input[idx] + bias[col];
          output[idx] = x / (1.0f + expf(-x));
      }
  }
CUDA
RESIDUAL_ADD_KERNEL =
<<~CUDA
  extern "C" __global__ void residual_add(
      const float* __restrict__ input,
      const float* __restrict__ residual,
      float* __restrict__ output,
      int n
  ) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
          output[idx] = input[idx] + residual[idx];
      }
  }
CUDA
SCALE_KERNEL =
<<~CUDA
  extern "C" __global__ void scale(
      const float* __restrict__ input,
      float* __restrict__ output,
      float scale_factor,
      int n
  ) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
          output[idx] = input[idx] * scale_factor;
      }
  }
CUDA