Module: Ignis::JIT::Kernels::Activations

Defined in:: lib/nvruby/jit/kernels/activations.rb

Overview

Activation function CUDA kernels for AI training and inference. Each kernel has a forward and backward variant for autograd compatibility. All kernels are compiled via NVRTC on first use and cached.

Class Method Summary collapse

.gelu_backward ⇒ Ignis::JIT::Kernel

GELU backward: Φ(x) + x * φ(x).
.gelu_forward ⇒ Ignis::JIT::Kernel

GELU forward: x * Φ(x) using tanh approximation GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x³))).
.relu_backward ⇒ Ignis::JIT::Kernel

ReLU backward: grad * (x > 0 ? 1 : 0).
.relu_forward(n) ⇒ Ignis::JIT::Kernel

ReLU forward: max(0, x).
.sigmoid_backward ⇒ Ignis::JIT::Kernel

Sigmoid backward: sigmoid(x) * (1 - sigmoid(x)) * grad.
.sigmoid_forward ⇒ Ignis::JIT::Kernel

Sigmoid forward: 1 / (1 + exp(-x)).
.silu_backward ⇒ Ignis::JIT::Kernel

SiLU backward: sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x)) = sigmoid(x) * (1 + x * (1 - sigmoid(x))).
.silu_forward ⇒ Ignis::JIT::Kernel

SiLU forward: x * sigmoid(x).

Class Method Details

.gelu_backward ⇒ `Ignis::JIT::Kernel`

GELU backward: Φ(x) + x * φ(x)

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/activations.rb', line 69

def gelu_backward
  source = <<~CUDA
    extern "C" __global__
    void gelu_backward(const float* __restrict__ grad_output,
                       const float* __restrict__ input,
                       float* __restrict__ grad_input,
                       const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float x = input[idx];
        float s = 0.7978845608f * (x + 0.044715f * x * x * x);
        float tanh_s = tanhf(s);
        float cdf = 0.5f * (1.0f + tanh_s);
        float pdf_term = 0.5f * (1.0f - tanh_s * tanh_s) * 0.7978845608f * (1.0f + 3.0f * 0.044715f * x * x);
        grad_input[idx] = grad_output[idx] * (cdf + x * pdf_term);
      }
    }
  CUDA
  compile_cached(source, "gelu_backward")
end

.gelu_forward ⇒ `Ignis::JIT::Kernel`

GELU forward: x * Φ(x) using tanh approximation GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x³)))

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/activations.rb', line 50

def gelu_forward
  source = <<~CUDA
    extern "C" __global__
    void gelu_forward(const float* __restrict__ input,
                      float* __restrict__ output,
                      const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float x = input[idx];
        float cdf = 0.5f * (1.0f + tanhf(0.7978845608f * (x + 0.044715f * x * x * x)));
        output[idx] = x * cdf;
      }
    }
  CUDA
  compile_cached(source, "gelu_forward")
end

.relu_backward ⇒ `Ignis::JIT::Kernel`

ReLU backward: grad * (x > 0 ? 1 : 0)

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/activations.rb', line 31

def relu_backward
  source = <<~CUDA
    extern "C" __global__
    void relu_backward(const float* __restrict__ grad_output,
                       const float* __restrict__ input,
                       float* __restrict__ grad_input,
                       const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        grad_input[idx] = input[idx] > 0.0f ? grad_output[idx] : 0.0f;
      }
    }
  CUDA
  compile_cached(source, "relu_backward")
end

.relu_forward(n) ⇒ `Ignis::JIT::Kernel`

ReLU forward: max(0, x)

Parameters:

n (Integer) —

number of elements

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/activations.rb', line 14

def relu_forward(n)
  source = <<~CUDA
    extern "C" __global__
    void relu_forward(const float* __restrict__ input,
                      float* __restrict__ output,
                      const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        output[idx] = fmaxf(input[idx], 0.0f);
      }
    }
  CUDA
  compile_cached(source, "relu_forward")
end

.sigmoid_backward ⇒ `Ignis::JIT::Kernel`

Sigmoid backward: sigmoid(x) * (1 - sigmoid(x)) * grad

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/activations.rb', line 149

def sigmoid_backward
  source = <<~CUDA
    extern "C" __global__
    void sigmoid_backward(const float* __restrict__ grad_output,
                          const float* __restrict__ output,
                          float* __restrict__ grad_input,
                          const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float s = output[idx];
        grad_input[idx] = grad_output[idx] * s * (1.0f - s);
      }
    }
  CUDA
  compile_cached(source, "sigmoid_backward")
end

.sigmoid_forward ⇒ `Ignis::JIT::Kernel`

Sigmoid forward: 1 / (1 + exp(-x))

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/activations.rb', line 132

def sigmoid_forward
  source = <<~CUDA
    extern "C" __global__
    void sigmoid_forward(const float* __restrict__ input,
                         float* __restrict__ output,
                         const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        output[idx] = 1.0f / (1.0f + expf(-input[idx]));
      }
    }
  CUDA
  compile_cached(source, "sigmoid_forward")
end

.silu_backward ⇒ `Ignis::JIT::Kernel`

SiLU backward: sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x))

sigmoid(x) * (1 + x * (1 - sigmoid(x)))

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/activations.rb', line 112

def silu_backward
  source = <<~CUDA
    extern "C" __global__
    void silu_backward(const float* __restrict__ grad_output,
                       const float* __restrict__ input,
                       float* __restrict__ grad_input,
                       const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float x = input[idx];
        float sig = 1.0f / (1.0f + expf(-x));
        grad_input[idx] = grad_output[idx] * (sig * (1.0f + x * (1.0f - sig)));
      }
    }
  CUDA
  compile_cached(source, "silu_backward")
end

.silu_forward ⇒ `Ignis::JIT::Kernel`

SiLU forward: x * sigmoid(x)

Returns:

(Ignis::JIT::Kernel)

# File 'lib/nvruby/jit/kernels/activations.rb', line 92

def silu_forward
  source = <<~CUDA
    extern "C" __global__
    void silu_forward(const float* __restrict__ input,
                      float* __restrict__ output,
                      const int n) {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx < n) {
        float x = input[idx];
        float sig = 1.0f / (1.0f + expf(-x));
        output[idx] = x * sig;
      }
    }
  CUDA
  compile_cached(source, "silu_forward")
end

Module: Ignis::JIT::Kernels::Activations

Overview

Class Method Summary collapse

Class Method Details

.gelu_backward ⇒ Ignis::JIT::Kernel

.gelu_forward ⇒ Ignis::JIT::Kernel

.relu_backward ⇒ Ignis::JIT::Kernel

.relu_forward(n) ⇒ Ignis::JIT::Kernel

.sigmoid_backward ⇒ Ignis::JIT::Kernel

.sigmoid_forward ⇒ Ignis::JIT::Kernel

.silu_backward ⇒ Ignis::JIT::Kernel

sigmoid(x) * (1 + x * (1 - sigmoid(x)))

.silu_forward ⇒ Ignis::JIT::Kernel

.gelu_backward ⇒ `Ignis::JIT::Kernel`

.gelu_forward ⇒ `Ignis::JIT::Kernel`

.relu_backward ⇒ `Ignis::JIT::Kernel`

.relu_forward(n) ⇒ `Ignis::JIT::Kernel`

.sigmoid_backward ⇒ `Ignis::JIT::Kernel`

.sigmoid_forward ⇒ `Ignis::JIT::Kernel`

.silu_backward ⇒ `Ignis::JIT::Kernel`

.silu_forward ⇒ `Ignis::JIT::Kernel`