Module: Ignis::LinAlg::CuBLASLtBindings

Extended by:: FFI::Library

Defined in:: lib/nvruby/linalg/cublaslt_bindings.rb

Overview

cuBLASLt (Light) library FFI bindings for advanced GEMM optimization

cuBLASLt provides:

Descriptor-based matrix multiplication
Heuristic algorithm selection
Workspace optimization
Custom epilog operations (bias, activation fusion)

Defined Under Namespace

Classes: MatmulHeuristicResult

Constant Summary collapse

CUBLASLT_ORDER_COL = Matrix layout order

CUBLASLT_ORDER_ROW =

CUBLASLT_ORDER_COL32 =

CUBLASLT_ORDER_COL4_4R2_8C =

CUBLASLT_ORDER_COL32_2R_4R4 =

CUBLASLT_MATMUL_DESC_COMPUTE_TYPE = Matmul descriptor attributes

CUBLASLT_MATMUL_DESC_SCALE_TYPE =

CUBLASLT_MATMUL_DESC_POINTER_MODE =

CUBLASLT_MATMUL_DESC_TRANSA =

CUBLASLT_MATMUL_DESC_TRANSB =

CUBLASLT_MATMUL_DESC_TRANSC =

CUBLASLT_MATMUL_DESC_EPILOGUE =

CUBLASLT_MATMUL_DESC_BIAS_POINTER =

CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE =

CUBLASLT_MATMUL_DESC_A_SCALE_POINTER =

CUBLASLT_MATMUL_DESC_B_SCALE_POINTER =

CUBLASLT_MATMUL_DESC_C_SCALE_POINTER =

CUBLASLT_MATMUL_DESC_D_SCALE_POINTER =

CUBLASLT_MATMUL_DESC_AMAX_D_POINTER =

CUBLASLT_MATMUL_DESC_SM_COUNT_TARGET =

CUBLASLT_MATMUL_DESC_FAST_ACCUM =

CUBLASLT_MATRIX_LAYOUT_TYPE = Matrix layout attributes

CUBLASLT_MATRIX_LAYOUT_ORDER =

CUBLASLT_MATRIX_LAYOUT_ROWS =

CUBLASLT_MATRIX_LAYOUT_COLS =

CUBLASLT_MATRIX_LAYOUT_LD =

CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT =

CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET =

CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET =

CUBLASLT_MATMUL_PREF_SEARCH_MODE = Matmul preference attributes

CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES =

CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK =

CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES =

CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES =

CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES =

CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES =

CUBLASLT_MATMUL_PREF_MAX_WAVES_COUNT =

CUBLASLT_MATMUL_PREF_IMPL_MASK =

CUBLASLT_MATMUL_PREF_SM_COUNT_TARGET =

CUBLASLT_SEARCH_BEST_FIT = Search modes

CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID =

CUBLASLT_NUMERICAL_IMPL_FLAGS_FMA = Numerical implementation flags for IMPL_MASK

0x01

CUBLASLT_NUMERICAL_IMPL_FLAGS_HMMA =

0x02

CUBLASLT_NUMERICAL_IMPL_FLAGS_IMMA =

0x04

CUBLASLT_NUMERICAL_IMPL_FLAGS_DMMA =

0x08

CUBLASLT_NUMERICAL_IMPL_FLAGS_TENSOR_OP_MASK =

0x0E

CUBLASLT_REDUCTION_SCHEME_NONE = Reduction schemes

CUBLASLT_REDUCTION_SCHEME_INPLACE =

CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE =

CUBLASLT_REDUCTION_SCHEME_OUTPUT_TYPE =

CUBLASLT_REDUCTION_SCHEME_MASK =

CUBLASLT_EPILOGUE_DEFAULT = Epilogue operations

CUBLASLT_EPILOGUE_RELU =

CUBLASLT_EPILOGUE_BIAS =

CUBLASLT_EPILOGUE_RELU_BIAS =

CUBLASLT_EPILOGUE_GELU =

CUBLASLT_EPILOGUE_GELU_BIAS =

CUDA_R_16F = CUDA data types (for compatibility)

CUDA_R_32F = FP16

CUDA_R_64F = FP32

CUDA_R_16BF = FP64

CUDA_R_8F_E4M3 = BF16

CUDA_R_8F_E5M2 = FP8 E4M3

Class Attribute Summary collapse

.lt_handle ⇒ FFI::Pointer^?

CuBLASLt handle.
.workspace_ptr ⇒ FFI::Pointer^?

Workspace memory.
.workspace_size ⇒ Integer

Workspace size in bytes.

Class Method Summary collapse

.check_status!(status, context = "cuBLASLt operation") ⇒ void

Check status and raise error if not success.
.compute_type_for_dtype(dtype) ⇒ Integer

Get compute type for dtype.
.dtype_to_cuda_type(dtype) ⇒ Integer

Convert dtype symbol to CUDA data type.
.ensure_loaded! ⇒ void

Ensure cuBLASLt is loaded.
.finalize! ⇒ void

Finalize cuBLASLt.
.get_handle ⇒ FFI::Pointer

Get or create cuBLASLt handle.
.get_workspace(min_size = 256 * 1024 * 1024) ⇒ FFI::Pointer

Get or allocate workspace.
.scale_type_for_dtype(dtype) ⇒ Integer

Get scale type for dtype (type of alpha/beta scalars).

Class Attribute Details

.lt_handle ⇒ `FFI::Pointer`^?

Returns cuBLASLt handle.

Returns:

(FFI::Pointer, nil) —

cuBLASLt handle



126
127
128

# File 'lib/nvruby/linalg/cublaslt_bindings.rb', line 126

def lt_handle
  @lt_handle
end

.workspace_ptr ⇒ `FFI::Pointer`^?

Returns Workspace memory.

Returns:

(FFI::Pointer, nil) —

Workspace memory



129
130
131

# File 'lib/nvruby/linalg/cublaslt_bindings.rb', line 129

def workspace_ptr
  @workspace_ptr
end

.workspace_size ⇒ `Integer`

Returns Workspace size in bytes.

Returns:

(Integer) —

Workspace size in bytes



132
133
134

# File 'lib/nvruby/linalg/cublaslt_bindings.rb', line 132

def workspace_size
  @workspace_size
end

Class Method Details

.check_status!(status, context = "cuBLASLt operation") ⇒ `void`

This method returns an undefined value.

Check status and raise error if not success

Parameters:

status (Integer) —

cuBLAS status code
context (String) (defaults to: "cuBLASLt operation") —

Context for error message

Raises:

(CuBLASError)

# File 'lib/nvruby/linalg/cublaslt_bindings.rb', line 335

def self.check_status!(status, context = "cuBLASLt operation")
  return if status.zero?

  raise CuBLASError.new("#{context}: status=#{status}", code: status)
end

.compute_type_for_dtype(dtype) ⇒ `Integer`

Get compute type for dtype

Parameters:

dtype (Symbol) —

Data type

Returns:

(Integer) —

Compute type constant

# File 'lib/nvruby/linalg/cublaslt_bindings.rb', line 299

def self.compute_type_for_dtype(dtype)
  case dtype
  when :float16, :half
    CuBLASBindings::CUBLAS_COMPUTE_32F_FAST_16F
  when :float32, :float
    CuBLASBindings::CUBLAS_COMPUTE_32F_FAST_TF32
  when :float64, :double
    CuBLASBindings::CUBLAS_COMPUTE_64F
  else
    CuBLASBindings::CUBLAS_COMPUTE_32F
  end
end

.dtype_to_cuda_type(dtype) ⇒ `Integer`

Convert dtype symbol to CUDA data type

Parameters:

dtype (Symbol) —

Data type (:float16, :float32, :float64)

Returns:

(Integer) —

CUDA data type constant

# File 'lib/nvruby/linalg/cublaslt_bindings.rb', line 286

def self.dtype_to_cuda_type(dtype)
  case dtype
  when :float16, :half then CUDA_R_16F
  when :float32, :float then CUDA_R_32F
  when :float64, :double then CUDA_R_64F
  when :bfloat16 then CUDA_R_16BF
  else CUDA_R_32F
  end
end

.ensure_loaded! ⇒ `void`

This method returns an undefined value.

Ensure cuBLASLt is loaded

# File 'lib/nvruby/linalg/cublaslt_bindings.rb', line 136

def ensure_loaded!
  return if @loaded

  cuda_bin = Ignis.configuration.cuda_bin_path
  if cuda_bin
    dll_path = Dir.glob(File.join(cuda_bin, "cublasLt64_*.dll")).max
    ffi_lib dll_path if dll_path
  else
    ffi_lib "cublasLt64_12"
  end

  attach_cublaslt_functions!
  initialize_cublaslt!

  @loaded = true
end

.finalize! ⇒ `void`

This method returns an undefined value.

Finalize cuBLASLt

# File 'lib/nvruby/linalg/cublaslt_bindings.rb', line 189

def finalize!
  if @workspace_ptr && !@workspace_ptr.null?
    CUDA::RuntimeAPI.cudaFree(@workspace_ptr) rescue nil
    @workspace_ptr = nil
    @workspace_size = 0
  end

  if @lt_handle
    cublasLtDestroy(@lt_handle) rescue nil
    @lt_handle = nil
  end

  @loaded = false
end

.get_handle ⇒ `FFI::Pointer`

Get or create cuBLASLt handle

Returns:

(FFI::Pointer)

# File 'lib/nvruby/linalg/cublaslt_bindings.rb', line 155

def get_handle
  ensure_loaded!
  @lt_handle
end

.get_workspace(min_size = 256 * 1024 * 1024) ⇒ `FFI::Pointer`

Get or allocate workspace

Parameters:

min_size (Integer) (defaults to: 256 * 1024 * 1024) —

Minimum workspace size in bytes

Returns:

(FFI::Pointer) —

Workspace pointer

# File 'lib/nvruby/linalg/cublaslt_bindings.rb', line 163

def get_workspace(min_size = 256 * 1024 * 1024)
  ensure_loaded!

  if @workspace_ptr.nil? || @workspace_size < min_size
    # Free existing workspace
    CUDA::RuntimeAPI.cudaFree(@workspace_ptr) if @workspace_ptr

    # Allocate new workspace
    ptr_ptr = FFI::MemoryPointer.new(:pointer)
    status = CUDA::RuntimeAPI.cudaMalloc(ptr_ptr, min_size)
    if status.zero?
      @workspace_ptr = ptr_ptr.read_pointer
      @workspace_size = min_size
      Ignis.logger.info { "cuBLASLt workspace allocated: #{min_size / 1024 / 1024}MB" }
    else
      Ignis.logger.warn { "Failed to allocate cuBLASLt workspace: #{status}" }
      @workspace_ptr = FFI::Pointer::NULL
      @workspace_size = 0
    end
  end

  @workspace_ptr
end

.scale_type_for_dtype(dtype) ⇒ `Integer`

Get scale type for dtype (type of alpha/beta scalars)

Per NVIDIA cuBLAS documentation, scaleType must match the accumulator type:

CUBLAS_COMPUTE_32F_FAST_16F requires CUDA_R_32F scale type
CUBLAS_COMPUTE_32F_FAST_TF32 requires CUDA_R_32F scale type
CUBLAS_COMPUTE_64F requires CUDA_R_64F scale type

Parameters:

dtype (Symbol) —

Data type of input matrices

Returns:

(Integer) —

CUDA scale type constant for alpha/beta

# File 'lib/nvruby/linalg/cublaslt_bindings.rb', line 321

def self.scale_type_for_dtype(dtype)
  case dtype
  when :float64, :double
    CUDA_R_64F
  else
    # FP16, BF16, FP32, TF32 all use FP32 accumulation and require FP32 scale type
    CUDA_R_32F
  end
end

Module: Ignis::LinAlg::CuBLASLtBindings

Overview

Defined Under Namespace

Constant Summary collapse

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.lt_handle ⇒ FFI::Pointer?

.workspace_ptr ⇒ FFI::Pointer?

.workspace_size ⇒ Integer

Class Method Details

.check_status!(status, context = "cuBLASLt operation") ⇒ void

.compute_type_for_dtype(dtype) ⇒ Integer

.dtype_to_cuda_type(dtype) ⇒ Integer

.ensure_loaded! ⇒ void

.finalize! ⇒ void

.get_handle ⇒ FFI::Pointer

.get_workspace(min_size = 256 * 1024 * 1024) ⇒ FFI::Pointer

.scale_type_for_dtype(dtype) ⇒ Integer

.lt_handle ⇒ `FFI::Pointer`^?

.workspace_ptr ⇒ `FFI::Pointer`^?

.workspace_size ⇒ `Integer`

.check_status!(status, context = "cuBLASLt operation") ⇒ `void`

.compute_type_for_dtype(dtype) ⇒ `Integer`

.dtype_to_cuda_type(dtype) ⇒ `Integer`

.ensure_loaded! ⇒ `void`

.finalize! ⇒ `void`

.get_handle ⇒ `FFI::Pointer`

.get_workspace(min_size = 256 * 1024 * 1024) ⇒ `FFI::Pointer`

.scale_type_for_dtype(dtype) ⇒ `Integer`