Class: Ignis::Collective::Transport::VMMIPCTransport

Inherits:

Base

Object
Base
Ignis::Collective::Transport::VMMIPCTransport

show all

Defined in:: lib/nvruby/collective/transport/vmm_ipc_transport.rb

Overview

VMM IPC Transport - Modern CUDA Virtual Memory Management IPC

Uses cuMemExportToShareableHandle with CU_MEM_HANDLE_TYPE_WIN32 for cross-process GPU memory sharing on Windows. Required for sharing memory allocated with cudaMallocAsync.

Workflow:

Sender: cuMemCreate -> cuMemExportToShareableHandle -> send HANDLE
Receiver: cuMemImportFromShareableHandle -> cuMemAddressReserve -> cuMemMap -> cuMemSetAccess

Instance Attribute Summary

Attributes inherited from Base

#dst_device, #src_device

Class Method Summary collapse

.available? ⇒ Boolean

Check if VMM IPC is available.
.transport_type ⇒ Symbol

Transport type identifier.

Instance Method Summary collapse

#close_imported_handle(device_ptr) ⇒ void

Close an imported handle.
#copy_async(dst_buffer, src_buffer, size, stream) ⇒ Object

Copy data via VMM IPC (used when both sender and receiver have mapped).
#create_shareable_allocation(size) ⇒ Hash

Create a VMM allocation on source device (shareable).
#destroy! ⇒ void

Clean up all handles.
#estimated_bandwidth ⇒ Float

Estimated bandwidth (GB/s).
#estimated_latency ⇒ Float

Estimated latency (microseconds).
#import_shareable_allocation(win32_handle, size) ⇒ FFI::Pointer

Import a shareable allocation on destination device.
#initialize! ⇒ void

Initialize the transport.
#recv_async(buffer, size, stream) ⇒ void

Async receive.
#send_async(buffer, size, stream) ⇒ Hash

Async send (export and copy).

Methods inherited from Base

#initialize, #ready?, #recv_sync, #send_sync, #synchronize!, #to_s

Constructor Details

This class inherits a constructor from Ignis::Collective::Transport::Base

Class Method Details

.available? ⇒ `Boolean`

Check if VMM IPC is available

Returns:

(Boolean) —

True if available

# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 232

def self.available?
  begin
    VMMBindings.ensure_loaded!
    true
  rescue LoadError
    false
  end
end

.transport_type ⇒ `Symbol`

Returns Transport type identifier.

Returns:

(Symbol) —

Transport type identifier



20
21
22

# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 20

def self.transport_type
  :vmm_ipc
end

Instance Method Details

#close_imported_handle(device_ptr) ⇒ `void`

This method returns an undefined value.

Close an imported handle

Parameters:

device_ptr (FFI::Pointer) —

The mapped device pointer

# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 167

def close_imported_handle(device_ptr)
  va_ptr = device_ptr.address
  
  # Find the imported handle info
  @imported_handles.each do |handle_addr, info|
    next unless info[:va_ptr] == va_ptr

    # Unmap
    VMMBindings.cuMemUnmap(va_ptr, info[:size])
    
    # Free address
    VMMBindings.cuMemAddressFree(va_ptr, info[:size])
    
    # Release handle
    VMMBindings.cuMemRelease(info[:alloc_handle])

    @imported_handles.delete(handle_addr)
    break
  end
end

#copy_async(dst_buffer, src_buffer, size, stream) ⇒ `Object`

Copy data via VMM IPC (used when both sender and receiver have mapped)

Parameters:

dst_buffer (FFI::Pointer) —

Destination (on dst_device)
src_buffer (FFI::Pointer) —

Source (on src_device)
size (Integer) —

Size in bytes
stream (FFI::Pointer) —

CUDA stream

# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 193

def copy_async(dst_buffer, src_buffer, size, stream)
  ensure_initialized!

  # For VMM IPC, once both sides have mapped the same allocation,
  # we can use regular cudaMemcpyAsync
  CUDA::RuntimeAPI.ensure_loaded!
  
  status = CUDA::RuntimeAPI.cudaMemcpyAsync(
    dst_buffer,
    src_buffer,
    size,
    CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_DEVICE,
    stream
  )
  CUDA::RuntimeAPI.check_status!(status, "VMM IPC copy")
end

#create_shareable_allocation(size) ⇒ `Hash`

Create a VMM allocation on source device (shareable)

Parameters:

size (Integer) —

Size in bytes

Returns:

(Hash) —

:alloc_handle, :win32_handle

# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 48

def create_shareable_allocation(size)
  ensure_initialized!

  # Get allocation granularity
  prop = VMMBindings.create_allocation_prop(device_id: @src_device, shareable: true)
  granularity_ptr = FFI::MemoryPointer.new(:size_t)
  
  status = VMMBindings.cuMemGetAllocationGranularity(
    granularity_ptr,
    prop,
    0  # CU_MEM_ALLOC_GRANULARITY_MINIMUM
  )
  VMMBindings.check_status!(status, "Get allocation granularity")

  granularity = granularity_ptr.read_size_t
  aligned_size = ((size + granularity - 1) / granularity) * granularity

  # Create allocation
  handle_ptr = FFI::MemoryPointer.new(:uint64)
  status = VMMBindings.cuMemCreate(handle_ptr, aligned_size, prop, 0)
  VMMBindings.check_status!(status, "VMM cuMemCreate")

  alloc_handle = handle_ptr.read_uint64

  # Reserve virtual address
  va_ptr_ptr = FFI::MemoryPointer.new(:uint64)
  status = VMMBindings.cuMemAddressReserve(va_ptr_ptr, aligned_size, 0, 0, 0)
  VMMBindings.check_status!(status, "VMM cuMemAddressReserve")

  va_ptr = va_ptr_ptr.read_uint64

  # Map allocation to virtual address
  status = VMMBindings.cuMemMap(va_ptr, aligned_size, 0, alloc_handle, 0)
  VMMBindings.check_status!(status, "VMM cuMemMap")

  # Set access for source device
  access_desc = VMMBindings.create_access_desc(device_id: @src_device, read_write: true)
  status = VMMBindings.cuMemSetAccess(va_ptr, aligned_size, access_desc, 1)
  VMMBindings.check_status!(status, "VMM cuMemSetAccess")

  # Export to Windows HANDLE
  win32_handle_ptr = FFI::MemoryPointer.new(:pointer)
  status = VMMBindings.cuMemExportToShareableHandle(
    win32_handle_ptr,
    alloc_handle,
    VMMBindings::CU_MEM_HANDLE_TYPE_WIN32,
    0
  )
  VMMBindings.check_status!(status, "VMM cuMemExportToShareableHandle")

  win32_handle = win32_handle_ptr.read_pointer

  # Cache for cleanup
  @exported_handles[va_ptr] = {
    alloc_handle: alloc_handle,
    win32_handle: win32_handle,
    size: aligned_size
  }

  {
    device_ptr: FFI::Pointer.new(:uint8, va_ptr),
    alloc_handle: alloc_handle,
    win32_handle: win32_handle,
    size: aligned_size
  }
end

#destroy! ⇒ `void`

This method returns an undefined value.

Clean up all handles

# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 243

def destroy!
  # Cleanup exported handles
  @exported_handles.each do |va_ptr, info|
    VMMBindings.cuMemUnmap(va_ptr, info[:size]) rescue nil
    VMMBindings.cuMemAddressFree(va_ptr, info[:size]) rescue nil
    VMMBindings.cuMemRelease(info[:alloc_handle]) rescue nil
    # Note: win32_handle should be closed by caller
  end
  @exported_handles.clear

  # Cleanup imported handles
  @imported_handles.each do |_, info|
    VMMBindings.cuMemUnmap(info[:va_ptr], info[:size]) rescue nil
    VMMBindings.cuMemAddressFree(info[:va_ptr], info[:size]) rescue nil
    VMMBindings.cuMemRelease(info[:alloc_handle]) rescue nil
  end
  @imported_handles.clear

  @initialized = false
end

#estimated_bandwidth ⇒ `Float`

Returns Estimated bandwidth (GB/s).

Returns:

(Float) —

Estimated bandwidth (GB/s)



25
26
27

# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 25

def estimated_bandwidth
  25.0  # PCIe-limited with IPC overhead
end

#estimated_latency ⇒ `Float`

Returns Estimated latency (microseconds).

Returns:

(Float) —

Estimated latency (microseconds)



30
31
32

# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 30

def estimated_latency
  10.0  # IPC overhead higher than P2P
end

#import_shareable_allocation(win32_handle, size) ⇒ `FFI::Pointer`

Import a shareable allocation on destination device

Parameters:

win32_handle (FFI::Pointer) —

Windows HANDLE from sender
size (Integer) —

Size in bytes

Returns:

(FFI::Pointer) —

Device pointer mapped on destination GPU

# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 119

def import_shareable_allocation(win32_handle, size)
  ensure_initialized!

  # Import the handle
  handle_ptr = FFI::MemoryPointer.new(:uint64)
  status = VMMBindings.cuMemImportFromShareableHandle(
    handle_ptr,
    win32_handle,
    VMMBindings::CU_MEM_HANDLE_TYPE_WIN32
  )
  VMMBindings.check_status!(status, "VMM cuMemImportFromShareableHandle")

  alloc_handle = handle_ptr.read_uint64

  # Get granularity for alignment
  prop_ptr = VMMBindings::CUmemAllocationProp.new
  status = VMMBindings.cuMemGetAllocationPropertiesFromHandle(prop_ptr, alloc_handle)
  VMMBindings.check_status!(status, "Get allocation properties")

  # Reserve virtual address on destination device
  va_ptr_ptr = FFI::MemoryPointer.new(:uint64)
  status = VMMBindings.cuMemAddressReserve(va_ptr_ptr, size, 0, 0, 0)
  VMMBindings.check_status!(status, "VMM cuMemAddressReserve (import)")

  va_ptr = va_ptr_ptr.read_uint64

  # Map
  status = VMMBindings.cuMemMap(va_ptr, size, 0, alloc_handle, 0)
  VMMBindings.check_status!(status, "VMM cuMemMap (import)")

  # Set access for destination device
  access_desc = VMMBindings.create_access_desc(device_id: @dst_device, read_write: true)
  status = VMMBindings.cuMemSetAccess(va_ptr, size, access_desc, 1)
  VMMBindings.check_status!(status, "VMM cuMemSetAccess (import)")

  # Cache for cleanup
  @imported_handles[win32_handle.address] = {
    alloc_handle: alloc_handle,
    va_ptr: va_ptr,
    size: size
  }

  FFI::Pointer.new(:uint8, va_ptr)
end

#initialize! ⇒ `void`

This method returns an undefined value.

Initialize the transport

# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 36

def initialize!
  return if @initialized

  VMMBindings.ensure_loaded!
  @exported_handles = {}    # device_ptr -> {handle: CUmemGenericAllocationHandle, win32_handle: HANDLE}
  @imported_handles = {}    # win32_handle -> {alloc_handle, va_ptr, size}
  @initialized = true
end

#recv_async(buffer, size, stream) ⇒ `void`

This method returns an undefined value.

Async receive

Parameters:

buffer (FFI::Pointer) —

Destination buffer
size (Integer) —

Size
stream (FFI::Pointer) —

CUDA stream

Raises:

(NotImplementedError)



226
227
228

# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 226

def recv_async(buffer, size, stream)
  raise NotImplementedError, "Use import_shareable_allocation and copy_async"
end

#send_async(buffer, size, stream) ⇒ `Hash`

Async send (export and copy)

Parameters:

buffer (FFI::Pointer) —

Source buffer
size (Integer) —

Size in bytes
stream (FFI::Pointer) —

CUDA stream

Returns:

(Hash) —

:size for receiver

Raises:

(NotImplementedError)

# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 215

def send_async(buffer, size, stream)
  # For VMM IPC, we typically create a shareable allocation first
  # This method assumes buffer is already a VMM allocation
  raise NotImplementedError, "Use create_shareable_allocation and copy_async"
end

Class: Ignis::Collective::Transport::VMMIPCTransport

Overview

Instance Attribute Summary

Attributes inherited from Base

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Base

Constructor Details

Class Method Details

.available? ⇒ Boolean

.transport_type ⇒ Symbol

Instance Method Details

#close_imported_handle(device_ptr) ⇒ void

#copy_async(dst_buffer, src_buffer, size, stream) ⇒ Object

#create_shareable_allocation(size) ⇒ Hash

#destroy! ⇒ void

#estimated_bandwidth ⇒ Float

#estimated_latency ⇒ Float

#import_shareable_allocation(win32_handle, size) ⇒ FFI::Pointer

#initialize! ⇒ void

#recv_async(buffer, size, stream) ⇒ void

#send_async(buffer, size, stream) ⇒ Hash

.available? ⇒ `Boolean`

.transport_type ⇒ `Symbol`

#close_imported_handle(device_ptr) ⇒ `void`

#copy_async(dst_buffer, src_buffer, size, stream) ⇒ `Object`

#create_shareable_allocation(size) ⇒ `Hash`

#destroy! ⇒ `void`

#estimated_bandwidth ⇒ `Float`

#estimated_latency ⇒ `Float`

#import_shareable_allocation(win32_handle, size) ⇒ `FFI::Pointer`

#initialize! ⇒ `void`

#recv_async(buffer, size, stream) ⇒ `void`

#send_async(buffer, size, stream) ⇒ `Hash`