Class: Ignis::Collective::Transport::VMMIPCTransport
- Defined in:
- lib/nvruby/collective/transport/vmm_ipc_transport.rb
Overview
VMM IPC Transport - Modern CUDA Virtual Memory Management IPC
Uses cuMemExportToShareableHandle with CU_MEM_HANDLE_TYPE_WIN32 for cross-process GPU memory sharing on Windows. Required for sharing memory allocated with cudaMallocAsync.
Workflow:
-
Sender: cuMemCreate -> cuMemExportToShareableHandle -> send HANDLE
-
Receiver: cuMemImportFromShareableHandle -> cuMemAddressReserve -> cuMemMap -> cuMemSetAccess
Instance Attribute Summary
Attributes inherited from Base
Class Method Summary collapse
-
.available? ⇒ Boolean
Check if VMM IPC is available.
-
.transport_type ⇒ Symbol
Transport type identifier.
Instance Method Summary collapse
-
#close_imported_handle(device_ptr) ⇒ void
Close an imported handle.
-
#copy_async(dst_buffer, src_buffer, size, stream) ⇒ Object
Copy data via VMM IPC (used when both sender and receiver have mapped).
-
#create_shareable_allocation(size) ⇒ Hash
Create a VMM allocation on source device (shareable).
-
#destroy! ⇒ void
Clean up all handles.
-
#estimated_bandwidth ⇒ Float
Estimated bandwidth (GB/s).
-
#estimated_latency ⇒ Float
Estimated latency (microseconds).
-
#import_shareable_allocation(win32_handle, size) ⇒ FFI::Pointer
Import a shareable allocation on destination device.
-
#initialize! ⇒ void
Initialize the transport.
-
#recv_async(buffer, size, stream) ⇒ void
Async receive.
-
#send_async(buffer, size, stream) ⇒ Hash
Async send (export and copy).
Methods inherited from Base
#initialize, #ready?, #recv_sync, #send_sync, #synchronize!, #to_s
Constructor Details
This class inherits a constructor from Ignis::Collective::Transport::Base
Class Method Details
.available? ⇒ Boolean
Check if VMM IPC is available
232 233 234 235 236 237 238 239 |
# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 232 def self.available? begin VMMBindings.ensure_loaded! true rescue LoadError false end end |
.transport_type ⇒ Symbol
Returns Transport type identifier.
20 21 22 |
# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 20 def self.transport_type :vmm_ipc end |
Instance Method Details
#close_imported_handle(device_ptr) ⇒ void
This method returns an undefined value.
Close an imported handle
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 167 def close_imported_handle(device_ptr) va_ptr = device_ptr.address # Find the imported handle info @imported_handles.each do |handle_addr, info| next unless info[:va_ptr] == va_ptr # Unmap VMMBindings.cuMemUnmap(va_ptr, info[:size]) # Free address VMMBindings.cuMemAddressFree(va_ptr, info[:size]) # Release handle VMMBindings.cuMemRelease(info[:alloc_handle]) @imported_handles.delete(handle_addr) break end end |
#copy_async(dst_buffer, src_buffer, size, stream) ⇒ Object
Copy data via VMM IPC (used when both sender and receiver have mapped)
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 193 def copy_async(dst_buffer, src_buffer, size, stream) ensure_initialized! # For VMM IPC, once both sides have mapped the same allocation, # we can use regular cudaMemcpyAsync CUDA::RuntimeAPI.ensure_loaded! status = CUDA::RuntimeAPI.cudaMemcpyAsync( dst_buffer, src_buffer, size, CUDA::RuntimeAPI::MEMCPY_DEVICE_TO_DEVICE, stream ) CUDA::RuntimeAPI.check_status!(status, "VMM IPC copy") end |
#create_shareable_allocation(size) ⇒ Hash
Create a VMM allocation on source device (shareable)
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 48 def create_shareable_allocation(size) ensure_initialized! # Get allocation granularity prop = VMMBindings.create_allocation_prop(device_id: @src_device, shareable: true) granularity_ptr = FFI::MemoryPointer.new(:size_t) status = VMMBindings.cuMemGetAllocationGranularity( granularity_ptr, prop, 0 # CU_MEM_ALLOC_GRANULARITY_MINIMUM ) VMMBindings.check_status!(status, "Get allocation granularity") granularity = granularity_ptr.read_size_t aligned_size = ((size + granularity - 1) / granularity) * granularity # Create allocation handle_ptr = FFI::MemoryPointer.new(:uint64) status = VMMBindings.cuMemCreate(handle_ptr, aligned_size, prop, 0) VMMBindings.check_status!(status, "VMM cuMemCreate") alloc_handle = handle_ptr.read_uint64 # Reserve virtual address va_ptr_ptr = FFI::MemoryPointer.new(:uint64) status = VMMBindings.cuMemAddressReserve(va_ptr_ptr, aligned_size, 0, 0, 0) VMMBindings.check_status!(status, "VMM cuMemAddressReserve") va_ptr = va_ptr_ptr.read_uint64 # Map allocation to virtual address status = VMMBindings.cuMemMap(va_ptr, aligned_size, 0, alloc_handle, 0) VMMBindings.check_status!(status, "VMM cuMemMap") # Set access for source device access_desc = VMMBindings.create_access_desc(device_id: @src_device, read_write: true) status = VMMBindings.cuMemSetAccess(va_ptr, aligned_size, access_desc, 1) VMMBindings.check_status!(status, "VMM cuMemSetAccess") # Export to Windows HANDLE win32_handle_ptr = FFI::MemoryPointer.new(:pointer) status = VMMBindings.cuMemExportToShareableHandle( win32_handle_ptr, alloc_handle, VMMBindings::CU_MEM_HANDLE_TYPE_WIN32, 0 ) VMMBindings.check_status!(status, "VMM cuMemExportToShareableHandle") win32_handle = win32_handle_ptr.read_pointer # Cache for cleanup @exported_handles[va_ptr] = { alloc_handle: alloc_handle, win32_handle: win32_handle, size: aligned_size } { device_ptr: FFI::Pointer.new(:uint8, va_ptr), alloc_handle: alloc_handle, win32_handle: win32_handle, size: aligned_size } end |
#destroy! ⇒ void
This method returns an undefined value.
Clean up all handles
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 |
# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 243 def destroy! # Cleanup exported handles @exported_handles.each do |va_ptr, info| VMMBindings.cuMemUnmap(va_ptr, info[:size]) rescue nil VMMBindings.cuMemAddressFree(va_ptr, info[:size]) rescue nil VMMBindings.cuMemRelease(info[:alloc_handle]) rescue nil # Note: win32_handle should be closed by caller end @exported_handles.clear # Cleanup imported handles @imported_handles.each do |_, info| VMMBindings.cuMemUnmap(info[:va_ptr], info[:size]) rescue nil VMMBindings.cuMemAddressFree(info[:va_ptr], info[:size]) rescue nil VMMBindings.cuMemRelease(info[:alloc_handle]) rescue nil end @imported_handles.clear @initialized = false end |
#estimated_bandwidth ⇒ Float
Returns Estimated bandwidth (GB/s).
25 26 27 |
# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 25 def estimated_bandwidth 25.0 # PCIe-limited with IPC overhead end |
#estimated_latency ⇒ Float
Returns Estimated latency (microseconds).
30 31 32 |
# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 30 def estimated_latency 10.0 # IPC overhead higher than P2P end |
#import_shareable_allocation(win32_handle, size) ⇒ FFI::Pointer
Import a shareable allocation on destination device
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 119 def import_shareable_allocation(win32_handle, size) ensure_initialized! # Import the handle handle_ptr = FFI::MemoryPointer.new(:uint64) status = VMMBindings.cuMemImportFromShareableHandle( handle_ptr, win32_handle, VMMBindings::CU_MEM_HANDLE_TYPE_WIN32 ) VMMBindings.check_status!(status, "VMM cuMemImportFromShareableHandle") alloc_handle = handle_ptr.read_uint64 # Get granularity for alignment prop_ptr = VMMBindings::CUmemAllocationProp.new status = VMMBindings.cuMemGetAllocationPropertiesFromHandle(prop_ptr, alloc_handle) VMMBindings.check_status!(status, "Get allocation properties") # Reserve virtual address on destination device va_ptr_ptr = FFI::MemoryPointer.new(:uint64) status = VMMBindings.cuMemAddressReserve(va_ptr_ptr, size, 0, 0, 0) VMMBindings.check_status!(status, "VMM cuMemAddressReserve (import)") va_ptr = va_ptr_ptr.read_uint64 # Map status = VMMBindings.cuMemMap(va_ptr, size, 0, alloc_handle, 0) VMMBindings.check_status!(status, "VMM cuMemMap (import)") # Set access for destination device access_desc = VMMBindings.create_access_desc(device_id: @dst_device, read_write: true) status = VMMBindings.cuMemSetAccess(va_ptr, size, access_desc, 1) VMMBindings.check_status!(status, "VMM cuMemSetAccess (import)") # Cache for cleanup @imported_handles[win32_handle.address] = { alloc_handle: alloc_handle, va_ptr: va_ptr, size: size } FFI::Pointer.new(:uint8, va_ptr) end |
#initialize! ⇒ void
This method returns an undefined value.
Initialize the transport
36 37 38 39 40 41 42 43 |
# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 36 def initialize! return if @initialized VMMBindings.ensure_loaded! @exported_handles = {} # device_ptr -> {handle: CUmemGenericAllocationHandle, win32_handle: HANDLE} @imported_handles = {} # win32_handle -> {alloc_handle, va_ptr, size} @initialized = true end |
#recv_async(buffer, size, stream) ⇒ void
This method returns an undefined value.
Async receive
226 227 228 |
# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 226 def recv_async(buffer, size, stream) raise NotImplementedError, "Use import_shareable_allocation and copy_async" end |
#send_async(buffer, size, stream) ⇒ Hash
Async send (export and copy)
215 216 217 218 219 |
# File 'lib/nvruby/collective/transport/vmm_ipc_transport.rb', line 215 def send_async(buffer, size, stream) # For VMM IPC, we typically create a shareable allocation first # This method assumes buffer is already a VMM allocation raise NotImplementedError, "Use create_shareable_allocation and copy_async" end |