Class: Ignis::Collective::HealthMonitor

Inherits:
Object
  • Object
show all
Defined in:
lib/nvruby/collective/health_monitor.rb

Overview

GPU health monitoring with proactive failure detection Monitors heartbeat, memory, and thermal status for each GPU

Examples:

Usage

monitor = HealthMonitor.new([0, 1, 2, 3])
monitor.start!

if monitor.unhealthy_devices.any?
  healer.heal!(monitor.unhealthy_devices)
end

Constant Summary collapse

HEARTBEAT_INTERVAL =

Health check interval (seconds)

5.0
MEMORY_THRESHOLD =

Minimum free memory percentage

0.05
TEMP_CRITICAL =

Critical temperature (Celsius)

90
TEMP_WARNING =

Warning temperature (Celsius)

80
FAILURE_THRESHOLD =

Consecutive failures before marking unhealthy

3
STATUS_HEALTHY =

Health status values

:healthy
STATUS_WARNING =
:warning
STATUS_CRITICAL =
:critical
STATUS_FAILED =
:failed

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(device_ids) ⇒ HealthMonitor

Create health monitor for specified GPUs

Parameters:

  • device_ids (Array<Integer>)

    GPU device IDs to monitor



50
51
52
53
54
55
56
57
58
# File 'lib/nvruby/collective/health_monitor.rb', line 50

def initialize(device_ids)
  @device_ids = device_ids.dup.freeze
  @device_status = {}
  @monitoring = false
  @monitor_thread = nil
  @callbacks = { on_failure: [], on_recovery: [], on_warning: [] }

  initialize_status!
end

Instance Attribute Details

#device_idsArray<Integer> (readonly)

Returns Monitored device IDs.

Returns:

  • (Array<Integer>)

    Monitored device IDs



39
40
41
# File 'lib/nvruby/collective/health_monitor.rb', line 39

def device_ids
  @device_ids
end

#device_statusHash<Integer, Hash> (readonly)

Returns Device health status.

Returns:

  • (Hash<Integer, Hash>)

    Device health status



42
43
44
# File 'lib/nvruby/collective/health_monitor.rb', line 42

def device_status
  @device_status
end

#monitoringBoolean (readonly)

Returns Whether monitoring is active.

Returns:

  • (Boolean)

    Whether monitoring is active



45
46
47
# File 'lib/nvruby/collective/health_monitor.rb', line 45

def monitoring
  @monitoring
end

Instance Method Details

#check_now!Hash<Integer, Symbol>

Run single health check (synchronous)

Returns:

  • (Hash<Integer, Symbol>)

    Device status map



79
80
81
82
# File 'lib/nvruby/collective/health_monitor.rb', line 79

def check_now!
  @device_ids.each { |id| check_device!(id) }
  status_summary
end

#destroy!void

This method returns an undefined value.

Clean up resources



159
160
161
162
163
# File 'lib/nvruby/collective/health_monitor.rb', line 159

def destroy!
  stop!
  @device_status.clear
  @callbacks.each_value(&:clear)
end

#device_details(device_id) ⇒ Hash

Get detailed status for a device

Parameters:

  • device_id (Integer)

    GPU device ID

Returns:

  • (Hash)

    Status details



141
142
143
# File 'lib/nvruby/collective/health_monitor.rb', line 141

def device_details(device_id)
  @device_status[device_id]&.dup
end

#force_status!(device_id, status) ⇒ void

This method returns an undefined value.

Force device status (for testing or manual override)

Parameters:

  • device_id (Integer)

    GPU device ID

  • status (Symbol)

    New status



150
151
152
153
154
155
# File 'lib/nvruby/collective/health_monitor.rb', line 150

def force_status!(device_id, status)
  return unless @device_status.key?(device_id)

  @device_status[device_id][:status] = status
  @device_status[device_id][:forced] = true
end

#healthy?(device_id) ⇒ Boolean

Check if specific device is healthy

Parameters:

  • device_id (Integer)

    GPU device ID

Returns:

  • (Boolean)

    True if healthy



88
89
90
91
92
93
# File 'lib/nvruby/collective/health_monitor.rb', line 88

def healthy?(device_id)
  status = @device_status[device_id]
  return false unless status

  status[:status] == STATUS_HEALTHY || status[:status] == STATUS_WARNING
end

#healthy_devicesArray<Integer>

Get all healthy device IDs

Returns:

  • (Array<Integer>)

    Healthy devices



97
98
99
# File 'lib/nvruby/collective/health_monitor.rb', line 97

def healthy_devices
  @device_ids.select { |id| healthy?(id) }
end

#on_failure {|device_id, reason| ... } ⇒ void

This method returns an undefined value.

Register callback for device failure

Yields:

  • (device_id, reason)

    Called when device fails



117
118
119
# File 'lib/nvruby/collective/health_monitor.rb', line 117

def on_failure(&block)
  @callbacks[:on_failure] << block
end

#on_recovery {|device_id| ... } ⇒ void

This method returns an undefined value.

Register callback for device recovery

Yields:

  • (device_id)

    Called when device recovers



125
126
127
# File 'lib/nvruby/collective/health_monitor.rb', line 125

def on_recovery(&block)
  @callbacks[:on_recovery] << block
end

#on_warning {|device_id, warning_type, value| ... } ⇒ void

This method returns an undefined value.

Register callback for warnings

Yields:

  • (device_id, warning_type, value)

    Called on warning



133
134
135
# File 'lib/nvruby/collective/health_monitor.rb', line 133

def on_warning(&block)
  @callbacks[:on_warning] << block
end

#start!void

This method returns an undefined value.

Start background monitoring



62
63
64
65
66
67
# File 'lib/nvruby/collective/health_monitor.rb', line 62

def start!
  return if @monitoring

  @monitoring = true
  @monitor_thread = Thread.new { monitor_loop }
end

#status_summaryHash<Integer, Symbol>

Get status summary

Returns:

  • (Hash<Integer, Symbol>)

    Device → status



109
110
111
# File 'lib/nvruby/collective/health_monitor.rb', line 109

def status_summary
  @device_status.transform_values { |v| v[:status] }
end

#stop!void

This method returns an undefined value.

Stop background monitoring



71
72
73
74
75
# File 'lib/nvruby/collective/health_monitor.rb', line 71

def stop!
  @monitoring = false
  @monitor_thread&.join(2.0)
  @monitor_thread = nil
end

#to_sString

Returns Human-readable summary.

Returns:

  • (String)

    Human-readable summary



166
167
168
169
170
# File 'lib/nvruby/collective/health_monitor.rb', line 166

def to_s
  healthy = healthy_devices.size
  total = @device_ids.size
  "HealthMonitor[#{healthy}/#{total} healthy]"
end

#unhealthy_devicesArray<Integer>

Get all unhealthy device IDs

Returns:

  • (Array<Integer>)

    Unhealthy devices



103
104
105
# File 'lib/nvruby/collective/health_monitor.rb', line 103

def unhealthy_devices
  @device_ids.reject { |id| healthy?(id) }
end