Class: Ignis::Shared::RecoveryProtocol

Inherits:
Object
  • Object
show all
Defined in:
lib/nnw/shared/recovery_protocol.rb

Overview

RecoveryProtocol — Unified failure state machine shared by all three layers.

This is the spine of the recovery flow. The previous build had each layer implement its own recovery logic — they got out of sync. This singleton ensures all layers see the same state and transition atomically.

States:

HEALTHY → DEGRADED → RECOVERING → HEALTHY
                  ↘ FAILED (if recovery exceeds max_attempts)

Thread-safe: all operations protected by Monitor.

Constant Summary collapse

STATES =

Valid states for the recovery state machine.

%i[healthy degraded recovering failed].freeze
TRANSITIONS =

Valid transition map: current_state => [allowed_next_states]

{
  healthy:    [:degraded],
  degraded:   [:recovering],
  recovering: [:healthy, :failed],
  failed:     [:recovering]
}.freeze
CALLBACK_EVENTS =

Callback event names per state.

%i[on_degraded on_recovering on_healthy on_failed].freeze
DEFAULT_MAX_ATTEMPTS =

Default maximum recovery attempts before transitioning to FAILED.

3
DEFAULT_RECOVERY_TIMEOUT_MS =

Default recovery timeout in milliseconds.

30_000
MAX_HISTORY =

Maximum history entries.

20

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeRecoveryProtocol

Returns a new instance of RecoveryProtocol.



156
157
158
159
160
161
162
163
164
165
166
# File 'lib/nnw/shared/recovery_protocol.rb', line 156

def initialize
  @monitor = Monitor.new
  @state = :healthy
  @callbacks = {} # layer => {on_degraded: proc, ...}
  @history = []
  @attempt_count = 0
  @failed_gpu_id = nil
  @failed_reason = nil
  @max_attempts = DEFAULT_MAX_ATTEMPTS
  @recovery_timeout_ms = DEFAULT_RECOVERY_TIMEOUT_MS
end

Instance Attribute Details

#max_attemptsObject

Instance methods



154
155
156
# File 'lib/nnw/shared/recovery_protocol.rb', line 154

def max_attempts
  @max_attempts
end

#recovery_timeout_msObject

Instance methods



154
155
156
# File 'lib/nnw/shared/recovery_protocol.rb', line 154

def recovery_timeout_ms
  @recovery_timeout_ms
end

Class Method Details

.attempt_countInteger

Get current attempt count.

Returns:

  • (Integer)


121
122
123
# File 'lib/nnw/shared/recovery_protocol.rb', line 121

def self.attempt_count
  instance.attempt_count
end

.begin!(gpu_id:, reason:) ⇒ Symbol

Begin recovery for a failed GPU.

Transitions: HEALTHY → DEGRADED (fires on_degraded callbacks),

then DEGRADED → RECOVERING (fires on_recovering callbacks).

Parameters:

  • gpu_id (Integer)

    the failed GPU device ID

  • reason (Symbol)

    reason for failure (:heartbeat_timeout, :memory_error, etc.)

Returns:

  • (Symbol)

    the new state

Raises:

  • (RuntimeError)

    if current state doesn’t allow this transition



84
85
86
# File 'lib/nnw/shared/recovery_protocol.rb', line 84

def self.begin!(gpu_id:, reason:)
  instance.begin!(gpu_id: gpu_id, reason: reason)
end

.complete!(recovered_gpus:) ⇒ Symbol

Complete recovery successfully.

Transitions: RECOVERING → HEALTHY (fires on_healthy callbacks). Resets attempt counter.

Parameters:

  • recovered_gpus (Array<Integer>)

    list of active GPU IDs after recovery

Returns:

  • (Symbol)

    the new state (:healthy)



95
96
97
# File 'lib/nnw/shared/recovery_protocol.rb', line 95

def self.complete!(recovered_gpus:)
  instance.complete!(recovered_gpus: recovered_gpus)
end

.failed_gpu_idInteger?

Get the failed GPU ID (if in DEGRADED or RECOVERING state).

Returns:

  • (Integer, nil)


127
128
129
# File 'lib/nnw/shared/recovery_protocol.rb', line 127

def self.failed_gpu_id
  instance.failed_gpu_id
end

.historyArray<Hash>

Get transition history.

Returns:

  • (Array<Hash>)

    last 20 transitions with to:, timestamp:, context:



115
116
117
# File 'lib/nnw/shared/recovery_protocol.rb', line 115

def self.history
  instance.history
end

.instanceRecoveryProtocol

Returns singleton instance.

Returns:



41
42
43
# File 'lib/nnw/shared/recovery_protocol.rb', line 41

def self.instance
  @instance ||= new
end

.max_attemptsInteger

Configuration accessors.

Returns:

  • (Integer)


133
134
135
# File 'lib/nnw/shared/recovery_protocol.rb', line 133

def self.max_attempts
  instance.max_attempts
end

.max_attempts=(value) ⇒ Object

Parameters:

  • value (Integer)


138
139
140
# File 'lib/nnw/shared/recovery_protocol.rb', line 138

def self.max_attempts=(value)
  instance.max_attempts = value
end

.recovery_timeout_msInteger

Returns:

  • (Integer)


143
144
145
# File 'lib/nnw/shared/recovery_protocol.rb', line 143

def self.recovery_timeout_ms
  instance.recovery_timeout_ms
end

.recovery_timeout_ms=(value) ⇒ Object

Parameters:

  • value (Integer)


148
149
150
# File 'lib/nnw/shared/recovery_protocol.rb', line 148

def self.recovery_timeout_ms=(value)
  instance.recovery_timeout_ms = value
end

.register(layer:, on_degraded: nil, on_recovering: nil, on_healthy: nil, on_failed: nil) ⇒ void

This method returns an undefined value.

Register callbacks for a layer.

Parameters:

  • layer (Symbol)

    identifying the layer (:nvruby, :nvccl, :wnais)

  • on_degraded (Proc, nil) (defaults to: nil)

    called when state transitions to DEGRADED

  • on_recovering (Proc, nil) (defaults to: nil)

    called when state transitions to RECOVERING

  • on_healthy (Proc, nil) (defaults to: nil)

    called when state transitions to HEALTHY

  • on_failed (Proc, nil) (defaults to: nil)

    called when state transitions to FAILED



59
60
61
62
63
64
65
66
67
# File 'lib/nnw/shared/recovery_protocol.rb', line 59

def self.register(layer:, on_degraded: nil, on_recovering: nil, on_healthy: nil, on_failed: nil)
  instance.register(
    layer: layer,
    on_degraded: on_degraded,
    on_recovering: on_recovering,
    on_healthy: on_healthy,
    on_failed: on_failed
  )
end

.reset!Symbol

Manually reset from FAILED state to start recovery again.

Transitions: FAILED → RECOVERING (fires on_recovering callbacks). Resets attempt counter.

Returns:

  • (Symbol)

    the new state (:recovering)

Raises:

  • (RuntimeError)

    if current state is not :failed



47
48
49
# File 'lib/nnw/shared/recovery_protocol.rb', line 47

def self.reset!
  @instance = new
end

.stateSymbol

Get the current state.

Returns:

  • (Symbol)

    one of :healthy, :degraded, :recovering, :failed



71
72
73
# File 'lib/nnw/shared/recovery_protocol.rb', line 71

def self.state
  instance.state
end

Instance Method Details

#attempt_countInteger

Returns:

  • (Integer)


185
186
187
# File 'lib/nnw/shared/recovery_protocol.rb', line 185

def attempt_count
  @monitor.synchronize { @attempt_count }
end

#begin!(gpu_id:, reason:) ⇒ Object



194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/nnw/shared/recovery_protocol.rb', line 194

def begin!(gpu_id:, reason:)
  @monitor.synchronize do
    # Transition HEALTHY → DEGRADED
    unless @state == :healthy
      raise "Cannot begin recovery: current state is #{@state.inspect}, expected :healthy"
    end

    @failed_gpu_id = gpu_id
    @failed_reason = reason
    @attempt_count += 1

    transition_to!(:degraded, context: { gpu_id: gpu_id, reason: reason, attempt: @attempt_count })

    # Fire on_degraded callbacks
    fire_callbacks(:on_degraded, gpu_id: gpu_id, reason: reason)

    # Immediately transition DEGRADED → RECOVERING
    if @attempt_count > @max_attempts
      transition_to!(:failed, context: { gpu_id: gpu_id, reason: :max_attempts_exceeded, attempt: @attempt_count })
      fire_callbacks(:on_failed, gpu_id: gpu_id, reason: :max_attempts_exceeded)
    else
      transition_to!(:recovering, context: { gpu_id: gpu_id, reason: reason, attempt: @attempt_count })
      fire_callbacks(:on_recovering, gpu_id: gpu_id, reason: reason)
    end

    @state
  end
end

#complete!(recovered_gpus:) ⇒ Object



223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/nnw/shared/recovery_protocol.rb', line 223

def complete!(recovered_gpus:)
  @monitor.synchronize do
    unless @state == :recovering
      raise "Cannot complete recovery: current state is #{@state.inspect}, expected :recovering"
    end

    @attempt_count = 0
    transition_to!(:healthy, context: { recovered_gpus: recovered_gpus })

    fire_callbacks(:on_healthy, recovered_gpus: recovered_gpus)

    @failed_gpu_id = nil
    @failed_reason = nil

    @state
  end
end

#failed_gpu_idInteger?

Returns:

  • (Integer, nil)


190
191
192
# File 'lib/nnw/shared/recovery_protocol.rb', line 190

def failed_gpu_id
  @monitor.synchronize { @failed_gpu_id }
end

#historyObject



256
257
258
259
260
# File 'lib/nnw/shared/recovery_protocol.rb', line 256

def history
  @monitor.synchronize do
    @history.dup
  end
end

#manual_reset!Object



241
242
243
244
245
246
247
248
249
250
251
252
253
254
# File 'lib/nnw/shared/recovery_protocol.rb', line 241

def manual_reset!
  @monitor.synchronize do
    unless @state == :failed
      raise "Cannot manual reset: current state is #{@state.inspect}, expected :failed"
    end

    @attempt_count = 0
    transition_to!(:recovering, context: { manual_reset: true })

    fire_callbacks(:on_recovering, manual_reset: true)

    @state
  end
end

#register(layer:, on_degraded: nil, on_recovering: nil, on_healthy: nil, on_failed: nil) ⇒ Object



168
169
170
171
172
173
174
175
176
177
# File 'lib/nnw/shared/recovery_protocol.rb', line 168

def register(layer:, on_degraded: nil, on_recovering: nil, on_healthy: nil, on_failed: nil)
  @monitor.synchronize do
    @callbacks[layer] = {
      on_degraded: on_degraded,
      on_recovering: on_recovering,
      on_healthy: on_healthy,
      on_failed: on_failed
    }
  end
end

#stateSymbol

Returns:

  • (Symbol)


180
181
182
# File 'lib/nnw/shared/recovery_protocol.rb', line 180

def state
  @monitor.synchronize { @state }
end