Class: Protobuf::Nats::Server

Inherits:
Object
  • Object
show all
Includes:
Logging, Rpc::Server
Defined in:
lib/protobuf/nats/server.rb

Constant Summary collapse

MILLISECOND =
1000

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options) ⇒ Server

Returns a new instance of Server.



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/protobuf/nats/server.rb', line 18

def initialize(options)
  @options = options
  @processing_requests = true
  @running = true
  @stopped = false
  @pause_mutex = ::Mutex.new

  @nats = @options[:client] || ::Protobuf::Nats::NatsClient.new
  @nats.connect(::Protobuf::Nats.config.connection_options)

  @thread_pool = ::Protobuf::Nats::ThreadPool.new(threads, :max_queue => max_queue_size)

  @subscription_manager = ::Protobuf::Nats::SuperSubscriptionManager.new(@nats) do |request_data, reply_id, subject|
    unless enqueue_request(request_data, reply_id)
      logger.error { "Thread pool is full! Dropping message for subject: #{subject}" }
    end
  end
  @server = options.fetch(:server, ::Socket.gethostname)

  # In-flight handler tracking for observability. Long-running handlers are
  # allowed (and never aborted); we only measure/report. id => monotonic
  # start time; @overdue_flagged dedupes the per-handler overdue event.
  @inflight = ::Concurrent::Map.new
  @overdue_flagged = ::Concurrent::Map.new
  @request_seq = ::Concurrent::AtomicFixnum.new(0)
end

Instance Attribute Details

#natsObject (readonly)

Returns the value of attribute nats.



14
15
16
# File 'lib/protobuf/nats/server.rb', line 14

def nats
  @nats
end

#subscription_managerObject (readonly)

Returns the value of attribute subscription_manager.



14
15
16
# File 'lib/protobuf/nats/server.rb', line 14

def subscription_manager
  @subscription_manager
end

#thread_poolObject (readonly)

Returns the value of attribute thread_pool.



14
15
16
# File 'lib/protobuf/nats/server.rb', line 14

def thread_pool
  @thread_pool
end

Instance Method Details

#detect_and_handle_a_pauseObject



341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# File 'lib/protobuf/nats/server.rb', line 341

def detect_and_handle_a_pause
  @pause_mutex.synchronize do
    case
    # If we are taking requests and detect a pause file, then unsubscribe.
    when @processing_requests && paused?
      @processing_requests = false
      logger.warn("Pausing server!")
      unsubscribe

    # If we were paused and the pause file is no longer present, then subscribe again.
    when !@processing_requests && !paused?
      logger.warn("Resuming server: resubscribing to all services and restarting slow start!")
      @processing_requests = true
      subscribe
    end
  end
end

#do_not_subscribe_to_includes?(subscription_key) ⇒ Boolean

Returns:

  • (Boolean)


261
262
263
264
265
266
267
268
# File 'lib/protobuf/nats/server.rb', line 261

def do_not_subscribe_to_includes?(subscription_key)
  return false unless ::Protobuf::Nats.config.server_subscription_key_do_not_subscribe_to_when_includes_any_of.respond_to?(:any?)
  return false if ::Protobuf::Nats.config.server_subscription_key_do_not_subscribe_to_when_includes_any_of.empty?

  ::Protobuf::Nats.config.server_subscription_key_do_not_subscribe_to_when_includes_any_of.any? do |key|
    subscription_key.include?(key)
  end
end

#enqueue_request(request_data, reply_id) ⇒ Object



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# File 'lib/protobuf/nats/server.rb', line 160

def enqueue_request(request_data, reply_id)
  ::Protobuf::Nats.instrument "server.message_received"

  enqueued_at = monotonic
  request_id = @request_seq.increment
  was_enqueued = thread_pool.push do
    # nil response_data is the "handler failed, don't publish a success
    # response" sentinel (a successful encode is always a non-nil String,
    # even when empty).
    response_data = nil
    begin
      # Instrument the thread pool time-to-execute duration.
      processed_at = monotonic
      ::Protobuf::Nats.instrument("server.thread_pool_execution_delay", (processed_at - enqueued_at) * MILLISECOND)

      # Track this handler as in-flight (long handlers are allowed; this is
      # only for observability -- we never abort it unless overdue-reclaim
      # is explicitly enabled). Store the worker thread so reclaim can
      # target it; the start time drives age/overdue accounting.
      @inflight[request_id] = [processed_at, ::Thread.current]

      # Process request. Only the handler is wrapped here so a transport
      # failure on the success-response publish (below) cannot fall into
      # this rescue and emit a *second* (error) publish for a request whose
      # handler actually succeeded.
      begin
        response_data = handle_request(request_data, 'server' => @server)
      rescue => error
        response_data = nil # ensure the success-publish below is skipped
        logger.debug { "rescued error => #{error}" }  if logger.debug?
        # Logs the real error server-side (via the default log_error
        # callback) so it isn't lost; the client gets only a generic message.
        ::Protobuf::Nats.notify_error_callbacks(error)

        # The client has already received our ACK and is now blocked waiting
        # for the response message. If we don't send one it will hang until
        # response_timeout (60s by default). Publish an encoded RPC error so
        # the client fails fast instead. Use a generic message rather than
        # error.message so internal handler details aren't leaked over the
        # wire. (If the failure was the connection itself, this publish will
        # also fail and is swallowed below.)
        begin
          error_response = ::Protobuf::Rpc::PbError.new("Internal server error")
          nats.publish(reply_id, error_response.encode)
        rescue => publish_error
          logger.error "Failed to publish error response for #{reply_id}: #{publish_error.message}"
        end
      end

      # Publish the successful response. Kept outside the handler rescue so a
      # publish failure here is logged rather than triggering a duplicate
      # (error) response for a request that already succeeded.
      if response_data
        logger.debug { "Publishing response to #{reply_id}" } if logger.debug?
        begin
          nats.publish(reply_id, response_data)
        rescue => publish_error
          logger.error "Failed to publish response for #{reply_id}: #{publish_error.message}"
          ::Protobuf::Nats.notify_error_callbacks(publish_error)
        end
      end
    ensure
      @inflight.delete(request_id)
      @overdue_flagged.delete(request_id)

      # Instrument the request duration.
      completed_at = monotonic
      ::Protobuf::Nats.instrument("server.request_duration", (completed_at - enqueued_at) * MILLISECOND)

      # Informational slow-handler marker (opt-in; default off).
      if processed_at && slow_handler_threshold_ms.positive?
        handler_ms = (completed_at - processed_at) * MILLISECOND
        if handler_ms >= slow_handler_threshold_ms
          logger.warn "Slow handler for #{reply_id}: #{handler_ms.round}ms"
          ::Protobuf::Nats.instrument("server.slow_handler", handler_ms)
        end
      end
    end
  end

  # Publish an ACK to signal the server has picked up the work.
  begin
    if was_enqueued
      logger.debug { "[reply_id=#{reply_id}] Sending ACK" } if logger.debug?
      nats.publish(reply_id, ::Protobuf::Nats::Messages::ACK)
    else # Drop message if the thread pool is full
      ::Protobuf::Nats.instrument "server.thread_pool_saturated"
      ::Protobuf::Nats.instrument "server.message_dropped"
      logger.debug { "[reply_id=#{reply_id}] Sending NACK" } if logger.debug?

      # Let the client know we are not processing the message.
      nats.publish(reply_id, ::Protobuf::Nats::Messages::NACK)
    end
  rescue => e
    logger.error "Failed to send ACK/NACK for #{reply_id}: #{e.message}"
    ::Protobuf::Nats.notify_error_callbacks(e)
  end

  was_enqueued
end

#finish_slow_startObject

Slow start subscriptions by adding X rounds of subz every Y seconds, where X is subscriptions_per_rpc_endpoint and Y is slow_start_delay.



316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# File 'lib/protobuf/nats/server.rb', line 316

def finish_slow_start
  logger.info "Slow start has started..."
  completed = 1

  # We have (X - 1) here because we always subscribe at least once.
  (subscriptions_per_rpc_endpoint - 1).times do
    unless @running
      logger.info "Slow start interrupted (server stopping) after #{completed}/#{subscriptions_per_rpc_endpoint} rounds"
      return
    end

    if paused?
      logger.info "Slow start interrupted (server paused) after #{completed}/#{subscriptions_per_rpc_endpoint} rounds"
      return
    end

    completed += 1
    sleep slow_start_delay
    subscribe_to_services_once
    logger.info "Slow start adding another round of subscriptions (#{completed}/#{subscriptions_per_rpc_endpoint})..."
  end

  logger.info "Slow start finished successfully (#{completed}/#{subscriptions_per_rpc_endpoint} rounds completed)."
end

#handler_countObject



49
50
51
# File 'lib/protobuf/nats/server.rb', line 49

def handler_count
  subscription_manager.handler_count
end

#handler_overdue_msObject

A handler still running past this is “overdue”: the client has already given up (its response_timeout), so the work is orphaned and holding a pool slot for nothing. Defaults above the client’s 60s response_timeout so legitimate ≤60s operations are never flagged.



63
64
65
# File 'lib/protobuf/nats/server.rb', line 63

def handler_overdue_ms
  @handler_overdue_ms ||= ::ENV.fetch("PB_NATS_SERVER_HANDLER_OVERDUE_MS", 65_000).to_i
end

#instrument_inflight_handlersObject

Periodic in-flight handler health. Long handlers are normal, so inflight_oldest_age_ms can legitimately approach the client’s response_timeout; only overdue_handler_count (work the client has already abandoned) signals a problem.



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/protobuf/nats/server.rb', line 101

def instrument_inflight_handlers
  now = monotonic
  overdue_ms = handler_overdue_ms
  count = 0
  oldest_age_ms = 0.0
  overdue = 0

  @inflight.each_pair do |id, entry|
    started_at, handler_thread = entry
    count += 1
    age_ms = (now - started_at) * MILLISECOND
    oldest_age_ms = age_ms if age_ms > oldest_age_ms
    next unless overdue_ms.positive? && age_ms >= overdue_ms

    overdue += 1

    # Optionally reclaim the slot by aborting the orphaned handler (opt-in;
    # see #reclaim_overdue_handlers?). Done before the dedupe below so the
    # reclaim is attempted even after the overdue event was already emitted.
    if reclaim_overdue_handlers? && handler_thread&.alive?
      logger.warn "Reclaiming overdue handler (age=#{age_ms.round}ms, client already gave up) to free its pool slot"
      handler_thread.raise(::Protobuf::Nats::Errors::HandlerOverdue, "handler exceeded #{overdue_ms}ms; reclaimed")
      ::Protobuf::Nats.instrument("server.handler_reclaimed", age_ms)
    end

    # Emit the per-handler overdue event once (the client has already
    # given up; this handler's result is orphaned).
    next if @overdue_flagged[id]
    @overdue_flagged[id] = true
    logger.warn "Handler exceeded #{overdue_ms}ms (client already gave up); in-flight age=#{age_ms.round}ms"
    ::Protobuf::Nats.instrument("server.handler_overdue", age_ms)
  end

  ::Protobuf::Nats.instrument("server.pending_intake_queue_size", subscription_manager.pending_queue_size)
  ::Protobuf::Nats.instrument("server.inflight_count", count)
  ::Protobuf::Nats.instrument("server.inflight_oldest_age_ms", oldest_age_ms)
  ::Protobuf::Nats.instrument("server.overdue_handler_count", overdue)
end

#instrument_thread_pool_sizesObject



91
92
93
94
95
# File 'lib/protobuf/nats/server.rb', line 91

def instrument_thread_pool_sizes
  ::Protobuf::Nats.instrument("server.thread_pool_enqueued_size", thread_pool.enqueued_size)
  ::Protobuf::Nats.instrument("server.thread_pool_max_size", thread_pool.max_size)
  ::Protobuf::Nats.instrument("server.thread_pool_running_size", thread_pool.size)
end

#max_queue_sizeObject



140
141
142
# File 'lib/protobuf/nats/server.rb', line 140

def max_queue_size
  ::ENV.fetch("PB_NATS_SERVER_MAX_QUEUE_SIZE", @options[:threads]).to_i
end

#monotonicObject



45
46
47
# File 'lib/protobuf/nats/server.rb', line 45

def monotonic
  ::Protobuf::Nats.monotonic_time
end

#only_subscribe_to_includes?(subscription_key) ⇒ Boolean

Returns:

  • (Boolean)


270
271
272
273
274
275
276
277
# File 'lib/protobuf/nats/server.rb', line 270

def only_subscribe_to_includes?(subscription_key)
  return true unless ::Protobuf::Nats.config.server_subscription_key_only_subscribe_to_when_includes_any_of.respond_to?(:any?)
  return true if ::Protobuf::Nats.config.server_subscription_key_only_subscribe_to_when_includes_any_of.empty?

  ::Protobuf::Nats.config.server_subscription_key_only_subscribe_to_when_includes_any_of.any? do |key|
    subscription_key.include?(key)
  end
end

#pause_file_pathObject



279
280
281
# File 'lib/protobuf/nats/server.rb', line 279

def pause_file_path
  ::ENV.fetch("PB_NATS_SERVER_PAUSE_FILE_PATH", nil)
end

#paused?Boolean

Returns:

  • (Boolean)


359
360
361
# File 'lib/protobuf/nats/server.rb', line 359

def paused?
  !pause_file_path.nil? && ::File.exist?(pause_file_path)
end


283
284
285
286
287
288
289
# File 'lib/protobuf/nats/server.rb', line 283

def print_subscription_keys
  logger.info "Creating subscriptions:"

  with_each_subscription_key do |subscription_key|
    logger.info "  - #{subscription_key}"
  end
end

#reclaim_overdue_handlers?Boolean

Whether to actively reclaim (abort) an overdue handler’s pool slot. OFF by default: the documented contract is that handlers are never aborted, since killing a thread mid-handler can corrupt state. Enable only when you would rather shed orphaned work (whose client already gave up) than let it pin a pool slot – e.g. when overdue handlers are saturating the pool and the server is NACKing healthy traffic. Reclaim raises Errors::HandlerOverdue into the worker, which the handler rescue turns into an RPC error response.

Returns:

  • (Boolean)


74
75
76
77
78
79
# File 'lib/protobuf/nats/server.rb', line 74

def reclaim_overdue_handlers?
  # Memoize the raw string (never falsey, so ||= is safe) and derive the
  # boolean per call -- avoids the nil-guard dance for a false-able memo.
  @reclaim_overdue_handlers ||= ::ENV.fetch("PB_NATS_SERVER_RECLAIM_OVERDUE_HANDLERS", "false")
  @reclaim_overdue_handlers == "true"
end

#runObject



363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
# File 'lib/protobuf/nats/server.rb', line 363

def run
  nats.on_reconnect do
    logger.warn "Server NATS connection was reconnected"
  end

  nats.on_disconnect do
    logger.warn "Server NATS connection was disconnected"
  end

  nats.on_error do |error|
    # Runs on nats-pure's read/flush thread -- offload so a slow callback
    # can't stall the server's intake.
    ::Protobuf::Nats.notify_error_callbacks_async(error)
  end

  nats.on_close do
    logger.warn "Server NATS connection was closed"
  end

  print_subscription_keys
  if paused?
    yield if block_given?
  else
    subscribe { yield if block_given? }
  end

  loop do
    break unless @running
    detect_and_handle_a_pause
    instrument_thread_pool_sizes
    instrument_inflight_handlers
    thread_pool.replenish # respawn workers killed by non-StandardError
    sleep 1
  end

  unsubscribe

  logger.info "Shutting down subscription manager..."
  begin
    Timeout.timeout(10) do
      subscription_manager.shutdown(5)
    end
  rescue Timeout::Error
    logger.error "Subscription manager shutdown timed out!"
  rescue => e
    logger.error "Error during subscription manager shutdown: #{e.message}"
  end

  # Give in-flight handlers time to finish. Long operations are allowed
  # (up to ~the client's response_timeout), so the drain timeout tracks
  # handler_overdue_ms rather than a fixed 60s -- otherwise a legitimate
  # ~60s handler would be killed and its client left waiting.
  drain_timeout = shutdown_drain_timeout
  logger.info "Waiting up to #{drain_timeout.round}s for the thread pool to finish shutting down..."
  thread_pool.shutdown
  unless thread_pool.wait_for_termination(drain_timeout)
    abandoned = @inflight.size
    logger.warn "Thread pool did not shut down cleanly within #{drain_timeout.round}s! Abandoned #{abandoned} in-flight handler(s)."
    ::Protobuf::Nats.instrument "server.thread_pool_shutdown_timeout"
    ::Protobuf::Nats.instrument "server.shutdown_abandoned_handlers", abandoned
  end
ensure
  @stopped = true

  begin
    logger.info "Closing NATS connection..."
    @nats.close if @nats
  rescue => e
    logger.warn "Failed to close NATS connection: #{e.message}"
  end
end

#running?Boolean

Returns:

  • (Boolean)


435
436
437
# File 'lib/protobuf/nats/server.rb', line 435

def running?
  !@stopped
end

#service_klassesObject



156
157
158
# File 'lib/protobuf/nats/server.rb', line 156

def service_klasses
  ::Protobuf::Rpc::Service.implemented_services.map(&:safe_constantize)
end

#shutdown_drain_timeoutObject

How long to let in-flight handlers finish on shutdown. Tracks the overdue window (plus grace) so a legitimate long handler isn’t killed mid-flight.



83
84
85
86
87
88
89
# File 'lib/protobuf/nats/server.rb', line 83

def shutdown_drain_timeout
  @shutdown_drain_timeout ||= if ::ENV.key?("PB_NATS_SERVER_SHUTDOWN_DRAIN_TIMEOUT")
    ::ENV["PB_NATS_SERVER_SHUTDOWN_DRAIN_TIMEOUT"].to_f
  else
    (handler_overdue_ms / 1000.0) + 5
  end
end

#slow_handler_threshold_msObject

Informational SLA marker for slow handlers. Default 0 (off) so normal long-running operations are not flagged.



55
56
57
# File 'lib/protobuf/nats/server.rb', line 55

def slow_handler_threshold_ms
  @slow_handler_threshold_ms ||= ::ENV.fetch("PB_NATS_SERVER_SLOW_HANDLER_THRESHOLD_MS", 0).to_i
end

#slow_start_delayObject



144
145
146
# File 'lib/protobuf/nats/server.rb', line 144

def slow_start_delay
  @slow_start_delay ||= ::ENV.fetch("PB_NATS_SERVER_SLOW_START_DELAY", 10).to_i
end

#stopObject



439
440
441
# File 'lib/protobuf/nats/server.rb', line 439

def stop
  @running = false
end

#subscribeObject



443
444
445
446
447
# File 'lib/protobuf/nats/server.rb', line 443

def subscribe
  subscribe_to_services_once
  yield if block_given?
  finish_slow_start
end

#subscribe_to_services_onceObject



291
292
293
294
295
# File 'lib/protobuf/nats/server.rb', line 291

def subscribe_to_services_once
  with_each_subscription_key do |subscription_key_and_queue|
    subscription_manager.queue_subscribe(subscription_key_and_queue)
  end
end

#subscriptions_per_rpc_endpointObject



148
149
150
# File 'lib/protobuf/nats/server.rb', line 148

def subscriptions_per_rpc_endpoint
  @subscriptions_per_rpc_endpoint ||= ::ENV.fetch("PB_NATS_SERVER_SUBSCRIPTIONS_PER_RPC_ENDPOINT", 10).to_i
end

#threadsObject



152
153
154
# File 'lib/protobuf/nats/server.rb', line 152

def threads
  @options[:threads] || 10 # Default to 10 if not provided, consistent with original behavior
end

#unsubscribeObject



449
450
451
452
# File 'lib/protobuf/nats/server.rb', line 449

def unsubscribe
  logger.info "Unsubscribing from rpc routes..."
  subscription_manager.unsubscribe_all
end

#with_each_subscription_keyObject



297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
# File 'lib/protobuf/nats/server.rb', line 297

def with_each_subscription_key
  fail ::ArgumentError unless block_given?

  service_klasses.each do |service_klass|
    service_klass.rpcs.each do |service_method, _|
      # Skip services that are not implemented.
      next unless service_klass.method_defined?(service_method)
      subscription_key = ::Protobuf::Nats.subscription_key(service_klass, service_method)
      next if do_not_subscribe_to_includes?(subscription_key)
      next unless only_subscribe_to_includes?(subscription_key)

      yield subscription_key
    end
  end
end