Class: Protobuf::Nats::Server

Inherits:
Object
  • Object
show all
Includes:
Logging, Rpc::Server
Defined in:
lib/protobuf/nats/server.rb

Constant Summary collapse

MILLISECOND =
1000

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options) ⇒ Server

Returns a new instance of Server.



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/protobuf/nats/server.rb', line 18

def initialize(options)
  @options = options
  @processing_requests = true
  @running = true
  @stopped = false
  @pause_mutex = ::Mutex.new

  @nats = @options[:client] || ::Protobuf::Nats::NatsClient.new
  @nats.connect(::Protobuf::Nats.config.connection_options)

  @thread_pool = ::Protobuf::Nats::ThreadPool.new(threads, :max_queue => max_queue_size)

  @subscription_manager = ::Protobuf::Nats::SuperSubscriptionManager.new(@nats) do |request_data, reply_id, subject|
    unless enqueue_request(request_data, reply_id)
      logger.error { "Thread pool is full! Dropping message for subject: #{subject}" }
    end
  end
  @server = options.fetch(:server, ::Socket.gethostname)

  # In-flight handler tracking for observability. Long-running handlers are
  # allowed (and never aborted); we only measure/report. id => monotonic
  # start time; @overdue_flagged dedupes the per-handler overdue event.
  @inflight = ::Concurrent::Map.new
  @overdue_flagged = ::Concurrent::Map.new
  @request_seq = ::Concurrent::AtomicFixnum.new(0)
end

Instance Attribute Details

#natsObject (readonly)

Returns the value of attribute nats.



14
15
16
# File 'lib/protobuf/nats/server.rb', line 14

def nats
  @nats
end

#subscription_managerObject (readonly)

Returns the value of attribute subscription_manager.



14
15
16
# File 'lib/protobuf/nats/server.rb', line 14

def subscription_manager
  @subscription_manager
end

#thread_poolObject (readonly)

Returns the value of attribute thread_pool.



14
15
16
# File 'lib/protobuf/nats/server.rb', line 14

def thread_pool
  @thread_pool
end

Instance Method Details

#detect_and_handle_a_pauseObject



291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# File 'lib/protobuf/nats/server.rb', line 291

def detect_and_handle_a_pause
  @pause_mutex.synchronize do
    case
    # If we are taking requests and detect a pause file, then unsubscribe.
    when @processing_requests && paused?
      @processing_requests = false
      logger.warn("Pausing server!")
      unsubscribe

    # If we were paused and the pause file is no longer present, then subscribe again.
    when !@processing_requests && !paused?
      logger.warn("Resuming server: resubscribing to all services and restarting slow start!")
      @processing_requests = true
      subscribe
    end
  end
end

#do_not_subscribe_to_includes?(subscription_key) ⇒ Boolean

Returns:

  • (Boolean)


211
212
213
214
215
216
217
218
# File 'lib/protobuf/nats/server.rb', line 211

def do_not_subscribe_to_includes?(subscription_key)
  return false unless ::Protobuf::Nats.config.server_subscription_key_do_not_subscribe_to_when_includes_any_of.respond_to?(:any?)
  return false if ::Protobuf::Nats.config.server_subscription_key_do_not_subscribe_to_when_includes_any_of.empty?

  ::Protobuf::Nats.config.server_subscription_key_do_not_subscribe_to_when_includes_any_of.any? do |key|
    subscription_key.include?(key)
  end
end

#enqueue_request(request_data, reply_id) ⇒ Object



135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/protobuf/nats/server.rb', line 135

def enqueue_request(request_data, reply_id)
  ::Protobuf::Nats.instrument "server.message_received"

  enqueued_at = monotonic
  request_id = @request_seq.increment
  was_enqueued = thread_pool.push do
    begin
      # Instrument the thread pool time-to-execute duration.
      processed_at = monotonic
      ::Protobuf::Nats.instrument("server.thread_pool_execution_delay", (processed_at - enqueued_at) * MILLISECOND)

      # Track this handler as in-flight (long handlers are allowed; this is
      # only for observability -- we never abort it).
      @inflight[request_id] = processed_at

      # Process request.
      response_data = handle_request(request_data, 'server' => @server)

      # Publish response.
      logger.debug { "Publishing response to #{reply_id}" } if logger.debug?
      nats.publish(reply_id, response_data)
    rescue => error
      logger.debug { "rescued error => #{error}" }  if logger.debug?
      ::Protobuf::Nats.notify_error_callbacks(error)

      # The client has already received our ACK and is now blocked waiting
      # for the response message. If we don't send one it will hang until
      # response_timeout (60s by default). Publish an encoded RPC error so
      # the client fails fast instead. (If the failure was the connection
      # itself, this publish will also fail and is swallowed below.)
      begin
        error_response = ::Protobuf::Rpc::PbError.new(error.message)
        nats.publish(reply_id, error_response.encode)
      rescue => publish_error
        logger.error "Failed to publish error response for #{reply_id}: #{publish_error.message}"
      end
    ensure
      @inflight.delete(request_id)
      @overdue_flagged.delete(request_id)

      # Instrument the request duration.
      completed_at = monotonic
      ::Protobuf::Nats.instrument("server.request_duration", (completed_at - enqueued_at) * MILLISECOND)

      # Informational slow-handler marker (opt-in; default off).
      if processed_at && slow_handler_threshold_ms.positive?
        handler_ms = (completed_at - processed_at) * MILLISECOND
        if handler_ms >= slow_handler_threshold_ms
          logger.warn "Slow handler for #{reply_id}: #{handler_ms.round}ms"
          ::Protobuf::Nats.instrument("server.slow_handler", handler_ms)
        end
      end
    end
  end

  # Publish an ACK to signal the server has picked up the work.
  begin
    if was_enqueued
      logger.debug { "[reply_id=#{reply_id}] Sending ACK" } if logger.debug?
      nats.publish(reply_id, ::Protobuf::Nats::Messages::ACK)
    else # Drop message if the thread pool is full
      ::Protobuf::Nats.instrument "server.thread_pool_saturated"
      ::Protobuf::Nats.instrument "server.message_dropped"
      logger.debug { "[reply_id=#{reply_id}] Sending NACK" } if logger.debug?

      # Let the client know we are not processing the message.
      nats.publish(reply_id, ::Protobuf::Nats::Messages::NACK)
    end
  rescue => e
    logger.error "Failed to send ACK/NACK for #{reply_id}: #{e.message}"
    ::Protobuf::Nats.notify_error_callbacks(e)
  end

  was_enqueued
end

#finish_slow_startObject

Slow start subscriptions by adding X rounds of subz every Y seconds, where X is subscriptions_per_rpc_endpoint and Y is slow_start_delay.



266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# File 'lib/protobuf/nats/server.rb', line 266

def finish_slow_start
  logger.info "Slow start has started..."
  completed = 1

  # We have (X - 1) here because we always subscribe at least once.
  (subscriptions_per_rpc_endpoint - 1).times do
    unless @running
      logger.info "Slow start interrupted (server stopping) after #{completed}/#{subscriptions_per_rpc_endpoint} rounds"
      return
    end

    if paused?
      logger.info "Slow start interrupted (server paused) after #{completed}/#{subscriptions_per_rpc_endpoint} rounds"
      return
    end

    completed += 1
    sleep slow_start_delay
    subscribe_to_services_once
    logger.info "Slow start adding another round of subscriptions (#{completed}/#{subscriptions_per_rpc_endpoint})..."
  end

  logger.info "Slow start finished successfully (#{completed}/#{subscriptions_per_rpc_endpoint} rounds completed)."
end

#handler_countObject



49
50
51
# File 'lib/protobuf/nats/server.rb', line 49

def handler_count
  subscription_manager.handler_count
end

#handler_overdue_msObject

A handler still running past this is “overdue”: the client has already given up (its response_timeout), so the work is orphaned and holding a pool slot for nothing. Defaults above the client’s 60s response_timeout so legitimate ≤60s operations are never flagged.



63
64
65
# File 'lib/protobuf/nats/server.rb', line 63

def handler_overdue_ms
  @handler_overdue_ms ||= ::ENV.fetch("PB_NATS_SERVER_HANDLER_OVERDUE_MS", 65_000).to_i
end

#instrument_inflight_handlersObject

Periodic in-flight handler health. Long handlers are normal, so inflight_oldest_age_ms can legitimately approach the client’s response_timeout; only overdue_handler_count (work the client has already abandoned) signals a problem.



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/protobuf/nats/server.rb', line 87

def instrument_inflight_handlers
  now = monotonic
  overdue_ms = handler_overdue_ms
  count = 0
  oldest_age_ms = 0.0
  overdue = 0

  @inflight.each_pair do |id, started_at|
    count += 1
    age_ms = (now - started_at) * MILLISECOND
    oldest_age_ms = age_ms if age_ms > oldest_age_ms
    next unless overdue_ms.positive? && age_ms >= overdue_ms

    overdue += 1
    # Emit the per-handler overdue event once (the client has already
    # given up; this handler's result is orphaned).
    next if @overdue_flagged[id]
    @overdue_flagged[id] = true
    logger.warn "Handler exceeded #{overdue_ms}ms (client already gave up); in-flight age=#{age_ms.round}ms"
    ::Protobuf::Nats.instrument("server.handler_overdue", age_ms)
  end

  ::Protobuf::Nats.instrument("server.pending_intake_queue_size", subscription_manager.pending_queue_size)
  ::Protobuf::Nats.instrument("server.inflight_count", count)
  ::Protobuf::Nats.instrument("server.inflight_oldest_age_ms", oldest_age_ms)
  ::Protobuf::Nats.instrument("server.overdue_handler_count", overdue)
end

#instrument_thread_pool_sizesObject



77
78
79
80
81
# File 'lib/protobuf/nats/server.rb', line 77

def instrument_thread_pool_sizes
  ::Protobuf::Nats.instrument("server.thread_pool_enqueued_size", thread_pool.enqueued_size)
  ::Protobuf::Nats.instrument("server.thread_pool_max_size", thread_pool.max_size)
  ::Protobuf::Nats.instrument("server.thread_pool_running_size", thread_pool.size)
end

#max_queue_sizeObject



115
116
117
# File 'lib/protobuf/nats/server.rb', line 115

def max_queue_size
  ::ENV.fetch("PB_NATS_SERVER_MAX_QUEUE_SIZE", @options[:threads]).to_i
end

#monotonicObject



45
46
47
# File 'lib/protobuf/nats/server.rb', line 45

def monotonic
  ::Protobuf::Nats.monotonic_time
end

#only_subscribe_to_includes?(subscription_key) ⇒ Boolean

Returns:

  • (Boolean)


220
221
222
223
224
225
226
227
# File 'lib/protobuf/nats/server.rb', line 220

def only_subscribe_to_includes?(subscription_key)
  return true unless ::Protobuf::Nats.config.server_subscription_key_only_subscribe_to_when_includes_any_of.respond_to?(:any?)
  return true if ::Protobuf::Nats.config.server_subscription_key_only_subscribe_to_when_includes_any_of.empty?

  ::Protobuf::Nats.config.server_subscription_key_only_subscribe_to_when_includes_any_of.any? do |key|
    subscription_key.include?(key)
  end
end

#pause_file_pathObject



229
230
231
# File 'lib/protobuf/nats/server.rb', line 229

def pause_file_path
  ::ENV.fetch("PB_NATS_SERVER_PAUSE_FILE_PATH", nil)
end

#paused?Boolean

Returns:

  • (Boolean)


309
310
311
# File 'lib/protobuf/nats/server.rb', line 309

def paused?
  !pause_file_path.nil? && ::File.exist?(pause_file_path)
end


233
234
235
236
237
238
239
# File 'lib/protobuf/nats/server.rb', line 233

def print_subscription_keys
  logger.info "Creating subscriptions:"

  with_each_subscription_key do |subscription_key|
    logger.info "  - #{subscription_key}"
  end
end

#runObject



313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
# File 'lib/protobuf/nats/server.rb', line 313

def run
  nats.on_reconnect do
    logger.warn "Server NATS connection was reconnected"
  end

  nats.on_disconnect do
    logger.warn "Server NATS connection was disconnected"
  end

  nats.on_error do |error|
    # Runs on nats-pure's read/flush thread -- offload so a slow callback
    # can't stall the server's intake.
    ::Protobuf::Nats.notify_error_callbacks_async(error)
  end

  nats.on_close do
    logger.warn "Server NATS connection was closed"
  end

  print_subscription_keys
  if paused?
    yield if block_given?
  else
    subscribe { yield if block_given? }
  end

  loop do
    break unless @running
    detect_and_handle_a_pause
    instrument_thread_pool_sizes
    instrument_inflight_handlers
    thread_pool.replenish # respawn workers killed by non-StandardError
    sleep 1
  end

  unsubscribe

  logger.info "Shutting down subscription manager..."
  begin
    Timeout.timeout(10) do
      subscription_manager.shutdown(5)
    end
  rescue Timeout::Error
    logger.error "Subscription manager shutdown timed out!"
  rescue => e
    logger.error "Error during subscription manager shutdown: #{e.message}"
  end

  # Give in-flight handlers time to finish. Long operations are allowed
  # (up to ~the client's response_timeout), so the drain timeout tracks
  # handler_overdue_ms rather than a fixed 60s -- otherwise a legitimate
  # ~60s handler would be killed and its client left waiting.
  drain_timeout = shutdown_drain_timeout
  logger.info "Waiting up to #{drain_timeout.round}s for the thread pool to finish shutting down..."
  thread_pool.shutdown
  unless thread_pool.wait_for_termination(drain_timeout)
    abandoned = @inflight.size
    logger.warn "Thread pool did not shut down cleanly within #{drain_timeout.round}s! Abandoned #{abandoned} in-flight handler(s)."
    ::Protobuf::Nats.instrument "server.thread_pool_shutdown_timeout"
    ::Protobuf::Nats.instrument "server.shutdown_abandoned_handlers", abandoned
  end
ensure
  @stopped = true

  begin
    logger.info "Closing NATS connection..."
    @nats.close if @nats
  rescue => e
    logger.warn "Failed to close NATS connection: #{e.message}"
  end
end

#running?Boolean

Returns:

  • (Boolean)


385
386
387
# File 'lib/protobuf/nats/server.rb', line 385

def running?
  !@stopped
end

#service_klassesObject



131
132
133
# File 'lib/protobuf/nats/server.rb', line 131

def service_klasses
  ::Protobuf::Rpc::Service.implemented_services.map(&:safe_constantize)
end

#shutdown_drain_timeoutObject

How long to let in-flight handlers finish on shutdown. Tracks the overdue window (plus grace) so a legitimate long handler isn’t killed mid-flight.



69
70
71
72
73
74
75
# File 'lib/protobuf/nats/server.rb', line 69

def shutdown_drain_timeout
  @shutdown_drain_timeout ||= if ::ENV.key?("PB_NATS_SERVER_SHUTDOWN_DRAIN_TIMEOUT")
    ::ENV["PB_NATS_SERVER_SHUTDOWN_DRAIN_TIMEOUT"].to_f
  else
    (handler_overdue_ms / 1000.0) + 5
  end
end

#slow_handler_threshold_msObject

Informational SLA marker for slow handlers. Default 0 (off) so normal long-running operations are not flagged.



55
56
57
# File 'lib/protobuf/nats/server.rb', line 55

def slow_handler_threshold_ms
  @slow_handler_threshold_ms ||= ::ENV.fetch("PB_NATS_SERVER_SLOW_HANDLER_THRESHOLD_MS", 0).to_i
end

#slow_start_delayObject



119
120
121
# File 'lib/protobuf/nats/server.rb', line 119

def slow_start_delay
  @slow_start_delay ||= ::ENV.fetch("PB_NATS_SERVER_SLOW_START_DELAY", 10).to_i
end

#stopObject



389
390
391
# File 'lib/protobuf/nats/server.rb', line 389

def stop
  @running = false
end

#subscribeObject



393
394
395
396
397
# File 'lib/protobuf/nats/server.rb', line 393

def subscribe
  subscribe_to_services_once
  yield if block_given?
  finish_slow_start
end

#subscribe_to_services_onceObject



241
242
243
244
245
# File 'lib/protobuf/nats/server.rb', line 241

def subscribe_to_services_once
  with_each_subscription_key do |subscription_key_and_queue|
    subscription_manager.queue_subscribe(subscription_key_and_queue)
  end
end

#subscriptions_per_rpc_endpointObject



123
124
125
# File 'lib/protobuf/nats/server.rb', line 123

def subscriptions_per_rpc_endpoint
  @subscriptions_per_rpc_endpoint ||= ::ENV.fetch("PB_NATS_SERVER_SUBSCRIPTIONS_PER_RPC_ENDPOINT", 10).to_i
end

#threadsObject



127
128
129
# File 'lib/protobuf/nats/server.rb', line 127

def threads
  @options[:threads] || 10 # Default to 10 if not provided, consistent with original behavior
end

#unsubscribeObject



399
400
401
402
# File 'lib/protobuf/nats/server.rb', line 399

def unsubscribe
  logger.info "Unsubscribing from rpc routes..."
  subscription_manager.unsubscribe_all
end

#with_each_subscription_keyObject



247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# File 'lib/protobuf/nats/server.rb', line 247

def with_each_subscription_key
  fail ::ArgumentError unless block_given?

  service_klasses.each do |service_klass|
    service_klass.rpcs.each do |service_method, _|
      # Skip services that are not implemented.
      next unless service_klass.method_defined?(service_method)
      subscription_key = ::Protobuf::Nats.subscription_key(service_klass, service_method)
      next if do_not_subscribe_to_includes?(subscription_key)
      next unless only_subscribe_to_includes?(subscription_key)

      yield subscription_key
    end
  end
end