Class: Protobuf::Nats::Server
- Inherits:
-
Object
- Object
- Protobuf::Nats::Server
- Includes:
- Logging, Rpc::Server
- Defined in:
- lib/protobuf/nats/server.rb
Constant Summary collapse
- MILLISECOND =
1000
Instance Attribute Summary collapse
-
#nats ⇒ Object
readonly
Returns the value of attribute nats.
-
#subscription_manager ⇒ Object
readonly
Returns the value of attribute subscription_manager.
-
#thread_pool ⇒ Object
readonly
Returns the value of attribute thread_pool.
Instance Method Summary collapse
- #detect_and_handle_a_pause ⇒ Object
- #do_not_subscribe_to_includes?(subscription_key) ⇒ Boolean
- #enqueue_request(request_data, reply_id) ⇒ Object
-
#finish_slow_start ⇒ Object
Slow start subscriptions by adding X rounds of subz every Y seconds, where X is subscriptions_per_rpc_endpoint and Y is slow_start_delay.
- #handler_count ⇒ Object
-
#handler_overdue_ms ⇒ Object
A handler still running past this is “overdue”: the client has already given up (its response_timeout), so the work is orphaned and holding a pool slot for nothing.
-
#initialize(options) ⇒ Server
constructor
A new instance of Server.
-
#instrument_inflight_handlers ⇒ Object
Periodic in-flight handler health.
- #instrument_thread_pool_sizes ⇒ Object
- #max_queue_size ⇒ Object
- #monotonic ⇒ Object
- #only_subscribe_to_includes?(subscription_key) ⇒ Boolean
- #pause_file_path ⇒ Object
- #paused? ⇒ Boolean
- #print_subscription_keys ⇒ Object
-
#reclaim_overdue_handlers? ⇒ Boolean
Whether to actively reclaim (abort) an overdue handler’s pool slot.
- #run ⇒ Object
- #running? ⇒ Boolean
- #service_klasses ⇒ Object
-
#shutdown_drain_timeout ⇒ Object
How long to let in-flight handlers finish on shutdown.
-
#slow_handler_threshold_ms ⇒ Object
Informational SLA marker for slow handlers.
- #slow_start_delay ⇒ Object
- #stop ⇒ Object
- #subscribe ⇒ Object
- #subscribe_to_services_once ⇒ Object
- #subscriptions_per_rpc_endpoint ⇒ Object
- #threads ⇒ Object
- #unsubscribe ⇒ Object
- #with_each_subscription_key ⇒ Object
Constructor Details
#initialize(options) ⇒ Server
Returns a new instance of Server.
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/protobuf/nats/server.rb', line 18 def initialize() @options = @processing_requests = true @running = true @stopped = false @pause_mutex = ::Mutex.new @nats = @options[:client] || ::Protobuf::Nats::NatsClient.new @nats.connect(::Protobuf::Nats.config.) @thread_pool = ::Protobuf::Nats::ThreadPool.new(threads, :max_queue => max_queue_size) @subscription_manager = ::Protobuf::Nats::SuperSubscriptionManager.new(@nats) do |request_data, reply_id, subject| unless enqueue_request(request_data, reply_id) logger.error { "Thread pool is full! Dropping message for subject: #{subject}" } end end @server = .fetch(:server, ::Socket.gethostname) # In-flight handler tracking for observability. Long-running handlers are # allowed (and never aborted); we only measure/report. id => monotonic # start time; @overdue_flagged dedupes the per-handler overdue event. @inflight = ::Concurrent::Map.new @overdue_flagged = ::Concurrent::Map.new @request_seq = ::Concurrent::AtomicFixnum.new(0) end |
Instance Attribute Details
#nats ⇒ Object (readonly)
Returns the value of attribute nats.
14 15 16 |
# File 'lib/protobuf/nats/server.rb', line 14 def nats @nats end |
#subscription_manager ⇒ Object (readonly)
Returns the value of attribute subscription_manager.
14 15 16 |
# File 'lib/protobuf/nats/server.rb', line 14 def subscription_manager @subscription_manager end |
#thread_pool ⇒ Object (readonly)
Returns the value of attribute thread_pool.
14 15 16 |
# File 'lib/protobuf/nats/server.rb', line 14 def thread_pool @thread_pool end |
Instance Method Details
#detect_and_handle_a_pause ⇒ Object
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 |
# File 'lib/protobuf/nats/server.rb', line 341 def detect_and_handle_a_pause @pause_mutex.synchronize do case # If we are taking requests and detect a pause file, then unsubscribe. when @processing_requests && paused? @processing_requests = false logger.warn("Pausing server!") unsubscribe # If we were paused and the pause file is no longer present, then subscribe again. when !@processing_requests && !paused? logger.warn("Resuming server: resubscribing to all services and restarting slow start!") @processing_requests = true subscribe end end end |
#do_not_subscribe_to_includes?(subscription_key) ⇒ Boolean
261 262 263 264 265 266 267 268 |
# File 'lib/protobuf/nats/server.rb', line 261 def do_not_subscribe_to_includes?(subscription_key) return false unless ::Protobuf::Nats.config.server_subscription_key_do_not_subscribe_to_when_includes_any_of.respond_to?(:any?) return false if ::Protobuf::Nats.config.server_subscription_key_do_not_subscribe_to_when_includes_any_of.empty? ::Protobuf::Nats.config.server_subscription_key_do_not_subscribe_to_when_includes_any_of.any? do |key| subscription_key.include?(key) end end |
#enqueue_request(request_data, reply_id) ⇒ Object
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 |
# File 'lib/protobuf/nats/server.rb', line 160 def enqueue_request(request_data, reply_id) ::Protobuf::Nats.instrument "server.message_received" enqueued_at = monotonic request_id = @request_seq.increment was_enqueued = thread_pool.push do # nil response_data is the "handler failed, don't publish a success # response" sentinel (a successful encode is always a non-nil String, # even when empty). response_data = nil begin # Instrument the thread pool time-to-execute duration. processed_at = monotonic ::Protobuf::Nats.instrument("server.thread_pool_execution_delay", (processed_at - enqueued_at) * MILLISECOND) # Track this handler as in-flight (long handlers are allowed; this is # only for observability -- we never abort it unless overdue-reclaim # is explicitly enabled). Store the worker thread so reclaim can # target it; the start time drives age/overdue accounting. @inflight[request_id] = [processed_at, ::Thread.current] # Process request. Only the handler is wrapped here so a transport # failure on the success-response publish (below) cannot fall into # this rescue and emit a *second* (error) publish for a request whose # handler actually succeeded. begin response_data = handle_request(request_data, 'server' => @server) rescue => error response_data = nil # ensure the success-publish below is skipped logger.debug { "rescued error => #{error}" } if logger.debug? # Logs the real error server-side (via the default log_error # callback) so it isn't lost; the client gets only a generic message. ::Protobuf::Nats.notify_error_callbacks(error) # The client has already received our ACK and is now blocked waiting # for the response message. If we don't send one it will hang until # response_timeout (60s by default). Publish an encoded RPC error so # the client fails fast instead. Use a generic message rather than # error.message so internal handler details aren't leaked over the # wire. (If the failure was the connection itself, this publish will # also fail and is swallowed below.) begin error_response = ::Protobuf::Rpc::PbError.new("Internal server error") nats.publish(reply_id, error_response.encode) rescue => publish_error logger.error "Failed to publish error response for #{reply_id}: #{publish_error.}" end end # Publish the successful response. Kept outside the handler rescue so a # publish failure here is logged rather than triggering a duplicate # (error) response for a request that already succeeded. if response_data logger.debug { "Publishing response to #{reply_id}" } if logger.debug? begin nats.publish(reply_id, response_data) rescue => publish_error logger.error "Failed to publish response for #{reply_id}: #{publish_error.}" ::Protobuf::Nats.notify_error_callbacks(publish_error) end end ensure @inflight.delete(request_id) @overdue_flagged.delete(request_id) # Instrument the request duration. completed_at = monotonic ::Protobuf::Nats.instrument("server.request_duration", (completed_at - enqueued_at) * MILLISECOND) # Informational slow-handler marker (opt-in; default off). if processed_at && slow_handler_threshold_ms.positive? handler_ms = (completed_at - processed_at) * MILLISECOND if handler_ms >= slow_handler_threshold_ms logger.warn "Slow handler for #{reply_id}: #{handler_ms.round}ms" ::Protobuf::Nats.instrument("server.slow_handler", handler_ms) end end end end # Publish an ACK to signal the server has picked up the work. begin if was_enqueued logger.debug { "[reply_id=#{reply_id}] Sending ACK" } if logger.debug? nats.publish(reply_id, ::Protobuf::Nats::Messages::ACK) else # Drop message if the thread pool is full ::Protobuf::Nats.instrument "server.thread_pool_saturated" ::Protobuf::Nats.instrument "server.message_dropped" logger.debug { "[reply_id=#{reply_id}] Sending NACK" } if logger.debug? # Let the client know we are not processing the message. nats.publish(reply_id, ::Protobuf::Nats::Messages::NACK) end rescue => e logger.error "Failed to send ACK/NACK for #{reply_id}: #{e.}" ::Protobuf::Nats.notify_error_callbacks(e) end was_enqueued end |
#finish_slow_start ⇒ Object
Slow start subscriptions by adding X rounds of subz every Y seconds, where X is subscriptions_per_rpc_endpoint and Y is slow_start_delay.
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 |
# File 'lib/protobuf/nats/server.rb', line 316 def finish_slow_start logger.info "Slow start has started..." completed = 1 # We have (X - 1) here because we always subscribe at least once. (subscriptions_per_rpc_endpoint - 1).times do unless @running logger.info "Slow start interrupted (server stopping) after #{completed}/#{subscriptions_per_rpc_endpoint} rounds" return end if paused? logger.info "Slow start interrupted (server paused) after #{completed}/#{subscriptions_per_rpc_endpoint} rounds" return end completed += 1 sleep slow_start_delay subscribe_to_services_once logger.info "Slow start adding another round of subscriptions (#{completed}/#{subscriptions_per_rpc_endpoint})..." end logger.info "Slow start finished successfully (#{completed}/#{subscriptions_per_rpc_endpoint} rounds completed)." end |
#handler_count ⇒ Object
49 50 51 |
# File 'lib/protobuf/nats/server.rb', line 49 def handler_count subscription_manager.handler_count end |
#handler_overdue_ms ⇒ Object
A handler still running past this is “overdue”: the client has already given up (its response_timeout), so the work is orphaned and holding a pool slot for nothing. Defaults above the client’s 60s response_timeout so legitimate ≤60s operations are never flagged.
63 64 65 |
# File 'lib/protobuf/nats/server.rb', line 63 def handler_overdue_ms @handler_overdue_ms ||= ::ENV.fetch("PB_NATS_SERVER_HANDLER_OVERDUE_MS", 65_000).to_i end |
#instrument_inflight_handlers ⇒ Object
Periodic in-flight handler health. Long handlers are normal, so inflight_oldest_age_ms can legitimately approach the client’s response_timeout; only overdue_handler_count (work the client has already abandoned) signals a problem.
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/protobuf/nats/server.rb', line 101 def instrument_inflight_handlers now = monotonic overdue_ms = handler_overdue_ms count = 0 oldest_age_ms = 0.0 overdue = 0 @inflight.each_pair do |id, entry| started_at, handler_thread = entry count += 1 age_ms = (now - started_at) * MILLISECOND oldest_age_ms = age_ms if age_ms > oldest_age_ms next unless overdue_ms.positive? && age_ms >= overdue_ms overdue += 1 # Optionally reclaim the slot by aborting the orphaned handler (opt-in; # see #reclaim_overdue_handlers?). Done before the dedupe below so the # reclaim is attempted even after the overdue event was already emitted. if reclaim_overdue_handlers? && handler_thread&.alive? logger.warn "Reclaiming overdue handler (age=#{age_ms.round}ms, client already gave up) to free its pool slot" handler_thread.raise(::Protobuf::Nats::Errors::HandlerOverdue, "handler exceeded #{overdue_ms}ms; reclaimed") ::Protobuf::Nats.instrument("server.handler_reclaimed", age_ms) end # Emit the per-handler overdue event once (the client has already # given up; this handler's result is orphaned). next if @overdue_flagged[id] @overdue_flagged[id] = true logger.warn "Handler exceeded #{overdue_ms}ms (client already gave up); in-flight age=#{age_ms.round}ms" ::Protobuf::Nats.instrument("server.handler_overdue", age_ms) end ::Protobuf::Nats.instrument("server.pending_intake_queue_size", subscription_manager.pending_queue_size) ::Protobuf::Nats.instrument("server.inflight_count", count) ::Protobuf::Nats.instrument("server.inflight_oldest_age_ms", oldest_age_ms) ::Protobuf::Nats.instrument("server.overdue_handler_count", overdue) end |
#instrument_thread_pool_sizes ⇒ Object
91 92 93 94 95 |
# File 'lib/protobuf/nats/server.rb', line 91 def instrument_thread_pool_sizes ::Protobuf::Nats.instrument("server.thread_pool_enqueued_size", thread_pool.enqueued_size) ::Protobuf::Nats.instrument("server.thread_pool_max_size", thread_pool.max_size) ::Protobuf::Nats.instrument("server.thread_pool_running_size", thread_pool.size) end |
#max_queue_size ⇒ Object
140 141 142 |
# File 'lib/protobuf/nats/server.rb', line 140 def max_queue_size ::ENV.fetch("PB_NATS_SERVER_MAX_QUEUE_SIZE", @options[:threads]).to_i end |
#monotonic ⇒ Object
45 46 47 |
# File 'lib/protobuf/nats/server.rb', line 45 def monotonic ::Protobuf::Nats.monotonic_time end |
#only_subscribe_to_includes?(subscription_key) ⇒ Boolean
270 271 272 273 274 275 276 277 |
# File 'lib/protobuf/nats/server.rb', line 270 def only_subscribe_to_includes?(subscription_key) return true unless ::Protobuf::Nats.config.server_subscription_key_only_subscribe_to_when_includes_any_of.respond_to?(:any?) return true if ::Protobuf::Nats.config.server_subscription_key_only_subscribe_to_when_includes_any_of.empty? ::Protobuf::Nats.config.server_subscription_key_only_subscribe_to_when_includes_any_of.any? do |key| subscription_key.include?(key) end end |
#pause_file_path ⇒ Object
279 280 281 |
# File 'lib/protobuf/nats/server.rb', line 279 def pause_file_path ::ENV.fetch("PB_NATS_SERVER_PAUSE_FILE_PATH", nil) end |
#paused? ⇒ Boolean
359 360 361 |
# File 'lib/protobuf/nats/server.rb', line 359 def paused? !pause_file_path.nil? && ::File.exist?(pause_file_path) end |
#print_subscription_keys ⇒ Object
283 284 285 286 287 288 289 |
# File 'lib/protobuf/nats/server.rb', line 283 def print_subscription_keys logger.info "Creating subscriptions:" with_each_subscription_key do |subscription_key| logger.info " - #{subscription_key}" end end |
#reclaim_overdue_handlers? ⇒ Boolean
Whether to actively reclaim (abort) an overdue handler’s pool slot. OFF by default: the documented contract is that handlers are never aborted, since killing a thread mid-handler can corrupt state. Enable only when you would rather shed orphaned work (whose client already gave up) than let it pin a pool slot – e.g. when overdue handlers are saturating the pool and the server is NACKing healthy traffic. Reclaim raises Errors::HandlerOverdue into the worker, which the handler rescue turns into an RPC error response.
74 75 76 77 78 79 |
# File 'lib/protobuf/nats/server.rb', line 74 def reclaim_overdue_handlers? # Memoize the raw string (never falsey, so ||= is safe) and derive the # boolean per call -- avoids the nil-guard dance for a false-able memo. @reclaim_overdue_handlers ||= ::ENV.fetch("PB_NATS_SERVER_RECLAIM_OVERDUE_HANDLERS", "false") @reclaim_overdue_handlers == "true" end |
#run ⇒ Object
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 |
# File 'lib/protobuf/nats/server.rb', line 363 def run nats.on_reconnect do logger.warn "Server NATS connection was reconnected" end nats.on_disconnect do logger.warn "Server NATS connection was disconnected" end nats.on_error do |error| # Runs on nats-pure's read/flush thread -- offload so a slow callback # can't stall the server's intake. ::Protobuf::Nats.notify_error_callbacks_async(error) end nats.on_close do logger.warn "Server NATS connection was closed" end print_subscription_keys if paused? yield if block_given? else subscribe { yield if block_given? } end loop do break unless @running detect_and_handle_a_pause instrument_thread_pool_sizes instrument_inflight_handlers thread_pool.replenish # respawn workers killed by non-StandardError sleep 1 end unsubscribe logger.info "Shutting down subscription manager..." begin Timeout.timeout(10) do subscription_manager.shutdown(5) end rescue Timeout::Error logger.error "Subscription manager shutdown timed out!" rescue => e logger.error "Error during subscription manager shutdown: #{e.}" end # Give in-flight handlers time to finish. Long operations are allowed # (up to ~the client's response_timeout), so the drain timeout tracks # handler_overdue_ms rather than a fixed 60s -- otherwise a legitimate # ~60s handler would be killed and its client left waiting. drain_timeout = shutdown_drain_timeout logger.info "Waiting up to #{drain_timeout.round}s for the thread pool to finish shutting down..." thread_pool.shutdown unless thread_pool.wait_for_termination(drain_timeout) abandoned = @inflight.size logger.warn "Thread pool did not shut down cleanly within #{drain_timeout.round}s! Abandoned #{abandoned} in-flight handler(s)." ::Protobuf::Nats.instrument "server.thread_pool_shutdown_timeout" ::Protobuf::Nats.instrument "server.shutdown_abandoned_handlers", abandoned end ensure @stopped = true begin logger.info "Closing NATS connection..." @nats.close if @nats rescue => e logger.warn "Failed to close NATS connection: #{e.}" end end |
#running? ⇒ Boolean
435 436 437 |
# File 'lib/protobuf/nats/server.rb', line 435 def running? !@stopped end |
#service_klasses ⇒ Object
156 157 158 |
# File 'lib/protobuf/nats/server.rb', line 156 def service_klasses ::Protobuf::Rpc::Service.implemented_services.map(&:safe_constantize) end |
#shutdown_drain_timeout ⇒ Object
How long to let in-flight handlers finish on shutdown. Tracks the overdue window (plus grace) so a legitimate long handler isn’t killed mid-flight.
83 84 85 86 87 88 89 |
# File 'lib/protobuf/nats/server.rb', line 83 def shutdown_drain_timeout @shutdown_drain_timeout ||= if ::ENV.key?("PB_NATS_SERVER_SHUTDOWN_DRAIN_TIMEOUT") ::ENV["PB_NATS_SERVER_SHUTDOWN_DRAIN_TIMEOUT"].to_f else (handler_overdue_ms / 1000.0) + 5 end end |
#slow_handler_threshold_ms ⇒ Object
Informational SLA marker for slow handlers. Default 0 (off) so normal long-running operations are not flagged.
55 56 57 |
# File 'lib/protobuf/nats/server.rb', line 55 def slow_handler_threshold_ms @slow_handler_threshold_ms ||= ::ENV.fetch("PB_NATS_SERVER_SLOW_HANDLER_THRESHOLD_MS", 0).to_i end |
#slow_start_delay ⇒ Object
144 145 146 |
# File 'lib/protobuf/nats/server.rb', line 144 def slow_start_delay @slow_start_delay ||= ::ENV.fetch("PB_NATS_SERVER_SLOW_START_DELAY", 10).to_i end |
#stop ⇒ Object
439 440 441 |
# File 'lib/protobuf/nats/server.rb', line 439 def stop @running = false end |
#subscribe ⇒ Object
443 444 445 446 447 |
# File 'lib/protobuf/nats/server.rb', line 443 def subscribe subscribe_to_services_once yield if block_given? finish_slow_start end |
#subscribe_to_services_once ⇒ Object
291 292 293 294 295 |
# File 'lib/protobuf/nats/server.rb', line 291 def subscribe_to_services_once with_each_subscription_key do |subscription_key_and_queue| subscription_manager.queue_subscribe(subscription_key_and_queue) end end |
#subscriptions_per_rpc_endpoint ⇒ Object
148 149 150 |
# File 'lib/protobuf/nats/server.rb', line 148 def subscriptions_per_rpc_endpoint @subscriptions_per_rpc_endpoint ||= ::ENV.fetch("PB_NATS_SERVER_SUBSCRIPTIONS_PER_RPC_ENDPOINT", 10).to_i end |
#threads ⇒ Object
152 153 154 |
# File 'lib/protobuf/nats/server.rb', line 152 def threads @options[:threads] || 10 # Default to 10 if not provided, consistent with original behavior end |
#unsubscribe ⇒ Object
449 450 451 452 |
# File 'lib/protobuf/nats/server.rb', line 449 def unsubscribe logger.info "Unsubscribing from rpc routes..." subscription_manager.unsubscribe_all end |
#with_each_subscription_key ⇒ Object
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 |
# File 'lib/protobuf/nats/server.rb', line 297 def with_each_subscription_key fail ::ArgumentError unless block_given? service_klasses.each do |service_klass| service_klass.rpcs.each do |service_method, _| # Skip services that are not implemented. next unless service_klass.method_defined?(service_method) subscription_key = ::Protobuf::Nats.subscription_key(service_klass, service_method) next if do_not_subscribe_to_includes?(subscription_key) next unless only_subscribe_to_includes?(subscription_key) yield subscription_key end end end |