Module: Legion::Extensions::Llm::Gateway::Runners::Inference
- Defined in:
- lib/legion/extensions/llm/gateway/runners/inference.rb
Class Method Summary collapse
- .base_meter_fields(response, opts) ⇒ Object
- .build_meter_event(response, **opts) ⇒ Object
- .call_llm(method_name) ⇒ Object
- .chat(model: nil, provider: nil, **opts) ⇒ Object
- .dispatch_chat(message: nil, messages: nil, model: nil, provider: nil, **opts) ⇒ Object
- .dispatch_embed(text: nil, model: nil, provider: nil, **opts) ⇒ Object
- .dispatch_structured(messages: nil, schema: nil, model: nil, provider: nil, **opts) ⇒ Object
- .embed(text: nil, model: nil, provider: nil) ⇒ Object
- .extract_model(response, fallback) ⇒ Object
- .extract_provider(response, fallback) ⇒ Object
- .extract_tokens(response, field) ⇒ Object
- .fleet_available? ⇒ Boolean
- .log_deprecation(method) ⇒ Object
- .meter_response(response) ⇒ Object
- .pipeline_available? ⇒ Boolean
- .structured(messages: nil, schema: nil, model: nil, provider: nil) ⇒ Object
- .token_fields(response) ⇒ Object
Class Method Details
.base_meter_fields(response, opts) ⇒ Object
126 127 128 129 130 131 132 133 134 135 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 126 def base_meter_fields(response, opts) { request_type: opts[:request_type], provider: extract_provider(response, opts[:provider]), model_id: extract_model(response, opts[:model_id]), latency_ms: opts[:latency_ms], tier: opts[:tier], routing_reason: opts[:intent] } end |
.build_meter_event(response, **opts) ⇒ Object
122 123 124 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 122 def build_meter_event(response, **opts) Metering.build_event(**base_meter_fields(response, opts), **token_fields(response)) end |
.call_llm(method_name) ⇒ Object
107 108 109 110 111 112 113 114 115 116 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 107 def call_llm(method_name, **) unless defined?(Legion::LLM) Legion::Logging.warn('[Gateway::Inference] Legion::LLM not defined') if defined?(Legion::Logging) # rubocop:disable Legion/HelperMigration/DirectLogging, Legion/HelperMigration/LoggingGuard return { error: 'llm_not_available' } end direct = :"#{method_name}_direct" target = Legion::LLM.respond_to?(direct) ? direct : method_name Legion::LLM.public_send(target, **) end |
.chat(model: nil, provider: nil, **opts) ⇒ Object
11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 11 def chat(model: nil, provider: nil, **opts) if pipeline_available? log_deprecation(:chat) return Legion::LLM.chat(model: model, provider: provider, # rubocop:disable Legion/HelperMigration/DirectLlm caller: { extension: 'lex-llm-gateway', operation: 'inference' }, **opts) end start_ms = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond) response = dispatch_chat(model: model, provider: provider, **opts) elapsed_ms = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond) - start_ms meter_response(response, request_type: 'chat', provider: provider, model_id: model, latency_ms: elapsed_ms, **opts.slice(:tier, :intent)) response end |
.dispatch_chat(message: nil, messages: nil, model: nil, provider: nil, **opts) ⇒ Object
71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 71 def dispatch_chat(message: nil, messages: nil, model: nil, provider: nil, **opts) tier = opts[:tier] Legion::Logging.debug("[Gateway::Inference] dispatch_chat tier=#{tier}") if defined?(Legion::Logging) # rubocop:disable Legion/HelperMigration/DirectLogging, Legion/HelperMigration/LoggingGuard if tier == 'fleet' && fleet_available? = || [{ role: 'user', content: }] Fleet.dispatch(model: model, messages: , intent: opts[:intent]) else call_llm(:chat, message: , messages: , model: model, provider: provider, **opts) end end |
.dispatch_embed(text: nil, model: nil, provider: nil, **opts) ⇒ Object
83 84 85 86 87 88 89 90 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 83 def (text: nil, model: nil, provider: nil, **opts) if opts[:tier] == 'fleet' && fleet_available? Fleet.dispatch(model: model, messages: [{ role: 'user', content: text }], intent: opts[:intent], request_type: 'embed', text: text) else call_llm(:embed, text: text, model: model, provider: provider, **opts) end end |
.dispatch_structured(messages: nil, schema: nil, model: nil, provider: nil, **opts) ⇒ Object
92 93 94 95 96 97 98 99 100 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 92 def dispatch_structured(messages: nil, schema: nil, model: nil, provider: nil, **opts) if opts[:tier] == 'fleet' && fleet_available? Fleet.dispatch(model: model, messages: , intent: opts[:intent], request_type: 'structured', schema: schema) else call_llm(:structured, messages: , schema: schema, model: model, provider: provider, **opts) end end |
.embed(text: nil, model: nil, provider: nil) ⇒ Object
26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 26 def (text: nil, model: nil, provider: nil, **) if pipeline_available? log_deprecation(:embed) return Legion::LLM.(text, model: model, provider: provider, # rubocop:disable Legion/HelperMigration/DirectLlm caller: { extension: 'lex-llm-gateway', operation: 'inference' }, **) end start_ms = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond) response = (text: text, model: model, provider: provider, **) elapsed_ms = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond) - start_ms meter_response(response, request_type: 'embed', provider: provider, model_id: model, latency_ms: elapsed_ms) response end |
.extract_model(response, fallback) ⇒ Object
153 154 155 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 153 def extract_model(response, fallback) response.respond_to?(:model) ? response.model : fallback end |
.extract_provider(response, fallback) ⇒ Object
149 150 151 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 149 def extract_provider(response, fallback) response.respond_to?(:provider) ? response.provider : fallback end |
.extract_tokens(response, field) ⇒ Object
145 146 147 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 145 def extract_tokens(response, field) response.respond_to?(field) ? response.public_send(field).to_i : 0 end |
.fleet_available? ⇒ Boolean
102 103 104 105 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 102 def fleet_available? defined?(Legion::Extensions::Llm::Gateway::Runners::Fleet) && Fleet.respond_to?(:fleet_available?) && Fleet.fleet_available? end |
.log_deprecation(method) ⇒ Object
65 66 67 68 69 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 65 def log_deprecation(method) return unless defined?(Legion::Logging) # rubocop:disable Legion/HelperMigration/LoggingGuard Legion::Logging.warn("lex-llm-gateway is deprecated for #{method}, use Legion::LLM.#{method} directly") # rubocop:disable Legion/HelperMigration/DirectLogging end |
.meter_response(response) ⇒ Object
118 119 120 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 118 def meter_response(response, **) Metering.publish_or_spool(build_meter_event(response, **)) end |
.pipeline_available? ⇒ Boolean
58 59 60 61 62 63 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 58 def pipeline_available? defined?(Legion::LLM::Pipeline::Executor) && defined?(Legion::LLM) && Legion::LLM.respond_to?(:pipeline_enabled?) && Legion::LLM.pipeline_enabled? end |
.structured(messages: nil, schema: nil, model: nil, provider: nil) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 41 def structured(messages: nil, schema: nil, model: nil, provider: nil, **) if pipeline_available? log_deprecation(:structured) return Legion::LLM.structured(messages: , schema: schema, model: model, # rubocop:disable Legion/HelperMigration/DirectLlm provider: provider, caller: { extension: 'lex-llm-gateway', operation: 'inference' }, **) end start_ms = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond) response = dispatch_structured(messages: , schema: schema, model: model, provider: provider, **) elapsed_ms = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond) - start_ms meter_response(response, request_type: 'structured', provider: provider, model_id: model, latency_ms: elapsed_ms) response end |
.token_fields(response) ⇒ Object
137 138 139 140 141 142 143 |
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 137 def token_fields(response) { input_tokens: extract_tokens(response, :input_tokens), output_tokens: extract_tokens(response, :output_tokens), thinking_tokens: extract_tokens(response, :thinking_tokens) } end |