Module: Legion::Extensions::Llm::Gateway::Runners::Inference

Defined in:
lib/legion/extensions/llm/gateway/runners/inference.rb

Class Method Summary collapse

Class Method Details

.base_meter_fields(response, opts) ⇒ Object



126
127
128
129
130
131
132
133
134
135
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 126

def base_meter_fields(response, opts)
  {
    request_type:   opts[:request_type],
    provider:       extract_provider(response, opts[:provider]),
    model_id:       extract_model(response, opts[:model_id]),
    latency_ms:     opts[:latency_ms],
    tier:           opts[:tier],
    routing_reason: opts[:intent]
  }
end

.build_meter_event(response, **opts) ⇒ Object



122
123
124
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 122

def build_meter_event(response, **opts)
  Metering.build_event(**base_meter_fields(response, opts), **token_fields(response))
end

.call_llm(method_name) ⇒ Object



107
108
109
110
111
112
113
114
115
116
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 107

def call_llm(method_name, **)
  unless defined?(Legion::LLM)
    Legion::Logging.warn('[Gateway::Inference] Legion::LLM not defined') if defined?(Legion::Logging) # rubocop:disable Legion/HelperMigration/DirectLogging, Legion/HelperMigration/LoggingGuard
    return { error: 'llm_not_available' }
  end

  direct = :"#{method_name}_direct"
  target = Legion::LLM.respond_to?(direct) ? direct : method_name
  Legion::LLM.public_send(target, **)
end

.chat(model: nil, provider: nil, **opts) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 11

def chat(model: nil, provider: nil, **opts)
  if pipeline_available?
    log_deprecation(:chat)
    return Legion::LLM.chat(model: model, provider: provider, # rubocop:disable Legion/HelperMigration/DirectLlm
                            caller: { extension: 'lex-llm-gateway', operation: 'inference' }, **opts)
  end

  start_ms = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond)
  response = dispatch_chat(model: model, provider: provider, **opts)
  elapsed_ms = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond) - start_ms
  meter_response(response, request_type: 'chat', provider: provider,
                           model_id: model, latency_ms: elapsed_ms, **opts.slice(:tier, :intent))
  response
end

.dispatch_chat(message: nil, messages: nil, model: nil, provider: nil, **opts) ⇒ Object



71
72
73
74
75
76
77
78
79
80
81
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 71

def dispatch_chat(message: nil, messages: nil, model: nil, provider: nil, **opts)
  tier = opts[:tier]
  Legion::Logging.debug("[Gateway::Inference] dispatch_chat tier=#{tier}") if defined?(Legion::Logging) # rubocop:disable Legion/HelperMigration/DirectLogging, Legion/HelperMigration/LoggingGuard
  if tier == 'fleet' && fleet_available?
    fleet_messages = messages || [{ role: 'user', content: message }]
    Fleet.dispatch(model: model, messages: fleet_messages, intent: opts[:intent])
  else
    call_llm(:chat, message: message, messages: messages, model: model,
                    provider: provider, **opts)
  end
end

.dispatch_embed(text: nil, model: nil, provider: nil, **opts) ⇒ Object



83
84
85
86
87
88
89
90
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 83

def dispatch_embed(text: nil, model: nil, provider: nil, **opts)
  if opts[:tier] == 'fleet' && fleet_available?
    Fleet.dispatch(model: model, messages: [{ role: 'user', content: text }],
                   intent: opts[:intent], request_type: 'embed', text: text)
  else
    call_llm(:embed, text: text, model: model, provider: provider, **opts)
  end
end

.dispatch_structured(messages: nil, schema: nil, model: nil, provider: nil, **opts) ⇒ Object



92
93
94
95
96
97
98
99
100
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 92

def dispatch_structured(messages: nil, schema: nil, model: nil, provider: nil, **opts)
  if opts[:tier] == 'fleet' && fleet_available?
    Fleet.dispatch(model: model, messages: messages, intent: opts[:intent],
                   request_type: 'structured', schema: schema)
  else
    call_llm(:structured, messages: messages, schema: schema, model: model,
                          provider: provider, **opts)
  end
end

.embed(text: nil, model: nil, provider: nil) ⇒ Object



26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 26

def embed(text: nil, model: nil, provider: nil, **)
  if pipeline_available?
    log_deprecation(:embed)
    return Legion::LLM.embed(text, model: model, provider: provider, # rubocop:disable Legion/HelperMigration/DirectLlm
                             caller: { extension: 'lex-llm-gateway', operation: 'inference' }, **)
  end

  start_ms = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond)
  response = dispatch_embed(text: text, model: model, provider: provider, **)
  elapsed_ms = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond) - start_ms
  meter_response(response, request_type: 'embed', provider: provider, model_id: model,
                           latency_ms: elapsed_ms)
  response
end

.extract_model(response, fallback) ⇒ Object



153
154
155
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 153

def extract_model(response, fallback)
  response.respond_to?(:model) ? response.model : fallback
end

.extract_provider(response, fallback) ⇒ Object



149
150
151
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 149

def extract_provider(response, fallback)
  response.respond_to?(:provider) ? response.provider : fallback
end

.extract_tokens(response, field) ⇒ Object



145
146
147
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 145

def extract_tokens(response, field)
  response.respond_to?(field) ? response.public_send(field).to_i : 0
end

.fleet_available?Boolean

Returns:

  • (Boolean)


102
103
104
105
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 102

def fleet_available?
  defined?(Legion::Extensions::Llm::Gateway::Runners::Fleet) &&
    Fleet.respond_to?(:fleet_available?) && Fleet.fleet_available?
end

.log_deprecation(method) ⇒ Object



65
66
67
68
69
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 65

def log_deprecation(method)
  return unless defined?(Legion::Logging) # rubocop:disable Legion/HelperMigration/LoggingGuard

  Legion::Logging.warn("lex-llm-gateway is deprecated for #{method}, use Legion::LLM.#{method} directly") # rubocop:disable Legion/HelperMigration/DirectLogging
end

.meter_response(response) ⇒ Object



118
119
120
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 118

def meter_response(response, **)
  Metering.publish_or_spool(build_meter_event(response, **))
end

.pipeline_available?Boolean

Returns:

  • (Boolean)


58
59
60
61
62
63
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 58

def pipeline_available?
  defined?(Legion::LLM::Pipeline::Executor) &&
    defined?(Legion::LLM) &&
    Legion::LLM.respond_to?(:pipeline_enabled?) &&
    Legion::LLM.pipeline_enabled?
end

.structured(messages: nil, schema: nil, model: nil, provider: nil) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 41

def structured(messages: nil, schema: nil, model: nil, provider: nil, **)
  if pipeline_available?
    log_deprecation(:structured)
    return Legion::LLM.structured(messages: messages, schema: schema, model: model, # rubocop:disable Legion/HelperMigration/DirectLlm
                                  provider: provider,
                                  caller: { extension: 'lex-llm-gateway', operation: 'inference' }, **)
  end

  start_ms = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond)
  response = dispatch_structured(messages: messages, schema: schema, model: model,
                                 provider: provider, **)
  elapsed_ms = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond) - start_ms
  meter_response(response, request_type: 'structured', provider: provider, model_id: model,
                           latency_ms: elapsed_ms)
  response
end

.token_fields(response) ⇒ Object



137
138
139
140
141
142
143
# File 'lib/legion/extensions/llm/gateway/runners/inference.rb', line 137

def token_fields(response)
  {
    input_tokens:    extract_tokens(response, :input_tokens),
    output_tokens:   extract_tokens(response, :output_tokens),
    thinking_tokens: extract_tokens(response, :thinking_tokens)
  }
end