Module: Legion::LLM::API::Namespaces::OpenAI::Responses

Extended by:: Legion::Logging::Helper

Defined in:: lib/legion/llm/api/namespaces/openai/responses.rb

Overview

Sinatra extension for /v1/responses — parse → translate → execute → respond. All translation lives in API::ClientTranslators::OpenAIResponses.

Class Method Summary collapse

.flush_pending(messages, pending) ⇒ Object
.normalize_input_array(input) ⇒ Object

Helper kept at module-level for the input_tokens/count handler (and as a public seam for tests).
.registered(app) ⇒ Object

rubocop:disable Metrics/AbcSize.

Class Method Details

.flush_pending(messages, pending) ⇒ `Object`

# File 'lib/legion/llm/api/namespaces/openai/responses.rb', line 227

def self.flush_pending(messages, pending)
  return if pending.empty?

  messages << {
    role:       'assistant',
    content:    '',
    tool_calls: pending.map do |tc|
      { id: tc[:id], type: 'function', function: { name: tc[:name], arguments: tc[:arguments] } }
    end
  }
  pending.clear
end

.normalize_input_array(input) ⇒ `Object`

Helper kept at module-level for the input_tokens/count handler (and as a public seam for tests). Mirrors the translator’s internal normalization but stays callable without instantiating a translator.

# File 'lib/legion/llm/api/namespaces/openai/responses.rb', line 195

def self.normalize_input_array(input)
  messages = []
  pending = []

  input.each do |item|
    item = item.transform_keys(&:to_sym) if item.respond_to?(:transform_keys)
    case item[:type]&.to_s
    when 'function_call'
      pending << {
        id:        item[:call_id] || item[:id],
        name:      item[:name].to_s,
        arguments: item[:arguments].is_a?(String) ? item[:arguments] : Legion::JSON.dump(item[:arguments] || {})
      }
    when 'function_call_output'
      flush_pending(messages, pending)
      messages << { role: 'tool', tool_call_id: item[:call_id], content: item[:output].to_s }
    else
      flush_pending(messages, pending)
      role = item[:role]&.to_s
      next unless role

      role = 'system' if role == 'developer'

      content = item[:content]
      content = content.to_s if content && !content.is_a?(Array)
      messages << { role: role, content: content }.compact
    end
  end
  flush_pending(messages, pending)
  messages
end

.registered(app) ⇒ `Object`

rubocop:disable Metrics/AbcSize

# File 'lib/legion/llm/api/namespaces/openai/responses.rb', line 20

def self.registered(app) # rubocop:disable Metrics/AbcSize
  log.debug('[llm][api][namespaces][openai][responses] registering routes')

  app.post '/v1/responses' do
    require_llm!
    request_started_at = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
    body = parse_request_body

    input = body[:input]
    unless input.is_a?(Array) || input.is_a?(String)
      return openai_error('input is required (string or array)',
                          type: 'invalid_request_error', status_code: 400)
    end

    translator = Legion::LLM::API::ClientTranslators::OpenAIResponses.new
    canonical_request = translator.parse_request(body, env)
    # Default reasoning.summary to 'auto' when the caller asked
    # for reasoning but didn't pin a summary mode — OpenAI's
    # /v1/responses lane omits reasoning content otherwise (B3).
    body = translator.ensure_reasoning_summary(body)
    request_id = canonical_request.id
    model = body[:model] || Legion::Settings[:llm][:default_model] || 'default'
    streaming = canonical_request.stream

    inference_request = translator.build_inference_request(
      canonical_request,
      request_id:    request_id,
      server_caller: build_server_caller(source: 'openai_responses', path: request.path, env: env)
    )

    log.info('[llm][api][namespaces][openai][responses] action=accepted ' \
             "request_id=#{request_id} model=#{model} stream=#{streaming}")

    executor = Legion::LLM::Inference::Executor.new(inference_request)

    canonical_format = Legion::LLM::API::DebugFormats.canonical_format?(env)
    echo_request = Legion::LLM::API::DebugFormats.echo_request?(env)

    if streaming
      content_type 'text/event-stream'
      headers 'Cache-Control' => 'no-cache', 'Connection' => 'keep-alive', 'X-Accel-Buffering' => 'no'
      stream do |out|
        emitter = if canonical_format
                    Legion::LLM::API::DebugFormats.canonical_event_emitter(out)
                  else
                    translator.events_emitter(out, request_id: request_id, model: model)
                  end
        Legion::LLM::API::DebugFormats.emit_echo_request_sse(out, canonical_request) if echo_request

        assembler = Legion::LLM::API::StreamAssembler.new(
          emitter:    emitter,
          request_id: request_id,
          model:      model
        )
        pipeline_response = if executor.respond_to?(:call_responses)
                              executor.call_responses(body: body, stream: true) { |c| assembler.push(c) }
                            else
                              executor.call_stream { |c| assembler.push(c) }
                            end
        assembler.finalize(pipeline_response)
        log_api_completion_summary(
          namespace:         'namespaces][openai][responses',
          request_id:        request_id,
          pipeline_response: pipeline_response,
          stream:            true,
          started_at:        request_started_at
        )
      rescue Legion::LLM::API::StreamAssembler::StreamClosed
        # Client disconnected — caller treats as cancellation per G10.
      rescue IOError, Errno::EPIPE
        # Client disconnected mid-write before assembler caught it.
      rescue StandardError => e
        handle_exception(e, level: :error, handled: false,
                            operation: 'llm.api.namespaces.openai.responses.stream', request_id: request_id)
        out << "event: error\ndata: #{Legion::JSON.dump({ type: 'server_error', message: e.message })}\n\n"
      end
    else
      pipeline_response = if executor.respond_to?(:call_responses)
                            executor.call_responses(body: body, stream: false)
                          else
                            executor.call
                          end
      log_api_completion_summary(
        namespace:         'namespaces][openai][responses',
        request_id:        request_id,
        pipeline_response: pipeline_response,
        stream:            false,
        started_at:        request_started_at
      )

      if canonical_format
        status_code, response_headers, body_string = Legion::LLM::API::DebugFormats.render_canonical_response(
          pipeline_response, canonical_request: canonical_request, env: env
        )
        status status_code
        response_headers.each { |k, v| headers k => v }
        body_string
      else
        formatted = translator.format_response(pipeline_response, request_id: request_id, model: model)
        formatted = Legion::LLM::API::DebugFormats.attach_echo_request(formatted, canonical_request) if echo_request
        content_type :json
        status 200
        Legion::JSON.dump(formatted)
      end
    end
  rescue Legion::LLM::AuthError => e
    handle_exception(e, level: :error, handled: true, operation: 'llm.api.namespaces.openai.responses.auth')
    openai_error(e.message, type: 'authentication_error', status_code: 401)
  rescue Legion::LLM::RateLimitError => e
    handle_exception(e, level: :warn, handled: true, operation: 'llm.api.namespaces.openai.responses.rate_limit')
    openai_error(e.message, type: 'rate_limit_error', code: 'rate_limit_exceeded', status_code: 429)
  rescue Legion::LLM::ProviderDown, Legion::LLM::ProviderError => e
    handle_exception(e, level: :error, handled: true, operation: 'llm.api.namespaces.openai.responses.provider')
    openai_error(e.message, type: 'server_error', status_code: 502)
  rescue StandardError => e
    handle_exception(e, level: :error, handled: false, operation: 'llm.api.namespaces.openai.responses')
    openai_error(e.message, type: 'server_error', status_code: 500)
  end

  app.get '/v1/responses/:id' do
    openai_error("Response '#{params[:id]}' not found", type: 'invalid_request_error',
                                                        code: 'response_not_found', status_code: 404)
  end

  app.delete '/v1/responses/:id' do
    content_type :json
    Legion::JSON.dump({ id: params[:id], object: 'response', deleted: true })
  end

  app.post '/v1/responses/:id/cancel' do
    openai_error("Response '#{params[:id]}' not found or already completed",
                 type: 'invalid_request_error', status_code: 404)
  end

  app.get '/v1/responses/:id/input_items' do
    content_type :json
    Legion::JSON.dump({ object: 'list', data: [], has_more: false })
  end

  app.post '/v1/responses/:id/input_tokens/count' do
    body  = parse_request_body
    input = body[:input]
    model = body[:model] || params[:id]
    messages = case input
               when Array  then Responses.normalize_input_array(input)
               when String then [{ role: 'user', content: input }]
               else []
               end
    result = Legion::LLM::TokenEstimation.estimate(messages: messages, model: model.to_s)
    content_type :json
    Legion::JSON.dump(result)
  rescue StandardError => e
    handle_exception(e, level: :error, handled: false,
                        operation: 'llm.api.namespaces.openai.responses.count_tokens')
    openai_error(e.message, type: 'server_error', status_code: 500)
  end

  app.post '/v1/responses/:id/compact' do
    openai_error("Response '#{params[:id]}' not found", type: 'invalid_request_error', status_code: 404)
  end

  app.post '/api/llm/inference/v1/responses' do
    call env.merge('PATH_INFO' => '/v1/responses')
  end

  log.debug('[llm][api][namespaces][openai][responses] routes registered')
rescue StandardError => e
  handle_exception(e, level: :error, handled: false,
                      operation: 'llm.api.namespaces.openai.responses.register')
end