Class: Tep::Llm::OpenAI::CompletionsHandler

Inherits:

Handler

Object
Handler
Tep::Llm::OpenAI::CompletionsHandler

show all

Defined in:: lib/tep/openai_server.rb

Overview

POST /v1/completions – token-level OpenAI shape (the primary completion route). Parses model / prompt (token ids) / max_tokens, calls backend.generate_from_tokens, and formats the standard text_completion response. Dispatches through APP.openai_backend (the app’s subclass override answers).

Instance Method Summary collapse

#handle(req, res) ⇒ Object

Methods inherited from Handler

#is_regex?, #re_capture, #re_match?

Instance Method Details

#handle(req, res) ⇒ `Object`

# File 'lib/tep/openai_server.rb', line 526

def handle(req, res)
  body      = req.raw_body
  model     = Tep::Json.get_str(body, "model")
  token_ids = Tep::Json.get_int_array(body, "prompt")
  sampling  = Tep::Llm::OpenAI::Sampling.new
  sampling.max_tokens = Tep::Json.get_int(body, "max_tokens")
  # Floats from the JSON body; defaults stay at 1.0 if the
  # key is absent (Tep::Json.get_float returns 0.0 for
  # missing, but we only overwrite when present).
  if Tep::Json.has_key?(body, "temperature")
    sampling.temperature = Tep::Json.get_float(body, "temperature")
  end
  if Tep::Json.has_key?(body, "top_p")
    sampling.top_p = Tep::Json.get_float(body, "top_p")
  end

  # OpenAI signals streaming with "stream": true in the JSON
  # body; Tep::Json has no bool getter, so we sniff the literal
  # (same shape as examples/llm_gateway/app.rb). When set, the
  # response is SSE: a CompletionsStreamer pumps per-token
  # frames + the [DONE] sentinel, then emits the inference
  # event with sink.completion_count.
  wants_stream = Tep.str_find(body, "\"stream\":true", 0) >= 0 ||
                 Tep.str_find(body, "\"stream\": true", 0) >= 0
  if wants_stream
    res.headers["Content-Type"]  = "text/event-stream"
    res.headers["Cache-Control"] = "no-cache"
    streamer = Tep::Llm::OpenAI::CompletionsStreamer.new
    streamer.model         = model
    streamer.token_ids     = token_ids
    streamer.sampling      = sampling
    streamer.prompt_tokens = token_ids.length
    streamer.t0            = Time.now.to_i
    streamer.request_id    = "cmpl-tep"
    streamer.principal_id  = req.identity.subject
    res.start_stream(streamer)
    return ""
  end

  res.headers["Content-Type"] = "application/json"

  # Stamp t0 for the inference event's wall_us. Time.now exposes
  # only integer epoch seconds under spinel, so wall_us is at
  # second-resolution (latency * 1_000_000) -- coarse, but LLM
  # serving is seconds-scale, fine for the run-level analytics.
  # A µs clock helper lands later; until then this is the right
  # placeholder shape so consumers see populated wall_us.
  t0 = Time.now.to_i

  comp = Tep::APP.openai_backend.generate_from_tokens(model, token_ids, sampling)
  total = comp.prompt_tokens + comp.completion_tokens

  # Emit one inference event per request. Skipped when events
  # are disabled via path-length short-circuit inside #inference.
  # request_id matches the JSON response's id; principal_id is
  # the auth-filter populated identity (anonymous if none).
  wall_us = (Time.now.to_i - t0) * 1_000_000
  extra = "{" +
    Tep::Json.encode_pair_str("request_id", "cmpl-tep") + "," +
    Tep::Json.encode_pair_str("principal_id", req.identity.subject) +
  "}"
  Tep::APP.openai_events.inference(
    model, comp.prompt_tokens, comp.completion_tokens, wall_us, extra
  )

  # IDs-only backends (no detokenizer) carry the generated token
  # IDs; emit them as choices[0].ids. Text backends leave token_ids
  # empty and the field is omitted (standard OpenAI shape).
  ids_frag = ""
  if comp.token_ids.length > 0
    ids_frag = "\"ids\":" + Tep::Json.from_int_array(comp.token_ids) + ","
  end

  "{" +
    Tep::Json.encode_pair_str("id", "cmpl-tep") + "," +
    Tep::Json.encode_pair_str("object", "text_completion") + "," +
    Tep::Json.encode_pair_int("created", Time.now.to_i) + "," +
    Tep::Json.encode_pair_str("model", model) + "," +
    "\"choices\":[{" +
      Tep::Json.encode_pair_int("index", 0) + "," +
      Tep::Json.encode_pair_str("text", comp.text) + "," +
      ids_frag +
      Tep::Json.encode_pair_str("finish_reason", comp.finish_reason) +
    "}]," +
    "\"usage\":{" +
      Tep::Json.encode_pair_int("prompt_tokens", comp.prompt_tokens) + "," +
      Tep::Json.encode_pair_int("completion_tokens", comp.completion_tokens) + "," +
      Tep::Json.encode_pair_int("total_tokens", total) +
    "}" +
  "}"
end