Class: Tep::Llm::OpenAI::CompletionsHandler
- Defined in:
- lib/tep/openai_server.rb
Overview
POST /v1/completions – token-level OpenAI shape (the primary completion route). Parses model / prompt (token ids) / max_tokens, calls backend.generate_from_tokens, and formats the standard text_completion response. Dispatches through APP.openai_backend (the app’s subclass override answers).
Instance Method Summary collapse
Methods inherited from Handler
#is_regex?, #re_capture, #re_match?
Instance Method Details
#handle(req, res) ⇒ Object
502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 |
# File 'lib/tep/openai_server.rb', line 502 def handle(req, res) body = req.raw_body model = Tep::Json.get_str(body, "model") token_ids = Tep::Json.get_int_array(body, "prompt") sampling = Tep::Llm::OpenAI::Sampling.new sampling.max_tokens = Tep::Json.get_int(body, "max_tokens") # Floats from the JSON body; defaults stay at 1.0 if the # key is absent (Tep::Json.get_float returns 0.0 for # missing, but we only overwrite when present). if Tep::Json.has_key?(body, "temperature") sampling.temperature = Tep::Json.get_float(body, "temperature") end if Tep::Json.has_key?(body, "top_p") sampling.top_p = Tep::Json.get_float(body, "top_p") end # OpenAI signals streaming with "stream": true in the JSON # body; Tep::Json has no bool getter, so we sniff the literal # (same shape as examples/llm_gateway/app.rb). When set, the # response is SSE: a CompletionsStreamer pumps per-token # frames + the [DONE] sentinel, then emits the inference # event with sink.completion_count. wants_stream = Tep.str_find(body, "\"stream\":true", 0) >= 0 || Tep.str_find(body, "\"stream\": true", 0) >= 0 if wants_stream res.headers["Content-Type"] = "text/event-stream" res.headers["Cache-Control"] = "no-cache" streamer = Tep::Llm::OpenAI::CompletionsStreamer.new streamer.model = model streamer.token_ids = token_ids streamer.sampling = sampling streamer.prompt_tokens = token_ids.length streamer.t0 = Time.now.to_i streamer.request_id = "cmpl-tep" streamer.principal_id = req.identity.subject res.start_stream(streamer) return "" end res.headers["Content-Type"] = "application/json" # Stamp t0 for the inference event's wall_us. Time.now exposes # only integer epoch seconds under spinel, so wall_us is at # second-resolution (latency * 1_000_000) -- coarse, but LLM # serving is seconds-scale, fine for the run-level analytics. # A µs clock helper lands later; until then this is the right # placeholder shape so consumers see populated wall_us. t0 = Time.now.to_i comp = Tep::APP.openai_backend.generate_from_tokens(model, token_ids, sampling) total = comp.prompt_tokens + comp.completion_tokens # Emit one inference event per request. Skipped when events # are disabled via path-length short-circuit inside #inference. # request_id matches the JSON response's id; principal_id is # the auth-filter populated identity (anonymous if none). wall_us = (Time.now.to_i - t0) * 1_000_000 extra = "{" + Tep::Json.encode_pair_str("request_id", "cmpl-tep") + "," + Tep::Json.encode_pair_str("principal_id", req.identity.subject) + "}" Tep::APP.openai_events.inference( model, comp.prompt_tokens, comp.completion_tokens, wall_us, extra ) "{" + Tep::Json.encode_pair_str("id", "cmpl-tep") + "," + Tep::Json.encode_pair_str("object", "text_completion") + "," + Tep::Json.encode_pair_int("created", Time.now.to_i) + "," + Tep::Json.encode_pair_str("model", model) + "," + "\"choices\":[{" + Tep::Json.encode_pair_int("index", 0) + "," + Tep::Json.encode_pair_str("text", comp.text) + "," + Tep::Json.encode_pair_str("finish_reason", "stop") + "}]," + "\"usage\":{" + Tep::Json.encode_pair_int("prompt_tokens", comp.prompt_tokens) + "," + Tep::Json.encode_pair_int("completion_tokens", comp.completion_tokens) + "," + Tep::Json.encode_pair_int("total_tokens", total) + "}" + "}" end |