Class: Tep::Llm::OpenAI::CompletionsHandler
- Defined in:
- lib/tep/openai_server.rb
Overview
POST /v1/completions – token-level OpenAI shape (the primary completion route). Parses model / prompt (token ids) / max_tokens, calls backend.generate_from_tokens, and formats the standard text_completion response. Dispatches through APP.openai_backend (the app’s subclass override answers).
Instance Method Summary collapse
Methods inherited from Handler
#is_regex?, #re_capture, #re_match?
Instance Method Details
#handle(req, res) ⇒ Object
526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 |
# File 'lib/tep/openai_server.rb', line 526 def handle(req, res) body = req.raw_body model = Tep::Json.get_str(body, "model") token_ids = Tep::Json.get_int_array(body, "prompt") sampling = Tep::Llm::OpenAI::Sampling.new sampling.max_tokens = Tep::Json.get_int(body, "max_tokens") # Floats from the JSON body; defaults stay at 1.0 if the # key is absent (Tep::Json.get_float returns 0.0 for # missing, but we only overwrite when present). if Tep::Json.has_key?(body, "temperature") sampling.temperature = Tep::Json.get_float(body, "temperature") end if Tep::Json.has_key?(body, "top_p") sampling.top_p = Tep::Json.get_float(body, "top_p") end # OpenAI signals streaming with "stream": true in the JSON # body; Tep::Json has no bool getter, so we sniff the literal # (same shape as examples/llm_gateway/app.rb). When set, the # response is SSE: a CompletionsStreamer pumps per-token # frames + the [DONE] sentinel, then emits the inference # event with sink.completion_count. wants_stream = Tep.str_find(body, "\"stream\":true", 0) >= 0 || Tep.str_find(body, "\"stream\": true", 0) >= 0 if wants_stream res.headers["Content-Type"] = "text/event-stream" res.headers["Cache-Control"] = "no-cache" streamer = Tep::Llm::OpenAI::CompletionsStreamer.new streamer.model = model streamer.token_ids = token_ids streamer.sampling = sampling streamer.prompt_tokens = token_ids.length streamer.t0 = Time.now.to_i streamer.request_id = "cmpl-tep" streamer.principal_id = req.identity.subject res.start_stream(streamer) return "" end res.headers["Content-Type"] = "application/json" # Stamp t0 for the inference event's wall_us. Time.now exposes # only integer epoch seconds under spinel, so wall_us is at # second-resolution (latency * 1_000_000) -- coarse, but LLM # serving is seconds-scale, fine for the run-level analytics. # A µs clock helper lands later; until then this is the right # placeholder shape so consumers see populated wall_us. t0 = Time.now.to_i comp = Tep::APP.openai_backend.generate_from_tokens(model, token_ids, sampling) total = comp.prompt_tokens + comp.completion_tokens # Emit one inference event per request. Skipped when events # are disabled via path-length short-circuit inside #inference. # request_id matches the JSON response's id; principal_id is # the auth-filter populated identity (anonymous if none). wall_us = (Time.now.to_i - t0) * 1_000_000 extra = "{" + Tep::Json.encode_pair_str("request_id", "cmpl-tep") + "," + Tep::Json.encode_pair_str("principal_id", req.identity.subject) + "}" Tep::APP.openai_events.inference( model, comp.prompt_tokens, comp.completion_tokens, wall_us, extra ) # IDs-only backends (no detokenizer) carry the generated token # IDs; emit them as choices[0].ids. Text backends leave token_ids # empty and the field is omitted (standard OpenAI shape). ids_frag = "" if comp.token_ids.length > 0 ids_frag = "\"ids\":" + Tep::Json.from_int_array(comp.token_ids) + "," end "{" + Tep::Json.encode_pair_str("id", "cmpl-tep") + "," + Tep::Json.encode_pair_str("object", "text_completion") + "," + Tep::Json.encode_pair_int("created", Time.now.to_i) + "," + Tep::Json.encode_pair_str("model", model) + "," + "\"choices\":[{" + Tep::Json.encode_pair_int("index", 0) + "," + Tep::Json.encode_pair_str("text", comp.text) + "," + ids_frag + Tep::Json.encode_pair_str("finish_reason", comp.finish_reason) + "}]," + "\"usage\":{" + Tep::Json.encode_pair_int("prompt_tokens", comp.prompt_tokens) + "," + Tep::Json.encode_pair_int("completion_tokens", comp.completion_tokens) + "," + Tep::Json.encode_pair_int("total_tokens", total) + "}" + "}" end |