Class: CompletionsHandler

Inherits:
Tep::Handler
  • Object
show all
Defined in:
lib/toy/serve/openai/handlers.rb

Instance Method Summary collapse

Instance Method Details

#handle(req, res) ⇒ Object



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/toy/serve/openai/handlers.rb', line 44

def handle(req, res)
  res.headers["Content-Type"] = "application/json"
  body = req.body

  # Accept the prompt as a JSON int array. OpenAI spec allows
  # `prompt: <int-array>` for pre-tokenized input; we require it.
  prompt_ids = SpinelKit::Json.get_int_array(body, "prompt")
  if prompt_ids.length == 0
    prompt_ids = SpinelKit::Json.get_int_array(body, "prompt_ids")
  end

  if prompt_ids.length == 0
    res.set_status(400)
    return "{\"error\":{\"message\":\"prompt must be a non-empty int array " +
           "(this server speaks IDs only; tokenize client-side)\"," +
           "\"type\":\"invalid_request_error\"}}\n"
  end

  n_new = 16
  if SpinelKit::Json.has_key?(body, "max_tokens")
    n_new = SpinelKit::Json.get_int(body, "max_tokens")
  end
  if n_new <= 0; n_new = 16; end
  if n_new > 256; n_new = 256; end

  # Measure generation latency across the single compute call (cheap: two
  # tnn_events_now_seconds reads regardless of whether events are active).
  t_start = TinyNN.tnn_events_now_seconds
  new_ids = api_generate_ids(prompt_ids, n_new)
  t_end   = TinyNN.tnn_events_now_seconds
  prompt_len = prompt_ids.length
  completion_len = new_ids.length
  latency_us = ((t_end - t_start) * 1.0e6).to_i

  # ONE eval/serve/request event per completion (toy/v1). FILE-only side
  # effect: the response String below is byte-UNCHANGED. Guarded by the
  # process-global C events state (tnn_events_active==1 only when the runner
  # opened events.jsonl, i.e. TAO_RUN_DIR was set), so the handler needs no
  # EVENTS-path knowledge. NO emit on the 400 empty-prompt early-return.
  if TinyNN.tnn_events_active == 1
    STATE.req_seq = STATE.req_seq + 1
    req_id = "req-" + STATE.req_seq.to_s
    ev  = "{\"kind\":\"eval\",\"phase\":\"serve\""
    ev = ev + ",\"t\":"    + TinyNN.tnn_events_now_seconds.to_s
    ev = ev + ",\"name\":\"request\""
    ev = ev + ",\"extra\":{"
    ev = ev +   "\"model\":\"" + STATE.model_name + "\""
    ev = ev +   ",\"prompt_tokens\":"     + prompt_len.to_s
    ev = ev +   ",\"completion_tokens\":" + completion_len.to_s
    ev = ev +   ",\"latency_us\":"        + latency_us.to_s
    ev = ev +   ",\"sampling\":{\"max_tokens\":" + n_new.to_s + "}"
    ev = ev +   ",\"request_id\":\"" + req_id + "\""
    ev = ev + "}}"
    TinyNN.tnn_events_emit(ev)
  end

  "{" +
    SpinelKit::Json.encode_pair_str("id", api_gen_id("cmpl")) + "," +
    SpinelKit::Json.encode_pair_str("object", "text_completion") + "," +
    SpinelKit::Json.encode_pair_int("created", api_now_unix) + "," +
    SpinelKit::Json.encode_pair_str("model", STATE.model_name) + "," +
    "\"choices\":[{\"index\":0," +
      SpinelKit::Json.encode_pair_str("text", "") + "," +
      "\"ids\":" + SpinelKit::Json.from_int_array(new_ids) + "," +
      "\"finish_reason\":\"length\"}]," +
    "\"usage\":{" +
      SpinelKit::Json.encode_pair_int("prompt_tokens", prompt_len) + "," +
      SpinelKit::Json.encode_pair_int("completion_tokens", completion_len) + "," +
      SpinelKit::Json.encode_pair_int("total_tokens", prompt_len + completion_len) +
    "}}\n"
end