Class: LlmMetaClient::ServerQuery

Inherits:
Object
  • Object
show all
Defined in:
lib/llm_meta_client/server_query.rb

Instance Method Summary collapse

Instance Method Details

#call(id_token, api_key_uuid, model_id, context, user_content, tool_ids: [], generation_settings: {}) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/llm_meta_client/server_query.rb', line 45

def call(id_token, api_key_uuid, model_id, context, user_content, tool_ids: [], generation_settings: {})
  debug_log "Context: #{context}"
  context_and_user_content = "Context:#{context}, User Prompt: #{user_content}"
  debug_log "Request to LLM: \n===>\n#{context_and_user_content}\n===>"

  response = request(api_key_uuid, id_token, model_id, context_and_user_content, tool_ids, generation_settings)

  unless response.success?
    raise Exceptions::ServerError, build_error_message(response.code.to_i, response.parsed_response)
  end

  response_body = response.parsed_response

  raise Exceptions::InvalidResponseError, "LLM server returned non-JSON response" unless response_body.is_a?(Hash)

  content = response_body.dig("response", "message") || ""
  tool_calls = response_body.dig("response", "tool_calls")
  content = combine_with_tool_calls(content, tool_calls) if tool_calls.is_a?(Array) && tool_calls.any?

  raise Exceptions::EmptyResponseError, "LLM server returned empty response" if content.blank?

  debug_log "Response from LLM: \n<===\n#{content}\n<==>"

  content
end

#stream(id_token, api_key_uuid, model_id, context, user_content, tool_ids: [], generation_settings: {}) ⇒ Object

Stream LLM responses incrementally. Yields each content delta event ({ event: “message”, data: { “delta” => “…” } }) and any tool_calls event ({ event: “tool_calls”, data: { “tool_calls” => […] } }) to the caller’s block. Upstream “done” markers are absorbed (end-of-stream is signaled by the block returning); upstream “error” events raise ServerError. Returns the final assistant content. If tool calls fired, the returned string mirrors the synchronous #call format (response + markdown “Tool calls” section appended) so persistence stays consistent.



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/llm_meta_client/server_query.rb', line 15

def stream(id_token, api_key_uuid, model_id, context, user_content, tool_ids: [], generation_settings: {})
  context_and_user_content = "Context:#{context}, User Prompt: #{user_content}"
  debug_log "Streaming request to LLM: \n===>\n#{context_and_user_content}\n===>"

  body = { prompt: context_and_user_content }
  body[:tool_ids] = tool_ids if tool_ids.present?
  body[:generation_settings] = generation_settings if generation_settings.present?

  assembled = +""
  collected_tool_calls = []
  request_stream(api_key_uuid, id_token, model_id, body) do |event|
    case event[:event]
    when "message"
      assembled << event[:data]["delta"].to_s
      yield event if block_given?
    when "tool_calls"
      collected_tool_calls = event[:data]["tool_calls"] || []
      yield event if block_given?
    when "done"
      # End-of-stream marker from upstream; no-op here.
    when "error"
      raise Exceptions::ServerError, format_stream_error(event[:data])
    else
      yield event if block_given?
    end
  end

  collected_tool_calls.any? ? combine_with_tool_calls(assembled, collected_tool_calls) : assembled
end