Class: Clacky::Client

Inherits:

Object

Object
Clacky::Client

show all

Defined in:: lib/clacky/client.rb

Constant Summary collapse

MAX_RETRIES =

RETRY_DELAY = seconds

Instance Attribute Summary collapse

#provider_id ⇒ Object readonly
Returns the value of attribute provider_id.

Instance Method Summary collapse

#add_cache_control_to_message(msg) ⇒ Object
Wrap or extend the message's content with a cache_control marker.
#anthropic_connection ⇒ Object
#anthropic_format?(model = nil) ⇒ Boolean
Returns true when the client is talking directly to the Anthropic API (determined at construction time via the anthropic_format flag).
#apply_message_caching(messages) ⇒ Object
Add cache_control markers to the last 2 messages in the array.
#bedrock? ⇒ Boolean
Returns true when the client is using the AWS Bedrock Converse API.
#bedrock_connection ⇒ Object
#bedrock_endpoint(model) ⇒ Object
Bedrock Converse API endpoint path for a given model ID.
#check_html_response(response) ⇒ Object
Raise a friendly error if the response body is HTML (e.g. gateway error page returned with 200).
#deep_clone(obj) ⇒ Object
── Utilities ─────────────────────────────────────────────────────────────.
#extract_error_message(error_body, raw_body) ⇒ Object
#format_tool_results(response, tool_results, model:) ⇒ Object
Format tool results into canonical messages ready to append to @messages.
#handle_test_response(response) ⇒ Object
── Error handling ────────────────────────────────────────────────────────.
#initialize(api_key, base_url:, model:, anthropic_format: false, read_timeout: nil) ⇒ Client constructor
A new instance of Client.
#is_compression_instruction?(message) ⇒ Boolean
#openai_connection ⇒ Object
#parse_simple_anthropic_response(response) ⇒ Object
#parse_simple_bedrock_response(response) ⇒ Object
#parse_simple_openai_response(response) ⇒ Object
#raise_error(response) ⇒ Object
#reset_connections! ⇒ Object
#safe_json_parse(json_string, context: "response") ⇒ Hash, Array
Parse JSON with user-friendly error messages.
#send_anthropic_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil) ⇒ Object
── Anthropic request / response ──────────────────────────────────────────.
#send_bedrock_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil) ⇒ Object
── Bedrock Converse request / response ───────────────────────────────────.
#send_message(content, model:, max_tokens:) ⇒ Object
Send a single string message and return the reply text.
#send_messages(messages, model:, max_tokens:) ⇒ Object
Send a messages array and return the reply text.
#send_messages_with_tools(messages, model:, tools:, max_tokens:, enable_caching: false, reasoning_effort: nil, on_chunk: nil) ⇒ Object
Send messages with tool-calling support.
#send_openai_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil) ⇒ Object
── OpenAI request / response ─────────────────────────────────────────────.
#supports_prompt_caching?(model) ⇒ Boolean
Returns true for Claude models that support prompt caching (gen 3.5+ or gen 4+).
#test_connection(model:) ⇒ Object
Test API connection by sending a minimal request.

Constructor Details

#initialize(api_key, base_url:, model:, anthropic_format: false, read_timeout: nil) ⇒ `Client`

Returns a new instance of Client.

# File 'lib/clacky/client.rb', line 13

def initialize(api_key, base_url:, model:, anthropic_format: false, read_timeout: nil)
  @api_key = api_key
  @base_url = base_url
  @model = model
  # Detect Bedrock: ABSK key prefix (native AWS) or abs- model prefix (Clacky AI proxy)
  @use_bedrock = MessageFormat::Bedrock.bedrock_api_key?(api_key, model)

  # Resolve provider once — reused for capability + api-type lookups.
  provider_id = Providers.resolve_provider(base_url: @base_url, api_key: @api_key)

  # Decide anthropic_format dynamically based on provider+model, falling
  # back to the explicit constructor flag for unknown providers / custom
  # base_urls. This lets e.g. OpenRouter's Claude models auto-route to the
  # native /v1/messages endpoint (preserving cache_control byte-for-byte)
  # without requiring any change to user YAML.
  provider_prefers_anthropic = provider_id &&
                               Providers.anthropic_format_for_model?(provider_id, @model)
  @use_anthropic_format = provider_prefers_anthropic || anthropic_format

  # Remember the provider id so we can tune connection headers below
  # (OpenRouter's /v1/messages accepts either Bearer or x-api-key, but
  # some OpenRouter-compatible relays only honour Bearer — send both).
  @provider_id = provider_id

  # Optional override for Faraday read_timeout (e.g. benchmark calls).
  # nil means use the default (300s for streaming).
  @read_timeout = read_timeout
end

Instance Attribute Details

#provider_id ⇒ `Object` (readonly)

Returns the value of attribute provider_id.



11
12
13

# File 'lib/clacky/client.rb', line 11

def provider_id
  @provider_id
end

Instance Method Details

#add_cache_control_to_message(msg) ⇒ `Object`

Wrap or extend the message's content with a cache_control marker.

# File 'lib/clacky/client.rb', line 443

def add_cache_control_to_message(msg)
  content = msg[:content]

  content_array = case content
                  when String
                    [{ type: "text", text: content, cache_control: { type: "ephemeral" } }]
                  when Array
                    content.map.with_index do |block, idx|
                      idx == content.length - 1 ? block.merge(cache_control: { type: "ephemeral" }) : block
                    end
                  else
                    return msg
                  end

  msg.merge(content: content_array)
end

#anthropic_connection ⇒ `Object`

# File 'lib/clacky/client.rb', line 556

def anthropic_connection
  current_epoch = Clacky::ProxyConfig.epoch
  if @anthropic_connection.nil? ||
     (!@anthropic_connection_epoch.nil? && @anthropic_connection_epoch != current_epoch)
    @anthropic_connection = Faraday.new(url: @base_url) do |conn|
      conn.headers["Content-Type"]   = "application/json"
      conn.headers["x-api-key"]      = @api_key
      conn.headers["anthropic-version"] = "2023-06-01"
      conn.headers["anthropic-dangerous-direct-browser-access"] = "true"
      if @provider_id == "openrouter"
        conn.headers["Authorization"] = "Bearer #{@api_key}"
      end
      # Moonshot's Kimi Code (Coding Plan) endpoint enforces a User-Agent
      # prefix whitelist limited to first-party coding agents.
      if @provider_id == "kimi-coding"
        conn.headers["User-Agent"] = "claude-cli/1.0.51 (external, cli)"
      end
      conn.options.timeout      = @read_timeout || 300
      conn.options.open_timeout = 10
      conn.ssl.verify           = false
      conn.adapter Faraday.default_adapter
    end
    @anthropic_connection_epoch = current_epoch
  end
  @anthropic_connection
end

#anthropic_format?(model = nil) ⇒ `Boolean`

Returns true when the client is talking directly to the Anthropic API (determined at construction time via the anthropic_format flag).

Returns:

(Boolean)



49
50
51

# File 'lib/clacky/client.rb', line 49

def anthropic_format?(model = nil)
  @use_anthropic_format && !@use_bedrock
end

#apply_message_caching(messages) ⇒ `Object`

Add cache_control markers to the last 2 messages in the array.

Why 2 markers:

Turn N   — marks messages[-2] and messages[-1]; server caches prefix up to [-1]
Turn N+1 — messages[-2] is Turn N's last message (still marked) → cache READ hit;
         messages[-1] is the new message (marked) → cache WRITE for Turn N+2

With only 1 marker (old behavior): Turn N marks messages; in Turn N+1 that same message is now [-2] and carries no marker → server sees a different prefix → cache MISS.

Compression instructions (system_injected: true) are skipped — we never want to cache those ephemeral injection messages.

# File 'lib/clacky/client.rb', line 426

def apply_message_caching(messages)
  return messages if messages.empty?

  # Collect up to 2 candidate indices from the tail, skipping compression instructions.
  candidate_indices = []
  (messages.length - 1).downto(0) do |i|
    break if candidate_indices.length >= 2

    candidate_indices << i unless is_compression_instruction?(messages[i])
  end

  messages.map.with_index do |msg, idx|
    candidate_indices.include?(idx) ? add_cache_control_to_message(msg) : msg
  end
end

#bedrock? ⇒ `Boolean`

Returns true when the client is using the AWS Bedrock Converse API.

Returns:

(Boolean)



43
44
45

# File 'lib/clacky/client.rb', line 43

def bedrock?
  @use_bedrock
end

#bedrock_connection ⇒ `Object`

# File 'lib/clacky/client.rb', line 522

def bedrock_connection
  current_epoch = Clacky::ProxyConfig.epoch
  if @bedrock_connection.nil? ||
     (!@bedrock_connection_epoch.nil? && @bedrock_connection_epoch != current_epoch)
    @bedrock_connection = Faraday.new(url: @base_url) do |conn|
      conn.headers["Content-Type"]  = "application/json"
      conn.headers["Authorization"] = "Bearer #{@api_key}"
      conn.options.timeout      = @read_timeout || 300
      conn.options.open_timeout = 10
      conn.ssl.verify           = false
      conn.adapter Faraday.default_adapter
    end
    @bedrock_connection_epoch = current_epoch
  end
  @bedrock_connection
end

#bedrock_endpoint(model) ⇒ `Object`

Bedrock Converse API endpoint path for a given model ID.



467
468
469

# File 'lib/clacky/client.rb', line 467

def bedrock_endpoint(model)
  "/model/#{model}/converse"
end

#check_html_response(response) ⇒ `Object`

Raise a friendly error if the response body is HTML (e.g. gateway error page returned with 200)

# File 'lib/clacky/client.rb', line 674

def check_html_response(response)
  body = response.body.to_s.lstrip
  if body.start_with?("<!DOCTYPE", "<!doctype", "<html", "<HTML")
    raise RetryableError, "[LLM] #{I18n.t("llm.error.html_response")}"
  end
end

#deep_clone(obj) ⇒ `Object`

── Utilities ─────────────────────────────────────────────────────────────

# File 'lib/clacky/client.rb', line 766

def deep_clone(obj)
  case obj
  when Hash  then obj.each_with_object({}) { |(k, v), h| h[k] = deep_clone(v) }
  when Array then obj.map { |item| deep_clone(item) }
  else obj
  end
end

#extract_error_message(error_body, raw_body) ⇒ `Object`

# File 'lib/clacky/client.rb', line 688

def extract_error_message(error_body, raw_body)
  if raw_body.is_a?(String) && raw_body.strip.start_with?("<!DOCTYPE", "<html")
    return "Invalid API endpoint or server error (received HTML instead of JSON)"
  end

  return "(empty response body)" if raw_body.to_s.strip.empty? && !error_body.is_a?(Hash)
  return raw_body unless error_body.is_a?(Hash)

  error_body["upstreamMessage"]&.then { |m| return m unless m.empty? }

  if error_body["error"].is_a?(Hash)
    upstream_msg = extract_upstream_error(error_body["error"])
    return upstream_msg if upstream_msg
  end

  error_body["message"]&.then             { |m| return m }
  error_body["error"].is_a?(String) ? error_body["error"] : (raw_body.to_s[0..200] + (raw_body.to_s.length > 200 ? "..." : ""))
end

#format_tool_results(response, tool_results, model:) ⇒ `Object`

Format tool results into canonical messages ready to append to @messages. Always returns canonical format (role: "tool") regardless of API type — conversion to API-native happens inside each send_*_request.

# File 'lib/clacky/client.rb', line 183

def format_tool_results(response, tool_results, model:)
  return [] if tool_results.empty?

  if bedrock?
    MessageFormat::Bedrock.format_tool_results(response, tool_results)
  elsif anthropic_format?
    MessageFormat::Anthropic.format_tool_results(response, tool_results)
  else
    MessageFormat::OpenAI.format_tool_results(response, tool_results)
  end
end

#handle_test_response(response) ⇒ `Object`

── Error handling ────────────────────────────────────────────────────────

# File 'lib/clacky/client.rb', line 601

def handle_test_response(response)
  return { success: true, status: response.status } if response.status == 200

  error_body = JSON.parse(response.body) rescue nil
  error_code = extract_error_code(error_body)

  translated = case response.status
  when 402       then I18n.t("llm.error.insufficient_credit")
  when 400       then I18n.t("llm.error.rate_limit_400")
  when 401       then I18n.t("llm.error.invalid_api_key")
  when 403       then I18n.t("llm.error.403.#{error_code || "default"}")
  when 404       then I18n.t("llm.error.endpoint_not_found")
  when 429       then I18n.t("llm.error.rate_limit_429")
  when 500..599  then I18n.t("llm.error.server_error", status: response.status)
  else                extract_error_message(error_body, response.body)
  end

  {
    success:    false,
    status:     response.status,
    error:      translated,
    error_code: error_code
  }
end

#is_compression_instruction?(message) ⇒ `Boolean`

Returns:

(Boolean)



460
461
462

# File 'lib/clacky/client.rb', line 460

def is_compression_instruction?(message)
  message.is_a?(Hash) && message[:system_injected] == true
end

#openai_connection ⇒ `Object`

# File 'lib/clacky/client.rb', line 539

def openai_connection
  current_epoch = Clacky::ProxyConfig.epoch
  if @openai_connection.nil? ||
     (!@openai_connection_epoch.nil? && @openai_connection_epoch != current_epoch)
    @openai_connection = Faraday.new(url: @base_url) do |conn|
      conn.headers["Content-Type"]  = "application/json"
      conn.headers["Authorization"] = "Bearer #{@api_key}"
      conn.options.timeout      = @read_timeout || 300
      conn.options.open_timeout = 10
      conn.ssl.verify           = false
      conn.adapter Faraday.default_adapter
    end
    @openai_connection_epoch = current_epoch
  end
  @openai_connection
end

#parse_simple_anthropic_response(response) ⇒ `Object`

# File 'lib/clacky/client.rb', line 328

def parse_simple_anthropic_response(response)
  raise_error(response) unless response.status == 200
  data = safe_json_parse(response.body, context: "LLM response")
  (data["content"] || []).select { |b| b["type"] == "text" }.map { |b| b["text"] }.join("")
end

#parse_simple_bedrock_response(response) ⇒ `Object`

# File 'lib/clacky/client.rb', line 269

def parse_simple_bedrock_response(response)
  raise_error(response) unless response.status == 200
  data = safe_json_parse(response.body, context: "LLM response")
  (data.dig("output", "message", "content") || [])
    .select { |b| b["text"] }
    .map { |b| b["text"] }
    .join("")
end

#parse_simple_openai_response(response) ⇒ `Object`

# File 'lib/clacky/client.rb', line 396

def parse_simple_openai_response(response)
  raise_error(response) unless response.status == 200
  parsed_body = safe_json_parse(response.body, context: "LLM response")
  content = parsed_body.dig("choices", 0, "message", "content")
  if content.nil?
    snippet = response.body.to_s[0, 1200]
    if defined?(Clacky::Logger)
      Clacky::Logger.warn("[parse_simple_openai_response] no content. status=#{response.status} body=#{snippet}")
    end
    raise Clacky::Error,
      "Upstream OpenAI-compatible response missing choices[0].message.content. " \
      "Body snippet: #{snippet}"
  end
  content
end

#raise_error(response) ⇒ `Object`

# File 'lib/clacky/client.rb', line 626

def raise_error(response)
  error_body    = JSON.parse(response.body) rescue nil
  error_message = extract_error_message(error_body, response.body)
  error_code    = extract_error_code(error_body)

  Clacky::Logger.warn("client.raise_error",
    status: response.status,
    body: response.body.to_s[0, 2000],
    error_message: error_message.to_s[0, 500],
    error_code: error_code
  )

  if error_code == "insufficient_credit" || response.status == 402
    raise InsufficientCreditError.new(
      "[LLM] #{I18n.t("llm.error.insufficient_credit")}",
      error_code: "insufficient_credit",
      provider_id: @provider_id,
      raw_message: error_message
    )
  end

  case response.status
  when 400
    if error_message.match?(/ThrottlingException|unavailable|quota/i)
      raise RetryableError, "[LLM] #{I18n.t("llm.error.rate_limit_400")}"
    end

    raise BadRequestError.new(
      "[LLM] Client request error: #{error_message}",
      display_message: "[LLM] #{I18n.t("llm.error.bad_request")}",
      raw_message: error_message
    )
  when 401
    raise AgentError.new("[LLM] #{I18n.t("llm.error.invalid_api_key")}", raw_message: error_message)
  when 403
    i18n_key = "llm.error.403.#{error_code}"
    translated = I18n.t(i18n_key)
    translated = I18n.t("llm.error.403.default") if translated == i18n_key
    raise AgentError.new("[LLM] #{translated}", raw_message: error_message)
  when 404
    raise AgentError.new("[LLM] #{I18n.t("llm.error.endpoint_not_found")}", raw_message: error_message)
  when 429 then raise RetryableError, "[LLM] #{I18n.t("llm.error.rate_limit_429")}"
  when 500..599 then raise RetryableError, "[LLM] #{I18n.t("llm.error.server_error", status: response.status)}"
  else raise AgentError.new("[LLM] #{I18n.t("llm.error.unexpected", status: response.status)}", raw_message: error_message)
  end
end

#reset_connections! ⇒ `Object`

# File 'lib/clacky/client.rb', line 516

def reset_connections!
  @bedrock_connection = nil
  @openai_connection = nil
  @anthropic_connection = nil
end

#safe_json_parse(json_string, context: "response") ⇒ `Hash`, `Array`

Parse JSON with user-friendly error messages.

Parameters:

json_string (String) —
the JSON string to parse
context (String) (defaults to: "response") —
a description of what's being parsed (e.g., "LLM response")

Returns:

(Hash, Array) —
the parsed JSON

Raises:

(RetryableError) —
if parsing fails (indicates a malformed LLM response)

# File 'lib/clacky/client.rb', line 731

def safe_json_parse(json_string, context: "response")
  JSON.parse(json_string)
rescue JSON::ParserError => e
  # Transform technical JSON parsing errors into user-friendly messages.
  # These are usually caused by:
  #   1. Incomplete/truncated LLM response (network issue, timeout)
  #   2. LLM service returned malformed data
  #   3. Proxy/gateway corruption
  error_detail = if json_string.to_s.strip.empty?
    "received empty response"
  elsif json_string.to_s.bytesize > 500
    "response was truncated or malformed (#{json_string.to_s.bytesize} bytes received)"
  else
    "response format is invalid"
  end

  raise RetryableError, "[LLM] Failed to parse #{context}: #{error_detail}. " \
                       "This usually means the AI service returned incomplete or corrupted data. " \
                       "The request will be retried automatically."
end

#send_anthropic_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil) ⇒ `Object`

── Anthropic request / response ──────────────────────────────────────────

# File 'lib/clacky/client.rb', line 280

def send_anthropic_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil)
  # Apply cache_control to the message that marks the cache breakpoint
  messages = apply_message_caching(messages) if caching_enabled

  body = MessageFormat::Anthropic.build_request_body(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: reasoning_effort)
  return send_anthropic_stream_request(body, on_chunk) if on_chunk

  response = anthropic_connection.post(anthropic_messages_path) { |r| r.body = body.to_json }

  raise_error(response) unless response.status == 200
  check_html_response(response)
  parsed_body = safe_json_parse(response.body, context: "LLM response")
  MessageFormat::Anthropic.parse_response(parsed_body)
end

#send_bedrock_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil) ⇒ `Object`

── Bedrock Converse request / response ───────────────────────────────────

# File 'lib/clacky/client.rb', line 221

def send_bedrock_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil)
  body = MessageFormat::Bedrock.build_request_body(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: reasoning_effort)
  return send_bedrock_stream_request(body, model, on_chunk) if on_chunk

  response = bedrock_connection.post(bedrock_endpoint(model)) { |r| r.body = body.to_json }

  raise_error(response) unless response.status == 200
  check_html_response(response)
  parsed_body = safe_json_parse(response.body, context: "LLM response")
  MessageFormat::Bedrock.parse_response(parsed_body)
end

#send_message(content, model:, max_tokens:) ⇒ `Object`

Send a single string message and return the reply text.

# File 'lib/clacky/client.rb', line 83

def send_message(content, model:, max_tokens:)
  messages = [{ role: "user", content: content }]
  send_messages(messages, model: model, max_tokens: max_tokens)
end

#send_messages(messages, model:, max_tokens:) ⇒ `Object`

Send a messages array and return the reply text.

# File 'lib/clacky/client.rb', line 89

def send_messages(messages, model:, max_tokens:)
  if bedrock?
    body     = MessageFormat::Bedrock.build_request_body(messages, model, [], max_tokens)
    response = bedrock_connection.post(bedrock_endpoint(model)) { |r| r.body = body.to_json }
    parse_simple_bedrock_response(response)
  elsif anthropic_format?
    body     = MessageFormat::Anthropic.build_request_body(messages, model, [], max_tokens, false)
    response = anthropic_connection.post(anthropic_messages_path) { |r| r.body = body.to_json }
    parse_simple_anthropic_response(response)
  else
    body     = { model: model, max_tokens: max_tokens, messages: messages }
    response = openai_connection.post("chat/completions") { |r| r.body = body.to_json }
    parse_simple_openai_response(response)
  end
end

#send_messages_with_tools(messages, model:, tools:, max_tokens:, enable_caching: false, reasoning_effort: nil, on_chunk: nil) ⇒ `Object`

Send messages with tool-calling support. Returns canonical response hash: { content:, tool_calls:, finish_reason:, usage:, latency: }

Latency measurement:

Because the current HTTP path is *non-streaming* (plain POST, response
body read in one shot), TTFB (time to response headers) is not exposed
by Faraday's default adapter without extra plumbing. What we CAN measure
cheaply — and what users actually feel — is total request duration,
which for a non-streaming call equals the time from "hit Enter" to
"first token visible" (since we receive everything at once).

So we record `duration_ms` as the authoritative number and alias it to
`ttft_ms` for downstream consumers (status bar uses ttft_ms as its
signal metric — see docs). When we migrate to streaming later, this
same `ttft_ms` field will start carrying the *actual* first-token
latency without any schema change.

Parameters:

on_chunk (Proc, nil) (defaults to: nil) —
optional streaming progress callback. Receives keyword args { input_tokens:, output_tokens: } with cumulative token counts. When nil, behaves exactly as the historical non-streaming path. When given but streaming is not yet wired for the active provider, a single synthetic invocation is fired after the response is received, so UI plumbing can be exercised end-to-end without the proxy work.

# File 'lib/clacky/client.rb', line 129

def send_messages_with_tools(messages, model:, tools:, max_tokens:, enable_caching: false, reasoning_effort: nil, on_chunk: nil)
  caching_enabled = enable_caching && supports_prompt_caching?(model)
  cloned = deep_clone(messages)

  streaming_used = false
  first_chunk_at = nil
  wrapped_on_chunk = on_chunk && lambda do |**kwargs|
    first_chunk_at ||= Process.clock_gettime(Process::CLOCK_MONOTONIC)
    on_chunk.call(**kwargs)
  end

  t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  response =
    if bedrock?
      streaming_used = !on_chunk.nil?
      send_bedrock_request(cloned, model, tools, max_tokens, caching_enabled, reasoning_effort: reasoning_effort, on_chunk: wrapped_on_chunk)
    elsif anthropic_format?
      streaming_used = !on_chunk.nil?
      send_anthropic_request(cloned, model, tools, max_tokens, caching_enabled, reasoning_effort: reasoning_effort, on_chunk: wrapped_on_chunk)
    else
      streaming_used = !on_chunk.nil?
      send_openai_request(cloned, model, tools, max_tokens, caching_enabled, reasoning_effort: reasoning_effort, on_chunk: wrapped_on_chunk)
    end
  t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)

  if on_chunk && !streaming_used
    usage = response[:usage] || {}
    safe_invoke_on_chunk(
      on_chunk,
      input_tokens:  usage[:prompt_tokens].to_i,
      output_tokens: usage[:completion_tokens].to_i
    )
  end

  duration_ms = ((t1 - t0) * 1000).round
  ttft_ms = first_chunk_at ? ((first_chunk_at - t0) * 1000).round : duration_ms
  output_tokens = response[:usage]&.dig(:completion_tokens).to_i
  tps = (output_tokens >= 10 && duration_ms > 0) ? (output_tokens * 1000.0 / duration_ms).round(1) : nil

  response[:latency] = {
    ttft_ms:     ttft_ms,
    duration_ms: duration_ms,
    output_tokens: output_tokens,
    tps:         tps,
    model:       model,
    measured_at: Time.now.to_f,
    streaming:   streaming_used
  }
  response
end

#send_openai_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil) ⇒ `Object`

── OpenAI request / response ─────────────────────────────────────────────

# File 'lib/clacky/client.rb', line 336

def send_openai_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil)
  # Apply cache_control markers to messages when caching is enabled.
  # OpenRouter proxies Claude with the same cache_control field convention as Anthropic direct.
  messages = apply_message_caching(messages) if caching_enabled

  # Vision support is resolved against the request's actual model (which may
  # differ from @model after a runtime switch or fallback override), so the
  # conversion layer strips image_url blocks for non-vision models.
  body = MessageFormat::OpenAI.build_request_body(
    messages, model, tools, max_tokens, caching_enabled,
    vision_supported: Providers.supports?(@provider_id, :vision, model_name: model),
    reasoning_effort: reasoning_effort
  )
  return send_openai_stream_request(body, on_chunk) if on_chunk

  response = openai_connection.post("chat/completions") { |r| r.body = body.to_json }

  raise_error(response) unless response.status == 200
  check_html_response(response)

  parsed_body = safe_json_parse(response.body, context: "LLM response")
  MessageFormat::OpenAI.parse_response(parsed_body)
end

#supports_prompt_caching?(model) ⇒ `Boolean`

Returns true for Claude models that support prompt caching (gen 3.5+ or gen 4+).

Handles both direct model names (e.g. "claude-haiku-4-5") and Clacky AI Bedrock proxy names with "abs-" prefix (e.g. "abs-claude-haiku-4-5").

Why only Claude models:

- MiniMax uses automatic server-side caching (no cache_control needed from client)
- Kimi uses a proprietary prompt_cache_key param, not cache_control
- MiMo has no documented caching API
- Only Claude (direct, OpenRouter, or ClackyAI Bedrock proxy) consumes our
cache_control / cachePoint markers

Returns:

(Boolean)

# File 'lib/clacky/client.rb', line 208

def supports_prompt_caching?(model)
  # Strip ClackyAI Bedrock proxy prefix before matching
  model_str = model.to_s.downcase.sub(/^abs-/, "")
  return false unless model_str.include?("claude")

  # Match Claude gen 3.5+ (3.5/3.6/3.7…) or gen 4+ in any name format:
  #   claude-3.5-sonnet-...  claude-3-7-sonnet  claude-haiku-4-5  claude-sonnet-4-6
  model_str.match?(/claude(?:-3[-.]?[5-9]|.*-[4-9][-.]|.*-[4-9]$|-[4-9][-.]|-[4-9]$|-sonnet-[34])/)
end

#test_connection(model:) ⇒ `Object`

Test API connection by sending a minimal request. Returns { success: true } or { success: false, error: "..." }.

# File 'lib/clacky/client.rb', line 57

def test_connection(model:)
  if bedrock?
    body = MessageFormat::Bedrock.build_request_body(
      [{ role: :user, content: "hi" }], model, [], 16
    ).to_json
    response = bedrock_connection.post(bedrock_endpoint(model)) { |r| r.body = body }
  elsif anthropic_format?
    minimal_body = { model: model, max_tokens: 16,
                     messages: [{ role: "user", content: "hi" }] }.to_json
    response = anthropic_connection.post(anthropic_messages_path) { |r| r.body = minimal_body }
  else
    minimal_body = { model: model, max_tokens: 16,
                     messages: [{ role: "user", content: "hi" }] }.to_json
    response = openai_connection.post("chat/completions") { |r| r.body = minimal_body }
  end
  handle_test_response(response)
rescue Faraday::Error => e
  { success: false, error: "Connection error: #{e.message}" }
rescue => e
  Clacky::Logger.error("[test_connection] #{e.class}: #{e.message}", error: e)
  { success: false, error: e.message }
end

Class: Clacky::Client

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(api_key, base_url:, model:, anthropic_format: false, read_timeout: nil) ⇒ Client

Instance Attribute Details

#provider_id ⇒ Object (readonly)

Instance Method Details

#add_cache_control_to_message(msg) ⇒ Object

#anthropic_connection ⇒ Object

#anthropic_format?(model = nil) ⇒ Boolean

#apply_message_caching(messages) ⇒ Object

#bedrock? ⇒ Boolean

#bedrock_connection ⇒ Object

#bedrock_endpoint(model) ⇒ Object

#check_html_response(response) ⇒ Object

#deep_clone(obj) ⇒ Object

#extract_error_message(error_body, raw_body) ⇒ Object

#format_tool_results(response, tool_results, model:) ⇒ Object

#handle_test_response(response) ⇒ Object

#is_compression_instruction?(message) ⇒ Boolean

#openai_connection ⇒ Object

#parse_simple_anthropic_response(response) ⇒ Object

#parse_simple_bedrock_response(response) ⇒ Object

#parse_simple_openai_response(response) ⇒ Object

#raise_error(response) ⇒ Object

#reset_connections! ⇒ Object

#safe_json_parse(json_string, context: "response") ⇒ Hash, Array

#send_anthropic_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil) ⇒ Object

#send_bedrock_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil) ⇒ Object

#send_message(content, model:, max_tokens:) ⇒ Object

#send_messages(messages, model:, max_tokens:) ⇒ Object

#send_messages_with_tools(messages, model:, tools:, max_tokens:, enable_caching: false, reasoning_effort: nil, on_chunk: nil) ⇒ Object

#send_openai_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil) ⇒ Object

#supports_prompt_caching?(model) ⇒ Boolean

#test_connection(model:) ⇒ Object

#initialize(api_key, base_url:, model:, anthropic_format: false, read_timeout: nil) ⇒ `Client`

#provider_id ⇒ `Object` (readonly)

#add_cache_control_to_message(msg) ⇒ `Object`

#anthropic_connection ⇒ `Object`

#anthropic_format?(model = nil) ⇒ `Boolean`

#apply_message_caching(messages) ⇒ `Object`

#bedrock? ⇒ `Boolean`

#bedrock_connection ⇒ `Object`

#bedrock_endpoint(model) ⇒ `Object`

#check_html_response(response) ⇒ `Object`

#deep_clone(obj) ⇒ `Object`

#extract_error_message(error_body, raw_body) ⇒ `Object`

#format_tool_results(response, tool_results, model:) ⇒ `Object`

#handle_test_response(response) ⇒ `Object`

#is_compression_instruction?(message) ⇒ `Boolean`

#openai_connection ⇒ `Object`

#parse_simple_anthropic_response(response) ⇒ `Object`

#parse_simple_bedrock_response(response) ⇒ `Object`

#parse_simple_openai_response(response) ⇒ `Object`

#raise_error(response) ⇒ `Object`

#reset_connections! ⇒ `Object`

#safe_json_parse(json_string, context: "response") ⇒ `Hash`, `Array`

#send_anthropic_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil) ⇒ `Object`

#send_bedrock_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil) ⇒ `Object`

#send_message(content, model:, max_tokens:) ⇒ `Object`

#send_messages(messages, model:, max_tokens:) ⇒ `Object`

#send_messages_with_tools(messages, model:, tools:, max_tokens:, enable_caching: false, reasoning_effort: nil, on_chunk: nil) ⇒ `Object`

#send_openai_request(messages, model, tools, max_tokens, caching_enabled, reasoning_effort: nil, on_chunk: nil) ⇒ `Object`

#supports_prompt_caching?(model) ⇒ `Boolean`

#test_connection(model:) ⇒ `Object`