Class: Clacky::Client

Inherits:
Object
  • Object
show all
Defined in:
lib/clacky/client.rb

Constant Summary collapse

MAX_RETRIES =
10
RETRY_DELAY =

seconds

5

Instance Method Summary collapse

Constructor Details

#initialize(api_key, base_url:, model:, anthropic_format: false) ⇒ Client

Returns a new instance of Client.



11
12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/clacky/client.rb', line 11

def initialize(api_key, base_url:, model:, anthropic_format: false)
  @api_key = api_key
  @base_url = base_url
  @model = model
  @use_anthropic_format = anthropic_format
  # Detect Bedrock: ABSK key prefix (native AWS) or abs- model prefix (Clacky AI proxy)
  @use_bedrock = MessageFormat::Bedrock.bedrock_api_key?(api_key, model)

  # Determine vision support once at construction time.
  # Non-vision models (DeepSeek, Kimi, MiniMax, etc.) reject image_url
  # content blocks; the conversion layer strips them when this is false.
  provider_id = Providers.resolve_provider(base_url: @base_url, api_key: @api_key)
  @vision_supported = Providers.supports?(provider_id, :vision, model_name: @model)
end

Instance Method Details

#add_cache_control_to_message(msg) ⇒ Object

Wrap or extend the message’s content with a cache_control marker.



280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# File 'lib/clacky/client.rb', line 280

def add_cache_control_to_message(msg)
  content = msg[:content]

  content_array = case content
                  when String
                    [{ type: "text", text: content, cache_control: { type: "ephemeral" } }]
                  when Array
                    content.map.with_index do |block, idx|
                      idx == content.length - 1 ? block.merge(cache_control: { type: "ephemeral" }) : block
                    end
                  else
                    return msg
                  end

  msg.merge(content: content_array)
end

#anthropic_connectionObject



330
331
332
333
334
335
336
337
338
339
340
341
# File 'lib/clacky/client.rb', line 330

def anthropic_connection
  @anthropic_connection ||= Faraday.new(url: @base_url) do |conn|
    conn.headers["Content-Type"]   = "application/json"
    conn.headers["x-api-key"]      = @api_key
    conn.headers["anthropic-version"] = "2023-06-01"
    conn.headers["anthropic-dangerous-direct-browser-access"] = "true"
    conn.options.timeout      = 300
    conn.options.open_timeout = 10
    conn.ssl.verify           = false
    conn.adapter Faraday.default_adapter
  end
end

#anthropic_format?(model = nil) ⇒ Boolean

Returns true when the client is talking directly to the Anthropic API (determined at construction time via the anthropic_format flag).

Returns:

  • (Boolean)


33
34
35
# File 'lib/clacky/client.rb', line 33

def anthropic_format?(model = nil)
  @use_anthropic_format && !@use_bedrock
end

#apply_message_caching(messages) ⇒ Object

Add cache_control markers to the last 2 messages in the array.

Why 2 markers:

Turn N   — marks messages[-2] and messages[-1]; server caches prefix up to [-1]
Turn N+1 — messages[-2] is Turn N's last message (still marked) → cache READ hit;
           messages[-1] is the new message (marked) → cache WRITE for Turn N+2

With only 1 marker (old behavior): Turn N marks messages; in Turn N+1 that same message is now [-2] and carries no marker → server sees a different prefix → cache MISS.

Compression instructions (system_injected: true) are skipped — we never want to cache those ephemeral injection messages.



263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# File 'lib/clacky/client.rb', line 263

def apply_message_caching(messages)
  return messages if messages.empty?

  # Collect up to 2 candidate indices from the tail, skipping compression instructions.
  candidate_indices = []
  (messages.length - 1).downto(0) do |i|
    break if candidate_indices.length >= 2

    candidate_indices << i unless is_compression_instruction?(messages[i])
  end

  messages.map.with_index do |msg, idx|
    candidate_indices.include?(idx) ? add_cache_control_to_message(msg) : msg
  end
end

#bedrock?Boolean

Returns true when the client is using the AWS Bedrock Converse API.

Returns:

  • (Boolean)


27
28
29
# File 'lib/clacky/client.rb', line 27

def bedrock?
  @use_bedrock
end

#bedrock_connectionObject



308
309
310
311
312
313
314
315
316
317
# File 'lib/clacky/client.rb', line 308

def bedrock_connection
  @bedrock_connection ||= Faraday.new(url: @base_url) do |conn|
    conn.headers["Content-Type"]  = "application/json"
    conn.headers["Authorization"] = "Bearer #{@api_key}"
    conn.options.timeout      = 300
    conn.options.open_timeout = 10
    conn.ssl.verify           = false
    conn.adapter Faraday.default_adapter
  end
end

#bedrock_endpoint(model) ⇒ Object

Bedrock Converse API endpoint path for a given model ID.



304
305
306
# File 'lib/clacky/client.rb', line 304

def bedrock_endpoint(model)
  "/model/#{model}/converse"
end

#check_html_response(response) ⇒ Object

Raise a friendly error if the response body is HTML (e.g. gateway error page returned with 200)



380
381
382
383
384
385
# File 'lib/clacky/client.rb', line 380

def check_html_response(response)
  body = response.body.to_s.lstrip
  if body.start_with?("<!DOCTYPE", "<!doctype", "<html", "<HTML")
    raise RetryableError, "[LLM] Service temporarily unavailable (received HTML error page), retrying..."
  end
end

#deep_clone(obj) ⇒ Object

── Utilities ─────────────────────────────────────────────────────────────



428
429
430
431
432
433
434
# File 'lib/clacky/client.rb', line 428

def deep_clone(obj)
  case obj
  when Hash  then obj.each_with_object({}) { |(k, v), h| h[k] = deep_clone(v) }
  when Array then obj.map { |item| deep_clone(item) }
  else obj
  end
end

#extract_error_message(error_body, raw_body) ⇒ Object



387
388
389
390
391
392
393
394
395
396
397
398
# File 'lib/clacky/client.rb', line 387

def extract_error_message(error_body, raw_body)
  if raw_body.is_a?(String) && raw_body.strip.start_with?("<!DOCTYPE", "<html")
    return "Invalid API endpoint or server error (received HTML instead of JSON)"
  end

  return raw_body unless error_body.is_a?(Hash)

  error_body["upstreamMessage"]&.then { |m| return m unless m.empty? }
  error_body.dig("error", "message")&.then { |m| return m } if error_body["error"].is_a?(Hash)
  error_body["message"]&.then             { |m| return m }
  error_body["error"].is_a?(String) ? error_body["error"] : (raw_body.to_s[0..200] + (raw_body.to_s.length > 200 ? "..." : ""))
end

#format_tool_results(response, tool_results, model:) ⇒ Object

Format tool results into canonical messages ready to append to @messages. Always returns canonical format (role: “tool”) regardless of API type —conversion to API-native happens inside each send_*_request.



145
146
147
148
149
150
151
152
153
154
155
# File 'lib/clacky/client.rb', line 145

def format_tool_results(response, tool_results, model:)
  return [] if tool_results.empty?

  if bedrock?
    MessageFormat::Bedrock.format_tool_results(response, tool_results)
  elsif anthropic_format?
    MessageFormat::Anthropic.format_tool_results(response, tool_results)
  else
    MessageFormat::OpenAI.format_tool_results(response, tool_results)
  end
end

#handle_test_response(response) ⇒ Object

── Error handling ────────────────────────────────────────────────────────



345
346
347
348
349
350
# File 'lib/clacky/client.rb', line 345

def handle_test_response(response)
  return { success: true } if response.status == 200

  error_body = JSON.parse(response.body) rescue nil
  { success: false, error: extract_error_message(error_body, response.body) }
end

#is_compression_instruction?(message) ⇒ Boolean

Returns:

  • (Boolean)


297
298
299
# File 'lib/clacky/client.rb', line 297

def is_compression_instruction?(message)
  message.is_a?(Hash) && message[:system_injected] == true
end

#openai_connectionObject



319
320
321
322
323
324
325
326
327
328
# File 'lib/clacky/client.rb', line 319

def openai_connection
  @openai_connection ||= Faraday.new(url: @base_url) do |conn|
    conn.headers["Content-Type"]  = "application/json"
    conn.headers["Authorization"] = "Bearer #{@api_key}"
    conn.options.timeout      = 300
    conn.options.open_timeout = 10
    conn.ssl.verify           = false
    conn.adapter Faraday.default_adapter
  end
end

#parse_simple_anthropic_response(response) ⇒ Object



217
218
219
220
221
# File 'lib/clacky/client.rb', line 217

def parse_simple_anthropic_response(response)
  raise_error(response) unless response.status == 200
  data = safe_json_parse(response.body, context: "LLM response")
  (data["content"] || []).select { |b| b["type"] == "text" }.map { |b| b["text"] }.join("")
end

#parse_simple_bedrock_response(response) ⇒ Object



193
194
195
196
197
198
199
200
# File 'lib/clacky/client.rb', line 193

def parse_simple_bedrock_response(response)
  raise_error(response) unless response.status == 200
  data = safe_json_parse(response.body, context: "LLM response")
  (data.dig("output", "message", "content") || [])
    .select { |b| b["text"] }
    .map { |b| b["text"] }
    .join("")
end

#parse_simple_openai_response(response) ⇒ Object



243
244
245
246
247
# File 'lib/clacky/client.rb', line 243

def parse_simple_openai_response(response)
  raise_error(response) unless response.status == 200
  parsed_body = safe_json_parse(response.body, context: "LLM response")
  parsed_body["choices"].first["message"]["content"]
end

#raise_error(response) ⇒ Object



352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
# File 'lib/clacky/client.rb', line 352

def raise_error(response)
  error_body    = JSON.parse(response.body) rescue nil
  error_message = extract_error_message(error_body, response.body)

  case response.status
  when 400
    # Well-behaved APIs (Anthropic, OpenAI) never put quota/availability issues in 400.
    # However, some proxy/relay providers do — so we inspect the message first.
    # Also, Bedrock returns ThrottlingException as 400 instead of 429.
    if error_message.match?(/ThrottlingException|unavailable|quota/i)
      hint = error_message.match?(/quota/i) ? " (possibly out of credits)" : ""
      raise RetryableError, "[LLM] Rate limit or service issue: #{error_message}#{hint}"
    end

    # True bad request — our message was malformed. Roll back history so the
    # broken message is not replayed on the next user turn.
    raise BadRequestError, "[LLM] Client request error: #{error_message}"
  when 401 then raise AgentError, "[LLM] Invalid API key"
  when 402 then raise AgentError, "[LLM] Billing or payment issue (possibly out of credits): #{error_message}"
  when 403 then raise AgentError, "[LLM] Access denied: #{error_message}"
  when 404 then raise AgentError, "[LLM] API endpoint not found: #{error_message}"
  when 429 then raise RetryableError, "[LLM] Rate limit exceeded, please wait a moment"
  when 500..599 then raise RetryableError, "[LLM] Service temporarily unavailable (#{response.status}), retrying..."
  else raise AgentError, "[LLM] Unexpected error (#{response.status}): #{error_message}"
  end
end

#safe_json_parse(json_string, context: "response") ⇒ Hash, Array

Parse JSON with user-friendly error messages.

Parameters:

  • json_string (String)

    the JSON string to parse

  • context (String) (defaults to: "response")

    a description of what’s being parsed (e.g., “LLM response”)

Returns:

  • (Hash, Array)

    the parsed JSON

Raises:

  • (RetryableError)

    if parsing fails (indicates a malformed LLM response)



405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
# File 'lib/clacky/client.rb', line 405

def safe_json_parse(json_string, context: "response")
  JSON.parse(json_string)
rescue JSON::ParserError => e
  # Transform technical JSON parsing errors into user-friendly messages.
  # These are usually caused by:
  #   1. Incomplete/truncated LLM response (network issue, timeout)
  #   2. LLM service returned malformed data
  #   3. Proxy/gateway corruption
  error_detail = if json_string.to_s.strip.empty?
    "received empty response"
  elsif json_string.to_s.bytesize > 500
    "response was truncated or malformed (#{json_string.to_s.bytesize} bytes received)"
  else
    "response format is invalid"
  end

  raise RetryableError, "[LLM] Failed to parse #{context}: #{error_detail}. " \
                       "This usually means the AI service returned incomplete or corrupted data. " \
                       "The request will be retried automatically."
end

#send_anthropic_request(messages, model, tools, max_tokens, caching_enabled) ⇒ Object

── Anthropic request / response ──────────────────────────────────────────



204
205
206
207
208
209
210
211
212
213
214
215
# File 'lib/clacky/client.rb', line 204

def send_anthropic_request(messages, model, tools, max_tokens, caching_enabled)
  # Apply cache_control to the message that marks the cache breakpoint
  messages = apply_message_caching(messages) if caching_enabled

  body     = MessageFormat::Anthropic.build_request_body(messages, model, tools, max_tokens, caching_enabled)
  response = anthropic_connection.post("v1/messages") { |r| r.body = body.to_json }

  raise_error(response) unless response.status == 200
  check_html_response(response)
  parsed_body = safe_json_parse(response.body, context: "LLM response")
  MessageFormat::Anthropic.parse_response(parsed_body)
end

#send_bedrock_request(messages, model, tools, max_tokens, caching_enabled) ⇒ Object

── Bedrock Converse request / response ───────────────────────────────────



183
184
185
186
187
188
189
190
191
# File 'lib/clacky/client.rb', line 183

def send_bedrock_request(messages, model, tools, max_tokens, caching_enabled)
  body     = MessageFormat::Bedrock.build_request_body(messages, model, tools, max_tokens, caching_enabled)
  response = bedrock_connection.post(bedrock_endpoint(model)) { |r| r.body = body.to_json }

  raise_error(response) unless response.status == 200
  check_html_response(response)
  parsed_body = safe_json_parse(response.body, context: "LLM response")
  MessageFormat::Bedrock.parse_response(parsed_body)
end

#send_message(content, model:, max_tokens:) ⇒ Object

Send a single string message and return the reply text.



67
68
69
70
# File 'lib/clacky/client.rb', line 67

def send_message(content, model:, max_tokens:)
  messages = [{ role: "user", content: content }]
  send_messages(messages, model: model, max_tokens: max_tokens)
end

#send_messages(messages, model:, max_tokens:) ⇒ Object

Send a messages array and return the reply text.



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/clacky/client.rb', line 73

def send_messages(messages, model:, max_tokens:)
  if bedrock?
    body     = MessageFormat::Bedrock.build_request_body(messages, model, [], max_tokens)
    response = bedrock_connection.post(bedrock_endpoint(model)) { |r| r.body = body.to_json }
    parse_simple_bedrock_response(response)
  elsif anthropic_format?
    body     = MessageFormat::Anthropic.build_request_body(messages, model, [], max_tokens, false)
    response = anthropic_connection.post("v1/messages") { |r| r.body = body.to_json }
    parse_simple_anthropic_response(response)
  else
    body     = { model: model, max_tokens: max_tokens, messages: messages }
    response = openai_connection.post("chat/completions") { |r| r.body = body.to_json }
    parse_simple_openai_response(response)
  end
end

#send_messages_with_tools(messages, model:, tools:, max_tokens:, enable_caching: false) ⇒ Object

Send messages with tool-calling support. Returns canonical response hash: { content:, tool_calls:, finish_reason:, usage:, latency: }

Latency measurement:

Because the current HTTP path is *non-streaming* (plain POST, response
body read in one shot), TTFB (time to response headers) is not exposed
by Faraday's default adapter without extra plumbing. What we CAN measure
cheaply — and what users actually feel — is total request duration,
which for a non-streaming call equals the time from "hit Enter" to
"first token visible" (since we receive everything at once).

So we record `duration_ms` as the authoritative number and alias it to
`ttft_ms` for downstream consumers (status bar uses ttft_ms as its
signal metric — see docs). When we migrate to streaming later, this
same `ttft_ms` field will start carrying the *actual* first-token
latency without any schema change.


107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/clacky/client.rb', line 107

def send_messages_with_tools(messages, model:, tools:, max_tokens:, enable_caching: false)
  caching_enabled = enable_caching && supports_prompt_caching?(model)
  cloned = deep_clone(messages)

  t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  response =
    if bedrock?
      send_bedrock_request(cloned, model, tools, max_tokens, caching_enabled)
    elsif anthropic_format?
      send_anthropic_request(cloned, model, tools, max_tokens, caching_enabled)
    else
      send_openai_request(cloned, model, tools, max_tokens, caching_enabled)
    end
  t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)

  duration_ms = ((t1 - t0) * 1000).round
  # Throughput is only meaningful with a reasonable output size; below ~10
  # tokens the sample is too small to be informative and the result is
  # wildly high (e.g. 1 token / 50ms → 20 tok/s is meaningless).
  # Canonical usage hashes from message_format/* all use :completion_tokens.
  output_tokens = response[:usage]&.dig(:completion_tokens).to_i
  tps = (output_tokens >= 10 && duration_ms > 0) ? (output_tokens * 1000.0 / duration_ms).round(1) : nil

  response[:latency] = {
    ttft_ms:     duration_ms,      # non-streaming: TTFT == full duration
    duration_ms: duration_ms,
    output_tokens: output_tokens,
    tps:         tps,
    model:       model,
    measured_at: Time.now.to_f,
    streaming:   false              # future flag — true when we migrate
  }
  response
end

#send_openai_request(messages, model, tools, max_tokens, caching_enabled) ⇒ Object

── OpenAI request / response ─────────────────────────────────────────────



225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/clacky/client.rb', line 225

def send_openai_request(messages, model, tools, max_tokens, caching_enabled)
  # Apply cache_control markers to messages when caching is enabled.
  # OpenRouter proxies Claude with the same cache_control field convention as Anthropic direct.
  messages = apply_message_caching(messages) if caching_enabled

  body     = MessageFormat::OpenAI.build_request_body(
    messages, model, tools, max_tokens, caching_enabled,
    vision_supported: @vision_supported
  )
  response = openai_connection.post("chat/completions") { |r| r.body = body.to_json }

  raise_error(response) unless response.status == 200
  check_html_response(response)
  
  parsed_body = safe_json_parse(response.body, context: "LLM response")
  MessageFormat::OpenAI.parse_response(parsed_body)
end

#supports_prompt_caching?(model) ⇒ Boolean

Returns true for Claude models that support prompt caching (gen 3.5+ or gen 4+).

Handles both direct model names (e.g. “claude-haiku-4-5”) and Clacky AI Bedrock proxy names with “abs-” prefix (e.g. “abs-claude-haiku-4-5”).

Why only Claude models:

- MiniMax uses automatic server-side caching (no cache_control needed from client)
- Kimi uses a proprietary prompt_cache_key param, not cache_control
- MiMo has no documented caching API
- Only Claude (direct, OpenRouter, or ClackyAI Bedrock proxy) consumes our
  cache_control / cachePoint markers

Returns:

  • (Boolean)


170
171
172
173
174
175
176
177
178
# File 'lib/clacky/client.rb', line 170

def supports_prompt_caching?(model)
  # Strip ClackyAI Bedrock proxy prefix before matching
  model_str = model.to_s.downcase.sub(/^abs-/, "")
  return false unless model_str.include?("claude")

  # Match Claude gen 3.5+ (3.5/3.6/3.7…) or gen 4+ in any name format:
  #   claude-3.5-sonnet-...  claude-3-7-sonnet  claude-haiku-4-5  claude-sonnet-4-6
  model_str.match?(/claude(?:-3[-.]?[5-9]|.*-[4-9][-.]|.*-[4-9]$|-[4-9][-.]|-[4-9]$|-sonnet-[34])/)
end

#test_connection(model:) ⇒ Object

Test API connection by sending a minimal request. Returns { success: true } or { success: false, error: “…” }.



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/clacky/client.rb', line 41

def test_connection(model:)
  if bedrock?
    body = MessageFormat::Bedrock.build_request_body(
      [{ role: :user, content: "hi" }], model, [], 16
    ).to_json
    response = bedrock_connection.post(bedrock_endpoint(model)) { |r| r.body = body }
  elsif anthropic_format?
    minimal_body = { model: model, max_tokens: 16,
                     messages: [{ role: "user", content: "hi" }] }.to_json
    response = anthropic_connection.post("v1/messages") { |r| r.body = minimal_body }
  else
    minimal_body = { model: model, max_tokens: 16,
                     messages: [{ role: "user", content: "hi" }] }.to_json
    response = openai_connection.post("chat/completions") { |r| r.body = minimal_body }
  end
  handle_test_response(response)
rescue Faraday::Error => e
  { success: false, error: "Connection error: #{e.message}" }
rescue => e
  Clacky::Logger.error("[test_connection] #{e.class}: #{e.message}", error: e)
  { success: false, error: e.message }
end