Module: Clacky::ModelPricing

Defined in:
lib/clacky/utils/model_pricing.rb

Overview

Module for handling AI model pricing Supports different pricing tiers and prompt caching

Constant Summary collapse

PRICING_TABLE =

Pricing per 1M tokens (MTok) in USD All pricing is based on official API documentation

{
  # Claude 4.5 models - tiered pricing based on prompt length
  "claude-opus-4.5" => {
    input: {
      default: 5.00,              # $5/MTok for prompts ≤ 200K tokens
      over_200k: 5.00             # same for all tiers
    },
    output: {
      default: 25.00,             # $25/MTok for prompts ≤ 200K tokens
      over_200k: 25.00            # same for all tiers
    },
    cache: {
      write: 6.25,                # $6.25/MTok cache write
      read: 0.50                  # $0.50/MTok cache read
    }
  },
  
  "claude-sonnet-4.5" => {
    input: {
      default: 3.00,              # $3/MTok for prompts ≤ 200K tokens
      over_200k: 6.00             # $6/MTok for prompts > 200K tokens
    },
    output: {
      default: 15.00,             # $15/MTok for prompts ≤ 200K tokens
      over_200k: 22.50            # $22.50/MTok for prompts > 200K tokens
    },
    cache: {
      write_default: 3.75,        # $3.75/MTok cache write (≤ 200K)
      write_over_200k: 7.50,      # $7.50/MTok cache write (> 200K)
      read_default: 0.30,         # $0.30/MTok cache read (≤ 200K)
      read_over_200k: 0.60        # $0.60/MTok cache read (> 200K)
    }
  },
  
  "claude-haiku-4.5" => {
    input: {
      default: 1.00,              # $1/MTok
      over_200k: 1.00             # same for all tiers
    },
    output: {
      default: 5.00,              # $5/MTok
      over_200k: 5.00             # same for all tiers
    },
    cache: {
      write: 1.25,                # $1.25/MTok cache write
      read: 0.10                  # $0.10/MTok cache read
    }
  },

  # Claude 3.5 models (for backwards compatibility)
  "claude-3-5-sonnet-20241022" => {
    input: {
      default: 3.00,
      over_200k: 6.00
    },
    output: {
      default: 15.00,
      over_200k: 22.50
    },
    cache: {
      write_default: 3.75,
      write_over_200k: 7.50,
      read_default: 0.30,
      read_over_200k: 0.60
    }
  },

  "claude-3-5-sonnet-20240620" => {
    input: {
      default: 3.00,
      over_200k: 6.00
    },
    output: {
      default: 15.00,
      over_200k: 22.50
    },
    cache: {
      write_default: 3.75,
      write_over_200k: 7.50,
      read_default: 0.30,
      read_over_200k: 0.60
    }
  },

  "claude-3-5-haiku-20241022" => {
    input: {
      default: 1.00,
      over_200k: 1.00
    },
    output: {
      default: 5.00,
      over_200k: 5.00
    },
    cache: {
      write: 1.25,
      read: 0.10
    }
  },

  # DeepSeek V4 models
  # Source: https://api-docs.deepseek.com/quick_start/pricing (USD / 1M tokens)
  # DeepSeek billing model:
  #   - "cache miss input" = regular prompt_tokens rate
  #   - "cache hit input"  = cache_read rate (DeepSeek has no separate cache-write charge)
  #   - No tiered pricing (single rate regardless of context length)
  "deepseek-v4-flash" => {
    input: {
      default: 0.14,                  # $0.14/MTok cache miss
      over_200k: 0.14                 # no tiered pricing
    },
    output: {
      default: 0.28,                  # $0.28/MTok
      over_200k: 0.28
    },
    cache: {
      write: 0.14,                    # DeepSeek doesn't charge extra for writes; bill at miss rate
      read: 0.028                     # $0.028/MTok cache hit
    }
  },

  "deepseek-v4-pro" => {
    input: {
      default: 1.74,                  # $1.74/MTok cache miss
      over_200k: 1.74
    },
    output: {
      default: 3.48,                  # $3.48/MTok
      over_200k: 3.48
    },
    cache: {
      write: 1.74,                    # no separate write charge; bill at miss rate
      read: 0.145                     # $0.145/MTok cache hit
    }
  },

  # Kimi K2.5 / K2.6 multimodal models
  # Source: https://platform.moonshot.cn (USD / 1M tokens)
  # Kimi billing model (same shape as DeepSeek):
  #   - "cache miss input" = regular prompt_tokens rate
  #   - "cache hit input"  = cache_read rate (no separate cache-write charge)
  #   - No tiered pricing (single rate regardless of context length)
  "kimi-k2.5" => {
    input: {
      default: 0.60,                  # $0.60/MTok cache miss
      over_200k: 0.60                 # no tiered pricing
    },
    output: {
      default: 3.00,                  # $3.00/MTok
      over_200k: 3.00
    },
    cache: {
      write: 0.60,                    # Kimi doesn't charge extra for writes; bill at miss rate
      read: 0.10                      # $0.10/MTok cache hit
    }
  },

  "kimi-k2.6" => {
    input: {
      default: 0.95,                  # $0.95/MTok cache miss
      over_200k: 0.95
    },
    output: {
      default: 4.00,                  # $4.00/MTok
      over_200k: 4.00
    },
    cache: {
      write: 0.95,                    # no separate write charge; bill at miss rate
      read: 0.16                      # $0.16/MTok cache hit
    }
  },

  # OpenAI GPT-5.5 / GPT-5.4 — breakpoint at 272K input tokens
  # Source: https://openai.com/api/pricing/ (USD / 1M tokens)
  # Note: OpenAI's actual tiered-pricing threshold is 272K, not the
  # global 200K below.  Prompts between 200K–272K will slightly
  # over-estimate costs until a per-model threshold is implemented.
  "gpt-5.5" => {
    input: {
      default: 5.00,              # $5/MTok for prompts ≤ 272K tokens
      over_200k: 10.00            # $10/MTok for prompts > 272K tokens
    },
    output: {
      default: 30.00,             # $30/MTok for prompts ≤ 272K tokens
      over_200k: 45.00            # $45/MTok for prompts > 272K tokens
    },
    cache: {
      write_default: 5.00,        # $5/MTok cache write (≤ 272K)
      write_over_200k: 10.00,     # $10/MTok cache write (> 272K)
      read_default: 0.50,         # $0.50/MTok cache read (≤ 272K)
      read_over_200k: 1.00        # $1.00/MTok cache read (> 272K)
    }
  },

  "gpt-5.4" => {
    input: {
      default: 2.50,              # $2.50/MTok for prompts ≤ 272K tokens
      over_200k: 5.00             # $5/MTok for prompts > 272K tokens
    },
    output: {
      default: 15.00,             # $15/MTok for prompts ≤ 272K tokens
      over_200k: 22.50           # $22.50/MTok for prompts > 272K tokens
    },
    cache: {
      write_default: 2.50,        # $2.50/MTok cache write (≤ 272K)
      write_over_200k: 5.00,      # $5/MTok cache write (> 272K)
      read_default: 0.25,         # $0.25/MTok cache read (≤ 272K)
      read_over_200k: 0.50        # $0.50/MTok cache read (> 272K)
    }
  },

  # GPT-5.4 flat-rate models (no breakpoint, single rate regardless of context)
  "gpt-5.4-mini" => {
    input: {
      default: 0.75,              # $0.75/MTok
      over_200k: 0.75
    },
    output: {
      default: 4.50,              # $4.50/MTok
      over_200k: 4.50
    },
    cache: {
      write: 0.75,                # $0.75/MTok cache write
      read: 0.075                 # $0.075/MTok cache read (10% of input)
    }
  },

  "gpt-5.4-nano" => {
    input: {
      default: 0.20,              # $0.20/MTok
      over_200k: 0.20
    },
    output: {
      default: 1.25,              # $1.25/MTok
      over_200k: 1.25
    },
    cache: {
      write: 0.20,                # $0.20/MTok cache write
      read: 0.02                  # $0.02/MTok cache read (10% of input)
    }
  },

  # O-series reasoning models — flat-rate (200K context window)
  # Source: https://openai.com/api/pricing/
  "o3" => {
    input: {
      default: 2.00,              # $2/MTok
      over_200k: 2.00             # flat rate
    },
    output: {
      default: 8.00,              # $8/MTok
      over_200k: 8.00
    },
    cache: {
      write: 2.00,                # $2/MTok cache write (same as input)
      read: 0.50                  # $0.50/MTok cache read (25% of input)
    }
  },

  "o4-mini" => {
    input: {
      default: 1.10,              # $1.10/MTok
      over_200k: 1.10             # flat rate
    },
    output: {
      default: 4.40,              # $4.40/MTok
      over_200k: 4.40
    },
    cache: {
      write: 1.10,                # $1.10/MTok cache write (same as input)
      read: 0.275                 # $0.275/MTok cache read (25% of input)
    }
  },

  # GLM (Zhipu / Z.ai) — USD per 1M tokens.
  # Source: https://docs.z.ai/guides/overview/pricing (Z.ai international).
  # Pricing policy: we always bill at the Z.ai international flat rate,
  # regardless of which endpoint (mainland bigmodel.cn vs intl z.ai) the
  # user configured. Rationale:
  #   1. Mainland GLM uses tiered pricing (≤32K / >32K / >128K) where the
  #      >32K tier is hit by the vast majority of real requests, and is
  #      actually a few RMB cheaper than Z.ai's flat rate — displaying the
  #      (slightly higher) Z.ai rate gives users a "displayed ≤ actual"
  #      experience which is psychologically safer than the reverse.
  #   2. Single flat rate keeps the table shape consistent with every
  #      other provider here (no special-case tier logic for just GLM).
  # Cache-write: same convention as DeepSeek/Kimi — OpenAI-compatible
  # endpoints don't charge separately for cache writes (Z.ai's page lists
  # "Cached Input Storage: Limited-time Free"), so bill writes at the
  # regular input miss rate for safe "displayed ≤ actual" behaviour.
  "glm-5.1" => {
    input:  { default: 1.40, over_200k: 1.40 },
    output: { default: 4.40, over_200k: 4.40 },
    cache:  { write: 1.40, read: 0.26 }
  },

  "glm-5" => {
    input:  { default: 1.00, over_200k: 1.00 },
    output: { default: 3.20, over_200k: 3.20 },
    cache:  { write: 1.00, read: 0.20 }
  },

  "glm-5-turbo" => {
    input:  { default: 1.20, over_200k: 1.20 },
    output: { default: 4.00, over_200k: 4.00 },
    cache:  { write: 1.20, read: 0.24 }
  },

  # GLM-5V-Turbo is the multimodal sibling of GLM-5-Turbo (vision capable,
  # see providers.rb model_capabilities override). Same input/output rate
  # as 5-Turbo per Z.ai's Vision Models table.
  "glm-5v-turbo" => {
    input:  { default: 1.20, over_200k: 1.20 },
    output: { default: 4.00, over_200k: 4.00 },
    cache:  { write: 1.20, read: 0.24 }
  },

  "glm-4.7" => {
    input:  { default: 0.60, over_200k: 0.60 },
    output: { default: 2.20, over_200k: 2.20 },
    cache:  { write: 0.60, read: 0.11 }
  },

  # MiniMax — USD per 1M tokens.
  # Source: https://platform.minimaxi.com (Pay-as-You-Go).
  # MiniMax pricing is identical across mainland (.com) and international
  # (.io) endpoints, verified by the team. Same cache-write convention as
  # DeepSeek/Kimi/GLM: bill writes at the input miss rate (OpenAI-compatible
  # usage responses from MiniMax don't reliably carry a separate
  # cache_creation_input_tokens field, so a distinct write rate would be
  # dead code in practice).
  # Note: providers.rb uses the capitalised "MiniMax-M2.x" model id, but
  # the pricing table keys are lowercased to stay consistent with the
  # rest of this file; normalize_model_name() lowercases incoming model
  # names before lookup.
  "minimax-m2.5" => {
    input:  { default: 0.30, over_200k: 0.30 },
    output: { default: 1.20, over_200k: 1.20 },
    cache:  { write: 0.30, read: 0.03 }
  },

  "minimax-m2.7" => {
    input:  { default: 0.30, over_200k: 0.30 },
    output: { default: 1.20, over_200k: 1.20 },
    cache:  { write: 0.30, read: 0.06 }
  },

}.freeze
TIERED_PRICING_THRESHOLD =

Threshold for tiered pricing (200K tokens) NOTE: OpenAI GPT-5.5/GPT-5.4 use a 272K breakpoint, not 200K. Costs for prompts between 200K–272K will be slightly over-estimated.

200_000

Class Method Summary collapse

Class Method Details

.calculate_cache_cost(pricing:, cache_write_tokens:, cache_read_tokens:, over_threshold:) ⇒ Object

Calculate cache-related costs



538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
# File 'lib/clacky/utils/model_pricing.rb', line 538

def calculate_cache_cost(pricing:, cache_write_tokens:, cache_read_tokens:, over_threshold:)
  cache_cost = 0.0
  
  # Cache write cost
  if cache_write_tokens > 0
    write_rate = if pricing[:cache].key?(:write)
                   # Simple pricing (Opus 4.5, Haiku 4.5)
                   pricing[:cache][:write]
                 elsif over_threshold
                   # Tiered pricing (Sonnet 4.5)
                   pricing[:cache][:write_over_200k]
                 else
                   pricing[:cache][:write_default]
                 end
    
    cache_cost += (cache_write_tokens / 1_000_000.0) * write_rate
  end
  
  # Cache read cost
  if cache_read_tokens > 0
    read_rate = if pricing[:cache].key?(:read)
                  # Simple pricing (Opus 4.5, Haiku 4.5)
                  pricing[:cache][:read]
                elsif over_threshold
                  # Tiered pricing (Sonnet 4.5)
                  pricing[:cache][:read_over_200k]
                else
                  pricing[:cache][:read_default]
                end
    
    cache_cost += (cache_read_tokens / 1_000_000.0) * read_rate
  end
  
  cache_cost
end

.calculate_cost(model:, usage:) ⇒ Hash

Calculate cost for the given model and usage

Parameters:

  • model (String)

    Model identifier

  • usage (Hash)

    Usage statistics containing:

    • prompt_tokens: number of input tokens

    • completion_tokens: number of output tokens

    • cache_creation_input_tokens: tokens written to cache (optional)

    • cache_read_input_tokens: tokens read from cache (optional)

Returns:

  • (Hash)

    Hash containing:

    • cost: Cost in USD (Float) or nil if model pricing is unknown

    • source: Cost source (:price) or nil if unknown (Symbol or nil)



374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
# File 'lib/clacky/utils/model_pricing.rb', line 374

def calculate_cost(model:, usage:)
  pricing_result = get_pricing_with_source(model)
  pricing = pricing_result[:pricing]
  source = pricing_result[:source]

  # If no pricing table matches this model, return nil cost.
  # Unknown models should display as N/A, never fall back to guesses.
  return { cost: nil, source: nil } unless pricing

  prompt_tokens = usage[:prompt_tokens] || 0
  completion_tokens = usage[:completion_tokens] || 0
  cache_write_tokens = usage[:cache_creation_input_tokens] || 0
  cache_read_tokens = usage[:cache_read_input_tokens] || 0
  
  # Determine if we're in the over_200k tier
  # Note: prompt_tokens includes cache_read_tokens but NOT cache_write_tokens
  # cache_write_tokens are additional tokens that were written to cache
  total_input_tokens = prompt_tokens + cache_write_tokens
  over_threshold = total_input_tokens > TIERED_PRICING_THRESHOLD
  
  # Calculate regular input cost (non-cached tokens)
  # prompt_tokens already includes cache_read_tokens, so we need to subtract them
  # cache_write_tokens are not part of prompt_tokens, so they're handled separately in cache_cost
  regular_input_tokens = prompt_tokens - cache_read_tokens
  input_rate = over_threshold ? pricing[:input][:over_200k] : pricing[:input][:default]
  input_cost = (regular_input_tokens / 1_000_000.0) * input_rate
  
  # Calculate output cost
  output_rate = over_threshold ? pricing[:output][:over_200k] : pricing[:output][:default]
  output_cost = (completion_tokens / 1_000_000.0) * output_rate
  
  # Calculate cache costs
  cache_cost = calculate_cache_cost(
    pricing: pricing,
    cache_write_tokens: cache_write_tokens,
    cache_read_tokens: cache_read_tokens,
    over_threshold: over_threshold
  )
  
  {
    cost: input_cost + output_cost + cache_cost,
    source: source
  }
end

.get_pricing(model) ⇒ Hash

Get pricing for a specific model Falls back to default pricing if model not found

Parameters:

  • model (String)

    Model identifier

Returns:

  • (Hash)

    Pricing structure for the model



424
425
426
# File 'lib/clacky/utils/model_pricing.rb', line 424

def get_pricing(model)
  get_pricing_with_source(model)[:pricing]
end

.get_pricing_with_source(model) ⇒ Hash

Get pricing with source information

Parameters:

  • model (String)

    Model identifier

Returns:

  • (Hash)

    Hash containing:

    • pricing: Pricing structure or nil if model is unknown

    • source: :price (matched) or nil (unknown)



434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
# File 'lib/clacky/utils/model_pricing.rb', line 434

def get_pricing_with_source(model)
  # Normalize model name (remove version suffixes, handle variations)
  normalized_model = normalize_model_name(model)

  if normalized_model
    # Found specific pricing for this model
    {
      pricing: PRICING_TABLE[normalized_model],
      source: :price
    }
  else
    # No matching pricing table entry — cost is unknown
    { pricing: nil, source: nil }
  end
end

.normalize_model_name(model) ⇒ Object

Normalize model name to match pricing table keys. Returns the canonical key on match, or nil when no pricing is available.



453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
# File 'lib/clacky/utils/model_pricing.rb', line 453

def normalize_model_name(model)
  return nil if model.nil? || model.empty?
  
  model = model.downcase.strip
  
  # Direct match
  return model if PRICING_TABLE.key?(model)
  
  # Check for Claude model variations
  # Support both dot and dash separators (e.g., "4.5", "4-5", "4-6")
  # Also handles Bedrock cross-region prefixes (e.g. "jp.anthropic.claude-sonnet-4-6")
  case model
  when /claude.*opus.*4[.-]?[5-9]/i
    "claude-opus-4.5"
  when /claude.*sonnet.*4[.-]?[5-9]/i
    "claude-sonnet-4.5"
  when /claude.*haiku.*4[.-]?[5-9]/i
    "claude-haiku-4.5"
  when /claude-3-5-sonnet-20241022/i
    "claude-3-5-sonnet-20241022"
  when /claude-3-5-sonnet-20240620/i
    "claude-3-5-sonnet-20240620"
  when /claude-3-5-haiku-20241022/i
    "claude-3-5-haiku-20241022"
  when /deepseek-v4-pro/i, /deepseek.*v4.*pro/i
    "deepseek-v4-pro"
  when /deepseek-v4-flash/i, /deepseek.*v4.*flash/i
    "deepseek-v4-flash"
  # Legacy aliases: deepseek-chat and deepseek-reasoner are being
  # deprecated on 2026-07-24 and map to deepseek-v4-flash's
  # non-thinking / thinking modes respectively. Bill at flash rates.
  when /^deepseek-chat$/i, /^deepseek-reasoner$/i
    "deepseek-v4-flash"
  # Kimi K2.5 / K2.6 — strict match only. K2 text-only models
  # (kimi-k2-0905-preview, kimi-k2-thinking, etc.) are not yet
  # registered in providers.rb and will be added in a follow-up
  # issue together with their model_capabilities overrides.
  when /^kimi-k2\.?5$/i
    "kimi-k2.5"
  when /^kimi-k2\.?6$/i
    "kimi-k2.6"
  # GLM (Zhipu / Z.ai) — the five models registered in providers.rb.
  # GLM-5V-Turbo is the vision variant; all five share the same Z.ai
  # international flat-rate pricing regardless of which endpoint
  # (mainland bigmodel.cn vs intl z.ai) the user configured.
  # Strict anchored match so unrelated strings like "glm-5-x-foo"
  # don't silently borrow a nearby model's rate.
  when /^glm-5\.1$/i
    "glm-5.1"
  when /^glm-5v-turbo$/i
    "glm-5v-turbo"
  when /^glm-5-turbo$/i
    "glm-5-turbo"
  when /^glm-5$/i
    "glm-5"
  when /^glm-4\.7$/i
    "glm-4.7"
  # MiniMax — model ids in providers.rb use capitalised "MiniMax-M2.x"
  # but we match case-insensitively and map to the lowercased table key.
  when /^minimax-m2\.5$/i
    "minimax-m2.5"
  when /^minimax-m2\.7$/i
    "minimax-m2.7"

  # OpenAI GPT-5.x models — match various dashed/dotted/compact forms
  # (e.g. "gpt-5.5", "gpt-5-5", "gpt5.5", "gpt55")
  when /^gpt-?5\.?5$/i, /^gpt-?5[\.-]?5$/i
    "gpt-5.5"
  when /^gpt-?5\.?4[^.]*mini$/i, /^gpt-?5\.?4[\.-]?mini$/i
    "gpt-5.4-mini"
  when /^gpt-?5\.?4[^.]*nano$/i, /^gpt-?5\.?4[\.-]?nano$/i
    "gpt-5.4-nano"
  when /^gpt-?5\.?4$/i, /^gpt-?5[\.-]?4$/i
    "gpt-5.4"
  # O-series reasoning models
  when /^o4[\.-]?mini$/i
    "o4-mini"
  when /^o3$/i
    "o3"
  else
    nil  # No pricing available for this model — cost will show as N/A
  end
end