Module: Clacky::ModelPricing

Defined in:: lib/clacky/utils/model_pricing.rb

Overview

Module for handling AI model pricing Supports different pricing tiers and prompt caching

Constant Summary collapse

PRICING_TABLE = Pricing per 1M tokens (MTok) in USD All pricing is based on official API documentation

{
  # Claude 4.5 models - tiered pricing based on prompt length
  "claude-opus-4.5" => {
    input: {
      default: 5.00,              # $5/MTok for prompts ≤ 200K tokens
      over_200k: 5.00             # same for all tiers
    },
    output: {
      default: 25.00,             # $25/MTok for prompts ≤ 200K tokens
      over_200k: 25.00            # same for all tiers
    },
    cache: {
      write: 6.25,                # $6.25/MTok cache write
      read: 0.50                  # $0.50/MTok cache read
    }
  },

  "claude-sonnet-4.5" => {
    input: {
      default: 3.00,              # $3/MTok for prompts ≤ 200K tokens
      over_200k: 6.00             # $6/MTok for prompts > 200K tokens
    },
    output: {
      default: 15.00,             # $15/MTok for prompts ≤ 200K tokens
      over_200k: 22.50            # $22.50/MTok for prompts > 200K tokens
    },
    cache: {
      write_default: 3.75,        # $3.75/MTok cache write (≤ 200K)
      write_over_200k: 7.50,      # $7.50/MTok cache write (> 200K)
      read_default: 0.30,         # $0.30/MTok cache read (≤ 200K)
      read_over_200k: 0.60        # $0.60/MTok cache read (> 200K)
    }
  },

  "claude-haiku-4.5" => {
    input: {
      default: 1.00,              # $1/MTok
      over_200k: 1.00             # same for all tiers
    },
    output: {
      default: 5.00,              # $5/MTok
      over_200k: 5.00             # same for all tiers
    },
    cache: {
      write: 1.25,                # $1.25/MTok cache write
      read: 0.10                  # $0.10/MTok cache read
    }
  },

  # Claude 3.5 models (for backwards compatibility)
  "claude-3-5-sonnet-20241022" => {
    input: {
      default: 3.00,
      over_200k: 6.00
    },
    output: {
      default: 15.00,
      over_200k: 22.50
    },
    cache: {
      write_default: 3.75,
      write_over_200k: 7.50,
      read_default: 0.30,
      read_over_200k: 0.60
    }
  },

  "claude-3-5-sonnet-20240620" => {
    input: {
      default: 3.00,
      over_200k: 6.00
    },
    output: {
      default: 15.00,
      over_200k: 22.50
    },
    cache: {
      write_default: 3.75,
      write_over_200k: 7.50,
      read_default: 0.30,
      read_over_200k: 0.60
    }
  },

  "claude-3-5-haiku-20241022" => {
    input: {
      default: 1.00,
      over_200k: 1.00
    },
    output: {
      default: 5.00,
      over_200k: 5.00
    },
    cache: {
      write: 1.25,
      read: 0.10
    }
  },

  # DeepSeek V4 models
  # Source: https://api-docs.deepseek.com/quick_start/pricing (USD / 1M tokens)
  # DeepSeek billing model:
  #   - "cache miss input" = regular prompt_tokens rate
  #   - "cache hit input"  = cache_read rate (DeepSeek has no separate cache-write charge)
  #   - No tiered pricing (single rate regardless of context length)
  # Cache-hit prices are 1/10 of launch (global, permanent since 2026-04-26).
  # v4-pro is on a 75% off promo through 2026-05-31 23:59 CST; the same
  # numbers become the permanent price after that date (= original × 1/4),
  # so we bill at the discounted rates both before and after the cutover.
  "deepseek-v4-flash" => {
    input: {
      default: 0.14,                  # $0.14/MTok cache miss
      over_200k: 0.14                 # no tiered pricing
    },
    output: {
      default: 0.28,                  # $0.28/MTok
      over_200k: 0.28
    },
    cache: {
      write: 0.14,                    # DeepSeek doesn't charge extra for writes; bill at miss rate
      read: 0.0028                    # $0.0028/MTok cache hit
    }
  },

  "deepseek-v4-pro" => {
    input: {
      default: 0.435,                 # $0.435/MTok cache miss (75% off; permanent after 5/31)
      over_200k: 0.435
    },
    output: {
      default: 0.87,                  # $0.87/MTok (75% off; permanent after 5/31)
      over_200k: 0.87
    },
    cache: {
      write: 0.435,                   # no separate write charge; bill at miss rate
      read: 0.003625                  # $0.003625/MTok cache hit (1/10 × 75% off)
    }
  },

  # Xiaomi MiMo — USD per 1M tokens, international (海外) list price.
  # Source: https://platform.xiaomimimo.com/docs/zh-CN/price/pay-as-you-go
  # Effective 2026-05-27 (V2.5 launch price cut). Cache write is "limited-
  # time free" per Xiaomi's notice; per the project's "displayed ≤ actual"
  # convention we bill writes at the input-miss rate so that when the
  # promo ends users won't see a cost spike. Cache hits use the explicit
  # cache-hit rate.
  #
  # As of 2026-06-01, mimo-v2-pro/omni are forwarded to the V2.5 series
  # and billed at V2.5 rates; mimo-v2-pro mirrors mimo-v2.5-pro and
  # mimo-v2-omni mirrors mimo-v2.5. Both will be retired 2026-06-30.
  "mimo-v2.5-pro" => {
    input:  { default: 0.435,   over_200k: 0.435 },
    output: { default: 0.87,    over_200k: 0.87 },
    cache:  { write: 0.435,     read: 0.0036 }
  },

  "mimo-v2.5" => {
    input:  { default: 0.14,    over_200k: 0.14 },
    output: { default: 0.28,    over_200k: 0.28 },
    cache:  { write: 0.14,      read: 0.0028 }
  },

  "mimo-v2-pro" => {
    input:  { default: 0.435,   over_200k: 0.435 },
    output: { default: 0.87,    over_200k: 0.87 },
    cache:  { write: 0.435,     read: 0.0036 }
  },

  "mimo-v2-omni" => {
    input:  { default: 0.14,    over_200k: 0.14 },
    output: { default: 0.28,    over_200k: 0.28 },
    cache:  { write: 0.14,      read: 0.0028 }
  },

  "mimo-v2-flash" => {
    input:  { default: 0.10,    over_200k: 0.10 },
    output: { default: 0.30,    over_200k: 0.30 },
    cache:  { write: 0.10,      read: 0.01 }
  },

  # Kimi K2.5 / K2.6 multimodal models
  # Source: https://platform.moonshot.cn (USD / 1M tokens)
  # Kimi billing model (same shape as DeepSeek):
  #   - "cache miss input" = regular prompt_tokens rate
  #   - "cache hit input"  = cache_read rate (no separate cache-write charge)
  #   - No tiered pricing (single rate regardless of context length)
  "kimi-k2.5" => {
    input: {
      default: 0.60,                  # $0.60/MTok cache miss
      over_200k: 0.60                 # no tiered pricing
    },
    output: {
      default: 3.00,                  # $3.00/MTok
      over_200k: 3.00
    },
    cache: {
      write: 0.60,                    # Kimi doesn't charge extra for writes; bill at miss rate
      read: 0.10                      # $0.10/MTok cache hit
    }
  },

  "kimi-k2.6" => {
    input: {
      default: 0.95,                  # $0.95/MTok cache miss
      over_200k: 0.95
    },
    output: {
      default: 4.00,                  # $4.00/MTok
      over_200k: 4.00
    },
    cache: {
      write: 0.95,                    # no separate write charge; bill at miss rate
      read: 0.16                      # $0.16/MTok cache hit
    }
  },

  # Google Gemini 3 series (via Vertex AI). Tiered at 200K input tokens
  # for Pro; Flash has flat pricing.
  "gemini-3.1-pro" => {
    input: {
      default: 2.00,
      over_200k: 4.00
    },
    output: {
      default: 12.00,
      over_200k: 18.00
    },
    cache: {
      write: 2.00,
      read: 0.50
    }
  },

  "gemini-3-flash" => {
    input: {
      default: 0.50,
      over_200k: 0.50
    },
    output: {
      default: 3.00,
      over_200k: 3.00
    },
    cache: {
      write: 0.50,
      read: 0.05
    }
  },

  # OpenAI GPT-5.5 / GPT-5.4 — breakpoint at 272K input tokens
  # Source: https://openai.com/api/pricing/ (USD / 1M tokens)
  # Note: OpenAI's actual tiered-pricing threshold is 272K, not the
  # global 200K below.  Prompts between 200K–272K will slightly
  # over-estimate costs until a per-model threshold is implemented.
  "gpt-5.5" => {
    input: {
      default: 5.00,              # $5/MTok for prompts ≤ 272K tokens
      over_200k: 10.00            # $10/MTok for prompts > 272K tokens
    },
    output: {
      default: 30.00,             # $30/MTok for prompts ≤ 272K tokens
      over_200k: 45.00            # $45/MTok for prompts > 272K tokens
    },
    cache: {
      write_default: 5.00,        # $5/MTok cache write (≤ 272K)
      write_over_200k: 10.00,     # $10/MTok cache write (> 272K)
      read_default: 0.50,         # $0.50/MTok cache read (≤ 272K)
      read_over_200k: 1.00        # $1.00/MTok cache read (> 272K)
    }
  },

  "gpt-5.4" => {
    input: {
      default: 2.50,              # $2.50/MTok for prompts ≤ 272K tokens
      over_200k: 5.00             # $5/MTok for prompts > 272K tokens
    },
    output: {
      default: 15.00,             # $15/MTok for prompts ≤ 272K tokens
      over_200k: 22.50           # $22.50/MTok for prompts > 272K tokens
    },
    cache: {
      write_default: 2.50,        # $2.50/MTok cache write (≤ 272K)
      write_over_200k: 5.00,      # $5/MTok cache write (> 272K)
      read_default: 0.25,         # $0.25/MTok cache read (≤ 272K)
      read_over_200k: 0.50        # $0.50/MTok cache read (> 272K)
    }
  },

  # GPT-5.4 flat-rate models (no breakpoint, single rate regardless of context)
  "gpt-5.4-mini" => {
    input: {
      default: 0.75,              # $0.75/MTok
      over_200k: 0.75
    },
    output: {
      default: 4.50,              # $4.50/MTok
      over_200k: 4.50
    },
    cache: {
      write: 0.75,                # $0.75/MTok cache write
      read: 0.075                 # $0.075/MTok cache read (10% of input)
    }
  },

  "gpt-5.4-nano" => {
    input: {
      default: 0.20,              # $0.20/MTok
      over_200k: 0.20
    },
    output: {
      default: 1.25,              # $1.25/MTok
      over_200k: 1.25
    },
    cache: {
      write: 0.20,                # $0.20/MTok cache write
      read: 0.02                  # $0.02/MTok cache read (10% of input)
    }
  },

  # O-series reasoning models — flat-rate (200K context window)
  # Source: https://openai.com/api/pricing/
  "o3" => {
    input: {
      default: 2.00,              # $2/MTok
      over_200k: 2.00             # flat rate
    },
    output: {
      default: 8.00,              # $8/MTok
      over_200k: 8.00
    },
    cache: {
      write: 2.00,                # $2/MTok cache write (same as input)
      read: 0.50                  # $0.50/MTok cache read (25% of input)
    }
  },

  "o4-mini" => {
    input: {
      default: 1.10,              # $1.10/MTok
      over_200k: 1.10             # flat rate
    },
    output: {
      default: 4.40,              # $4.40/MTok
      over_200k: 4.40
    },
    cache: {
      write: 1.10,                # $1.10/MTok cache write (same as input)
      read: 0.275                 # $0.275/MTok cache read (25% of input)
    }
  },

  # GLM (Zhipu / Z.ai) — USD per 1M tokens.
  # Source: https://docs.z.ai/guides/overview/pricing (Z.ai international).
  # Pricing policy: we always bill at the Z.ai international flat rate,
  # regardless of which endpoint (mainland bigmodel.cn vs intl z.ai) the
  # user configured. Rationale:
  #   1. Mainland GLM uses tiered pricing (≤32K / >32K / >128K) where the
  #      >32K tier is hit by the vast majority of real requests, and is
  #      actually a few RMB cheaper than Z.ai's flat rate — displaying the
  #      (slightly higher) Z.ai rate gives users a "displayed ≤ actual"
  #      experience which is psychologically safer than the reverse.
  #   2. Single flat rate keeps the table shape consistent with every
  #      other provider here (no special-case tier logic for just GLM).
  # Cache-write: same convention as DeepSeek/Kimi — OpenAI-compatible
  # endpoints don't charge separately for cache writes (Z.ai's page lists
  # "Cached Input Storage: Limited-time Free"), so bill writes at the
  # regular input miss rate for safe "displayed ≤ actual" behaviour.
  "glm-5.1" => {
    input:  { default: 1.40, over_200k: 1.40 },
    output: { default: 4.40, over_200k: 4.40 },
    cache:  { write: 1.40, read: 0.26 }
  },

  "glm-5" => {
    input:  { default: 1.00, over_200k: 1.00 },
    output: { default: 3.20, over_200k: 3.20 },
    cache:  { write: 1.00, read: 0.20 }
  },

  "glm-5-turbo" => {
    input:  { default: 1.20, over_200k: 1.20 },
    output: { default: 4.00, over_200k: 4.00 },
    cache:  { write: 1.20, read: 0.24 }
  },

  # GLM-5V-Turbo is the multimodal sibling of GLM-5-Turbo (vision capable,
  # see providers.rb model_capabilities override). Same input/output rate
  # as 5-Turbo per Z.ai's Vision Models table.
  "glm-5v-turbo" => {
    input:  { default: 1.20, over_200k: 1.20 },
    output: { default: 4.00, over_200k: 4.00 },
    cache:  { write: 1.20, read: 0.24 }
  },

  "glm-4.7" => {
    input:  { default: 0.60, over_200k: 0.60 },
    output: { default: 2.20, over_200k: 2.20 },
    cache:  { write: 0.60, read: 0.11 }
  },

  # MiniMax — USD per 1M tokens.
  # Source: https://platform.minimaxi.com (Pay-as-You-Go).
  # MiniMax pricing is identical across mainland (.com) and international
  # (.io) endpoints, verified by the team. Same cache-write convention as
  # DeepSeek/Kimi/GLM: bill writes at the input miss rate (OpenAI-compatible
  # usage responses from MiniMax don't reliably carry a separate
  # cache_creation_input_tokens field, so a distinct write rate would be
  # dead code in practice).
  # Note: providers.rb uses the capitalised "MiniMax-M2.x" model id, but
  # the pricing table keys are lowercased to stay consistent with the
  # rest of this file; normalize_model_name() lowercases incoming model
  # names before lookup.
  "minimax-m2.5" => {
    input:  { default: 0.30, over_200k: 0.30 },
    output: { default: 1.20, over_200k: 1.20 },
    cache:  { write: 0.30, read: 0.03 }
  },

  # M3 (released 2026-06-01) is MiniMax's multimodal flagship. Official
  # pricing is tiered by context length (≤512K vs 512K–1M); per the
  # project's "displayed ≤ actual" convention we record only the lowest
  # (≤512K) tier as a flat rate — the global TIERED_PRICING_THRESHOLD is
  # 200K, so applying the 512K–1M rate to the 200K–512K band would over-
  # charge. Listed at original (non-promotional) prices: input $0.60,
  # output $2.40, cache read $0.12 per 1M tokens.
  "minimax-m3" => {
    input:  { default: 0.60, over_200k: 0.60 },
    output: { default: 2.40, over_200k: 2.40 },
    cache:  { write: 0.60, read: 0.12 }
  },

  "minimax-m2.7" => {
    input:  { default: 0.30, over_200k: 0.30 },
    output: { default: 1.20, over_200k: 1.20 },
    cache:  { write: 0.30, read: 0.06 }
  },

  # Qwen (Alibaba DashScope) - USD per 1M tokens, international (Singapore) list price.
  # Source: Alibaba Cloud Model Studio international console per-model pages.
  #
  # Pricing convention:
  #   - These rates are used for user-facing cost ESTIMATION, so we always use
  #     the standard LIST price and intentionally ignore any limited-time promo
  #     discounts. A promo lowers the user's actual bill, never raises it, so
  #     estimating at list price keeps the estimate a safe upper bound and avoids
  #     churn whenever a promo starts or ends.
  #   - We record the model's LOWEST context tier (e.g. input<=256k / <=128k) as a
  #     flat rate, since the global TIERED_PRICING_THRESHOLD is 200K and does not
  #     match Qwen's per-model breakpoints.
  #   - cache.write = official explicit-cache-create price.
  #   - cache.read  = official explicit-cache-hit price.
  #   - When a model has NO published explicit-cache price (e.g. qwen3.6-27b,
  #     qwen-plus-latest), cache.write/read fall back to the input rate.
  # qwen3.7-max: NOT tiered (single flat tier per Alibaba's definition).
  #   List price: input 2.5, output 7.5, explicit write 3.125, explicit read 0.25.
  "qwen3.7-max" => {
    input:  { default: 2.5, over_200k: 2.5 },
    output: { default: 7.5, over_200k: 7.5 },
    cache:  { write: 3.125, read: 0.25 }
  },

  # qwen3.7-plus: list price (<=256k tier):
  #   input 0.4, output 1.6, explicit write 0.5, explicit read 0.04.
  "qwen3.7-plus" => {
    input:  { default: 0.4, over_200k: 0.4 },
    output: { default: 1.6, over_200k: 1.6 },
    cache:  { write: 0.5, read: 0.04 }
  },

  # qwen3.6-plus: list price (<=256k tier). Official explicit-cache prices.
  #   input 0.50, output 3.00, explicit write 0.625, explicit read 0.05
  "qwen3.6-plus" => {
    input:  { default: 0.50, over_200k: 0.50 },
    output: { default: 3.00, over_200k: 3.00 },
    cache:  { write: 0.625, read: 0.05 }
  },

  # qwen3.6-max (qwen3.6-max-preview): list price (<=128k tier).
  #   input 1.30, output 7.80, explicit write 1.625, explicit read 0.13
  "qwen3.6-max" => {
    input:  { default: 1.30, over_200k: 1.30 },
    output: { default: 7.80, over_200k: 7.80 },
    cache:  { write: 1.625, read: 0.13 }
  },

  # qwen3.6-27b: list price, no explicit-cache pricing published.
  #   Cache write/read fall back to the input rate (no cache discount).
  "qwen3.6-27b" => {
    input:  { default: 0.60, over_200k: 0.60 },
    output: { default: 3.60, over_200k: 3.60 },
    cache:  { write: 0.60, read: 0.60 }
  },

  # qwen3.6-flash: list price (<=256k tier).
  #   input 0.25, output 1.50, explicit write 0.3125, explicit read 0.025
  "qwen3.6-flash" => {
    input:  { default: 0.25, over_200k: 0.25 },
    output: { default: 1.50, over_200k: 1.50 },
    cache:  { write: 0.3125, read: 0.025 }
  },

  # qwen-plus-latest: list price (<=256k tier), no explicit-cache pricing.
  #   Cache write/read fall back to the input rate (no cache discount).
  "qwen-plus-latest" => {
    input:  { default: 0.40, over_200k: 0.40 },
    output: { default: 1.20, over_200k: 1.20 },
    cache:  { write: 0.40, read: 0.40 }
  },

  # qwen3-vl-plus: replaces the retiring qwen-vl-plus. List price
  #   (128k<input<=256k tier). input 0.60, output 4.80,
  #   explicit write 0.75, explicit read 0.06.
  "qwen3-vl-plus" => {
    input:  { default: 0.60, over_200k: 0.60 },
    output: { default: 4.80, over_200k: 4.80 },
    cache:  { write: 0.75, read: 0.06 }
  },

}.freeze

TIERED_PRICING_THRESHOLD = Threshold for tiered pricing (200K tokens) NOTE: OpenAI GPT-5.5/GPT-5.4 use a 272K breakpoint, not 200K. Costs for prompts between 200K–272K will be slightly over-estimated.

200_000

Class Method Summary collapse

.calculate_cache_cost(pricing:, cache_write_tokens:, cache_read_tokens:, over_threshold:) ⇒ Object

Calculate cache-related costs.
.calculate_cost(model:, usage:) ⇒ Hash

Calculate cost for the given model and usage.
.get_pricing(model) ⇒ Hash

Get pricing for a specific model Falls back to default pricing if model not found.
.get_pricing_with_source(model) ⇒ Hash

Get pricing with source information.
.normalize_model_name(model) ⇒ Object

Normalize model name to match pricing table keys.

Class Method Details

.calculate_cache_cost(pricing:, cache_write_tokens:, cache_read_tokens:, over_threshold:) ⇒ `Object`

Calculate cache-related costs

# File 'lib/clacky/utils/model_pricing.rb', line 756

def calculate_cache_cost(pricing:, cache_write_tokens:, cache_read_tokens:, over_threshold:)
  cache_cost = 0.0

  # Cache write cost
  if cache_write_tokens > 0
    write_rate = if pricing[:cache].key?(:write)
                   # Simple pricing (Opus 4.5, Haiku 4.5)
                   pricing[:cache][:write]
                 elsif over_threshold
                   # Tiered pricing (Sonnet 4.5)
                   pricing[:cache][:write_over_200k]
                 else
                   pricing[:cache][:write_default]
                 end

    cache_cost += (cache_write_tokens / 1_000_000.0) * write_rate
  end

  # Cache read cost
  if cache_read_tokens > 0
    read_rate = if pricing[:cache].key?(:read)
                  # Simple pricing (Opus 4.5, Haiku 4.5)
                  pricing[:cache][:read]
                elsif over_threshold
                  # Tiered pricing (Sonnet 4.5)
                  pricing[:cache][:read_over_200k]
                else
                  pricing[:cache][:read_default]
                end

    cache_cost += (cache_read_tokens / 1_000_000.0) * read_rate
  end

  cache_cost
end

.calculate_cost(model:, usage:) ⇒ `Hash`

Calculate cost for the given model and usage

Parameters:

model (String) —

Model identifier
usage (Hash) —
Usage statistics containing:
- prompt_tokens: number of input tokens
- completion_tokens: number of output tokens
- cache_creation_input_tokens: tokens written to cache (optional)
- cache_read_input_tokens: tokens read from cache (optional)

Returns:

(Hash) —
Hash containing:
- cost: Cost in USD (Float) or nil if model pricing is unknown
- source: Cost source (:price) or nil if unknown (Symbol or nil)

# File 'lib/clacky/utils/model_pricing.rb', line 545

def calculate_cost(model:, usage:)
  pricing_result = get_pricing_with_source(model)
  pricing = pricing_result[:pricing]
  source = pricing_result[:source]

  # If no pricing table matches this model, return nil cost.
  # Unknown models should display as N/A, never fall back to guesses.
  return { cost: nil, source: nil } unless pricing

  prompt_tokens = usage[:prompt_tokens] || 0
  completion_tokens = usage[:completion_tokens] || 0
  cache_write_tokens = usage[:cache_creation_input_tokens] || 0
  cache_read_tokens = usage[:cache_read_input_tokens] || 0

  # Determine if we're in the over_200k tier
  # Note: prompt_tokens includes cache_read_tokens but NOT cache_write_tokens
  # cache_write_tokens are additional tokens that were written to cache
  total_input_tokens = prompt_tokens + cache_write_tokens
  over_threshold = total_input_tokens > TIERED_PRICING_THRESHOLD

  # Calculate regular input cost (non-cached tokens)
  # prompt_tokens already includes cache_read_tokens, so we need to subtract them
  # cache_write_tokens are not part of prompt_tokens, so they're handled separately in cache_cost
  regular_input_tokens = prompt_tokens - cache_read_tokens
  input_rate = over_threshold ? pricing[:input][:over_200k] : pricing[:input][:default]
  input_cost = (regular_input_tokens / 1_000_000.0) * input_rate

  # Calculate output cost
  output_rate = over_threshold ? pricing[:output][:over_200k] : pricing[:output][:default]
  output_cost = (completion_tokens / 1_000_000.0) * output_rate

  # Calculate cache costs
  cache_cost = calculate_cache_cost(
    pricing: pricing,
    cache_write_tokens: cache_write_tokens,
    cache_read_tokens: cache_read_tokens,
    over_threshold: over_threshold
  )

  {
    cost: input_cost + output_cost + cache_cost,
    source: source
  }
end

.get_pricing(model) ⇒ `Hash`

Get pricing for a specific model Falls back to default pricing if model not found

Parameters:

model (String) —

Model identifier

Returns:

(Hash) —

Pricing structure for the model



595
596
597

# File 'lib/clacky/utils/model_pricing.rb', line 595

def get_pricing(model)
  get_pricing_with_source(model)[:pricing]
end

.get_pricing_with_source(model) ⇒ `Hash`

Get pricing with source information

Parameters:

model (String) —

Model identifier

Returns:

(Hash) —
Hash containing:
- pricing: Pricing structure or nil if model is unknown
- source: :price (matched) or nil (unknown)

# File 'lib/clacky/utils/model_pricing.rb', line 605

def get_pricing_with_source(model)
  # Normalize model name (remove version suffixes, handle variations)
  normalized_model = normalize_model_name(model)

  if normalized_model
    # Found specific pricing for this model
    {
      pricing: PRICING_TABLE[normalized_model],
      source: :price
    }
  else
    # No matching pricing table entry — cost is unknown
    { pricing: nil, source: nil }
  end
end

.normalize_model_name(model) ⇒ `Object`

Normalize model name to match pricing table keys. Returns the canonical key on match, or nil when no pricing is available.

# File 'lib/clacky/utils/model_pricing.rb', line 624

def normalize_model_name(model)
  return nil if model.nil? || model.empty?

  model = model.downcase.strip

  # Direct match
  return model if PRICING_TABLE.key?(model)

  # Check for Claude model variations
  # Support both dot and dash separators (e.g., "4.5", "4-5", "4-6")
  # Also handles Bedrock cross-region prefixes (e.g. "jp.anthropic.claude-sonnet-4-6")
  case model
  when /claude.*opus.*4[.-]?[5-9]/i
    "claude-opus-4.5"
  when /claude.*sonnet.*4[.-]?[5-9]/i
    "claude-sonnet-4.5"
  when /claude.*haiku.*4[.-]?[5-9]/i
    "claude-haiku-4.5"
  when /claude-3-5-sonnet-20241022/i
    "claude-3-5-sonnet-20241022"
  when /claude-3-5-sonnet-20240620/i
    "claude-3-5-sonnet-20240620"
  when /claude-3-5-haiku-20241022/i
    "claude-3-5-haiku-20241022"
  when /deepseek-v4-pro/i, /deepseek.*v4.*pro/i
    "deepseek-v4-pro"
  when /deepseek-v4-flash/i, /deepseek.*v4.*flash/i
    "deepseek-v4-flash"
  # Legacy aliases: deepseek-chat and deepseek-reasoner are being
  # deprecated on 2026-07-24 and map to deepseek-v4-flash's
  # non-thinking / thinking modes respectively. Bill at flash rates.
  when /^deepseek-chat$/i, /^deepseek-reasoner$/i
    "deepseek-v4-flash"
  # Xiaomi MiMo — strict anchored match per registered model id in
  # providers.rb (currently mimo-v2.5-pro / mimo-v2-pro / mimo-v2-omni).
  # mimo-v2.5 / mimo-v2-flash are also priced ahead of provider-side
  # registration. Per Xiaomi's 2026-06 schedule, mimo-v2-pro/omni are
  # transparently routed to V2.5 — keys are listed independently so
  # both old and new ids resolve to the right rate.
  when /^mimo-v2\.?5-pro$/i
    "mimo-v2.5-pro"
  when /^mimo-v2\.?5$/i
    "mimo-v2.5"
  when /^mimo-v2-pro$/i
    "mimo-v2-pro"
  when /^mimo-v2-omni$/i
    "mimo-v2-omni"
  when /^mimo-v2-flash$/i
    "mimo-v2-flash"
  # Kimi K2.5 / K2.6 — strict match only. K2 text-only models
  # (kimi-k2-0905-preview, kimi-k2-thinking, etc.) are not yet
  # registered in providers.rb and will be added in a follow-up
  # issue together with their model_capabilities overrides.
  when /^kimi-k2\.?5$/i
    "kimi-k2.5"
  when /^kimi-k2\.?6$/i
    "kimi-k2.6"
  # GLM (Zhipu / Z.ai) — the five models registered in providers.rb.
  # GLM-5V-Turbo is the vision variant; all five share the same Z.ai
  # international flat-rate pricing regardless of which endpoint
  # (mainland bigmodel.cn vs intl z.ai) the user configured.
  # Strict anchored match so unrelated strings like "glm-5-x-foo"
  # don't silently borrow a nearby model's rate.
  when /^glm-5\.1$/i
    "glm-5.1"
  when /^glm-5v-turbo$/i
    "glm-5v-turbo"
  when /^glm-5-turbo$/i
    "glm-5-turbo"
  when /^glm-5$/i
    "glm-5"
  when /^glm-4\.7$/i
    "glm-4.7"
  # MiniMax — model ids in providers.rb use capitalised "MiniMax-M2.x"
  # but we match case-insensitively and map to the lowercased table key.
  when /^minimax-m3$/i
    "minimax-m3"
  when /^minimax-m2\.5$/i
    "minimax-m2.5"
  when /^minimax-m2\.7$/i
    "minimax-m2.7"

  # Qwen (Alibaba DashScope) — strict anchored match per registered
  # model id in providers.rb. qwen3.7-* is the latest flagship line;
  # qwen3.6-* are the previous generation; qwen-plus-latest is the
  # rolling alias for the latest Qwen-Plus release; qwen3-vl-plus is
  # the multimodal SKU (replaces the retired qwen-vl-plus/max).
  when /^qwen3\.7-max$/i
    "qwen3.7-max"
  when /^qwen3\.7-plus$/i
    "qwen3.7-plus"
  when /^qwen3\.6-plus$/i
    "qwen3.6-plus"
  when /^qwen3\.6-max$/i
    "qwen3.6-max"
  when /^qwen3\.6-27b$/i
    "qwen3.6-27b"
  when /^qwen3\.6-flash$/i
    "qwen3.6-flash"
  when /^qwen-plus-latest$/i
    "qwen-plus-latest"
  when /^qwen3-vl-plus$/i
    "qwen3-vl-plus"

  # Google Gemini 3 series. Match the platform aliases (or-gemini-*)
  # and the bare upstream ids returned by Vertex.
  when /^or-gemini-3-1-pro$/i, /^gemini-3\.1-pro(-preview)?$/i
    "gemini-3.1-pro"
  when /^or-gemini-3-5-flash$/i, /^gemini-3\.5-flash$/i, /^gemini-3-flash(-preview)?$/i
    "gemini-3-flash"

  # OpenAI GPT-5.x models — match various dashed/dotted/compact forms
  # (e.g. "gpt-5.5", "gpt-5-5", "gpt5.5", "gpt55")
  when /^gpt-?5\.?5$/i, /^gpt-?5[\.-]?5$/i
    "gpt-5.5"
  when /^gpt-?5\.?4[^.]*mini$/i, /^gpt-?5\.?4[\.-]?mini$/i
    "gpt-5.4-mini"
  when /^gpt-?5\.?4[^.]*nano$/i, /^gpt-?5\.?4[\.-]?nano$/i
    "gpt-5.4-nano"
  when /^gpt-?5\.?4$/i, /^gpt-?5[\.-]?4$/i
    "gpt-5.4"
  # O-series reasoning models
  when /^o4[\.-]?mini$/i
    "o4-mini"
  when /^o3$/i
    "o3"
  else
    nil  # No pricing available for this model — cost will show as N/A
  end
end

Module: Clacky::ModelPricing

Overview

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.calculate_cache_cost(pricing:, cache_write_tokens:, cache_read_tokens:, over_threshold:) ⇒ Object

.calculate_cost(model:, usage:) ⇒ Hash

.get_pricing(model) ⇒ Hash

.get_pricing_with_source(model) ⇒ Hash

.normalize_model_name(model) ⇒ Object

.calculate_cache_cost(pricing:, cache_write_tokens:, cache_read_tokens:, over_threshold:) ⇒ `Object`

.calculate_cost(model:, usage:) ⇒ `Hash`

.get_pricing(model) ⇒ `Hash`

.get_pricing_with_source(model) ⇒ `Hash`

.normalize_model_name(model) ⇒ `Object`