Module: Clacky::ModelPricing

Defined in:
lib/clacky/utils/model_pricing.rb

Overview

Module for handling AI model pricing Supports different pricing tiers and prompt caching

Constant Summary collapse

PRICING_TABLE =

Pricing per 1M tokens (MTok) in USD All pricing is based on official API documentation

{
  # Claude 4.5 models - tiered pricing based on prompt length
  "claude-opus-4.5" => {
    input: {
      default: 5.00,              # $5/MTok for prompts ≤ 200K tokens
      over_200k: 5.00             # same for all tiers
    },
    output: {
      default: 25.00,             # $25/MTok for prompts ≤ 200K tokens
      over_200k: 25.00            # same for all tiers
    },
    cache: {
      write: 6.25,                # $6.25/MTok cache write
      read: 0.50                  # $0.50/MTok cache read
    }
  },

  "claude-sonnet-4.5" => {
    input: {
      default: 3.00,              # $3/MTok for prompts ≤ 200K tokens
      over_200k: 6.00             # $6/MTok for prompts > 200K tokens
    },
    output: {
      default: 15.00,             # $15/MTok for prompts ≤ 200K tokens
      over_200k: 22.50            # $22.50/MTok for prompts > 200K tokens
    },
    cache: {
      write_default: 3.75,        # $3.75/MTok cache write (≤ 200K)
      write_over_200k: 7.50,      # $7.50/MTok cache write (> 200K)
      read_default: 0.30,         # $0.30/MTok cache read (≤ 200K)
      read_over_200k: 0.60        # $0.60/MTok cache read (> 200K)
    }
  },

  "claude-haiku-4.5" => {
    input: {
      default: 1.00,              # $1/MTok
      over_200k: 1.00             # same for all tiers
    },
    output: {
      default: 5.00,              # $5/MTok
      over_200k: 5.00             # same for all tiers
    },
    cache: {
      write: 1.25,                # $1.25/MTok cache write
      read: 0.10                  # $0.10/MTok cache read
    }
  },

  # Claude 3.5 models (for backwards compatibility)
  "claude-3-5-sonnet-20241022" => {
    input: {
      default: 3.00,
      over_200k: 6.00
    },
    output: {
      default: 15.00,
      over_200k: 22.50
    },
    cache: {
      write_default: 3.75,
      write_over_200k: 7.50,
      read_default: 0.30,
      read_over_200k: 0.60
    }
  },

  "claude-3-5-sonnet-20240620" => {
    input: {
      default: 3.00,
      over_200k: 6.00
    },
    output: {
      default: 15.00,
      over_200k: 22.50
    },
    cache: {
      write_default: 3.75,
      write_over_200k: 7.50,
      read_default: 0.30,
      read_over_200k: 0.60
    }
  },

  "claude-3-5-haiku-20241022" => {
    input: {
      default: 1.00,
      over_200k: 1.00
    },
    output: {
      default: 5.00,
      over_200k: 5.00
    },
    cache: {
      write: 1.25,
      read: 0.10
    }
  },

  # DeepSeek V4 models
  # Source: https://api-docs.deepseek.com/quick_start/pricing (USD / 1M tokens)
  # DeepSeek billing model:
  #   - "cache miss input" = regular prompt_tokens rate
  #   - "cache hit input"  = cache_read rate (DeepSeek has no separate cache-write charge)
  #   - No tiered pricing (single rate regardless of context length)
  # Cache-hit prices are 1/10 of launch (global, permanent since 2026-04-26).
  # v4-pro is on a 75% off promo through 2026-05-31 23:59 CST; the same
  # numbers become the permanent price after that date (= original × 1/4),
  # so we bill at the discounted rates both before and after the cutover.
  "deepseek-v4-flash" => {
    input: {
      default: 0.14,                  # $0.14/MTok cache miss
      over_200k: 0.14                 # no tiered pricing
    },
    output: {
      default: 0.28,                  # $0.28/MTok
      over_200k: 0.28
    },
    cache: {
      write: 0.14,                    # DeepSeek doesn't charge extra for writes; bill at miss rate
      read: 0.0028                    # $0.0028/MTok cache hit
    }
  },

  "deepseek-v4-pro" => {
    input: {
      default: 0.435,                 # $0.435/MTok cache miss (75% off; permanent after 5/31)
      over_200k: 0.435
    },
    output: {
      default: 0.87,                  # $0.87/MTok (75% off; permanent after 5/31)
      over_200k: 0.87
    },
    cache: {
      write: 0.435,                   # no separate write charge; bill at miss rate
      read: 0.003625                  # $0.003625/MTok cache hit (1/10 × 75% off)
    }
  },

  # Xiaomi MiMo — USD per 1M tokens, international (海外) list price.
  # Source: https://platform.xiaomimimo.com/docs/zh-CN/price/pay-as-you-go
  # Effective 2026-05-27 (V2.5 launch price cut). Cache write is "limited-
  # time free" per Xiaomi's notice; per the project's "displayed ≤ actual"
  # convention we bill writes at the input-miss rate so that when the
  # promo ends users won't see a cost spike. Cache hits use the explicit
  # cache-hit rate.
  #
  # As of 2026-06-01, mimo-v2-pro/omni are forwarded to the V2.5 series
  # and billed at V2.5 rates; mimo-v2-pro mirrors mimo-v2.5-pro and
  # mimo-v2-omni mirrors mimo-v2.5. Both will be retired 2026-06-30.
  "mimo-v2.5-pro" => {
    input:  { default: 0.435,   over_200k: 0.435 },
    output: { default: 0.87,    over_200k: 0.87 },
    cache:  { write: 0.435,     read: 0.0036 }
  },

  "mimo-v2.5" => {
    input:  { default: 0.14,    over_200k: 0.14 },
    output: { default: 0.28,    over_200k: 0.28 },
    cache:  { write: 0.14,      read: 0.0028 }
  },

  "mimo-v2-pro" => {
    input:  { default: 0.435,   over_200k: 0.435 },
    output: { default: 0.87,    over_200k: 0.87 },
    cache:  { write: 0.435,     read: 0.0036 }
  },

  "mimo-v2-omni" => {
    input:  { default: 0.14,    over_200k: 0.14 },
    output: { default: 0.28,    over_200k: 0.28 },
    cache:  { write: 0.14,      read: 0.0028 }
  },

  "mimo-v2-flash" => {
    input:  { default: 0.10,    over_200k: 0.10 },
    output: { default: 0.30,    over_200k: 0.30 },
    cache:  { write: 0.10,      read: 0.01 }
  },

  # Kimi K2.5 / K2.6 multimodal models
  # Source: https://platform.moonshot.cn (USD / 1M tokens)
  # Kimi billing model (same shape as DeepSeek):
  #   - "cache miss input" = regular prompt_tokens rate
  #   - "cache hit input"  = cache_read rate (no separate cache-write charge)
  #   - No tiered pricing (single rate regardless of context length)
  "kimi-k2.5" => {
    input: {
      default: 0.60,                  # $0.60/MTok cache miss
      over_200k: 0.60                 # no tiered pricing
    },
    output: {
      default: 3.00,                  # $3.00/MTok
      over_200k: 3.00
    },
    cache: {
      write: 0.60,                    # Kimi doesn't charge extra for writes; bill at miss rate
      read: 0.10                      # $0.10/MTok cache hit
    }
  },

  "kimi-k2.6" => {
    input: {
      default: 0.95,                  # $0.95/MTok cache miss
      over_200k: 0.95
    },
    output: {
      default: 4.00,                  # $4.00/MTok
      over_200k: 4.00
    },
    cache: {
      write: 0.95,                    # no separate write charge; bill at miss rate
      read: 0.16                      # $0.16/MTok cache hit
    }
  },

  # Google Gemini 3 series (via Vertex AI). Tiered at 200K input tokens
  # for Pro; Flash has flat pricing.
  "gemini-3.1-pro" => {
    input: {
      default: 2.00,
      over_200k: 4.00
    },
    output: {
      default: 12.00,
      over_200k: 18.00
    },
    cache: {
      write: 2.00,
      read: 0.50
    }
  },

  "gemini-3-flash" => {
    input: {
      default: 0.50,
      over_200k: 0.50
    },
    output: {
      default: 3.00,
      over_200k: 3.00
    },
    cache: {
      write: 0.50,
      read: 0.05
    }
  },

  # OpenAI GPT-5.5 / GPT-5.4 — breakpoint at 272K input tokens
  # Source: https://openai.com/api/pricing/ (USD / 1M tokens)
  # Note: OpenAI's actual tiered-pricing threshold is 272K, not the
  # global 200K below.  Prompts between 200K–272K will slightly
  # over-estimate costs until a per-model threshold is implemented.
  "gpt-5.5" => {
    input: {
      default: 5.00,              # $5/MTok for prompts ≤ 272K tokens
      over_200k: 10.00            # $10/MTok for prompts > 272K tokens
    },
    output: {
      default: 30.00,             # $30/MTok for prompts ≤ 272K tokens
      over_200k: 45.00            # $45/MTok for prompts > 272K tokens
    },
    cache: {
      write_default: 5.00,        # $5/MTok cache write (≤ 272K)
      write_over_200k: 10.00,     # $10/MTok cache write (> 272K)
      read_default: 0.50,         # $0.50/MTok cache read (≤ 272K)
      read_over_200k: 1.00        # $1.00/MTok cache read (> 272K)
    }
  },

  "gpt-5.4" => {
    input: {
      default: 2.50,              # $2.50/MTok for prompts ≤ 272K tokens
      over_200k: 5.00             # $5/MTok for prompts > 272K tokens
    },
    output: {
      default: 15.00,             # $15/MTok for prompts ≤ 272K tokens
      over_200k: 22.50           # $22.50/MTok for prompts > 272K tokens
    },
    cache: {
      write_default: 2.50,        # $2.50/MTok cache write (≤ 272K)
      write_over_200k: 5.00,      # $5/MTok cache write (> 272K)
      read_default: 0.25,         # $0.25/MTok cache read (≤ 272K)
      read_over_200k: 0.50        # $0.50/MTok cache read (> 272K)
    }
  },

  # GPT-5.4 flat-rate models (no breakpoint, single rate regardless of context)
  "gpt-5.4-mini" => {
    input: {
      default: 0.75,              # $0.75/MTok
      over_200k: 0.75
    },
    output: {
      default: 4.50,              # $4.50/MTok
      over_200k: 4.50
    },
    cache: {
      write: 0.75,                # $0.75/MTok cache write
      read: 0.075                 # $0.075/MTok cache read (10% of input)
    }
  },

  "gpt-5.4-nano" => {
    input: {
      default: 0.20,              # $0.20/MTok
      over_200k: 0.20
    },
    output: {
      default: 1.25,              # $1.25/MTok
      over_200k: 1.25
    },
    cache: {
      write: 0.20,                # $0.20/MTok cache write
      read: 0.02                  # $0.02/MTok cache read (10% of input)
    }
  },

  # O-series reasoning models — flat-rate (200K context window)
  # Source: https://openai.com/api/pricing/
  "o3" => {
    input: {
      default: 2.00,              # $2/MTok
      over_200k: 2.00             # flat rate
    },
    output: {
      default: 8.00,              # $8/MTok
      over_200k: 8.00
    },
    cache: {
      write: 2.00,                # $2/MTok cache write (same as input)
      read: 0.50                  # $0.50/MTok cache read (25% of input)
    }
  },

  "o4-mini" => {
    input: {
      default: 1.10,              # $1.10/MTok
      over_200k: 1.10             # flat rate
    },
    output: {
      default: 4.40,              # $4.40/MTok
      over_200k: 4.40
    },
    cache: {
      write: 1.10,                # $1.10/MTok cache write (same as input)
      read: 0.275                 # $0.275/MTok cache read (25% of input)
    }
  },

  # GLM (Zhipu / Z.ai) — USD per 1M tokens.
  # Source: https://docs.z.ai/guides/overview/pricing (Z.ai international).
  # Pricing policy: we always bill at the Z.ai international flat rate,
  # regardless of which endpoint (mainland bigmodel.cn vs intl z.ai) the
  # user configured. Rationale:
  #   1. Mainland GLM uses tiered pricing (≤32K / >32K / >128K) where the
  #      >32K tier is hit by the vast majority of real requests, and is
  #      actually a few RMB cheaper than Z.ai's flat rate — displaying the
  #      (slightly higher) Z.ai rate gives users a "displayed ≤ actual"
  #      experience which is psychologically safer than the reverse.
  #   2. Single flat rate keeps the table shape consistent with every
  #      other provider here (no special-case tier logic for just GLM).
  # Cache-write: same convention as DeepSeek/Kimi — OpenAI-compatible
  # endpoints don't charge separately for cache writes (Z.ai's page lists
  # "Cached Input Storage: Limited-time Free"), so bill writes at the
  # regular input miss rate for safe "displayed ≤ actual" behaviour.
  "glm-5.1" => {
    input:  { default: 1.40, over_200k: 1.40 },
    output: { default: 4.40, over_200k: 4.40 },
    cache:  { write: 1.40, read: 0.26 }
  },

  "glm-5" => {
    input:  { default: 1.00, over_200k: 1.00 },
    output: { default: 3.20, over_200k: 3.20 },
    cache:  { write: 1.00, read: 0.20 }
  },

  "glm-5-turbo" => {
    input:  { default: 1.20, over_200k: 1.20 },
    output: { default: 4.00, over_200k: 4.00 },
    cache:  { write: 1.20, read: 0.24 }
  },

  # GLM-5V-Turbo is the multimodal sibling of GLM-5-Turbo (vision capable,
  # see providers.rb model_capabilities override). Same input/output rate
  # as 5-Turbo per Z.ai's Vision Models table.
  "glm-5v-turbo" => {
    input:  { default: 1.20, over_200k: 1.20 },
    output: { default: 4.00, over_200k: 4.00 },
    cache:  { write: 1.20, read: 0.24 }
  },

  "glm-4.7" => {
    input:  { default: 0.60, over_200k: 0.60 },
    output: { default: 2.20, over_200k: 2.20 },
    cache:  { write: 0.60, read: 0.11 }
  },

  # MiniMax — USD per 1M tokens.
  # Source: https://platform.minimaxi.com (Pay-as-You-Go).
  # MiniMax pricing is identical across mainland (.com) and international
  # (.io) endpoints, verified by the team. Same cache-write convention as
  # DeepSeek/Kimi/GLM: bill writes at the input miss rate (OpenAI-compatible
  # usage responses from MiniMax don't reliably carry a separate
  # cache_creation_input_tokens field, so a distinct write rate would be
  # dead code in practice).
  # Note: providers.rb uses the capitalised "MiniMax-M2.x" model id, but
  # the pricing table keys are lowercased to stay consistent with the
  # rest of this file; normalize_model_name() lowercases incoming model
  # names before lookup.
  "minimax-m2.5" => {
    input:  { default: 0.30, over_200k: 0.30 },
    output: { default: 1.20, over_200k: 1.20 },
    cache:  { write: 0.30, read: 0.03 }
  },

  # M3 (released 2026-06-01) is MiniMax's multimodal flagship. Official
  # pricing is tiered by context length (≤512K vs 512K–1M); per the
  # project's "displayed ≤ actual" convention we record only the lowest
  # (≤512K) tier as a flat rate — the global TIERED_PRICING_THRESHOLD is
  # 200K, so applying the 512K–1M rate to the 200K–512K band would over-
  # charge. Listed at original (non-promotional) prices: input $0.60,
  # output $2.40, cache read $0.12 per 1M tokens.
  "minimax-m3" => {
    input:  { default: 0.60, over_200k: 0.60 },
    output: { default: 2.40, over_200k: 2.40 },
    cache:  { write: 0.60, read: 0.12 }
  },

  "minimax-m2.7" => {
    input:  { default: 0.30, over_200k: 0.30 },
    output: { default: 1.20, over_200k: 1.20 },
    cache:  { write: 0.30, read: 0.06 }
  },

  # Qwen (Alibaba DashScope) - USD per 1M tokens, international (Singapore) list price.
  # Source: Alibaba Cloud Model Studio international console per-model pages.
  #
  # Pricing convention:
  #   - These rates are used for user-facing cost ESTIMATION, so we always use
  #     the standard LIST price and intentionally ignore any limited-time promo
  #     discounts. A promo lowers the user's actual bill, never raises it, so
  #     estimating at list price keeps the estimate a safe upper bound and avoids
  #     churn whenever a promo starts or ends.
  #   - We record the model's LOWEST context tier (e.g. input<=256k / <=128k) as a
  #     flat rate, since the global TIERED_PRICING_THRESHOLD is 200K and does not
  #     match Qwen's per-model breakpoints.
  #   - cache.write = official explicit-cache-create price.
  #   - cache.read  = official explicit-cache-hit price.
  #   - When a model has NO published explicit-cache price (e.g. qwen3.6-27b,
  #     qwen-plus-latest), cache.write/read fall back to the input rate.
  # qwen3.7-max: NOT tiered (single flat tier per Alibaba's definition).
  #   List price: input 2.5, output 7.5, explicit write 3.125, explicit read 0.25.
  "qwen3.7-max" => {
    input:  { default: 2.5, over_200k: 2.5 },
    output: { default: 7.5, over_200k: 7.5 },
    cache:  { write: 3.125, read: 0.25 }
  },

  # qwen3.7-plus: list price (<=256k tier):
  #   input 0.4, output 1.6, explicit write 0.5, explicit read 0.04.
  "qwen3.7-plus" => {
    input:  { default: 0.4, over_200k: 0.4 },
    output: { default: 1.6, over_200k: 1.6 },
    cache:  { write: 0.5, read: 0.04 }
  },

  # qwen3.6-plus: list price (<=256k tier). Official explicit-cache prices.
  #   input 0.50, output 3.00, explicit write 0.625, explicit read 0.05
  "qwen3.6-plus" => {
    input:  { default: 0.50, over_200k: 0.50 },
    output: { default: 3.00, over_200k: 3.00 },
    cache:  { write: 0.625, read: 0.05 }
  },

  # qwen3.6-max (qwen3.6-max-preview): list price (<=128k tier).
  #   input 1.30, output 7.80, explicit write 1.625, explicit read 0.13
  "qwen3.6-max" => {
    input:  { default: 1.30, over_200k: 1.30 },
    output: { default: 7.80, over_200k: 7.80 },
    cache:  { write: 1.625, read: 0.13 }
  },

  # qwen3.6-27b: list price, no explicit-cache pricing published.
  #   Cache write/read fall back to the input rate (no cache discount).
  "qwen3.6-27b" => {
    input:  { default: 0.60, over_200k: 0.60 },
    output: { default: 3.60, over_200k: 3.60 },
    cache:  { write: 0.60, read: 0.60 }
  },

  # qwen3.6-flash: list price (<=256k tier).
  #   input 0.25, output 1.50, explicit write 0.3125, explicit read 0.025
  "qwen3.6-flash" => {
    input:  { default: 0.25, over_200k: 0.25 },
    output: { default: 1.50, over_200k: 1.50 },
    cache:  { write: 0.3125, read: 0.025 }
  },

  # qwen-plus-latest: list price (<=256k tier), no explicit-cache pricing.
  #   Cache write/read fall back to the input rate (no cache discount).
  "qwen-plus-latest" => {
    input:  { default: 0.40, over_200k: 0.40 },
    output: { default: 1.20, over_200k: 1.20 },
    cache:  { write: 0.40, read: 0.40 }
  },

  # qwen3-vl-plus: replaces the retiring qwen-vl-plus. List price
  #   (128k<input<=256k tier). input 0.60, output 4.80,
  #   explicit write 0.75, explicit read 0.06.
  "qwen3-vl-plus" => {
    input:  { default: 0.60, over_200k: 0.60 },
    output: { default: 4.80, over_200k: 4.80 },
    cache:  { write: 0.75, read: 0.06 }
  },

}.freeze
TIERED_PRICING_THRESHOLD =

Threshold for tiered pricing (200K tokens) NOTE: OpenAI GPT-5.5/GPT-5.4 use a 272K breakpoint, not 200K. Costs for prompts between 200K–272K will be slightly over-estimated.

200_000

Class Method Summary collapse

Class Method Details

.calculate_cache_cost(pricing:, cache_write_tokens:, cache_read_tokens:, over_threshold:) ⇒ Object

Calculate cache-related costs



756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
# File 'lib/clacky/utils/model_pricing.rb', line 756

def calculate_cache_cost(pricing:, cache_write_tokens:, cache_read_tokens:, over_threshold:)
  cache_cost = 0.0

  # Cache write cost
  if cache_write_tokens > 0
    write_rate = if pricing[:cache].key?(:write)
                   # Simple pricing (Opus 4.5, Haiku 4.5)
                   pricing[:cache][:write]
                 elsif over_threshold
                   # Tiered pricing (Sonnet 4.5)
                   pricing[:cache][:write_over_200k]
                 else
                   pricing[:cache][:write_default]
                 end

    cache_cost += (cache_write_tokens / 1_000_000.0) * write_rate
  end

  # Cache read cost
  if cache_read_tokens > 0
    read_rate = if pricing[:cache].key?(:read)
                  # Simple pricing (Opus 4.5, Haiku 4.5)
                  pricing[:cache][:read]
                elsif over_threshold
                  # Tiered pricing (Sonnet 4.5)
                  pricing[:cache][:read_over_200k]
                else
                  pricing[:cache][:read_default]
                end

    cache_cost += (cache_read_tokens / 1_000_000.0) * read_rate
  end

  cache_cost
end

.calculate_cost(model:, usage:) ⇒ Hash

Calculate cost for the given model and usage

Parameters:

  • model (String)

    Model identifier

  • usage (Hash)

    Usage statistics containing:

    • prompt_tokens: number of input tokens

    • completion_tokens: number of output tokens

    • cache_creation_input_tokens: tokens written to cache (optional)

    • cache_read_input_tokens: tokens read from cache (optional)

Returns:

  • (Hash)

    Hash containing:

    • cost: Cost in USD (Float) or nil if model pricing is unknown

    • source: Cost source (:price) or nil if unknown (Symbol or nil)



545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
# File 'lib/clacky/utils/model_pricing.rb', line 545

def calculate_cost(model:, usage:)
  pricing_result = get_pricing_with_source(model)
  pricing = pricing_result[:pricing]
  source = pricing_result[:source]

  # If no pricing table matches this model, return nil cost.
  # Unknown models should display as N/A, never fall back to guesses.
  return { cost: nil, source: nil } unless pricing

  prompt_tokens = usage[:prompt_tokens] || 0
  completion_tokens = usage[:completion_tokens] || 0
  cache_write_tokens = usage[:cache_creation_input_tokens] || 0
  cache_read_tokens = usage[:cache_read_input_tokens] || 0

  # Determine if we're in the over_200k tier
  # Note: prompt_tokens includes cache_read_tokens but NOT cache_write_tokens
  # cache_write_tokens are additional tokens that were written to cache
  total_input_tokens = prompt_tokens + cache_write_tokens
  over_threshold = total_input_tokens > TIERED_PRICING_THRESHOLD

  # Calculate regular input cost (non-cached tokens)
  # prompt_tokens already includes cache_read_tokens, so we need to subtract them
  # cache_write_tokens are not part of prompt_tokens, so they're handled separately in cache_cost
  regular_input_tokens = prompt_tokens - cache_read_tokens
  input_rate = over_threshold ? pricing[:input][:over_200k] : pricing[:input][:default]
  input_cost = (regular_input_tokens / 1_000_000.0) * input_rate

  # Calculate output cost
  output_rate = over_threshold ? pricing[:output][:over_200k] : pricing[:output][:default]
  output_cost = (completion_tokens / 1_000_000.0) * output_rate

  # Calculate cache costs
  cache_cost = calculate_cache_cost(
    pricing: pricing,
    cache_write_tokens: cache_write_tokens,
    cache_read_tokens: cache_read_tokens,
    over_threshold: over_threshold
  )

  {
    cost: input_cost + output_cost + cache_cost,
    source: source
  }
end

.get_pricing(model) ⇒ Hash

Get pricing for a specific model Falls back to default pricing if model not found

Parameters:

  • model (String)

    Model identifier

Returns:

  • (Hash)

    Pricing structure for the model



595
596
597
# File 'lib/clacky/utils/model_pricing.rb', line 595

def get_pricing(model)
  get_pricing_with_source(model)[:pricing]
end

.get_pricing_with_source(model) ⇒ Hash

Get pricing with source information

Parameters:

  • model (String)

    Model identifier

Returns:

  • (Hash)

    Hash containing:

    • pricing: Pricing structure or nil if model is unknown

    • source: :price (matched) or nil (unknown)



605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
# File 'lib/clacky/utils/model_pricing.rb', line 605

def get_pricing_with_source(model)
  # Normalize model name (remove version suffixes, handle variations)
  normalized_model = normalize_model_name(model)

  if normalized_model
    # Found specific pricing for this model
    {
      pricing: PRICING_TABLE[normalized_model],
      source: :price
    }
  else
    # No matching pricing table entry — cost is unknown
    { pricing: nil, source: nil }
  end
end

.normalize_model_name(model) ⇒ Object

Normalize model name to match pricing table keys. Returns the canonical key on match, or nil when no pricing is available.



624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
# File 'lib/clacky/utils/model_pricing.rb', line 624

def normalize_model_name(model)
  return nil if model.nil? || model.empty?

  model = model.downcase.strip

  # Direct match
  return model if PRICING_TABLE.key?(model)

  # Check for Claude model variations
  # Support both dot and dash separators (e.g., "4.5", "4-5", "4-6")
  # Also handles Bedrock cross-region prefixes (e.g. "jp.anthropic.claude-sonnet-4-6")
  case model
  when /claude.*opus.*4[.-]?[5-9]/i
    "claude-opus-4.5"
  when /claude.*sonnet.*4[.-]?[5-9]/i
    "claude-sonnet-4.5"
  when /claude.*haiku.*4[.-]?[5-9]/i
    "claude-haiku-4.5"
  when /claude-3-5-sonnet-20241022/i
    "claude-3-5-sonnet-20241022"
  when /claude-3-5-sonnet-20240620/i
    "claude-3-5-sonnet-20240620"
  when /claude-3-5-haiku-20241022/i
    "claude-3-5-haiku-20241022"
  when /deepseek-v4-pro/i, /deepseek.*v4.*pro/i
    "deepseek-v4-pro"
  when /deepseek-v4-flash/i, /deepseek.*v4.*flash/i
    "deepseek-v4-flash"
  # Legacy aliases: deepseek-chat and deepseek-reasoner are being
  # deprecated on 2026-07-24 and map to deepseek-v4-flash's
  # non-thinking / thinking modes respectively. Bill at flash rates.
  when /^deepseek-chat$/i, /^deepseek-reasoner$/i
    "deepseek-v4-flash"
  # Xiaomi MiMo — strict anchored match per registered model id in
  # providers.rb (currently mimo-v2.5-pro / mimo-v2-pro / mimo-v2-omni).
  # mimo-v2.5 / mimo-v2-flash are also priced ahead of provider-side
  # registration. Per Xiaomi's 2026-06 schedule, mimo-v2-pro/omni are
  # transparently routed to V2.5 — keys are listed independently so
  # both old and new ids resolve to the right rate.
  when /^mimo-v2\.?5-pro$/i
    "mimo-v2.5-pro"
  when /^mimo-v2\.?5$/i
    "mimo-v2.5"
  when /^mimo-v2-pro$/i
    "mimo-v2-pro"
  when /^mimo-v2-omni$/i
    "mimo-v2-omni"
  when /^mimo-v2-flash$/i
    "mimo-v2-flash"
  # Kimi K2.5 / K2.6 — strict match only. K2 text-only models
  # (kimi-k2-0905-preview, kimi-k2-thinking, etc.) are not yet
  # registered in providers.rb and will be added in a follow-up
  # issue together with their model_capabilities overrides.
  when /^kimi-k2\.?5$/i
    "kimi-k2.5"
  when /^kimi-k2\.?6$/i
    "kimi-k2.6"
  # GLM (Zhipu / Z.ai) — the five models registered in providers.rb.
  # GLM-5V-Turbo is the vision variant; all five share the same Z.ai
  # international flat-rate pricing regardless of which endpoint
  # (mainland bigmodel.cn vs intl z.ai) the user configured.
  # Strict anchored match so unrelated strings like "glm-5-x-foo"
  # don't silently borrow a nearby model's rate.
  when /^glm-5\.1$/i
    "glm-5.1"
  when /^glm-5v-turbo$/i
    "glm-5v-turbo"
  when /^glm-5-turbo$/i
    "glm-5-turbo"
  when /^glm-5$/i
    "glm-5"
  when /^glm-4\.7$/i
    "glm-4.7"
  # MiniMax — model ids in providers.rb use capitalised "MiniMax-M2.x"
  # but we match case-insensitively and map to the lowercased table key.
  when /^minimax-m3$/i
    "minimax-m3"
  when /^minimax-m2\.5$/i
    "minimax-m2.5"
  when /^minimax-m2\.7$/i
    "minimax-m2.7"

  # Qwen (Alibaba DashScope) — strict anchored match per registered
  # model id in providers.rb. qwen3.7-* is the latest flagship line;
  # qwen3.6-* are the previous generation; qwen-plus-latest is the
  # rolling alias for the latest Qwen-Plus release; qwen3-vl-plus is
  # the multimodal SKU (replaces the retired qwen-vl-plus/max).
  when /^qwen3\.7-max$/i
    "qwen3.7-max"
  when /^qwen3\.7-plus$/i
    "qwen3.7-plus"
  when /^qwen3\.6-plus$/i
    "qwen3.6-plus"
  when /^qwen3\.6-max$/i
    "qwen3.6-max"
  when /^qwen3\.6-27b$/i
    "qwen3.6-27b"
  when /^qwen3\.6-flash$/i
    "qwen3.6-flash"
  when /^qwen-plus-latest$/i
    "qwen-plus-latest"
  when /^qwen3-vl-plus$/i
    "qwen3-vl-plus"

  # Google Gemini 3 series. Match the platform aliases (or-gemini-*)
  # and the bare upstream ids returned by Vertex.
  when /^or-gemini-3-1-pro$/i, /^gemini-3\.1-pro(-preview)?$/i
    "gemini-3.1-pro"
  when /^or-gemini-3-5-flash$/i, /^gemini-3\.5-flash$/i, /^gemini-3-flash(-preview)?$/i
    "gemini-3-flash"

  # OpenAI GPT-5.x models — match various dashed/dotted/compact forms
  # (e.g. "gpt-5.5", "gpt-5-5", "gpt5.5", "gpt55")
  when /^gpt-?5\.?5$/i, /^gpt-?5[\.-]?5$/i
    "gpt-5.5"
  when /^gpt-?5\.?4[^.]*mini$/i, /^gpt-?5\.?4[\.-]?mini$/i
    "gpt-5.4-mini"
  when /^gpt-?5\.?4[^.]*nano$/i, /^gpt-?5\.?4[\.-]?nano$/i
    "gpt-5.4-nano"
  when /^gpt-?5\.?4$/i, /^gpt-?5[\.-]?4$/i
    "gpt-5.4"
  # O-series reasoning models
  when /^o4[\.-]?mini$/i
    "o4-mini"
  when /^o3$/i
    "o3"
  else
    nil  # No pricing available for this model — cost will show as N/A
  end
end