Module: Clacky::ModelPricing
- Defined in:
- lib/clacky/utils/model_pricing.rb
Overview
Module for handling AI model pricing Supports different pricing tiers and prompt caching
Constant Summary collapse
- PRICING_TABLE =
Pricing per 1M tokens (MTok) in USD All pricing is based on official API documentation
{ # Claude 4.5 models - tiered pricing based on prompt length "claude-opus-4.5" => { input: { default: 5.00, # $5/MTok for prompts ≤ 200K tokens over_200k: 5.00 # same for all tiers }, output: { default: 25.00, # $25/MTok for prompts ≤ 200K tokens over_200k: 25.00 # same for all tiers }, cache: { write: 6.25, # $6.25/MTok cache write read: 0.50 # $0.50/MTok cache read } }, "claude-sonnet-4.5" => { input: { default: 3.00, # $3/MTok for prompts ≤ 200K tokens over_200k: 6.00 # $6/MTok for prompts > 200K tokens }, output: { default: 15.00, # $15/MTok for prompts ≤ 200K tokens over_200k: 22.50 # $22.50/MTok for prompts > 200K tokens }, cache: { write_default: 3.75, # $3.75/MTok cache write (≤ 200K) write_over_200k: 7.50, # $7.50/MTok cache write (> 200K) read_default: 0.30, # $0.30/MTok cache read (≤ 200K) read_over_200k: 0.60 # $0.60/MTok cache read (> 200K) } }, "claude-haiku-4.5" => { input: { default: 1.00, # $1/MTok over_200k: 1.00 # same for all tiers }, output: { default: 5.00, # $5/MTok over_200k: 5.00 # same for all tiers }, cache: { write: 1.25, # $1.25/MTok cache write read: 0.10 # $0.10/MTok cache read } }, # Claude 3.5 models (for backwards compatibility) "claude-3-5-sonnet-20241022" => { input: { default: 3.00, over_200k: 6.00 }, output: { default: 15.00, over_200k: 22.50 }, cache: { write_default: 3.75, write_over_200k: 7.50, read_default: 0.30, read_over_200k: 0.60 } }, "claude-3-5-sonnet-20240620" => { input: { default: 3.00, over_200k: 6.00 }, output: { default: 15.00, over_200k: 22.50 }, cache: { write_default: 3.75, write_over_200k: 7.50, read_default: 0.30, read_over_200k: 0.60 } }, "claude-3-5-haiku-20241022" => { input: { default: 1.00, over_200k: 1.00 }, output: { default: 5.00, over_200k: 5.00 }, cache: { write: 1.25, read: 0.10 } }, # DeepSeek V4 models # Source: https://api-docs.deepseek.com/quick_start/pricing (USD / 1M tokens) # DeepSeek billing model: # - "cache miss input" = regular prompt_tokens rate # - "cache hit input" = cache_read rate (DeepSeek has no separate cache-write charge) # - No tiered pricing (single rate regardless of context length) "deepseek-v4-flash" => { input: { default: 0.14, # $0.14/MTok cache miss over_200k: 0.14 # no tiered pricing }, output: { default: 0.28, # $0.28/MTok over_200k: 0.28 }, cache: { write: 0.14, # DeepSeek doesn't charge extra for writes; bill at miss rate read: 0.0028 # $0.0028/MTok cache hit } }, "deepseek-v4-pro" => { input: { default: 1.74, # $1.74/MTok cache miss over_200k: 1.74 }, output: { default: 3.48, # $3.48/MTok over_200k: 3.48 }, cache: { write: 1.74, # no separate write charge; bill at miss rate read: 0.0145 # $0.0145/MTok cache hit } }, # Kimi K2.5 / K2.6 multimodal models # Source: https://platform.moonshot.cn (USD / 1M tokens) # Kimi billing model (same shape as DeepSeek): # - "cache miss input" = regular prompt_tokens rate # - "cache hit input" = cache_read rate (no separate cache-write charge) # - No tiered pricing (single rate regardless of context length) "kimi-k2.5" => { input: { default: 0.60, # $0.60/MTok cache miss over_200k: 0.60 # no tiered pricing }, output: { default: 3.00, # $3.00/MTok over_200k: 3.00 }, cache: { write: 0.60, # Kimi doesn't charge extra for writes; bill at miss rate read: 0.10 # $0.10/MTok cache hit } }, "kimi-k2.6" => { input: { default: 0.95, # $0.95/MTok cache miss over_200k: 0.95 }, output: { default: 4.00, # $4.00/MTok over_200k: 4.00 }, cache: { write: 0.95, # no separate write charge; bill at miss rate read: 0.16 # $0.16/MTok cache hit } }, # OpenAI GPT-5.5 / GPT-5.4 — breakpoint at 272K input tokens # Source: https://openai.com/api/pricing/ (USD / 1M tokens) # Note: OpenAI's actual tiered-pricing threshold is 272K, not the # global 200K below. Prompts between 200K–272K will slightly # over-estimate costs until a per-model threshold is implemented. "gpt-5.5" => { input: { default: 5.00, # $5/MTok for prompts ≤ 272K tokens over_200k: 10.00 # $10/MTok for prompts > 272K tokens }, output: { default: 30.00, # $30/MTok for prompts ≤ 272K tokens over_200k: 45.00 # $45/MTok for prompts > 272K tokens }, cache: { write_default: 5.00, # $5/MTok cache write (≤ 272K) write_over_200k: 10.00, # $10/MTok cache write (> 272K) read_default: 0.50, # $0.50/MTok cache read (≤ 272K) read_over_200k: 1.00 # $1.00/MTok cache read (> 272K) } }, "gpt-5.4" => { input: { default: 2.50, # $2.50/MTok for prompts ≤ 272K tokens over_200k: 5.00 # $5/MTok for prompts > 272K tokens }, output: { default: 15.00, # $15/MTok for prompts ≤ 272K tokens over_200k: 22.50 # $22.50/MTok for prompts > 272K tokens }, cache: { write_default: 2.50, # $2.50/MTok cache write (≤ 272K) write_over_200k: 5.00, # $5/MTok cache write (> 272K) read_default: 0.25, # $0.25/MTok cache read (≤ 272K) read_over_200k: 0.50 # $0.50/MTok cache read (> 272K) } }, # GPT-5.4 flat-rate models (no breakpoint, single rate regardless of context) "gpt-5.4-mini" => { input: { default: 0.75, # $0.75/MTok over_200k: 0.75 }, output: { default: 4.50, # $4.50/MTok over_200k: 4.50 }, cache: { write: 0.75, # $0.75/MTok cache write read: 0.075 # $0.075/MTok cache read (10% of input) } }, "gpt-5.4-nano" => { input: { default: 0.20, # $0.20/MTok over_200k: 0.20 }, output: { default: 1.25, # $1.25/MTok over_200k: 1.25 }, cache: { write: 0.20, # $0.20/MTok cache write read: 0.02 # $0.02/MTok cache read (10% of input) } }, # O-series reasoning models — flat-rate (200K context window) # Source: https://openai.com/api/pricing/ "o3" => { input: { default: 2.00, # $2/MTok over_200k: 2.00 # flat rate }, output: { default: 8.00, # $8/MTok over_200k: 8.00 }, cache: { write: 2.00, # $2/MTok cache write (same as input) read: 0.50 # $0.50/MTok cache read (25% of input) } }, "o4-mini" => { input: { default: 1.10, # $1.10/MTok over_200k: 1.10 # flat rate }, output: { default: 4.40, # $4.40/MTok over_200k: 4.40 }, cache: { write: 1.10, # $1.10/MTok cache write (same as input) read: 0.275 # $0.275/MTok cache read (25% of input) } }, # GLM (Zhipu / Z.ai) — USD per 1M tokens. # Source: https://docs.z.ai/guides/overview/pricing (Z.ai international). # Pricing policy: we always bill at the Z.ai international flat rate, # regardless of which endpoint (mainland bigmodel.cn vs intl z.ai) the # user configured. Rationale: # 1. Mainland GLM uses tiered pricing (≤32K / >32K / >128K) where the # >32K tier is hit by the vast majority of real requests, and is # actually a few RMB cheaper than Z.ai's flat rate — displaying the # (slightly higher) Z.ai rate gives users a "displayed ≤ actual" # experience which is psychologically safer than the reverse. # 2. Single flat rate keeps the table shape consistent with every # other provider here (no special-case tier logic for just GLM). # Cache-write: same convention as DeepSeek/Kimi — OpenAI-compatible # endpoints don't charge separately for cache writes (Z.ai's page lists # "Cached Input Storage: Limited-time Free"), so bill writes at the # regular input miss rate for safe "displayed ≤ actual" behaviour. "glm-5.1" => { input: { default: 1.40, over_200k: 1.40 }, output: { default: 4.40, over_200k: 4.40 }, cache: { write: 1.40, read: 0.26 } }, "glm-5" => { input: { default: 1.00, over_200k: 1.00 }, output: { default: 3.20, over_200k: 3.20 }, cache: { write: 1.00, read: 0.20 } }, "glm-5-turbo" => { input: { default: 1.20, over_200k: 1.20 }, output: { default: 4.00, over_200k: 4.00 }, cache: { write: 1.20, read: 0.24 } }, # GLM-5V-Turbo is the multimodal sibling of GLM-5-Turbo (vision capable, # see providers.rb model_capabilities override). Same input/output rate # as 5-Turbo per Z.ai's Vision Models table. "glm-5v-turbo" => { input: { default: 1.20, over_200k: 1.20 }, output: { default: 4.00, over_200k: 4.00 }, cache: { write: 1.20, read: 0.24 } }, "glm-4.7" => { input: { default: 0.60, over_200k: 0.60 }, output: { default: 2.20, over_200k: 2.20 }, cache: { write: 0.60, read: 0.11 } }, # MiniMax — USD per 1M tokens. # Source: https://platform.minimaxi.com (Pay-as-You-Go). # MiniMax pricing is identical across mainland (.com) and international # (.io) endpoints, verified by the team. Same cache-write convention as # DeepSeek/Kimi/GLM: bill writes at the input miss rate (OpenAI-compatible # usage responses from MiniMax don't reliably carry a separate # cache_creation_input_tokens field, so a distinct write rate would be # dead code in practice). # Note: providers.rb uses the capitalised "MiniMax-M2.x" model id, but # the pricing table keys are lowercased to stay consistent with the # rest of this file; normalize_model_name() lowercases incoming model # names before lookup. "minimax-m2.5" => { input: { default: 0.30, over_200k: 0.30 }, output: { default: 1.20, over_200k: 1.20 }, cache: { write: 0.30, read: 0.03 } }, "minimax-m2.7" => { input: { default: 0.30, over_200k: 0.30 }, output: { default: 1.20, over_200k: 1.20 }, cache: { write: 0.30, read: 0.06 } }, # Qwen (Alibaba DashScope) — USD per 1M tokens, Singapore region list price. # Source: Alibaba Cloud Model Studio international pricing. # Cache convention (mirrors DeepSeek/Kimi/GLM "displayed ≤ actual"): # - DashScope has two cache modes; implicit is auto-on, explicit is opt-in. # Implicit: write @ 100% input, read @ 20% input (no setup, no guarantee) # Explicit: write @ 125% input, read @ 10% input (cache_control marker) # - We bill writes at the regular input rate (matches implicit, and avoids # surprising users with the explicit 25% surcharge). # - We bill reads at 20% (implicit rate) — the conservative side; users on # explicit caching will see real bills slightly *lower* than displayed. "qwen3.6-plus" => { input: { default: 0.40, over_200k: 0.40 }, output: { default: 2.40, over_200k: 2.40 }, cache: { write: 0.40, read: 0.08 } }, "qwen3.6-max" => { input: { default: 1.20, over_200k: 1.20 }, output: { default: 6.00, over_200k: 6.00 }, cache: { write: 1.20, read: 0.24 } }, "qwen3.6-27b" => { input: { default: 0.20, over_200k: 0.20 }, output: { default: 0.80, over_200k: 0.80 }, cache: { write: 0.20, read: 0.04 } }, "qwen3.6-flash" => { input: { default: 0.15, over_200k: 0.15 }, output: { default: 0.90, over_200k: 0.90 }, cache: { write: 0.15, read: 0.03 } }, "qwen-plus-latest" => { input: { default: 0.40, over_200k: 0.40 }, output: { default: 1.20, over_200k: 1.20 }, cache: { write: 0.40, read: 0.08 } }, "qwen-vl-plus" => { input: { default: 0.14, over_200k: 0.14 }, output: { default: 0.41, over_200k: 0.41 }, cache: { write: 0.14, read: 0.028 } }, "qwen-vl-max" => { input: { default: 0.52, over_200k: 0.52 }, output: { default: 2.08, over_200k: 2.08 }, cache: { write: 0.52, read: 0.104 } }, }.freeze
- TIERED_PRICING_THRESHOLD =
Threshold for tiered pricing (200K tokens) NOTE: OpenAI GPT-5.5/GPT-5.4 use a 272K breakpoint, not 200K. Costs for prompts between 200K–272K will be slightly over-estimated.
200_000
Class Method Summary collapse
-
.calculate_cache_cost(pricing:, cache_write_tokens:, cache_read_tokens:, over_threshold:) ⇒ Object
Calculate cache-related costs.
-
.calculate_cost(model:, usage:) ⇒ Hash
Calculate cost for the given model and usage.
-
.get_pricing(model) ⇒ Hash
Get pricing for a specific model Falls back to default pricing if model not found.
-
.get_pricing_with_source(model) ⇒ Hash
Get pricing with source information.
-
.normalize_model_name(model) ⇒ Object
Normalize model name to match pricing table keys.
Class Method Details
.calculate_cache_cost(pricing:, cache_write_tokens:, cache_read_tokens:, over_threshold:) ⇒ Object
Calculate cache-related costs
609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 |
# File 'lib/clacky/utils/model_pricing.rb', line 609 def calculate_cache_cost(pricing:, cache_write_tokens:, cache_read_tokens:, over_threshold:) cache_cost = 0.0 # Cache write cost if cache_write_tokens > 0 write_rate = if pricing[:cache].key?(:write) # Simple pricing (Opus 4.5, Haiku 4.5) pricing[:cache][:write] elsif over_threshold # Tiered pricing (Sonnet 4.5) pricing[:cache][:write_over_200k] else pricing[:cache][:write_default] end cache_cost += (cache_write_tokens / 1_000_000.0) * write_rate end # Cache read cost if cache_read_tokens > 0 read_rate = if pricing[:cache].key?(:read) # Simple pricing (Opus 4.5, Haiku 4.5) pricing[:cache][:read] elsif over_threshold # Tiered pricing (Sonnet 4.5) pricing[:cache][:read_over_200k] else pricing[:cache][:read_default] end cache_cost += (cache_read_tokens / 1_000_000.0) * read_rate end cache_cost end |
.calculate_cost(model:, usage:) ⇒ Hash
Calculate cost for the given model and usage
426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 |
# File 'lib/clacky/utils/model_pricing.rb', line 426 def calculate_cost(model:, usage:) pricing_result = get_pricing_with_source(model) pricing = pricing_result[:pricing] source = pricing_result[:source] # If no pricing table matches this model, return nil cost. # Unknown models should display as N/A, never fall back to guesses. return { cost: nil, source: nil } unless pricing prompt_tokens = usage[:prompt_tokens] || 0 completion_tokens = usage[:completion_tokens] || 0 cache_write_tokens = usage[:cache_creation_input_tokens] || 0 cache_read_tokens = usage[:cache_read_input_tokens] || 0 # Determine if we're in the over_200k tier # Note: prompt_tokens includes cache_read_tokens but NOT cache_write_tokens # cache_write_tokens are additional tokens that were written to cache total_input_tokens = prompt_tokens + cache_write_tokens over_threshold = total_input_tokens > TIERED_PRICING_THRESHOLD # Calculate regular input cost (non-cached tokens) # prompt_tokens already includes cache_read_tokens, so we need to subtract them # cache_write_tokens are not part of prompt_tokens, so they're handled separately in cache_cost regular_input_tokens = prompt_tokens - cache_read_tokens input_rate = over_threshold ? pricing[:input][:over_200k] : pricing[:input][:default] input_cost = (regular_input_tokens / 1_000_000.0) * input_rate # Calculate output cost output_rate = over_threshold ? pricing[:output][:over_200k] : pricing[:output][:default] output_cost = (completion_tokens / 1_000_000.0) * output_rate # Calculate cache costs cache_cost = calculate_cache_cost( pricing: pricing, cache_write_tokens: cache_write_tokens, cache_read_tokens: cache_read_tokens, over_threshold: over_threshold ) { cost: input_cost + output_cost + cache_cost, source: source } end |
.get_pricing(model) ⇒ Hash
Get pricing for a specific model Falls back to default pricing if model not found
476 477 478 |
# File 'lib/clacky/utils/model_pricing.rb', line 476 def get_pricing(model) get_pricing_with_source(model)[:pricing] end |
.get_pricing_with_source(model) ⇒ Hash
Get pricing with source information
486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 |
# File 'lib/clacky/utils/model_pricing.rb', line 486 def get_pricing_with_source(model) # Normalize model name (remove version suffixes, handle variations) normalized_model = normalize_model_name(model) if normalized_model # Found specific pricing for this model { pricing: PRICING_TABLE[normalized_model], source: :price } else # No matching pricing table entry — cost is unknown { pricing: nil, source: nil } end end |
.normalize_model_name(model) ⇒ Object
Normalize model name to match pricing table keys. Returns the canonical key on match, or nil when no pricing is available.
505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 |
# File 'lib/clacky/utils/model_pricing.rb', line 505 def normalize_model_name(model) return nil if model.nil? || model.empty? model = model.downcase.strip # Direct match return model if PRICING_TABLE.key?(model) # Check for Claude model variations # Support both dot and dash separators (e.g., "4.5", "4-5", "4-6") # Also handles Bedrock cross-region prefixes (e.g. "jp.anthropic.claude-sonnet-4-6") case model when /claude.*opus.*4[.-]?[5-9]/i "claude-opus-4.5" when /claude.*sonnet.*4[.-]?[5-9]/i "claude-sonnet-4.5" when /claude.*haiku.*4[.-]?[5-9]/i "claude-haiku-4.5" when /claude-3-5-sonnet-20241022/i "claude-3-5-sonnet-20241022" when /claude-3-5-sonnet-20240620/i "claude-3-5-sonnet-20240620" when /claude-3-5-haiku-20241022/i "claude-3-5-haiku-20241022" when /deepseek-v4-pro/i, /deepseek.*v4.*pro/i "deepseek-v4-pro" when /deepseek-v4-flash/i, /deepseek.*v4.*flash/i "deepseek-v4-flash" # Legacy aliases: deepseek-chat and deepseek-reasoner are being # deprecated on 2026-07-24 and map to deepseek-v4-flash's # non-thinking / thinking modes respectively. Bill at flash rates. when /^deepseek-chat$/i, /^deepseek-reasoner$/i "deepseek-v4-flash" # Kimi K2.5 / K2.6 — strict match only. K2 text-only models # (kimi-k2-0905-preview, kimi-k2-thinking, etc.) are not yet # registered in providers.rb and will be added in a follow-up # issue together with their model_capabilities overrides. when /^kimi-k2\.?5$/i "kimi-k2.5" when /^kimi-k2\.?6$/i "kimi-k2.6" # GLM (Zhipu / Z.ai) — the five models registered in providers.rb. # GLM-5V-Turbo is the vision variant; all five share the same Z.ai # international flat-rate pricing regardless of which endpoint # (mainland bigmodel.cn vs intl z.ai) the user configured. # Strict anchored match so unrelated strings like "glm-5-x-foo" # don't silently borrow a nearby model's rate. when /^glm-5\.1$/i "glm-5.1" when /^glm-5v-turbo$/i "glm-5v-turbo" when /^glm-5-turbo$/i "glm-5-turbo" when /^glm-5$/i "glm-5" when /^glm-4\.7$/i "glm-4.7" # MiniMax — model ids in providers.rb use capitalised "MiniMax-M2.x" # but we match case-insensitively and map to the lowercased table key. when /^minimax-m2\.5$/i "minimax-m2.5" when /^minimax-m2\.7$/i "minimax-m2.7" # Qwen (Alibaba DashScope) — strict anchored match per registered # model id in providers.rb. qwen3.6-* are the new flagship line; # qwen-plus-latest is the rolling alias for the latest Qwen-Plus # release; qwen-vl-* are the multimodal SKUs. when /^qwen3\.6-plus$/i "qwen3.6-plus" when /^qwen3\.6-max$/i "qwen3.6-max" when /^qwen3\.6-27b$/i "qwen3.6-27b" when /^qwen3\.6-flash$/i "qwen3.6-flash" when /^qwen-plus-latest$/i "qwen-plus-latest" when /^qwen-vl-plus$/i "qwen-vl-plus" when /^qwen-vl-max$/i "qwen-vl-max" # OpenAI GPT-5.x models — match various dashed/dotted/compact forms # (e.g. "gpt-5.5", "gpt-5-5", "gpt5.5", "gpt55") when /^gpt-?5\.?5$/i, /^gpt-?5[\.-]?5$/i "gpt-5.5" when /^gpt-?5\.?4[^.]*mini$/i, /^gpt-?5\.?4[\.-]?mini$/i "gpt-5.4-mini" when /^gpt-?5\.?4[^.]*nano$/i, /^gpt-?5\.?4[\.-]?nano$/i "gpt-5.4-nano" when /^gpt-?5\.?4$/i, /^gpt-?5[\.-]?4$/i "gpt-5.4" # O-series reasoning models when /^o4[\.-]?mini$/i "o4-mini" when /^o3$/i "o3" else nil # No pricing available for this model — cost will show as N/A end end |