Module: CompletionKit::McpTools::Judges
- Extended by:
- Base
- Defined in:
- app/services/completion_kit/mcp_tools/judges.rb
Constant Summary collapse
- TOOLS =
{ "judges_suggest" => { description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft JudgeVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.", inputSchema: { type: "object", properties: { metric_id: { type: "integer" }, count: { type: "integer", description: "How many variants to request (default 3, max 5)." }, model: { type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model." } }, required: ["metric_id"] }, handler: :suggest }, "judges_replay" => { description: "Run the current judge against a dataset (judge-only run). Wraps runs_create with prompt_id omitted and output_column supplied. Re-judges existing dataset outputs so you can compare against human verdicts.", inputSchema: { type: "object", properties: { name: { type: "string" }, metric_id: { type: "integer" }, dataset_id: { type: "integer" }, judge_model: { type: "string" }, output_column: { type: "string", description: "Dataset column with the existing outputs to grade. Defaults to actual_output." } }, required: ["name", "metric_id", "dataset_id", "judge_model"] }, handler: :replay }, "judges_compare" => { description: "Compare two judge versions' calibration stats side by side. Pass either two judge_version_ids or one metric_id with judge_version_a_id / judge_version_b_id.", inputSchema: { type: "object", properties: { metric_id: { type: "integer" }, judge_version_a_id: { type: "integer" }, judge_version_b_id: { type: "integer" } }, required: ["metric_id", "judge_version_a_id", "judge_version_b_id"] }, handler: :compare } }.freeze
Class Method Summary collapse
- .compare(args) ⇒ Object
- .delta_payload(a, b) ⇒ Object
- .judge_version_payload(version, stats) ⇒ Object
- .pair_delta(a, b) ⇒ Object
- .recommendation_for(a, b) ⇒ Object
- .replay(args) ⇒ Object
- .suggest(args) ⇒ Object
Methods included from Base
call, definitions, error_result, text_result
Class Method Details
.compare(args) ⇒ Object
78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 78 def self.compare(args) metric = CompletionKit::Metric.find(args["metric_id"]) a = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_a_id"]) b = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_b_id"]) stats_a = CompletionKit::MetricCalibrationStats.for(metric, judge_version: a) stats_b = CompletionKit::MetricCalibrationStats.for(metric, judge_version: b) text_result({ metric_id: metric.id, a: judge_version_payload(a, stats_a), b: judge_version_payload(b, stats_b), delta: delta_payload(stats_a, stats_b), recommendation: recommendation_for(stats_a, stats_b) }) end |
.delta_payload(a, b) ⇒ Object
106 107 108 109 110 111 112 113 |
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 106 def self.delta_payload(a, b) { agreement: pair_delta(a.agreement_point, b.agreement_point), mae: pair_delta(a.mae, b.mae), kappa: pair_delta(a.kappa, b.kappa), sample_size: { a: a.sample_size, b: b.sample_size } } end |
.judge_version_payload(version, stats) ⇒ Object
93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 93 def self.judge_version_payload(version, stats) { id: version.id, state: version.state, current: version.current, source: version.source, created_at: version.created_at, sample_size: stats.sample_size, agreement_point: stats.agreement_point, agreement_low: stats.agreement_low, agreement_high: stats.agreement_high, borderline_rate: stats.borderline_rate, mae: stats.mae, kappa: stats.kappa } end |
.pair_delta(a, b) ⇒ Object
115 116 117 |
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 115 def self.pair_delta(a, b) { a: a, b: b, delta: (a.nil? || b.nil?) ? nil : (b - a) } end |
.recommendation_for(a, b) ⇒ Object
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 119 def self.recommendation_for(a, b) total = a.sample_size + b.sample_size if total < 30 { state: "need_more_data", reason: "Combined n=#{total}; need 30+ to make a call." } elsif a.agreement_point.nil? || b.agreement_point.nil? { state: "no_change", reason: "Not enough verdicts on one of the versions to compare." } else lift = b.agreement_point - a.agreement_point if lift > 0.03 { state: "recommend", reason: "B agreement +#{(lift * 100).round}pt over A." } elsif lift < -0.03 { state: "hold", reason: "B agreement #{(lift * 100).round}pt vs A." } else { state: "no_change", reason: "Agreement within noise (#{(lift * 100).round}pt)." } end end end |
.replay(args) ⇒ Object
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 61 def self.replay(args) metric = CompletionKit::Metric.find(args["metric_id"]) dataset = CompletionKit::Dataset.find(args["dataset_id"]) run = CompletionKit::Run.new( name: args["name"], dataset: dataset, judge_model: args["judge_model"], output_column: args["output_column"].presence || "actual_output" ) if run.save run.replace_metrics!([metric.id]) text_result(run.reload.as_json) else error_result(run.errors..join(", ")) end end |
.suggest(args) ⇒ Object
50 51 52 53 54 55 56 57 58 59 |
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 50 def self.suggest(args) metric = CompletionKit::Metric.find(args["metric_id"]) count = [args["count"].to_i, 5].min count = CompletionKit::JudgeVariantGenerator::DEFAULT_VARIANT_COUNT if count <= 0 generator = CompletionKit::JudgeVariantGenerator.new(metric, count: count, model: args["model"]) variants = generator.call return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty? versions = generator.persist!(variants) text_result(versions.map(&:as_json)) end |