Module: CompletionKit::McpTools::Judges
- Extended by:
- Base
- Defined in:
- app/services/completion_kit/mcp_tools/judges.rb
Constant Summary
collapse
- TOOLS =
{
"judges_replay" => {
description: "Run the current judge against a dataset (judge-only run). Wraps runs_create with prompt_id omitted and output_column supplied. Re-judges existing dataset outputs so you can compare against human verdicts.",
inputSchema: {
type: "object",
properties: {
name: { type: "string" },
metric_id: { type: "integer" },
dataset_id: { type: "integer" },
judge_model: { type: "string" },
output_column: { type: "string", description: "Dataset column with the existing outputs to grade. Defaults to actual_output." }
},
required: ["name", "metric_id", "dataset_id", "judge_model"]
},
handler: :replay
},
"judges_compare" => {
description: "Compare two metric versions' agreement stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
inputSchema: {
type: "object",
properties: {
metric_id: { type: "integer" },
metric_version_a_id: { type: "integer" },
metric_version_b_id: { type: "integer" }
},
required: ["metric_id", "metric_version_a_id", "metric_version_b_id"]
},
handler: :compare
}
}.freeze
Class Method Summary
collapse
Methods included from Base
call, definitions, error_result, text_result
Class Method Details
.compare(args) ⇒ Object
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 54
def self.compare(args)
metric = CompletionKit::Metric.find(args["metric_id"])
a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
stats_a = CompletionKit::MetricAgreementStats.for(metric, metric_version: a)
stats_b = CompletionKit::MetricAgreementStats.for(metric, metric_version: b)
text_result({
metric_id: metric.id,
a: metric_version_payload(a, stats_a),
b: metric_version_payload(b, stats_b),
delta: delta_payload(stats_a, stats_b),
recommendation: recommendation_for(stats_a, stats_b)
})
end
|
.delta_payload(a, b) ⇒ Object
82
83
84
85
86
87
88
89
|
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 82
def self.delta_payload(a, b)
{
agreement: pair_delta(a.agreement_point, b.agreement_point),
mae: pair_delta(a.mae, b.mae),
kappa: pair_delta(a.kappa, b.kappa),
sample_size: { a: a.sample_size, b: b.sample_size }
}
end
|
.metric_version_payload(version, stats) ⇒ Object
69
70
71
72
73
74
75
76
77
78
79
80
|
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 69
def self.metric_version_payload(version, stats)
{
id: version.id, state: version.state, current: version.current,
source: version.source, created_at: version.created_at,
sample_size: stats.sample_size,
agreement_point: stats.agreement_point,
agreement_low: stats.agreement_low,
agreement_high: stats.agreement_high,
borderline_rate: stats.borderline_rate,
mae: stats.mae, kappa: stats.kappa
}
end
|
.pair_delta(a, b) ⇒ Object
91
92
93
|
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 91
def self.pair_delta(a, b)
{ a: a, b: b, delta: (a.nil? || b.nil?) ? nil : (b - a) }
end
|
.recommendation_for(a, b) ⇒ Object
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
|
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 95
def self.recommendation_for(a, b)
total = a.sample_size + b.sample_size
if total < 30
{ state: "need_more_data", reason: "Combined n=#{total}; need 30+ to make a call." }
elsif a.agreement_point.nil? || b.agreement_point.nil?
{ state: "no_change", reason: "Not enough verdicts on one of the versions to compare." }
else
lift = b.agreement_point - a.agreement_point
if lift > 0.03
{ state: "recommend", reason: "B agreement +#{(lift * 100).round}pt over A." }
elsif lift < -0.03
{ state: "hold", reason: "B agreement #{(lift * 100).round}pt vs A." }
else
{ state: "no_change", reason: "Agreement within noise (#{(lift * 100).round}pt)." }
end
end
end
|
.replay(args) ⇒ Object
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
# File 'app/services/completion_kit/mcp_tools/judges.rb', line 37
def self.replay(args)
metric = CompletionKit::Metric.find(args["metric_id"])
dataset = CompletionKit::Dataset.find(args["dataset_id"])
run = CompletionKit::Run.new(
name: args["name"],
dataset: dataset,
judge_model: args["judge_model"],
output_column: args["output_column"].presence || "actual_output"
)
if run.save
run.replace_metrics!([metric.id])
text_result(run.reload.as_json)
else
error_result(run.errors.full_messages.join(", "))
end
end
|