Class: Qualspec::Suite::Results
- Inherits:
-
Object
- Object
- Qualspec::Suite::Results
- Defined in:
- lib/qualspec/suite/runner.rb
Overview
Results container with multi-dimensional support
Instance Attribute Summary collapse
-
#candidate_models ⇒ Object
readonly
Returns the value of attribute candidate_models.
-
#costs ⇒ Object
readonly
Returns the value of attribute costs.
-
#evaluations ⇒ Object
readonly
Returns the value of attribute evaluations.
-
#finished_at ⇒ Object
readonly
Returns the value of attribute finished_at.
-
#metadata_captured ⇒ Object
Returns the value of attribute metadata_captured.
-
#prompts ⇒ Object
readonly
Returns the value of attribute prompts.
-
#responses ⇒ Object
readonly
Returns the value of attribute responses.
-
#started_at ⇒ Object
readonly
Returns the value of attribute started_at.
-
#suite_name ⇒ Object
readonly
Returns the value of attribute suite_name.
-
#timing ⇒ Object
readonly
Returns the value of attribute timing.
Instance Method Summary collapse
-
#cost_by_candidate ⇒ Object
Total cost per candidate.
-
#costs_tracked? ⇒ Boolean
Whether per-call cost/token metadata was captured this run.
- #finish! ⇒ Object
-
#initialize(suite_name) ⇒ Results
constructor
A new instance of Results.
- #record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil) ⇒ Object
- #record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil) ⇒ Object
-
#scores_by_candidate ⇒ Object
Group scores by candidate, aggregating across all variants.
-
#scores_by_scenario ⇒ Object
Detailed breakdown by scenario + variant.
-
#scores_by_scenario_variant ⇒ Object
Cross-tabulation: scenario × variant.
-
#scores_by_temperature ⇒ Object
Temperature sensitivity analysis.
-
#scores_by_variant ⇒ Object
Group scores by variant.
- #timing_by_candidate ⇒ Object
- #to_h ⇒ Object
-
#value_ranking ⇒ Object
Rank candidates by quality-per-dollar (avg score / total cost), best first.
Constructor Details
#initialize(suite_name) ⇒ Results
Returns a new instance of Results.
233 234 235 236 237 238 239 240 241 242 243 244 |
# File 'lib/qualspec/suite/runner.rb', line 233 def initialize(suite_name) @suite_name = suite_name @evaluations = [] @responses = {} # Nested: {candidate => {scenario => {variant => {temp => response}}}} @timing = {} @costs = {} @candidate_models = {} # {candidate_name => model_string} @prompts = {} # {scenario_name => prompt_string} @started_at = Time.now @finished_at = nil @metadata_captured = false # set true when the suite enables track_cost end |
Instance Attribute Details
#candidate_models ⇒ Object (readonly)
Returns the value of attribute candidate_models.
229 230 231 |
# File 'lib/qualspec/suite/runner.rb', line 229 def candidate_models @candidate_models end |
#costs ⇒ Object (readonly)
Returns the value of attribute costs.
229 230 231 |
# File 'lib/qualspec/suite/runner.rb', line 229 def costs @costs end |
#evaluations ⇒ Object (readonly)
Returns the value of attribute evaluations.
229 230 231 |
# File 'lib/qualspec/suite/runner.rb', line 229 def evaluations @evaluations end |
#finished_at ⇒ Object (readonly)
Returns the value of attribute finished_at.
229 230 231 |
# File 'lib/qualspec/suite/runner.rb', line 229 def finished_at @finished_at end |
#metadata_captured ⇒ Object
Returns the value of attribute metadata_captured.
231 232 233 |
# File 'lib/qualspec/suite/runner.rb', line 231 def @metadata_captured end |
#prompts ⇒ Object (readonly)
Returns the value of attribute prompts.
229 230 231 |
# File 'lib/qualspec/suite/runner.rb', line 229 def prompts @prompts end |
#responses ⇒ Object (readonly)
Returns the value of attribute responses.
229 230 231 |
# File 'lib/qualspec/suite/runner.rb', line 229 def responses @responses end |
#started_at ⇒ Object (readonly)
Returns the value of attribute started_at.
229 230 231 |
# File 'lib/qualspec/suite/runner.rb', line 229 def started_at @started_at end |
#suite_name ⇒ Object (readonly)
Returns the value of attribute suite_name.
229 230 231 |
# File 'lib/qualspec/suite/runner.rb', line 229 def suite_name @suite_name end |
#timing ⇒ Object (readonly)
Returns the value of attribute timing.
229 230 231 |
# File 'lib/qualspec/suite/runner.rb', line 229 def timing @timing end |
Instance Method Details
#cost_by_candidate ⇒ Object
Total cost per candidate. Raises if cost tracking wasn't enabled.
252 253 254 255 |
# File 'lib/qualspec/suite/runner.rb', line 252 def cost_by_candidate ensure_cost_tracking! @costs.dup end |
#costs_tracked? ⇒ Boolean
Whether per-call cost/token metadata was captured this run.
247 248 249 |
# File 'lib/qualspec/suite/runner.rb', line 247 def costs_tracked? @metadata_captured end |
#finish! ⇒ Object
308 309 310 |
# File 'lib/qualspec/suite/runner.rb', line 308 def finish! @finished_at = Time.now end |
#record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil) ⇒ Object
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 |
# File 'lib/qualspec/suite/runner.rb', line 292 def record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil) @evaluations << { candidate: candidate, scenario: scenario, variant: variant, temperature: temperature, criteria: criteria, criteria_count: Array(criteria).size, score: evaluation.score, pass: evaluation.pass?, reasoning: evaluation.reasoning, error: evaluation.error, winner: winner } end |
#record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil) ⇒ Object
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 |
# File 'lib/qualspec/suite/runner.rb', line 271 def record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil) # Store in nested structure @responses[candidate] ||= {} @responses[candidate][scenario] ||= {} @responses[candidate][scenario][variant] ||= {} @responses[candidate][scenario][variant][temperature] = { content: response, variant_data: variant_data } if duration_ms @timing[candidate] ||= {} @timing[candidate]["#{scenario}/#{variant}"] = duration_ms end return unless cost&.positive? @costs[candidate] ||= 0.0 @costs[candidate] += cost end |
#scores_by_candidate ⇒ Object
Group scores by candidate, aggregating across all variants
313 314 315 316 317 318 319 320 321 322 323 324 325 326 |
# File 'lib/qualspec/suite/runner.rb', line 313 def scores_by_candidate @evaluations.group_by { |e| e[:candidate] }.transform_values do |evals| passed = evals.count { |e| e[:pass] } total = evals.size avg_score = total.positive? ? evals.sum { |e| e[:score] }.to_f / total : 0 { passed: passed, total: total, pass_rate: total.positive? ? (passed.to_f / total * 100).round(1) : 0, avg_score: avg_score.round(2) } end end |
#scores_by_scenario ⇒ Object
Detailed breakdown by scenario + variant
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 |
# File 'lib/qualspec/suite/runner.rb', line 368 def scores_by_scenario @evaluations.group_by { |e| e[:scenario] }.transform_values do |evals| evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals| total = candidate_evals.size avg_score = (candidate_evals.sum { |e| e[:score] }.to_f / total).round(2) first = candidate_evals.first { score: avg_score, pass: candidate_evals.all? { |e| e[:pass] }, reasoning: first[:reasoning], variant: first[:variant], temperature: first[:temperature] } end end end |
#scores_by_scenario_variant ⇒ Object
Cross-tabulation: scenario × variant
386 387 388 389 390 391 392 393 394 395 396 397 398 |
# File 'lib/qualspec/suite/runner.rb', line 386 def scores_by_scenario_variant @evaluations.group_by { |e| [e[:scenario], e[:variant]] }.transform_values do |evals| evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals| eval_data = candidate_evals.first { score: eval_data[:score], pass: eval_data[:pass], reasoning: eval_data[:reasoning], temperature: eval_data[:temperature] } end end end |
#scores_by_temperature ⇒ Object
Temperature sensitivity analysis
345 346 347 348 349 350 351 352 353 |
# File 'lib/qualspec/suite/runner.rb', line 345 def scores_by_temperature by_temp = @evaluations.group_by { |e| e[:temperature] } by_temp.transform_values do |evals| { avg_score: (evals.sum { |e| e[:score] }.to_f / evals.size).round(2), pass_rate: (evals.count { |e| e[:pass] }.to_f / evals.size * 100).round(1) } end end |
#scores_by_variant ⇒ Object
Group scores by variant
329 330 331 332 333 334 335 336 337 338 339 340 341 342 |
# File 'lib/qualspec/suite/runner.rb', line 329 def scores_by_variant @evaluations.group_by { |e| e[:variant] }.transform_values do |evals| passed = evals.count { |e| e[:pass] } total = evals.size avg_score = total.positive? ? evals.sum { |e| e[:score] }.to_f / total : 0 { passed: passed, total: total, pass_rate: total.positive? ? (passed.to_f / total * 100).round(1) : 0, avg_score: avg_score.round(2) } end end |
#timing_by_candidate ⇒ Object
355 356 357 358 359 360 361 362 363 364 365 |
# File 'lib/qualspec/suite/runner.rb', line 355 def timing_by_candidate @timing.transform_values do |scenarios| total_ms = scenarios.values.sum avg_ms = !scenarios.empty? ? total_ms / scenarios.size : 0 { total_ms: total_ms, avg_ms: avg_ms.round, count: scenarios.size } end end |
#to_h ⇒ Object
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 |
# File 'lib/qualspec/suite/runner.rb', line 400 def to_h { suite_name: @suite_name, started_at: @started_at.iso8601, finished_at: @finished_at&.iso8601, summary: { by_candidate: scores_by_candidate, by_variant: scores_by_variant, by_temperature: scores_by_temperature }, timing: timing_by_candidate, costs: @costs, by_scenario: scores_by_scenario, by_scenario_variant: scores_by_scenario_variant, evaluations: @evaluations, responses: @responses } end |
#value_ranking ⇒ Object
Rank candidates by quality-per-dollar (avg score / total cost), best first. Candidates with zero recorded cost sort last. Raises a helpful error if cost tracking wasn't enabled for the run.
260 261 262 263 264 265 266 267 268 269 |
# File 'lib/qualspec/suite/runner.rb', line 260 def value_ranking ensure_cost_tracking! ranked = scores_by_candidate.map do |candidate, stats| cost = @costs[candidate].to_f score_per_dollar = cost.positive? ? (stats[:avg_score] / cost).round : nil [candidate, { avg_score: stats[:avg_score], cost: cost, score_per_dollar: score_per_dollar }] end ranked.sort_by { |_, v| -(v[:score_per_dollar] || 0) }.to_h end |