Class: Qualspec::Suite::Results

Inherits:
Object
  • Object
show all
Defined in:
lib/qualspec/suite/runner.rb

Overview

Results container with multi-dimensional support

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(suite_name) ⇒ Results

Returns a new instance of Results.



233
234
235
236
237
238
239
240
241
242
243
244
# File 'lib/qualspec/suite/runner.rb', line 233

def initialize(suite_name)
  @suite_name = suite_name
  @evaluations = []
  @responses = {} # Nested: {candidate => {scenario => {variant => {temp => response}}}}
  @timing = {}
  @costs = {}
  @candidate_models = {} # {candidate_name => model_string}
  @prompts = {}          # {scenario_name => prompt_string}
  @started_at = Time.now
  @finished_at = nil
  @metadata_captured = false # set true when the suite enables track_cost
end

Instance Attribute Details

#candidate_modelsObject (readonly)

Returns the value of attribute candidate_models.



229
230
231
# File 'lib/qualspec/suite/runner.rb', line 229

def candidate_models
  @candidate_models
end

#costsObject (readonly)

Returns the value of attribute costs.



229
230
231
# File 'lib/qualspec/suite/runner.rb', line 229

def costs
  @costs
end

#evaluationsObject (readonly)

Returns the value of attribute evaluations.



229
230
231
# File 'lib/qualspec/suite/runner.rb', line 229

def evaluations
  @evaluations
end

#finished_atObject (readonly)

Returns the value of attribute finished_at.



229
230
231
# File 'lib/qualspec/suite/runner.rb', line 229

def finished_at
  @finished_at
end

#metadata_capturedObject

Returns the value of attribute metadata_captured.



231
232
233
# File 'lib/qualspec/suite/runner.rb', line 231

def 
  @metadata_captured
end

#promptsObject (readonly)

Returns the value of attribute prompts.



229
230
231
# File 'lib/qualspec/suite/runner.rb', line 229

def prompts
  @prompts
end

#responsesObject (readonly)

Returns the value of attribute responses.



229
230
231
# File 'lib/qualspec/suite/runner.rb', line 229

def responses
  @responses
end

#started_atObject (readonly)

Returns the value of attribute started_at.



229
230
231
# File 'lib/qualspec/suite/runner.rb', line 229

def started_at
  @started_at
end

#suite_nameObject (readonly)

Returns the value of attribute suite_name.



229
230
231
# File 'lib/qualspec/suite/runner.rb', line 229

def suite_name
  @suite_name
end

#timingObject (readonly)

Returns the value of attribute timing.



229
230
231
# File 'lib/qualspec/suite/runner.rb', line 229

def timing
  @timing
end

Instance Method Details

#cost_by_candidateObject

Total cost per candidate. Raises if cost tracking wasn't enabled.



252
253
254
255
# File 'lib/qualspec/suite/runner.rb', line 252

def cost_by_candidate
  ensure_cost_tracking!
  @costs.dup
end

#costs_tracked?Boolean

Whether per-call cost/token metadata was captured this run.

Returns:

  • (Boolean)


247
248
249
# File 'lib/qualspec/suite/runner.rb', line 247

def costs_tracked?
  @metadata_captured
end

#finish!Object



308
309
310
# File 'lib/qualspec/suite/runner.rb', line 308

def finish!
  @finished_at = Time.now
end

#record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil) ⇒ Object



292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# File 'lib/qualspec/suite/runner.rb', line 292

def record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil)
  @evaluations << {
    candidate: candidate,
    scenario: scenario,
    variant: variant,
    temperature: temperature,
    criteria: criteria,
    criteria_count: Array(criteria).size,
    score: evaluation.score,
    pass: evaluation.pass?,
    reasoning: evaluation.reasoning,
    error: evaluation.error,
    winner: winner
  }
end

#record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil) ⇒ Object



271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
# File 'lib/qualspec/suite/runner.rb', line 271

def record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil)
  # Store in nested structure
  @responses[candidate] ||= {}
  @responses[candidate][scenario] ||= {}
  @responses[candidate][scenario][variant] ||= {}
  @responses[candidate][scenario][variant][temperature] = {
    content: response,
    variant_data: variant_data
  }

  if duration_ms
    @timing[candidate] ||= {}
    @timing[candidate]["#{scenario}/#{variant}"] = duration_ms
  end

  return unless cost&.positive?

  @costs[candidate] ||= 0.0
  @costs[candidate] += cost
end

#scores_by_candidateObject

Group scores by candidate, aggregating across all variants



313
314
315
316
317
318
319
320
321
322
323
324
325
326
# File 'lib/qualspec/suite/runner.rb', line 313

def scores_by_candidate
  @evaluations.group_by { |e| e[:candidate] }.transform_values do |evals|
    passed = evals.count { |e| e[:pass] }
    total = evals.size
    avg_score = total.positive? ? evals.sum { |e| e[:score] }.to_f / total : 0

    {
      passed: passed,
      total: total,
      pass_rate: total.positive? ? (passed.to_f / total * 100).round(1) : 0,
      avg_score: avg_score.round(2)
    }
  end
end

#scores_by_scenarioObject

Detailed breakdown by scenario + variant



368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
# File 'lib/qualspec/suite/runner.rb', line 368

def scores_by_scenario
  @evaluations.group_by { |e| e[:scenario] }.transform_values do |evals|
    evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals|
      total = candidate_evals.size
      avg_score = (candidate_evals.sum { |e| e[:score] }.to_f / total).round(2)
      first = candidate_evals.first
      {
        score: avg_score,
        pass: candidate_evals.all? { |e| e[:pass] },
        reasoning: first[:reasoning],
        variant: first[:variant],
        temperature: first[:temperature]
      }
    end
  end
end

#scores_by_scenario_variantObject

Cross-tabulation: scenario × variant



386
387
388
389
390
391
392
393
394
395
396
397
398
# File 'lib/qualspec/suite/runner.rb', line 386

def scores_by_scenario_variant
  @evaluations.group_by { |e| [e[:scenario], e[:variant]] }.transform_values do |evals|
    evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals|
      eval_data = candidate_evals.first
      {
        score: eval_data[:score],
        pass: eval_data[:pass],
        reasoning: eval_data[:reasoning],
        temperature: eval_data[:temperature]
      }
    end
  end
end

#scores_by_temperatureObject

Temperature sensitivity analysis



345
346
347
348
349
350
351
352
353
# File 'lib/qualspec/suite/runner.rb', line 345

def scores_by_temperature
  by_temp = @evaluations.group_by { |e| e[:temperature] }
  by_temp.transform_values do |evals|
    {
      avg_score: (evals.sum { |e| e[:score] }.to_f / evals.size).round(2),
      pass_rate: (evals.count { |e| e[:pass] }.to_f / evals.size * 100).round(1)
    }
  end
end

#scores_by_variantObject

Group scores by variant



329
330
331
332
333
334
335
336
337
338
339
340
341
342
# File 'lib/qualspec/suite/runner.rb', line 329

def scores_by_variant
  @evaluations.group_by { |e| e[:variant] }.transform_values do |evals|
    passed = evals.count { |e| e[:pass] }
    total = evals.size
    avg_score = total.positive? ? evals.sum { |e| e[:score] }.to_f / total : 0

    {
      passed: passed,
      total: total,
      pass_rate: total.positive? ? (passed.to_f / total * 100).round(1) : 0,
      avg_score: avg_score.round(2)
    }
  end
end

#timing_by_candidateObject



355
356
357
358
359
360
361
362
363
364
365
# File 'lib/qualspec/suite/runner.rb', line 355

def timing_by_candidate
  @timing.transform_values do |scenarios|
    total_ms = scenarios.values.sum
    avg_ms = !scenarios.empty? ? total_ms / scenarios.size : 0
    {
      total_ms: total_ms,
      avg_ms: avg_ms.round,
      count: scenarios.size
    }
  end
end

#to_hObject



400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
# File 'lib/qualspec/suite/runner.rb', line 400

def to_h
  {
    suite_name: @suite_name,
    started_at: @started_at.iso8601,
    finished_at: @finished_at&.iso8601,
    summary: {
      by_candidate: scores_by_candidate,
      by_variant: scores_by_variant,
      by_temperature: scores_by_temperature
    },
    timing: timing_by_candidate,
    costs: @costs,
    by_scenario: scores_by_scenario,
    by_scenario_variant: scores_by_scenario_variant,
    evaluations: @evaluations,
    responses: @responses
  }
end

#value_rankingObject

Rank candidates by quality-per-dollar (avg score / total cost), best first. Candidates with zero recorded cost sort last. Raises a helpful error if cost tracking wasn't enabled for the run.



260
261
262
263
264
265
266
267
268
269
# File 'lib/qualspec/suite/runner.rb', line 260

def value_ranking
  ensure_cost_tracking!

  ranked = scores_by_candidate.map do |candidate, stats|
    cost = @costs[candidate].to_f
    score_per_dollar = cost.positive? ? (stats[:avg_score] / cost).round : nil
    [candidate, { avg_score: stats[:avg_score], cost: cost, score_per_dollar: score_per_dollar }]
  end
  ranked.sort_by { |_, v| -(v[:score_per_dollar] || 0) }.to_h
end