Class: Qualspec::Suite::Results

Inherits:

Object

Object
Qualspec::Suite::Results

show all

Defined in:: lib/qualspec/suite/runner.rb

Overview

Results container with multi-dimensional support

Instance Attribute Summary collapse

#candidate_models ⇒ Object readonly
Returns the value of attribute candidate_models.
#costs ⇒ Object readonly
Returns the value of attribute costs.
#evaluations ⇒ Object readonly
Returns the value of attribute evaluations.
#finished_at ⇒ Object readonly
Returns the value of attribute finished_at.
#metadata_captured ⇒ Object
Returns the value of attribute metadata_captured.
#prompts ⇒ Object readonly
Returns the value of attribute prompts.
#responses ⇒ Object readonly
Returns the value of attribute responses.
#started_at ⇒ Object readonly
Returns the value of attribute started_at.
#suite_name ⇒ Object readonly
Returns the value of attribute suite_name.
#timing ⇒ Object readonly
Returns the value of attribute timing.

Instance Method Summary collapse

#cost_by_candidate ⇒ Object
Total cost per candidate.
#costs_tracked? ⇒ Boolean
Whether per-call cost/token metadata was captured this run.
#finish! ⇒ Object
#initialize(suite_name) ⇒ Results constructor
A new instance of Results.
#record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil) ⇒ Object
#record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil) ⇒ Object
#scores_by_candidate ⇒ Object
Group scores by candidate, aggregating across all variants.
#scores_by_scenario ⇒ Object
Detailed breakdown by scenario + variant.
#scores_by_scenario_variant ⇒ Object
Cross-tabulation: scenario × variant.
#scores_by_temperature ⇒ Object
Temperature sensitivity analysis.
#scores_by_variant ⇒ Object
Group scores by variant.
#timing_by_candidate ⇒ Object
#to_h ⇒ Object
#value_ranking ⇒ Object
Rank candidates by quality-per-dollar (avg score / total cost), best first.

Constructor Details

#initialize(suite_name) ⇒ `Results`

Returns a new instance of Results.

# File 'lib/qualspec/suite/runner.rb', line 233

def initialize(suite_name)
  @suite_name = suite_name
  @evaluations = []
  @responses = {} # Nested: {candidate => {scenario => {variant => {temp => response}}}}
  @timing = {}
  @costs = {}
  @candidate_models = {} # {candidate_name => model_string}
  @prompts = {}          # {scenario_name => prompt_string}
  @started_at = Time.now
  @finished_at = nil
  @metadata_captured = false # set true when the suite enables track_cost
end

Instance Attribute Details

#candidate_models ⇒ `Object` (readonly)

Returns the value of attribute candidate_models.



229
230
231

# File 'lib/qualspec/suite/runner.rb', line 229

def candidate_models
  @candidate_models
end

#costs ⇒ `Object` (readonly)

Returns the value of attribute costs.



229
230
231

# File 'lib/qualspec/suite/runner.rb', line 229

def costs
  @costs
end

#evaluations ⇒ `Object` (readonly)

Returns the value of attribute evaluations.



229
230
231

# File 'lib/qualspec/suite/runner.rb', line 229

def evaluations
  @evaluations
end

#finished_at ⇒ `Object` (readonly)

Returns the value of attribute finished_at.



229
230
231

# File 'lib/qualspec/suite/runner.rb', line 229

def finished_at
  @finished_at
end

#metadata_captured ⇒ `Object`

Returns the value of attribute metadata_captured.



231
232
233

# File 'lib/qualspec/suite/runner.rb', line 231

def metadata_captured
  @metadata_captured
end

#prompts ⇒ `Object` (readonly)

Returns the value of attribute prompts.



229
230
231

# File 'lib/qualspec/suite/runner.rb', line 229

def prompts
  @prompts
end

#responses ⇒ `Object` (readonly)

Returns the value of attribute responses.



229
230
231

# File 'lib/qualspec/suite/runner.rb', line 229

def responses
  @responses
end

#started_at ⇒ `Object` (readonly)

Returns the value of attribute started_at.



229
230
231

# File 'lib/qualspec/suite/runner.rb', line 229

def started_at
  @started_at
end

#suite_name ⇒ `Object` (readonly)

Returns the value of attribute suite_name.



229
230
231

# File 'lib/qualspec/suite/runner.rb', line 229

def suite_name
  @suite_name
end

#timing ⇒ `Object` (readonly)

Returns the value of attribute timing.



229
230
231

# File 'lib/qualspec/suite/runner.rb', line 229

def timing
  @timing
end

Instance Method Details

#cost_by_candidate ⇒ `Object`

Total cost per candidate. Raises if cost tracking wasn't enabled.

# File 'lib/qualspec/suite/runner.rb', line 252

def cost_by_candidate
  ensure_cost_tracking!
  @costs.dup
end

#costs_tracked? ⇒ `Boolean`

Whether per-call cost/token metadata was captured this run.

Returns:

(Boolean)



247
248
249

# File 'lib/qualspec/suite/runner.rb', line 247

def costs_tracked?
  @metadata_captured
end

#finish! ⇒ `Object`



308
309
310

# File 'lib/qualspec/suite/runner.rb', line 308

def finish!
  @finished_at = Time.now
end

#record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil) ⇒ `Object`

# File 'lib/qualspec/suite/runner.rb', line 292

def record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil)
  @evaluations << {
    candidate: candidate,
    scenario: scenario,
    variant: variant,
    temperature: temperature,
    criteria: criteria,
    criteria_count: Array(criteria).size,
    score: evaluation.score,
    pass: evaluation.pass?,
    reasoning: evaluation.reasoning,
    error: evaluation.error,
    winner: winner
  }
end

#record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil) ⇒ `Object`

# File 'lib/qualspec/suite/runner.rb', line 271

def record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil)
  # Store in nested structure
  @responses[candidate] ||= {}
  @responses[candidate][scenario] ||= {}
  @responses[candidate][scenario][variant] ||= {}
  @responses[candidate][scenario][variant][temperature] = {
    content: response,
    variant_data: variant_data
  }

  if duration_ms
    @timing[candidate] ||= {}
    @timing[candidate]["#{scenario}/#{variant}"] = duration_ms
  end

  return unless cost&.positive?

  @costs[candidate] ||= 0.0
  @costs[candidate] += cost
end

#scores_by_candidate ⇒ `Object`

Group scores by candidate, aggregating across all variants

# File 'lib/qualspec/suite/runner.rb', line 313

def scores_by_candidate
  @evaluations.group_by { |e| e[:candidate] }.transform_values do |evals|
    passed = evals.count { |e| e[:pass] }
    total = evals.size
    avg_score = total.positive? ? evals.sum { |e| e[:score] }.to_f / total : 0

    {
      passed: passed,
      total: total,
      pass_rate: total.positive? ? (passed.to_f / total * 100).round(1) : 0,
      avg_score: avg_score.round(2)
    }
  end
end

#scores_by_scenario ⇒ `Object`

Detailed breakdown by scenario + variant

# File 'lib/qualspec/suite/runner.rb', line 368

def scores_by_scenario
  @evaluations.group_by { |e| e[:scenario] }.transform_values do |evals|
    evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals|
      total = candidate_evals.size
      avg_score = (candidate_evals.sum { |e| e[:score] }.to_f / total).round(2)
      first = candidate_evals.first
      {
        score: avg_score,
        pass: candidate_evals.all? { |e| e[:pass] },
        reasoning: first[:reasoning],
        variant: first[:variant],
        temperature: first[:temperature]
      }
    end
  end
end

#scores_by_scenario_variant ⇒ `Object`

Cross-tabulation: scenario × variant

# File 'lib/qualspec/suite/runner.rb', line 386

def scores_by_scenario_variant
  @evaluations.group_by { |e| [e[:scenario], e[:variant]] }.transform_values do |evals|
    evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals|
      eval_data = candidate_evals.first
      {
        score: eval_data[:score],
        pass: eval_data[:pass],
        reasoning: eval_data[:reasoning],
        temperature: eval_data[:temperature]
      }
    end
  end
end

#scores_by_temperature ⇒ `Object`

Temperature sensitivity analysis

# File 'lib/qualspec/suite/runner.rb', line 345

def scores_by_temperature
  by_temp = @evaluations.group_by { |e| e[:temperature] }
  by_temp.transform_values do |evals|
    {
      avg_score: (evals.sum { |e| e[:score] }.to_f / evals.size).round(2),
      pass_rate: (evals.count { |e| e[:pass] }.to_f / evals.size * 100).round(1)
    }
  end
end

#scores_by_variant ⇒ `Object`

Group scores by variant

# File 'lib/qualspec/suite/runner.rb', line 329

def scores_by_variant
  @evaluations.group_by { |e| e[:variant] }.transform_values do |evals|
    passed = evals.count { |e| e[:pass] }
    total = evals.size
    avg_score = total.positive? ? evals.sum { |e| e[:score] }.to_f / total : 0

    {
      passed: passed,
      total: total,
      pass_rate: total.positive? ? (passed.to_f / total * 100).round(1) : 0,
      avg_score: avg_score.round(2)
    }
  end
end

#timing_by_candidate ⇒ `Object`

# File 'lib/qualspec/suite/runner.rb', line 355

def timing_by_candidate
  @timing.transform_values do |scenarios|
    total_ms = scenarios.values.sum
    avg_ms = !scenarios.empty? ? total_ms / scenarios.size : 0
    {
      total_ms: total_ms,
      avg_ms: avg_ms.round,
      count: scenarios.size
    }
  end
end

#to_h ⇒ `Object`

# File 'lib/qualspec/suite/runner.rb', line 400

def to_h
  {
    suite_name: @suite_name,
    started_at: @started_at.iso8601,
    finished_at: @finished_at&.iso8601,
    summary: {
      by_candidate: scores_by_candidate,
      by_variant: scores_by_variant,
      by_temperature: scores_by_temperature
    },
    timing: timing_by_candidate,
    costs: @costs,
    by_scenario: scores_by_scenario,
    by_scenario_variant: scores_by_scenario_variant,
    evaluations: @evaluations,
    responses: @responses
  }
end

#value_ranking ⇒ `Object`

Rank candidates by quality-per-dollar (avg score / total cost), best first. Candidates with zero recorded cost sort last. Raises a helpful error if cost tracking wasn't enabled for the run.

# File 'lib/qualspec/suite/runner.rb', line 260

def value_ranking
  ensure_cost_tracking!

  ranked = scores_by_candidate.map do |candidate, stats|
    cost = @costs[candidate].to_f
    score_per_dollar = cost.positive? ? (stats[:avg_score] / cost).round : nil
    [candidate, { avg_score: stats[:avg_score], cost: cost, score_per_dollar: score_per_dollar }]
  end
  ranked.sort_by { |_, v| -(v[:score_per_dollar] || 0) }.to_h
end

Class: Qualspec::Suite::Results

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(suite_name) ⇒ Results

Instance Attribute Details

#candidate_models ⇒ Object (readonly)

#costs ⇒ Object (readonly)

#evaluations ⇒ Object (readonly)

#finished_at ⇒ Object (readonly)

#metadata_captured ⇒ Object

#prompts ⇒ Object (readonly)

#responses ⇒ Object (readonly)

#started_at ⇒ Object (readonly)

#suite_name ⇒ Object (readonly)

#timing ⇒ Object (readonly)

Instance Method Details

#cost_by_candidate ⇒ Object

#costs_tracked? ⇒ Boolean

#finish! ⇒ Object

#record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil) ⇒ Object

#record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil) ⇒ Object

#scores_by_candidate ⇒ Object

#scores_by_scenario ⇒ Object

#scores_by_scenario_variant ⇒ Object

#scores_by_temperature ⇒ Object

#scores_by_variant ⇒ Object

#timing_by_candidate ⇒ Object

#to_h ⇒ Object

#value_ranking ⇒ Object