Class: Riffer::Evals::RunResult

Inherits:
Object
  • Object
show all
Defined in:
lib/riffer/evals/run_result.rb

Overview

Represents the complete result of an evaluation run.

Contains all individual results and provides aggregate metrics.

run_result = Riffer::Evals::RunResult.new(
  input: "question",
  output: "answer",
  context: {},
  results: [result1, result2],
  metrics: [metric1, metric2]
)

run_result.passed?          # => true/false
run_result.aggregate_score  # => 0.87
run_result.failures         # => [result1] (results that failed thresholds)

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input:, output:, context:, results:, metrics:) ⇒ RunResult

Initializes a new run result.

: (input: String, output: String, context: Hash[Symbol, untyped]?, results: Array, metrics: Array) -> void



39
40
41
42
43
44
45
# File 'lib/riffer/evals/run_result.rb', line 39

def initialize(input:, output:, context:, results:, metrics:)
  @input = input
  @output = output
  @context = context
  @results = results
  @metrics = metrics
end

Instance Attribute Details

#contextObject (readonly)

The context used during evaluation.



28
29
30
# File 'lib/riffer/evals/run_result.rb', line 28

def context
  @context
end

#inputObject (readonly)

The input that was evaluated.



22
23
24
# File 'lib/riffer/evals/run_result.rb', line 22

def input
  @input
end

#metricsObject (readonly)

The metrics that were evaluated.



34
35
36
# File 'lib/riffer/evals/run_result.rb', line 34

def metrics
  @metrics
end

#outputObject (readonly)

The output that was evaluated.



25
26
27
# File 'lib/riffer/evals/run_result.rb', line 25

def output
  @output
end

#resultsObject (readonly)

Individual evaluation results.



31
32
33
# File 'lib/riffer/evals/run_result.rb', line 31

def results
  @results
end

Instance Method Details

#aggregate_scoreObject

Calculates the weighted aggregate score.

Scores are normalized so that higher is always better for aggregation. For evaluators where lower is better (e.g., toxicity), the score is inverted (1 - score) before aggregation.

: () -> Float



71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/riffer/evals/run_result.rb', line 71

def aggregate_score
  return 0.0 if results.empty?

  total_weight = metrics.sum(&:weight)
  return 0.0 if total_weight.zero?

  weighted_sum = results.zip(metrics).sum do |result, metric|
    # Normalize score: for higher_is_better=false, invert so higher is better
    normalized_score = result.higher_is_better ? result.score : (1.0 - result.score)
    normalized_score * metric.weight
  end

  weighted_sum / total_weight
end

#failuresObject

Returns results that failed their metric thresholds.

: () -> Array



57
58
59
60
61
62
# File 'lib/riffer/evals/run_result.rb', line 57

def failures
  @failures ||= results.select.with_index do |result, index|
    metric = metrics[index]
    !metric.passes?(result)
  end
end

#passed?Boolean

Checks if all metrics passed their thresholds.

: () -> bool

Returns:

  • (Boolean)


50
51
52
# File 'lib/riffer/evals/run_result.rb', line 50

def passed?
  failures.empty?
end

#to_hObject

Returns a hash representation of the run result.

: () -> Hash[Symbol, untyped]



89
90
91
92
93
94
95
96
97
98
# File 'lib/riffer/evals/run_result.rb', line 89

def to_h
  {
    input: input,
    output: output,
    context: context,
    results: results.map(&:to_h),
    passed: passed?,
    aggregate_score: aggregate_score
  }
end