Module: RubyLLM::Contract::Concerns::EvalHost

Includes:
ContextHelpers, ProductionModeContext
Included in:
Pipeline::Base, Step::Base
Defined in:
lib/ruby_llm/contract/concerns/eval_host.rb

Constant Summary collapse

SAMPLE_RESPONSE_COMPARE_WARNING =
"[ruby_llm-contract] compare_with ignores sample_response. " \
"Without model: or context: { adapter: ... }, both sides will be skipped " \
"and the A/B comparison is not meaningful.".freeze

Instance Method Summary collapse

Instance Method Details

#clear_file_sourced_evals!Object



30
31
32
33
34
35
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 30

def clear_file_sourced_evals!
  return unless defined?(@file_sourced_evals) && defined?(@eval_definitions)

  @file_sourced_evals.each { |key| @eval_definitions.delete(key) }
  @file_sourced_evals.clear
end

#compare_models(eval_name, models: [], candidates: [], context: {}, runs: 1, production_mode: nil) ⇒ Object

Raises:

  • (ArgumentError)


74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 74

def compare_models(eval_name, models: [], candidates: [], context: {}, runs: 1, production_mode: nil)
  raise ArgumentError, "Pass either models: or candidates:, not both" if models.any? && candidates.any?

  runs = coerce_runs(runs)

  context = safe_context(context)
  candidate_configs = normalize_candidates(models, candidates)
  reject_production_mode_on_pipeline!(production_mode)
  fallback_config = normalize_production_mode(production_mode)

  reports = {}
  configs = {}
  candidate_configs.each do |config|
    label = Eval::ModelComparison.candidate_label(config)
    model_context = build_candidate_context(context, config, fallback_config)
    per_run = Array.new(runs) { run_single_eval(eval_name, model_context) }
    reports[label] = runs == 1 ? per_run.first : Eval::AggregatedReport.new(per_run)
    configs[label] = config
  end

  Eval::ModelComparison.new(
    eval_name: eval_name, reports: reports, configs: configs, fallback: fallback_config
  )
end

#compare_with(other_step, eval:, model: nil, context: {}) ⇒ Object

Compare this step (candidate) with another step (baseline) using the baseline’s eval definition as single source of truth.

Requires a real adapter or model in context. sample_response is intentionally NOT used, because A/B testing with canned data gives identical results for both sides rather than a real comparison.

Raises:

  • (ArgumentError)


60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 60

def compare_with(other_step, eval:, model: nil, context: {})
  ctx = comparison_context(context, model)
  baseline_defn = baseline_eval_definition(other_step, eval)
  raise ArgumentError, "No eval '#{eval}' on baseline step #{other_step}" unless baseline_defn

  dataset = baseline_defn.build_dataset
  warn_sample_response_compare(ctx, baseline_defn)

  my_report = Eval::Runner.run(step: self, dataset: dataset, context: isolate_context(ctx))
  other_report = Eval::Runner.run(step: other_step, dataset: dataset, context: isolate_context(ctx))

  Eval::PromptDiff.new(candidate: my_report, baseline: other_report)
end

#define_eval(name) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 14

def define_eval(name, &)
  @eval_definitions ||= {}
  @file_sourced_evals ||= Set.new
  key = name.to_s

  if @eval_definitions.key?(key) && !Thread.current[:ruby_llm_contract_reloading]
    warn "[ruby_llm-contract] Redefining eval '#{key}' on #{self}. " \
         "This replaces the previous definition."
  end

  @eval_definitions[key] = Eval::EvalDefinition.new(key, step_class: self, &)
  @file_sourced_evals.add(key) if Thread.current[:ruby_llm_contract_reloading]
  Contract.register_eval_host(self)
  register_subclasses(self)
end

#eval_defined?Boolean

Returns:

  • (Boolean)


41
42
43
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 41

def eval_defined?
  !all_eval_definitions.empty?
end

#eval_namesObject



37
38
39
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 37

def eval_names
  all_eval_definitions.keys
end

#run_eval(name = nil, context: {}, concurrency: nil) ⇒ Object



45
46
47
48
49
50
51
52
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 45

def run_eval(name = nil, context: {}, concurrency: nil)
  context = safe_context(context)
  if name
    run_single_eval(name, context, concurrency: concurrency)
  else
    run_all_own_evals(context, concurrency: concurrency)
  end
end