Module: RubyLLM::Contract::Concerns::EvalHost

Includes:
ContextHelpers
Included in:
Pipeline::Base, Step::Base
Defined in:
lib/ruby_llm/contract/concerns/eval_host.rb

Constant Summary collapse

SAMPLE_RESPONSE_COMPARE_WARNING =
"[ruby_llm-contract] compare_with ignores sample_response. " \
"Without model: or context: { adapter: ... }, both sides will be skipped " \
"and the A/B comparison is not meaningful.".freeze

Instance Method Summary collapse

Instance Method Details

#clear_file_sourced_evals!Object



29
30
31
32
33
34
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 29

def clear_file_sourced_evals!
  return unless defined?(@file_sourced_evals) && defined?(@eval_definitions)

  @file_sourced_evals.each { |key| @eval_definitions.delete(key) }
  @file_sourced_evals.clear
end

#compare_models(eval_name, models: [], candidates: [], context: {}) ⇒ Object

Raises:

  • (ArgumentError)


73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 73

def compare_models(eval_name, models: [], candidates: [], context: {})
  raise ArgumentError, "Pass either models: or candidates:, not both" if models.any? && candidates.any?

  context = safe_context(context)
  candidate_configs = normalize_candidates(models, candidates)

  reports = {}
  configs = {}
  candidate_configs.each do |config|
    label = Eval::ModelComparison.candidate_label(config)
    model_context = isolate_context(context).merge(model: config[:model])
    model_context[:reasoning_effort] = config[:reasoning_effort] if config[:reasoning_effort]
    reports[label] = run_single_eval(eval_name, model_context)
    configs[label] = config
  end

  Eval::ModelComparison.new(eval_name: eval_name, reports: reports, configs: configs)
end

#compare_with(other_step, eval:, model: nil, context: {}) ⇒ Object

Compare this step (candidate) with another step (baseline) using the baseline’s eval definition as single source of truth.

Requires a real adapter or model in context. sample_response is intentionally NOT used, because A/B testing with canned data gives identical results for both sides rather than a real comparison.

Raises:

  • (ArgumentError)


59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 59

def compare_with(other_step, eval:, model: nil, context: {})
  ctx = comparison_context(context, model)
  baseline_defn = baseline_eval_definition(other_step, eval)
  raise ArgumentError, "No eval '#{eval}' on baseline step #{other_step}" unless baseline_defn

  dataset = baseline_defn.build_dataset
  warn_sample_response_compare(ctx, baseline_defn)

  my_report = Eval::Runner.run(step: self, dataset: dataset, context: isolate_context(ctx))
  other_report = Eval::Runner.run(step: other_step, dataset: dataset, context: isolate_context(ctx))

  Eval::PromptDiff.new(candidate: my_report, baseline: other_report)
end

#define_eval(name) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 13

def define_eval(name, &)
  @eval_definitions ||= {}
  @file_sourced_evals ||= Set.new
  key = name.to_s

  if @eval_definitions.key?(key) && !Thread.current[:ruby_llm_contract_reloading]
    warn "[ruby_llm-contract] Redefining eval '#{key}' on #{self}. " \
         "This replaces the previous definition."
  end

  @eval_definitions[key] = Eval::EvalDefinition.new(key, step_class: self, &)
  @file_sourced_evals.add(key) if Thread.current[:ruby_llm_contract_reloading]
  Contract.register_eval_host(self)
  register_subclasses(self)
end

#eval_defined?Boolean

Returns:

  • (Boolean)


40
41
42
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 40

def eval_defined?
  !all_eval_definitions.empty?
end

#eval_namesObject



36
37
38
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 36

def eval_names
  all_eval_definitions.keys
end

#run_eval(name = nil, context: {}, concurrency: nil) ⇒ Object



44
45
46
47
48
49
50
51
# File 'lib/ruby_llm/contract/concerns/eval_host.rb', line 44

def run_eval(name = nil, context: {}, concurrency: nil)
  context = safe_context(context)
  if name
    run_single_eval(name, context, concurrency: concurrency)
  else
    run_all_own_evals(context, concurrency: concurrency)
  end
end