Class: SkillBench::Services::RunnerService
- Inherits:
-
Object
- Object
- SkillBench::Services::RunnerService
- Defined in:
- lib/skill_bench/services/runner_service.rb
Overview
Orchestrates the execution of an eval with baseline and context runs. rubocop:disable Metrics/ClassLength
Class Method Summary collapse
-
.call(eval_name:, skill_names:) ⇒ Hash
Runs an eval with the given parameters.
Instance Method Summary collapse
-
#call ⇒ Hash
Executes the eval: resolves entities, runs baseline and context, evaluates.
-
#initialize(eval_name:, skill_names:) ⇒ RunnerService
constructor
A new instance of RunnerService.
Constructor Details
#initialize(eval_name:, skill_names:) ⇒ RunnerService
Returns a new instance of RunnerService.
37 38 39 40 |
# File 'lib/skill_bench/services/runner_service.rb', line 37 def initialize(eval_name:, skill_names:) @eval_name = eval_name @skill_names = skill_names end |
Class Method Details
.call(eval_name:, skill_names:) ⇒ Hash
Runs an eval with the given parameters.
31 32 33 |
# File 'lib/skill_bench/services/runner_service.rb', line 31 def self.call(eval_name:, skill_names:) new(eval_name: eval_name, skill_names: skill_names).call end |
Instance Method Details
#call ⇒ Hash
Executes the eval: resolves entities, runs baseline and context, evaluates.
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/skill_bench/services/runner_service.rb', line 47 def call evaluation = resolve_eval skills = resolve_skills provider = resolve_provider config_result = resolve_provider_config(provider) return config_error_result(config_result[:error], evaluation, provider) unless config_result[:success] config = config_result[:config] baseline_prompt = build_baseline_system_prompt baseline_output = spawn_agent(evaluation, baseline_prompt, provider, config) return agent_error_result(baseline_output, 'baseline', evaluation, provider) if baseline_output[:status] == :error skill_context = load_combined_skill_context(skills) return empty_context_error_result(evaluation, provider) if skill_context.strip.empty? context_prompt = build_context_system_prompt(evaluation, skills) context_output = spawn_agent(evaluation, context_prompt, provider, config) return agent_error_result(context_output, 'context', evaluation, provider) if context_output[:status] == :error criteria = evaluation.criteria judge_params = build_judge_params(provider, config) result = Evaluation::Runner.call( task: evaluation.task, criteria: criteria, skill_context: skill_context, baseline_output: format_output(baseline_output), context_output: format_output(context_output), judge_params: judge_params ) return enrich_error_result(result, evaluation, provider) unless result[:success] trend_result = record_and_compute_trend(result) return enrich_error_result(trend_result, evaluation, provider) unless trend_result[:success] { success: true, eval_name: eval_name, skill_name: skill_names.join(', '), provider_name: provider.name, response: result[:response].merge( trend: trend_result[:trend], baseline_iterations: baseline_output[:iterations] || [], context_iterations: context_output[:iterations] || [] ) } end |