Class: Qualspec::Judge

Inherits:
Object
  • Object
show all
Defined in:
lib/qualspec/judge.rb

Constant Summary collapse

DEFAULT_SYSTEM_PROMPT =
<<~PROMPT
  You are an evaluation judge. You will be given a response and one or more evaluation criteria.
  Your job is to score how well the response meets the criteria.

  Scoring:
  - 0: Completely fails to meet the criteria
  - 1-3: Mostly fails, with minor positive elements
  - 4-6: Partially meets criteria, significant room for improvement
  - 7-8: Mostly meets criteria with minor issues
  - 9: Meets criteria well
  - 10: Perfectly meets all criteria

  Be strict but fair. Consider each criterion carefully.

  You MUST respond with valid JSON in this exact format:
  {"score": <0-10>, "reasoning": "Brief explanation of the score"}

  Your reasoning should be concise (1-2 sentences max).
PROMPT
COMPARISON_SYSTEM_PROMPT =
<<~PROMPT
  You are an evaluation judge comparing multiple AI responses to the same prompt.
  Score each response on how well it meets the criteria.

  Scoring (0-10):
  - 0: Completely fails
  - 1-3: Mostly fails
  - 4-6: Partially meets criteria
  - 7-8: Mostly meets criteria
  - 9-10: Excellent

  Be strict but fair. Compare responses relative to each other.

  IMPORTANT: Use the EXACT candidate names as given in the prompt.

  You MUST respond with valid JSON with scores for each candidate AND declare a winner.
  Example format (use actual names from prompt, not these placeholders):
  {
    "actual-name-1": {"score": 8, "reasoning": "..."},
    "actual-name-2": {"score": 6, "reasoning": "..."},
    "winner": "actual-name-1"
  }

  Use "winner": "tie" if scores are equal or too close to call.
PROMPT
DEFAULT_PASS_THRESHOLD =
7

Instance Method Summary collapse

Constructor Details

#initialize(client: nil, model: nil, system_prompt: nil, pass_threshold: nil) ⇒ Judge

Returns a new instance of Judge.



53
54
55
56
57
58
# File 'lib/qualspec/judge.rb', line 53

def initialize(client: nil, model: nil, system_prompt: nil, pass_threshold: nil)
  @client = client || Qualspec.client
  @model = model || Qualspec.configuration.judge_model
  @system_prompt = system_prompt || Qualspec.configuration.judge_system_prompt || DEFAULT_SYSTEM_PROMPT
  @pass_threshold = pass_threshold || DEFAULT_PASS_THRESHOLD
end

Instance Method Details

#evaluate(response:, criterion:, context: nil, pass_threshold: nil) ⇒ Object

Evaluate a single response



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/qualspec/judge.rb', line 61

def evaluate(response:, criterion:, context: nil, pass_threshold: nil)
  threshold = pass_threshold || @pass_threshold
  user_prompt = build_user_prompt(response, criterion, context)

  result = @client.chat(
    model: @model,
    messages: [
      { role: 'system', content: @system_prompt },
      { role: 'user', content: user_prompt }
    ],
    json_mode: true
  )

  parse_result(result, criterion, threshold)
rescue Client::RequestError => e
  Evaluation.new(
    criterion: criterion,
    score: 0,
    pass: false,
    reasoning: nil,
    error: e.message
  )
end

#evaluate_comparison(responses:, criteria:, context: nil, pass_threshold: nil) ⇒ Object

Evaluate multiple candidate responses together (comparative judging)



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/qualspec/judge.rb', line 86

def evaluate_comparison(responses:, criteria:, context: nil, pass_threshold: nil)
  threshold = pass_threshold || @pass_threshold

  criteria_text = Array(criteria).map.with_index { |c, i| "#{i + 1}. #{c}" }.join("\n")

  user_prompt = build_comparison_prompt(responses, criteria_text, context)

  result = @client.chat(
    model: @model,
    messages: [
      { role: 'system', content: COMPARISON_SYSTEM_PROMPT },
      { role: 'user', content: user_prompt }
    ],
    json_mode: true
  )

  parse_comparison_result(result, criteria_text, threshold, responses.keys)
rescue Client::RequestError => e
  # Return error evaluations for all candidates
  responses.keys.to_h do |candidate|
    [candidate, Evaluation.new(
      criterion: criteria_text,
      score: 0,
      pass: false,
      reasoning: nil,
      error: e.message
    )]
  end
end

#evaluate_rubric(response:, rubric:, context: nil, pass_threshold: nil) ⇒ Object



116
117
118
119
120
121
# File 'lib/qualspec/judge.rb', line 116

def evaluate_rubric(response:, rubric:, context: nil, pass_threshold: nil)
  rubric_obj = rubric.is_a?(Symbol) ? Rubric.find(rubric) : rubric
  criteria_text = rubric_obj.criteria.map.with_index { |c, i| "#{i + 1}. #{c}" }.join("\n")

  evaluate(response: response, criterion: criteria_text, context: context, pass_threshold: pass_threshold)
end