Class: Qualspec::Judge

Inherits:

Object

Object
Qualspec::Judge

show all

Defined in:: lib/qualspec/judge.rb

Constant Summary collapse

DEFAULT_SYSTEM_PROMPT =

<<~PROMPT
  You are an evaluation judge. You will be given a response and one or more evaluation criteria.
  Your job is to score how well the response meets the criteria.

  Scoring:
  - 0: Completely fails to meet the criteria
  - 1-3: Mostly fails, with minor positive elements
  - 4-6: Partially meets criteria, significant room for improvement
  - 7-8: Mostly meets criteria with minor issues
  - 9: Meets criteria well
  - 10: Perfectly meets all criteria

  Be strict but fair. Consider each criterion carefully.

  You MUST respond with valid JSON in this exact format:
  {"score": <0-10>, "reasoning": "Brief explanation of the score"}

  Your reasoning should be concise (1-2 sentences max).
PROMPT

COMPARISON_SYSTEM_PROMPT =

<<~PROMPT
  You are an evaluation judge comparing multiple AI responses to the same prompt.
  Score each response on how well it meets the criteria.

  Scoring (0-10):
  - 0: Completely fails
  - 1-3: Mostly fails
  - 4-6: Partially meets criteria
  - 7-8: Mostly meets criteria
  - 9-10: Excellent

  Be strict but fair. Compare responses relative to each other.

  IMPORTANT: Use the EXACT candidate names as given in the prompt.

  You MUST respond with valid JSON with scores for each candidate AND declare a winner.
  Example format (use actual names from prompt, not these placeholders):
  {
    "actual-name-1": {"score": 8, "reasoning": "..."},
    "actual-name-2": {"score": 6, "reasoning": "..."},
    "winner": "actual-name-1"
  }

  Use "winner": "tie" if scores are equal or too close to call.
PROMPT

DEFAULT_PASS_THRESHOLD =

Instance Method Summary collapse

#evaluate(response:, criterion:, context: nil, pass_threshold: nil) ⇒ Object

Evaluate a single response.
#evaluate_comparison(responses:, criteria:, context: nil, pass_threshold: nil) ⇒ Object

Evaluate multiple candidate responses together (comparative judging).
#evaluate_rubric(response:, rubric:, context: nil, pass_threshold: nil) ⇒ Object
#initialize(client: nil, model: nil, system_prompt: nil, pass_threshold: nil) ⇒ Judge constructor

A new instance of Judge.

Constructor Details

#initialize(client: nil, model: nil, system_prompt: nil, pass_threshold: nil) ⇒ `Judge`

Returns a new instance of Judge.

# File 'lib/qualspec/judge.rb', line 53

def initialize(client: nil, model: nil, system_prompt: nil, pass_threshold: nil)
  @client = client || Qualspec.client
  @model = model || Qualspec.configuration.judge_model
  @system_prompt = system_prompt || Qualspec.configuration.judge_system_prompt || DEFAULT_SYSTEM_PROMPT
  @pass_threshold = pass_threshold || DEFAULT_PASS_THRESHOLD
end

Instance Method Details

#evaluate(response:, criterion:, context: nil, pass_threshold: nil) ⇒ `Object`

Evaluate a single response

# File 'lib/qualspec/judge.rb', line 61

def evaluate(response:, criterion:, context: nil, pass_threshold: nil)
  threshold = pass_threshold || @pass_threshold
  user_prompt = build_user_prompt(response, criterion, context)

  result = @client.chat(
    model: @model,
    messages: [
      { role: 'system', content: @system_prompt },
      { role: 'user', content: user_prompt }
    ],
    json_mode: true
  )

  parse_result(result, criterion, threshold)
rescue Client::RequestError => e
  Evaluation.new(
    criterion: criterion,
    score: 0,
    pass: false,
    reasoning: nil,
    error: e.message
  )
end

#evaluate_comparison(responses:, criteria:, context: nil, pass_threshold: nil) ⇒ `Object`

Evaluate multiple candidate responses together (comparative judging)

# File 'lib/qualspec/judge.rb', line 86

def evaluate_comparison(responses:, criteria:, context: nil, pass_threshold: nil)
  threshold = pass_threshold || @pass_threshold

  criteria_text = Array(criteria).map.with_index { |c, i| "#{i + 1}. #{c}" }.join("\n")

  user_prompt = build_comparison_prompt(responses, criteria_text, context)

  result = @client.chat(
    model: @model,
    messages: [
      { role: 'system', content: COMPARISON_SYSTEM_PROMPT },
      { role: 'user', content: user_prompt }
    ],
    json_mode: true
  )

  parse_comparison_result(result, criteria_text, threshold, responses.keys)
rescue Client::RequestError => e
  # Return error evaluations for all candidates
  responses.keys.to_h do |candidate|
    [candidate, Evaluation.new(
      criterion: criteria_text,
      score: 0,
      pass: false,
      reasoning: nil,
      error: e.message
    )]
  end
end

#evaluate_rubric(response:, rubric:, context: nil, pass_threshold: nil) ⇒ `Object`

# File 'lib/qualspec/judge.rb', line 116

def evaluate_rubric(response:, rubric:, context: nil, pass_threshold: nil)
  rubric_obj = rubric.is_a?(Symbol) ? Rubric.find(rubric) : rubric
  criteria_text = rubric_obj.criteria.map.with_index { |c, i| "#{i + 1}. #{c}" }.join("\n")

  evaluate(response: response, criterion: criteria_text, context: context, pass_threshold: pass_threshold)
end

Class: Qualspec::Judge

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(client: nil, model: nil, system_prompt: nil, pass_threshold: nil) ⇒ Judge

Instance Method Details

#evaluate(response:, criterion:, context: nil, pass_threshold: nil) ⇒ Object

#evaluate_comparison(responses:, criteria:, context: nil, pass_threshold: nil) ⇒ Object

#evaluate_rubric(response:, rubric:, context: nil, pass_threshold: nil) ⇒ Object

#initialize(client: nil, model: nil, system_prompt: nil, pass_threshold: nil) ⇒ `Judge`

#evaluate(response:, criterion:, context: nil, pass_threshold: nil) ⇒ `Object`

#evaluate_comparison(responses:, criteria:, context: nil, pass_threshold: nil) ⇒ `Object`

#evaluate_rubric(response:, rubric:, context: nil, pass_threshold: nil) ⇒ `Object`