Module: Diogenes::Evaluation::Gates

Defined in:: lib/diogenes/evaluation/gates.rb
Constant Summary collapse

ALL =
[
  Gate.new(
    key: :failure_mode,
    name: "Failure Mode",
    principle: "Least surprise at scale",
    explanation: "When AI gets things wrong — and it will — what happens? " \
      "This gate tests whether failure is detectable and recoverable. " \
      "Embarrassing failures are manageable. Silent, cumulative, or " \
      "catastrophic ones are not.",
    question: "When this feature produces wrong output, can the user " \
      "detect and recover without your direct intervention? (Y/n)",
    fail_message: "Failures that go undetected or unrecoverable create compounding " \
      "harm over time. Add explicit uncertainty signals, limit the feature " \
      "scope, or move to a human-reviewed workflow."
  ),
  Gate.new(
    key: :user_verifiable,
    name: "User Verifiable",
    principle: "Trust requires verification",
    explanation: "Can a typical user of this feature actually tell when the " \
      "output is wrong? If they need domain expertise they don't have to " \
      "evaluate AI output, you're shipping a confidence gap, not a feature.",
    question: "Can a typical user judge whether the output is correct — " \
      "without needing domain expertise they don't already have? (Y/n)",
    fail_message: "Shipping outputs users cannot evaluate creates misplaced trust. " \
      "Consider adding confidence scores, human review, or limiting the " \
      "feature to domains where users have the relevant expertise."
  ),
  Gate.new(
    key: :human_in_loop,
    name: "Human in the Loop",
    principle: "Human-centered design, genuinely",
    explanation: "A loop requires a human with time, context, and authority to act. " \
      "If the human is approving outputs they cannot evaluate, that is a " \
      "rubber stamp — not a loop.",
    question: "Is there a specific person with enough time, context, and " \
      "authority to review and override outputs before they affect users? (Y/n)",
    fail_message: "A human in the loop who lacks time, context, or authority is " \
      "safety theater. Either invest in a real review capacity or design " \
      "the feature so it doesn't require human sign-off."
  ),
  Gate.new(
    key: :observability,
    name: "Observability",
    principle: "Craftsmanship — you wouldn't ship blind",
    explanation: "Silent degradation is worse than no feature. If you won't know " \
      "when this AI feature starts producing worse outputs in production, " \
      "you'll find out through a user complaint — after the damage is done.",
    question: "Do you have (or will you have before shipping) monitoring to " \
      "detect when this feature's quality degrades in production? (Y/n)",
    fail_message: "Shipping an AI feature without observability means betting your " \
      "users will tell you when it breaks. Set up quality monitoring, " \
      "sampling, or feedback loops before you ship."
  ),
  Gate.new(
    key: :right_tool,
    name: "Right Tool",
    principle: "Convention over configuration",
    explanation: "AI is powerful, but it is not always the right answer. " \
      "Deterministic software is more reliable, cheaper to operate, and " \
      "easier to audit. This gate asks you to make the case for AI explicitly.",
    question: "Have you genuinely considered and ruled out a simpler, " \
      "deterministic software solution for this specific need? (Y/n)",
    fail_message: "The best AI feature is sometimes no AI feature. Revisit whether " \
      "the problem can be solved with rules, search, filtering, or structured " \
      "data before adding an LLM dependency."
  )
].freeze