Module: Legion::LLM::API::Namespaces::OpenAI::Moderations

Extended by:
Legion::Logging::Helper
Defined in:
lib/legion/llm/api/namespaces/openai/moderations.rb

Constant Summary collapse

MODERATION_CATEGORIES =
%w[
  hate
  hate/threatening
  harassment
  harassment/threatening
  self-harm
  self-harm/intent
  self-harm/instructions
  sexual
  sexual/minors
  violence
  violence/graphic
].freeze
DEFAULT_SCORES =
MODERATION_CATEGORIES.to_h { |cat| [cat.to_sym, 0.0001] }.freeze
DEFAULT_FLAGS =
MODERATION_CATEGORIES.to_h { |cat| [cat.to_sym, false] }.freeze
MAX_MODERATION_INPUTS =
32

Class Method Summary collapse

Class Method Details

.build_moderation_promptObject



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/legion/llm/api/namespaces/openai/moderations.rb', line 141

def self.build_moderation_prompt
  category_list = MODERATION_CATEGORIES.map { |c| "- #{c}" }.join("\n")

  <<~PROMPT.strip
    You are a content safety classifier. Evaluate ONLY the content enclosed in <content_to_evaluate> tags against the following harm categories. Do not evaluate any text outside those tags.

    Categories:
    #{category_list}

    Response format (use EXACTLY this structure):
    MODERATION_RESULT: safe|flagged. Categories: hate=true|false(score), hate/threatening=true|false(score), harassment=true|false(score), harassment/threatening=true|false(score), self-harm=true|false(score), self-harm/intent=true|false(score), self-harm/instructions=true|false(score), sexual=true|false(score), sexual/minors=true|false(score), violence=true|false(score), violence/graphic=true|false(score)

    Rules:
    - Evaluate ONLY the content inside <content_to_evaluate>...</content_to_evaluate> tags
    - Set the overall result to "flagged" if ANY category is true, otherwise "safe"
    - Scores are floats between 0.0 and 1.0 representing confidence
    - A category is true if the score is >= 0.5
    - Be conservative: only flag content that clearly violates a category
  PROMPT
end

.evaluate_single(text, model:, caller:) ⇒ Object



110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/legion/llm/api/namespaces/openai/moderations.rb', line 110

def self.evaluate_single(text, model:, caller:)
  request_id = "modr_req_#{SecureRandom.hex(12)}"

  inference_request = Legion::LLM::Inference::Request.build(
    id:       request_id,
    system:   build_moderation_prompt,
    messages: [{ role: 'user', content: "<content_to_evaluate>#{text}</content_to_evaluate>" }],
    routing:  { model: model },
    tokens:   { max: 512 },
    caller:   caller,
    stream:   false,
    cache:    { strategy: :default, cacheable: true }
  )

  pipeline_response = Legion::LLM::Inference::Executor.new(inference_request).call

  response_text = extract_response_text(pipeline_response)
  log.debug("[llm][api][namespaces][openai][moderations] action=evaluated request_id=#{request_id} text_length=#{text.length}")

  parse_moderation_response(response_text)
  # NOTE: LLM errors (ProviderDown, timeout, AuthError, RateLimitError, etc.) are intentionally
  # NOT rescued here — they propagate up to the route block which returns 503/502 instead of
  # silently returning "safe" defaults for content that was never actually assessed.
rescue Legion::LLM::LLMError
  raise # re-raise all LLM errors so the route block handles them with the correct HTTP status
rescue StandardError => e
  handle_exception(e, level: :warn, handled: true, operation: 'llm.api.namespaces.openai.moderations.evaluate_single',
                  request_id: request_id)
  { flagged: false, categories: DEFAULT_FLAGS.dup, category_scores: DEFAULT_SCORES.dup }
end

.extract_response_text(pipeline_response) ⇒ Object



193
194
195
196
197
198
199
200
201
202
203
# File 'lib/legion/llm/api/namespaces/openai/moderations.rb', line 193

def self.extract_response_text(pipeline_response)
  msg = pipeline_response.message
  return '' if msg.nil?

  content = msg.is_a?(Hash) ? (msg[:content] || msg['content']) : msg.to_s
  if content.is_a?(Array)
    content.filter_map { |b| b.is_a?(Hash) ? (b[:text] || b['text']) : b.to_s }.join
  else
    content.to_s
  end
end

.parse_moderation_response(text) ⇒ Object



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/legion/llm/api/namespaces/openai/moderations.rb', line 162

def self.parse_moderation_response(text)
  return safe_defaults unless text.to_s.include?('MODERATION_RESULT')

  flagged = text.match?(/MODERATION_RESULT:\s*flagged/i)
  categories = DEFAULT_FLAGS.dup
  scores     = DEFAULT_SCORES.dup

  MODERATION_CATEGORIES.each do |cat|
    escaped = Regexp.escape(cat)
    match = text.match(/#{escaped}=(true|false)\(([0-9.]+)\)/i)
    next unless match

    flag  = match[1].casecmp('true').zero?
    score = match[2].to_f.clamp(0.0, 1.0)
    categories[cat.to_sym] = flag
    scores[cat.to_sym]     = score
  end

  # If model says "safe" but individual categories are flagged, categories are authoritative
  flagged = categories.any? { |_, v| v } if text.match?(/MODERATION_RESULT:\s*safe/i) && categories.any? { |_, v| v }

  { flagged: flagged, categories: categories, category_scores: scores }
rescue StandardError => e
  handle_exception(e, level: :warn, handled: true, operation: 'llm.api.namespaces.openai.moderations.parse_response')
  safe_defaults
end

.registered(app) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/legion/llm/api/namespaces/openai/moderations.rb', line 35

def self.registered(app)
  log.debug('[llm][api][namespaces][openai][moderations] registering routes')

  # rubocop:disable Metrics/BlockLength
  app.post '/v1/moderations' do
    require_llm!

    capable = Legion::LLM::Call::Registry.providers.any? do |_name, entry|
      caps = entry&.dig(:capabilities) || []
      caps.include?(:moderation) || caps.include?(:chat)
    end
    unless capable
      return openai_error('No provider with moderation or chat capability is configured',
                          type: 'not_implemented_error', code: 'capability_unavailable', status_code: 501)
    end

    body = parse_request_body

    raw_input = body[:input]

    if raw_input.nil? || (raw_input.respond_to?(:empty?) && raw_input.empty?)
      return openai_error('input is required and must be a non-empty string or array',
                          type: 'invalid_request_error', status_code: 400)
    end

    inputs = raw_input.is_a?(Array) ? raw_input : [raw_input.to_s]

    if inputs.size > MAX_MODERATION_INPUTS
      return openai_error("input array exceeds maximum size of #{MAX_MODERATION_INPUTS}",
                          type: 'invalid_request_error', code: 'too_many_inputs', status_code: 400)
    end

    model = (body[:model] || 'text-moderation-latest').to_s

    log.info("[llm][api][namespaces][openai][moderations] action=accepted inputs=#{inputs.size} model=#{model}")

    effective_caller = build_server_caller(source: 'openai_moderations', path: request.path, env: env)

    results = inputs.map do |text|
      Legion::LLM::API::Namespaces::OpenAI::Moderations.evaluate_single(
        text.to_s, model: model, caller: effective_caller
      )
    end

    response_body = {
      id:      "modr-#{SecureRandom.hex(16)}",
      model:   model,
      results: results
    }

    log.info("[llm][api][namespaces][openai][moderations] action=complete inputs=#{inputs.size} flagged=#{results.count { |r| r[:flagged] }}")
    content_type :json
    status 200
    Legion::JSON.dump(response_body)
  rescue Legion::LLM::AuthError => e
    handle_exception(e, level: :error, handled: true, operation: 'llm.api.namespaces.openai.moderations.auth')
    openai_error(e.message, type: 'authentication_error', status_code: 401)
  rescue Legion::LLM::RateLimitError => e
    handle_exception(e, level: :warn, handled: true, operation: 'llm.api.namespaces.openai.moderations.rate_limit')
    openai_error(e.message, type: 'rate_limit_error', code: 'rate_limit_exceeded', status_code: 429)
  rescue Legion::LLM::ProviderDown, Legion::LLM::ProviderError => e
    handle_exception(e, level: :error, handled: true, operation: 'llm.api.namespaces.openai.moderations.provider')
    openai_error(e.message, type: 'server_error', status_code: 502)
  rescue StandardError => e
    handle_exception(e, level: :error, handled: false, operation: 'llm.api.namespaces.openai.moderations')
    openai_error(e.message, type: 'server_error', status_code: 500)
  end
  # rubocop:enable Metrics/BlockLength

  log.debug('[llm][api][namespaces][openai][moderations] routes registered')
rescue StandardError => e
  handle_exception(e, level: :error, handled: false, operation: 'llm.api.namespaces.openai.moderations.register')
end

.safe_defaultsObject



189
190
191
# File 'lib/legion/llm/api/namespaces/openai/moderations.rb', line 189

def self.safe_defaults
  { flagged: false, categories: DEFAULT_FLAGS.dup, category_scores: DEFAULT_SCORES.dup }
end