Module: Philiprehberger::SanitizeHtml

Defined in:
lib/philiprehberger/sanitize_html.rb,
lib/philiprehberger/sanitize_html/version.rb

Defined Under Namespace

Classes: Error

Constant Summary collapse

DEFAULT_ALLOWED_TAGS =
%w[
  p br strong em b i u a ul ol li blockquote code pre
  h1 h2 h3 h4 h5 h6
].freeze
DEFAULT_ALLOWED_ATTRIBUTES =
{
  'a' => %w[href title],
  'img' => %w[src alt]
}.freeze
DANGEROUS_TAGS =
%w[script style iframe].freeze
EVENT_ATTRIBUTE_PATTERN =
/\A\s*on/i
DEFAULT_ALLOWED_PROTOCOLS =
%w[http https mailto].freeze
DEFAULT_ALLOWED_DATA_MIMES =
[].freeze
SAFE_CSS_PROPERTIES =
%w[
  color background-color font-size font-family font-weight font-style
  text-align text-decoration text-indent text-transform
  line-height letter-spacing word-spacing
  margin margin-top margin-right margin-bottom margin-left
  padding padding-top padding-right padding-bottom padding-left
  border border-top border-right border-bottom border-left
  border-color border-style border-width border-radius
  width height max-width max-height min-width min-height
  display float clear vertical-align
  list-style list-style-type
  white-space overflow
  opacity visibility
].freeze
DANGEROUS_CSS_PATTERN =
/expression\s*\(|javascript\s*:|url\s*\(\s*['"]?\s*javascript\s*:/i
PROFILES =
{
  strict: {
    tags: [],
    attributes: {}
  },
  moderate: {
    tags: %w[p br strong em b i u ul ol li blockquote],
    attributes: {}
  },
  permissive: {
    tags: %w[
      p br strong em b i u a ul ol li blockquote code pre
      h1 h2 h3 h4 h5 h6 img div span table thead tbody tr th td
      dl dt dd sub sup hr
    ],
    attributes: {
      'a' => %w[href title],
      'img' => %w[src alt width height],
      'td' => %w[colspan rowspan],
      'th' => %w[colspan rowspan]
    }
  },
  markdown: {
    tags: %w[
      p br strong em b i u a ul ol li blockquote code pre
      h1 h2 h3 h4 h5 h6 img hr table thead tbody tr th td
    ],
    attributes: {
      'a' => %w[href title],
      'img' => %w[src alt]
    }
  },
  text_only: {
    tags: [],
    attributes: {}
  }
}.freeze
VERSION =
'0.5.0'

Class Method Summary collapse

Class Method Details

.clean(html, tags: nil, attributes: nil, profile: nil, allowed_protocols: nil, allowed_data_mimes: nil, on_tag: nil, max_length: nil, link_rel: nil) ⇒ String

Sanitize HTML by removing disallowed tags and attributes.

Parameters:

  • html (String)

    the HTML string to sanitize

  • tags (Array<String>) (defaults to: nil)

    allowed tag names

  • attributes (Hash{String => Array<String>}) (defaults to: nil)

    allowed attributes per tag

  • profile (Symbol, nil) (defaults to: nil)

    predefined security profile (:strict, :moderate, :permissive, :markdown, :text_only)

  • allowed_protocols (Array<String>, nil) (defaults to: nil)

    allowed URL protocols for href/src attributes

  • allowed_data_mimes (Array<String>, nil) (defaults to: nil)

    allowed MIME types for data: URIs

  • on_tag (Proc, nil) (defaults to: nil)

    callback for custom tag processing, receives (tag_name, attributes_hash)

  • max_length (Integer, nil) (defaults to: nil)

    maximum allowed input length; raises Error when exceeded

  • link_rel (String, nil) (defaults to: nil)

    when set, every emitted ‘<a>` tag is given this exact `rel` attribute, replacing any existing rel and bypassing attribute filtering

Returns:

  • (String)

    sanitized HTML



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/philiprehberger/sanitize_html.rb', line 95

def self.clean(html, tags: nil, attributes: nil, profile: nil, # rubocop:disable Metrics/ParameterLists
               allowed_protocols: nil, allowed_data_mimes: nil, on_tag: nil,
               max_length: nil, link_rel: nil)
  return '' if html.nil? || html.empty?

  enforce_max_length!(html, max_length)

  if profile
    raise Error, "Unknown profile: #{profile}" unless PROFILES.key?(profile)

    return strip_tags(html) if profile == :text_only && tags.nil? && attributes.nil?

    profile_config = PROFILES[profile]
    tags ||= profile_config[:tags]
    attributes ||= profile_config[:attributes]
  end

  tags ||= DEFAULT_ALLOWED_TAGS
  attributes ||= DEFAULT_ALLOWED_ATTRIBUTES
  allowed_protocols ||= DEFAULT_ALLOWED_PROTOCOLS
  allowed_data_mimes ||= DEFAULT_ALLOWED_DATA_MIMES

  result = normalize_entities(html)
  result = remove_dangerous_tags(result)
  process_tags(result, tags, attributes, allowed_protocols, allowed_data_mimes, on_tag, link_rel)
end

.escape(html, max_length: nil) ⇒ String

Escape all HTML tags by converting < and > to entities.

Parameters:

  • html (String)

    the HTML string to escape

  • max_length (Integer, nil) (defaults to: nil)

    maximum allowed input length; raises Error when exceeded

Returns:

  • (String)

    entity-encoded HTML



157
158
159
160
161
162
163
164
165
166
167
# File 'lib/philiprehberger/sanitize_html.rb', line 157

def self.escape(html, max_length: nil)
  return '' if html.nil? || html.empty?

  enforce_max_length!(html, max_length)

  html.gsub('&', '&amp;')
      .gsub('<', '&lt;')
      .gsub('>', '&gt;')
      .gsub('"', '&quot;')
      .gsub("'", '&#39;')
end

.sanitize_url(url, allowed_protocols: DEFAULT_ALLOWED_PROTOCOLS, allowed_data_mimes: DEFAULT_ALLOWED_DATA_MIMES) ⇒ String?

Validate a single URL against an allowlist of protocols and optional data: MIME types. Returns the trimmed URL when it’s safe to use, or ‘nil` when the protocol is not permitted.

Fragment-only (‘#foo`), query-only (`?q=1`), and path-relative (`/foo`) URLs are always considered safe. Protocol-relative URLs (`//example.com`) are treated as paths and also considered safe. Explicit protocols are lowercased before the allowlist check; `data:` URIs are permitted only when the MIME type appears in `allowed_data_mimes`.

Parameters:

  • url (String)

    the URL to inspect

  • allowed_protocols (Array<String>) (defaults to: DEFAULT_ALLOWED_PROTOCOLS)

    permitted protocol names

  • allowed_data_mimes (Array<String>) (defaults to: DEFAULT_ALLOWED_DATA_MIMES)

    permitted data: MIME types

Returns:

  • (String, nil)

    the stripped URL when safe, otherwise nil



183
184
185
186
187
188
189
# File 'lib/philiprehberger/sanitize_html.rb', line 183

def self.sanitize_url(url, allowed_protocols: DEFAULT_ALLOWED_PROTOCOLS, allowed_data_mimes: DEFAULT_ALLOWED_DATA_MIMES)
  stripped = url.to_s.strip
  return nil if stripped.empty?
  return nil unless valid_url?(stripped, allowed_protocols, allowed_data_mimes)

  stripped
end

.strip(html, max_length: nil) ⇒ String

Remove all HTML tags, returning only text content.

Parameters:

  • html (String)

    the HTML string to strip

  • max_length (Integer, nil) (defaults to: nil)

    maximum allowed input length; raises Error when exceeded

Returns:

  • (String)

    plain text with no HTML tags



127
128
129
130
131
132
133
134
135
136
# File 'lib/philiprehberger/sanitize_html.rb', line 127

def self.strip(html, max_length: nil)
  return '' if html.nil? || html.empty?

  enforce_max_length!(html, max_length)

  text = normalize_entities(html)
  text = remove_dangerous_tags(text)
  text = text.gsub(/<[^>]*>/, '')
  decode_entities(text)
end

.strip_tags(html, max_length: nil) ⇒ String

Convert HTML to plain text by removing all tags and decoding entities.

Removes dangerous tags (script, style, iframe) along with their content, strips all remaining tags while preserving inner text, and decodes HTML entities so the result is a plain string. Returns an empty string for nil or empty input.

Parameters:

  • html (String, nil)

    the HTML string to convert

  • max_length (Integer, nil) (defaults to: nil)

    maximum allowed input length; raises Error when exceeded

Returns:

  • (String)

    plain text with no HTML tags or entities



148
149
150
# File 'lib/philiprehberger/sanitize_html.rb', line 148

def self.strip_tags(html, max_length: nil)
  strip(html, max_length: max_length)
end