Module: Philiprehberger::SanitizeHtml

Defined in:
lib/philiprehberger/sanitize_html.rb,
lib/philiprehberger/sanitize_html/version.rb

Defined Under Namespace

Classes: Error

Constant Summary collapse

DEFAULT_ALLOWED_TAGS =
%w[
  p br strong em b i u a ul ol li blockquote code pre
  h1 h2 h3 h4 h5 h6
].freeze
DEFAULT_ALLOWED_ATTRIBUTES =
{
  'a' => %w[href title],
  'img' => %w[src alt]
}.freeze
DANGEROUS_TAGS =
%w[script style iframe].freeze
EVENT_ATTRIBUTE_PATTERN =
/\A\s*on/i
DEFAULT_ALLOWED_PROTOCOLS =
%w[http https mailto].freeze
DEFAULT_ALLOWED_DATA_MIMES =
[].freeze
SAFE_CSS_PROPERTIES =
%w[
  color background-color font-size font-family font-weight font-style
  text-align text-decoration text-indent text-transform
  line-height letter-spacing word-spacing
  margin margin-top margin-right margin-bottom margin-left
  padding padding-top padding-right padding-bottom padding-left
  border border-top border-right border-bottom border-left
  border-color border-style border-width border-radius
  width height max-width max-height min-width min-height
  display float clear vertical-align
  list-style list-style-type
  white-space overflow
  opacity visibility
].freeze
DANGEROUS_CSS_PATTERN =
/expression\s*\(|javascript\s*:|url\s*\(\s*['"]?\s*javascript\s*:/i
PROFILES =
{
  strict: {
    tags: [],
    attributes: {}
  },
  moderate: {
    tags: %w[p br strong em b i u ul ol li blockquote],
    attributes: {}
  },
  permissive: {
    tags: %w[
      p br strong em b i u a ul ol li blockquote code pre
      h1 h2 h3 h4 h5 h6 img div span table thead tbody tr th td
      dl dt dd sub sup hr
    ],
    attributes: {
      'a' => %w[href title],
      'img' => %w[src alt width height],
      'td' => %w[colspan rowspan],
      'th' => %w[colspan rowspan]
    }
  },
  markdown: {
    tags: %w[
      p br strong em b i u a ul ol li blockquote code pre
      h1 h2 h3 h4 h5 h6 img hr table thead tbody tr th td
    ],
    attributes: {
      'a' => %w[href title],
      'img' => %w[src alt]
    }
  },
  text_only: {
    tags: [],
    attributes: {}
  }
}.freeze
VERSION =
'0.3.0'

Class Method Summary collapse

Class Method Details

.clean(html, tags: nil, attributes: nil, profile: nil, allowed_protocols: nil, allowed_data_mimes: nil, on_tag: nil) ⇒ String

Sanitize HTML by removing disallowed tags and attributes.

Parameters:

  • html (String)

    the HTML string to sanitize

  • tags (Array<String>) (defaults to: nil)

    allowed tag names

  • attributes (Hash{String => Array<String>}) (defaults to: nil)

    allowed attributes per tag

  • profile (Symbol, nil) (defaults to: nil)

    predefined security profile (:strict, :moderate, :permissive, :markdown, :text_only)

  • allowed_protocols (Array<String>, nil) (defaults to: nil)

    allowed URL protocols for href/src attributes

  • allowed_data_mimes (Array<String>, nil) (defaults to: nil)

    allowed MIME types for data: URIs

  • on_tag (Proc, nil) (defaults to: nil)

    callback for custom tag processing, receives (tag_name, attributes_hash)

Returns:

  • (String)

    sanitized HTML



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/philiprehberger/sanitize_html.rb', line 92

def self.clean(html, tags: nil, attributes: nil, profile: nil,
               allowed_protocols: nil, allowed_data_mimes: nil, on_tag: nil)
  return '' if html.nil? || html.empty?

  if profile
    raise Error, "Unknown profile: #{profile}" unless PROFILES.key?(profile)

    return strip_tags(html) if profile == :text_only && tags.nil? && attributes.nil?

    profile_config = PROFILES[profile]
    tags ||= profile_config[:tags]
    attributes ||= profile_config[:attributes]
  end

  tags ||= DEFAULT_ALLOWED_TAGS
  attributes ||= DEFAULT_ALLOWED_ATTRIBUTES
  allowed_protocols ||= DEFAULT_ALLOWED_PROTOCOLS
  allowed_data_mimes ||= DEFAULT_ALLOWED_DATA_MIMES

  result = normalize_entities(html)
  result = remove_dangerous_tags(result)
  process_tags(result, tags, attributes, allowed_protocols, allowed_data_mimes, on_tag)
end

.escape(html) ⇒ String

Escape all HTML tags by converting < and > to entities.

Parameters:

  • html (String)

    the HTML string to escape

Returns:

  • (String)

    entity-encoded HTML



146
147
148
149
150
151
152
153
154
# File 'lib/philiprehberger/sanitize_html.rb', line 146

def self.escape(html)
  return '' if html.nil? || html.empty?

  html.gsub('&', '&amp;')
      .gsub('<', '&lt;')
      .gsub('>', '&gt;')
      .gsub('"', '&quot;')
      .gsub("'", '&#39;')
end

.strip(html) ⇒ String

Remove all HTML tags, returning only text content.

Parameters:

  • html (String)

    the HTML string to strip

Returns:

  • (String)

    plain text with no HTML tags



120
121
122
123
124
125
126
127
# File 'lib/philiprehberger/sanitize_html.rb', line 120

def self.strip(html)
  return '' if html.nil? || html.empty?

  text = normalize_entities(html)
  text = remove_dangerous_tags(text)
  text = text.gsub(/<[^>]*>/, '')
  decode_entities(text)
end

.strip_tags(html) ⇒ String

Convert HTML to plain text by removing all tags and decoding entities.

Removes dangerous tags (script, style, iframe) along with their content, strips all remaining tags while preserving inner text, and decodes HTML entities so the result is a plain string. Returns an empty string for nil or empty input.

Parameters:

  • html (String, nil)

    the HTML string to convert

Returns:

  • (String)

    plain text with no HTML tags or entities



138
139
140
# File 'lib/philiprehberger/sanitize_html.rb', line 138

def self.strip_tags(html)
  strip(html)
end