Module: Coradoc::Html::Entity

Defined in:
lib/coradoc/html/entity.rb

Overview

HTML entity handling

Constant Summary collapse

NAMED_ENTITIES =

Named HTML entities

{
  'nbsp' => "\u00A0",
  'lt' => '<',
  'gt' => '>',
  'amp' => '&',
  'quot' => '"',
  'apos' => "'",
  'cent' => "\u00A2",
  'pound' => "\u00A3",
  'yen' => "\u00A5",
  'euro' => "\u20AC",
  'copy' => "\u00A9",
  'reg' => "\u00AE",
  'trade' => "\u2122",
  'mdash' => "\u2014",
  'ndash' => "\u2013",
  'hellip' => "\u2026",
  'laquo' => "\u00AB",
  'raquo' => "\u00BB",
  'ldquo' => "\u201C",
  'rdquo' => "\u201D",
  'lsquo' => "\u2018",
  'rsquo' => "\u2019"
}.freeze

Class Method Summary collapse

Class Method Details

.decode(text) ⇒ Object

Decode HTML entities to text



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/coradoc/html/entity.rb', line 57

def decode(text)
  return '' if text.nil?
  return text unless text.is_a?(String)

  decoded = text.dup

  # Decode named entities
  NAMED_ENTITIES.each do |name, char|
    decoded = decoded.gsub("&#{name};", char)
  end

  # Decode numeric entities (decimal)
  decoded = decoded.gsub(/&#(\d+);/) do
    [::Regexp.last_match(1).to_i].pack('U')
  end

  # Decode numeric entities (hexadecimal)
  decoded = decoded.gsub(/&#x([0-9a-fA-F]+);/) do
    [::Regexp.last_match(1).to_i(16)].pack('U')
  end

  # Decode basic entities last
  decoded
    .gsub('&quot;', '"')
    .gsub('&#39;', "'")
    .gsub('&#x27;', "'")
    .gsub('&lt;', '<')
    .gsub('&gt;', '>')
    .gsub('&amp;', '&')
end

.encode(text, options = {}) ⇒ Object

Encode text to HTML entities



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/coradoc/html/entity.rb', line 35

def encode(text, options = {})
  return '' if text.nil?
  return text unless text.is_a?(String)

  encoded = text.dup

  # Basic HTML entities
  encoded = encoded
            .gsub('&', '&amp;')
            .gsub('<', '&lt;')
            .gsub('>', '&gt;')
            .gsub('"', '&quot;')

  # Optionally encode additional characters
  encoded = encoded.gsub("'", '&#39;') if options[:encode_quotes]

  encoded = encoded.gsub("\u00A0", '&nbsp;') if options[:encode_nbsp]

  encoded
end

.has_entities?(text) ⇒ Boolean

Check if text contains HTML entities

Returns:

  • (Boolean)


109
110
111
112
113
# File 'lib/coradoc/html/entity.rb', line 109

def has_entities?(text)
  return false unless text.is_a?(String)

  text.match?(/&[a-zA-Z]+;|&#\d+;|&#x[0-9a-fA-F]+;/)
end

.normalize(text) ⇒ Object

Normalize entities (convert all to named where possible, numeric otherwise)



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/coradoc/html/entity.rb', line 116

def normalize(text)
  return '' if text.nil?
  return text unless text.is_a?(String)

  # First decode to get actual characters
  decoded = decode(text)

  # Then encode back using named entities where possible
  decoded.chars.map do |char|
    case char
    when '&', '<', '>', '"', "'"
      encode(char)
    else
      named = to_named_entity(char)
      named == char ? char : named
    end
  end.join
end

.to_named_entity(char) ⇒ Object

Convert character to named entity if available



89
90
91
92
# File 'lib/coradoc/html/entity.rb', line 89

def to_named_entity(char)
  entity_name = NAMED_ENTITIES.key(char)
  entity_name ? "&#{entity_name};" : char
end

.to_numeric_entity(char, format: :decimal) ⇒ Object

Convert character to numeric entity



95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/coradoc/html/entity.rb', line 95

def to_numeric_entity(char, format: :decimal)
  codepoint = char.ord

  case format
  when :decimal
    "&##{codepoint};"
  when :hex, :hexadecimal
    "&#x#{codepoint.to_s(16)};"
  else
    char
  end
end