Module: Coradoc::Html::Entity
- Defined in:
- lib/coradoc/html/entity.rb
Overview
HTML entity handling
Constant Summary collapse
- NAMED_ENTITIES =
Named HTML entities
{ 'nbsp' => "\u00A0", 'lt' => '<', 'gt' => '>', 'amp' => '&', 'quot' => '"', 'apos' => "'", 'cent' => "\u00A2", 'pound' => "\u00A3", 'yen' => "\u00A5", 'euro' => "\u20AC", 'copy' => "\u00A9", 'reg' => "\u00AE", 'trade' => "\u2122", 'mdash' => "\u2014", 'ndash' => "\u2013", 'hellip' => "\u2026", 'laquo' => "\u00AB", 'raquo' => "\u00BB", 'ldquo' => "\u201C", 'rdquo' => "\u201D", 'lsquo' => "\u2018", 'rsquo' => "\u2019" }.freeze
Class Method Summary collapse
-
.decode(text) ⇒ Object
Decode HTML entities to text.
-
.encode(text, options = {}) ⇒ Object
Encode text to HTML entities.
-
.has_entities?(text) ⇒ Boolean
Check if text contains HTML entities.
-
.normalize(text) ⇒ Object
Normalize entities (convert all to named where possible, numeric otherwise).
-
.to_named_entity(char) ⇒ Object
Convert character to named entity if available.
-
.to_numeric_entity(char, format: :decimal) ⇒ Object
Convert character to numeric entity.
Class Method Details
.decode(text) ⇒ Object
Decode HTML entities to text
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/coradoc/html/entity.rb', line 57 def decode(text) return '' if text.nil? return text unless text.is_a?(String) decoded = text.dup # Decode named entities NAMED_ENTITIES.each do |name, char| decoded = decoded.gsub("&#{name};", char) end # Decode numeric entities (decimal) decoded = decoded.gsub(/&#(\d+);/) do [::Regexp.last_match(1).to_i].pack('U') end # Decode numeric entities (hexadecimal) decoded = decoded.gsub(/&#x([0-9a-fA-F]+);/) do [::Regexp.last_match(1).to_i(16)].pack('U') end # Decode basic entities last decoded .gsub('"', '"') .gsub(''', "'") .gsub(''', "'") .gsub('<', '<') .gsub('>', '>') .gsub('&', '&') end |
.encode(text, options = {}) ⇒ Object
Encode text to HTML entities
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/coradoc/html/entity.rb', line 35 def encode(text, = {}) return '' if text.nil? return text unless text.is_a?(String) encoded = text.dup # Basic HTML entities encoded = encoded .gsub('&', '&') .gsub('<', '<') .gsub('>', '>') .gsub('"', '"') # Optionally encode additional characters encoded = encoded.gsub("'", ''') if [:encode_quotes] encoded = encoded.gsub("\u00A0", ' ') if [:encode_nbsp] encoded end |
.has_entities?(text) ⇒ Boolean
Check if text contains HTML entities
109 110 111 112 113 |
# File 'lib/coradoc/html/entity.rb', line 109 def has_entities?(text) return false unless text.is_a?(String) text.match?(/&[a-zA-Z]+;|&#\d+;|&#x[0-9a-fA-F]+;/) end |
.normalize(text) ⇒ Object
Normalize entities (convert all to named where possible, numeric otherwise)
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/coradoc/html/entity.rb', line 116 def normalize(text) return '' if text.nil? return text unless text.is_a?(String) # First decode to get actual characters decoded = decode(text) # Then encode back using named entities where possible decoded.chars.map do |char| case char when '&', '<', '>', '"', "'" encode(char) else named = to_named_entity(char) named == char ? char : named end end.join end |
.to_named_entity(char) ⇒ Object
Convert character to named entity if available
89 90 91 92 |
# File 'lib/coradoc/html/entity.rb', line 89 def to_named_entity(char) entity_name = NAMED_ENTITIES.key(char) entity_name ? "&#{entity_name};" : char end |
.to_numeric_entity(char, format: :decimal) ⇒ Object
Convert character to numeric entity
95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/coradoc/html/entity.rb', line 95 def to_numeric_entity(char, format: :decimal) codepoint = char.ord case format when :decimal "&##{codepoint};" when :hex, :hexadecimal "&#x#{codepoint.to_s(16)};" else char end end |