Module: Canon::TreeDiff::Core::XmlEntityDecoder

Defined in:
lib/canon/tree_diff/core/xml_entity_decoder.rb

Overview

Decodes XML entity references to Unicode characters.

Handles:

  • Named entities: & < > " '

  • Decimal numeric entities: &#digits;

  • Hexadecimal numeric entities: &#xH+;

Constant Summary collapse

ENTITY_PATTERN =
/&(?:amp|lt|gt|quot|apos|#[0-9]+|#[xX][0-9a-fA-F]+);/

Class Method Summary collapse

Class Method Details

.decode_codepoint(code_point) ⇒ Object



42
43
44
45
46
47
48
# File 'lib/canon/tree_diff/core/xml_entity_decoder.rb', line 42

def decode_codepoint(code_point)
  if code_point.positive? && code_point <= 0x10FFFF
    [code_point].pack("U")
  else
    ""
  end
end

.decode_entity(entity) ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/canon/tree_diff/core/xml_entity_decoder.rb', line 24

def decode_entity(entity)
  inner = entity[1..-2]

  case inner
  when "amp" then "&"
  when "lt" then "<"
  when "gt" then ">"
  when "quot" then '"'
  when "apos" then "'"
  when /\A#([0-9]+)\z/
    decode_codepoint(Regexp.last_match(1).to_i)
  when /\A#x([0-9a-fA-F]+)\z/, /\A#X([0-9a-fA-F]+)\z/
    decode_codepoint(Regexp.last_match(1).to_i(16))
  else
    entity
  end
end

.decode_xml_entities(text) ⇒ Object



17
18
19
20
21
22
# File 'lib/canon/tree_diff/core/xml_entity_decoder.rb', line 17

def decode_xml_entities(text)
  return text if text.nil? || text.empty?
  return text unless text.include?("&")

  text.gsub(ENTITY_PATTERN) { |match| decode_entity(match) }
end