Class: Canon::Html::DataModel

Inherits:
DataModel show all
Defined in:
lib/canon/html/data_model.rb

Overview

Builds XPath data model from HTML HTML-specific parsing with lowercase element/attribute names, whitespace-sensitive element handling, and fragment parsing

Class Method Summary collapse

Class Method Details

.build_attribute_nodes(nokogiri_element, element) ⇒ Object

Build attribute nodes for an element



186
187
188
189
190
191
192
193
194
195
196
197
198
# File 'lib/canon/html/data_model.rb', line 186

def self.build_attribute_nodes(nokogiri_element, element)
  nokogiri_element.attributes.each do |name, attr|
    next if name.start_with?("xmlns")

    attr_node = Canon::Xml::Nodes::AttributeNode.new(
      name: attr.name,
      value: attr.value,
      namespace_uri: attr.namespace&.href,
      prefix: attr.namespace&.prefix,
    )
    element.add_attribute(attr_node)
  end
end

.build_comment_node(nokogiri_comment) ⇒ Object

Build comment node from Nokogiri comment



227
228
229
# File 'lib/canon/html/data_model.rb', line 227

def self.build_comment_node(nokogiri_comment)
  Canon::Xml::Nodes::CommentNode.new(value: nokogiri_comment.content)
end

.build_element_node(nokogiri_element) ⇒ Object

Build element node from Nokogiri element



124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/canon/html/data_model.rb', line 124

def self.build_element_node(nokogiri_element)
  element = Canon::Xml::Nodes::ElementNode.new(
    name: nokogiri_element.name,
    namespace_uri: nokogiri_element.namespace&.href,
    prefix: nokogiri_element.namespace&.prefix,
  )

  # Build namespace nodes (includes inherited namespaces)
  build_namespace_nodes(nokogiri_element, element)

  # Build attribute nodes
  build_attribute_nodes(nokogiri_element, element)

  # Build child nodes
  nokogiri_element.children.each do |child|
    node = build_node_from_nokogiri(child)
    element.add_child(node) if node
  end

  element
end

.build_from_nokogiri(nokogiri_doc) ⇒ Object

Build XPath data model from Nokogiri document or fragment



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/canon/html/data_model.rb', line 80

def self.build_from_nokogiri(nokogiri_doc)
  root = Canon::Xml::Nodes::RootNode.new

  if nokogiri_doc.is_a?(Nokogiri::XML::Document) && nokogiri_doc.root
    # For Documents (HTML4, HTML5): process the root element
    root.add_child(build_element_node(nokogiri_doc.root))

    # Process PIs and comments outside doc element
    nokogiri_doc.children.each do |child|
      next if child == nokogiri_doc.root
      next if child.is_a?(Nokogiri::XML::DTD)

      node = build_node_from_nokogiri(child)
      root.add_child(node) if node
    end
  else
    # For DocumentFragments: process all children directly
    # Fragments don't have a single .root, they contain multiple top-level nodes
    nokogiri_doc.children.each do |child|
      next if child.is_a?(Nokogiri::XML::DTD)

      node = build_node_from_nokogiri(child)
      root.add_child(node) if node
    end
  end

  root
end

.build_namespace_nodes(nokogiri_element, element) ⇒ Object

Build namespace nodes for an element



147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/canon/html/data_model.rb', line 147

def self.build_namespace_nodes(nokogiri_element, element)
  # Collect all in-scope namespaces
  namespaces = collect_in_scope_namespaces(nokogiri_element)

  namespaces.each do |prefix, uri|
    ns_node = Canon::Xml::Nodes::NamespaceNode.new(
      prefix: prefix,
      uri: uri,
    )
    element.add_namespace(ns_node)
  end
end

.build_node_from_nokogiri(nokogiri_node) ⇒ Object

Build node from Nokogiri node



110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/canon/html/data_model.rb', line 110

def self.build_node_from_nokogiri(nokogiri_node)
  case nokogiri_node
  when Nokogiri::XML::Element
    build_element_node(nokogiri_node)
  when Nokogiri::XML::Text
    build_text_node(nokogiri_node)
  when Nokogiri::XML::Comment
    build_comment_node(nokogiri_node)
  when Nokogiri::XML::ProcessingInstruction
    build_pi_node(nokogiri_node)
  end
end

.build_pi_node(nokogiri_pi) ⇒ Object

Build PI node from Nokogiri PI



232
233
234
235
236
237
# File 'lib/canon/html/data_model.rb', line 232

def self.build_pi_node(nokogiri_pi)
  Canon::Xml::Nodes::ProcessingInstructionNode.new(
    target: nokogiri_pi.name,
    data: nokogiri_pi.content,
  )
end

.build_text_node(nokogiri_text) ⇒ Object

Build text node from Nokogiri text node HTML-specific: handles whitespace-sensitive elements (pre, code, textarea, script, style) and preserves whitespace between inline element siblings.



203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/canon/html/data_model.rb', line 203

def self.build_text_node(nokogiri_text)
  # Skip text nodes that are only whitespace between elements
  # EXCEPT in whitespace-sensitive elements (pre, code, textarea, script, style)
  # and when whitespace is between inline element siblings (semantically significant)
  content = nokogiri_text.content

  # NBSP (U+00A0) is never insignificant whitespace
  if content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element) && !content.include?("\u00A0")
    # Check if parent is whitespace-sensitive
    parent_name = nokogiri_text.parent.name.downcase
    whitespace_sensitive_tags = %w[pre code textarea script style]

    # Check if whitespace is between inline siblings
    unless whitespace_sensitive_tags.include?(parent_name) ||
        Canon::Comparison::WhitespaceSensitivity.inline_whitespace_significant?(nokogiri_text)
      return nil
    end
  end

  # Nokogiri already handles CDATA conversion and entity resolution
  Canon::Xml::Nodes::TextNode.new(value: content)
end

.collect_in_scope_namespaces(nokogiri_element) ⇒ Object

Collect all in-scope namespaces for an element



161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/canon/html/data_model.rb', line 161

def self.collect_in_scope_namespaces(nokogiri_element)
  namespaces = {}

  # Walk up the tree to collect all namespace declarations
  current = nokogiri_element
  while current && !current.is_a?(Nokogiri::XML::Document)
    if current.is_a?(Nokogiri::XML::Element)
      current.namespace_definitions.each do |ns|
        prefix = ns.prefix || ""
        # Only add if not already defined (child overrides parent)
        unless namespaces.key?(prefix)
          namespaces[prefix] = ns.href
        end
      end
    end
    current = current.parent
  end

  # Always include xml namespace
  namespaces["xml"] ||= "http://www.w3.org/XML/1998/namespace"

  namespaces
end

.from_html(html_string, version: :html4) ⇒ Canon::Xml::Nodes::RootNode

Build XPath data model from HTML string

Parameters:

  • html_string (String)

    HTML content to parse

  • version (Symbol) (defaults to: :html4)

    HTML version (:html4 or :html5)

Returns:



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/canon/html/data_model.rb', line 16

def self.from_html(html_string, version: :html4)
  # Detect if this is a full document (has <html> tag) or fragment
  # Full documents should use document parser to preserve structure
  # Fragments should use fragment parser to avoid adding implicit wrappers
  is_full_document = html_string.match?(%r{<html[\s>]}i)

  # Parse with Nokogiri using appropriate parser
  doc = if is_full_document
          # CRITICAL FIX: For full HTML documents, parse as document first
          # and extract the body element. This avoids Nokogiri::HTML.fragment()
          # incorrectly moving head elements (like meta) to the body.
          # Parse as full document to get proper structure
          full_doc = if version == :html5
                       Nokogiri::HTML5(html_string)
                     else
                       Nokogiri::HTML4(html_string)
                     end
          # Extract body element and create fragment from it
          body = full_doc.at_css("body")
          if body
            # Create a fragment and copy body children to it
            # This preserves the body structure without head elements
            frag = if version == :html5
                     Nokogiri::HTML5::DocumentFragment.new(full_doc)
                   else
                     Nokogiri::HTML4::DocumentFragment.new(full_doc)
                   end
            body.children.each do |child|
              frag.add_child(child.dup)
            end
            frag
          elsif version == :html5
            # No body found, fall back to fragment parsing
            Nokogiri::HTML5.fragment(html_string)
          else
            Nokogiri::HTML4.fragment(html_string)
          end
        elsif version == :html5
          # Fragment - use fragment parser to avoid implicit wrappers
          Nokogiri::HTML5.fragment(html_string)
        else
          Nokogiri::HTML4.fragment(html_string)
        end

  # HTML doesn't have strict namespace requirements like XML,
  # so skip the relative namespace URI check

  # Convert to XPath data model (reuse XML infrastructure)
  build_from_nokogiri(doc)
end

.parse(html_string, version: :html4) ⇒ Object

Alias for compatibility



68
69
70
# File 'lib/canon/html/data_model.rb', line 68

def self.parse(html_string, version: :html4)
  from_html(html_string, version: version)
end

.serialize(node) ⇒ Object

Serialize HTML node to string



73
74
75
76
77
# File 'lib/canon/html/data_model.rb', line 73

def self.serialize(node)
  # HTML nodes use the same serialization as XML
  # Delegate to XML serialization implementation
  Canon::Xml::DataModel.serialize(node)
end