Class: Canon::Html::DataModel

Inherits:
DataModel show all
Defined in:
lib/canon/html/data_model.rb

Overview

Builds XPath data model from HTML HTML-specific parsing with lowercase element/attribute names, whitespace-sensitive element handling, and fragment parsing

Class Method Summary collapse

Class Method Details

.build_attribute_nodes(nokogiri_element, element) ⇒ Object

Build attribute nodes for an element



195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/canon/html/data_model.rb', line 195

def self.build_attribute_nodes(nokogiri_element, element)
  nokogiri_element.attributes.each do |name, attr|
    next if name.start_with?("xmlns")

    attr_node = Canon::Xml::Nodes::AttributeNode.new(
      name: attr.name,
      value: attr.value,
      namespace_uri: attr.namespace&.href,
      prefix: attr.namespace&.prefix,
    )
    element.add_attribute(attr_node)
  end
end

.build_comment_node(nokogiri_comment) ⇒ Object

Build comment node from Nokogiri comment



231
232
233
# File 'lib/canon/html/data_model.rb', line 231

def self.build_comment_node(nokogiri_comment)
  Canon::Xml::Nodes::CommentNode.new(value: nokogiri_comment.content)
end

.build_element_node(nokogiri_element) ⇒ Object

Build element node from Nokogiri element



133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/canon/html/data_model.rb', line 133

def self.build_element_node(nokogiri_element)
  element = Canon::Xml::Nodes::ElementNode.new(
    name: nokogiri_element.name,
    namespace_uri: nokogiri_element.namespace&.href,
    prefix: nokogiri_element.namespace&.prefix,
  )

  # Build namespace nodes (includes inherited namespaces)
  build_namespace_nodes(nokogiri_element, element)

  # Build attribute nodes
  build_attribute_nodes(nokogiri_element, element)

  # Build child nodes
  nokogiri_element.children.each do |child|
    node = build_node_from_nokogiri(child)
    element.add_child(node) if node
  end

  element
end

.build_from_nokogiri(nokogiri_doc) ⇒ Object

Build XPath data model from Nokogiri document or fragment



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/canon/html/data_model.rb', line 89

def self.build_from_nokogiri(nokogiri_doc)
  root = Canon::Xml::Nodes::RootNode.new

  if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
    # For Documents (HTML4, HTML5): process the root element
    root.add_child(build_element_node(nokogiri_doc.root))

    # Process PIs and comments outside doc element
    nokogiri_doc.children.each do |child|
      next if child == nokogiri_doc.root
      next if child.is_a?(Nokogiri::XML::DTD)

      node = build_node_from_nokogiri(child)
      root.add_child(node) if node
    end
  else
    # For DocumentFragments: process all children directly
    # Fragments don't have a single .root, they contain multiple top-level nodes
    nokogiri_doc.children.each do |child|
      next if child.is_a?(Nokogiri::XML::DTD)

      node = build_node_from_nokogiri(child)
      root.add_child(node) if node
    end
  end

  root
end

.build_namespace_nodes(nokogiri_element, element) ⇒ Object

Build namespace nodes for an element



156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/canon/html/data_model.rb', line 156

def self.build_namespace_nodes(nokogiri_element, element)
  # Collect all in-scope namespaces
  namespaces = collect_in_scope_namespaces(nokogiri_element)

  namespaces.each do |prefix, uri|
    ns_node = Canon::Xml::Nodes::NamespaceNode.new(
      prefix: prefix,
      uri: uri,
    )
    element.add_namespace(ns_node)
  end
end

.build_node_from_nokogiri(nokogiri_node) ⇒ Object

Build node from Nokogiri node



119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/canon/html/data_model.rb', line 119

def self.build_node_from_nokogiri(nokogiri_node)
  case nokogiri_node
  when Nokogiri::XML::Element
    build_element_node(nokogiri_node)
  when Nokogiri::XML::Text
    build_text_node(nokogiri_node)
  when Nokogiri::XML::Comment
    build_comment_node(nokogiri_node)
  when Nokogiri::XML::ProcessingInstruction
    build_pi_node(nokogiri_node)
  end
end

.build_pi_node(nokogiri_pi) ⇒ Object

Build PI node from Nokogiri PI



236
237
238
239
240
241
# File 'lib/canon/html/data_model.rb', line 236

def self.build_pi_node(nokogiri_pi)
  Canon::Xml::Nodes::ProcessingInstructionNode.new(
    target: nokogiri_pi.name,
    data: nokogiri_pi.content,
  )
end

.build_text_node(nokogiri_text) ⇒ Object

Build text node from Nokogiri text node HTML-specific: handles whitespace-sensitive elements (pre, code, textarea, script, style)



211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/canon/html/data_model.rb', line 211

def self.build_text_node(nokogiri_text)
  # Skip text nodes that are only whitespace between elements
  # EXCEPT in whitespace-sensitive elements (pre, code, textarea, script, style)
  # where whitespace is semantically significant
  content = nokogiri_text.content

  if content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
    # Check if parent is whitespace-sensitive
    parent_name = nokogiri_text.parent.name.downcase
    whitespace_sensitive_tags = %w[pre code textarea script style]

    # Skip whitespace-only text UNLESS in whitespace-sensitive element
    return nil unless whitespace_sensitive_tags.include?(parent_name)
  end

  # Nokogiri already handles CDATA conversion and entity resolution
  Canon::Xml::Nodes::TextNode.new(value: content)
end

.collect_in_scope_namespaces(nokogiri_element) ⇒ Object

Collect all in-scope namespaces for an element



170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/canon/html/data_model.rb', line 170

def self.collect_in_scope_namespaces(nokogiri_element)
  namespaces = {}

  # Walk up the tree to collect all namespace declarations
  current = nokogiri_element
  while current && !current.is_a?(Nokogiri::XML::Document)
    if current.is_a?(Nokogiri::XML::Element)
      current.namespace_definitions.each do |ns|
        prefix = ns.prefix || ""
        # Only add if not already defined (child overrides parent)
        unless namespaces.key?(prefix)
          namespaces[prefix] = ns.href
        end
      end
    end
    current = current.parent
  end

  # Always include xml namespace
  namespaces["xml"] ||= "http://www.w3.org/XML/1998/namespace"

  namespaces
end

.from_html(html_string, version: :html4) ⇒ Canon::Xml::Nodes::RootNode

Build XPath data model from HTML string

Parameters:

  • html_string (String)

    HTML content to parse

  • version (Symbol) (defaults to: :html4)

    HTML version (:html4 or :html5)

Returns:



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/canon/html/data_model.rb', line 24

def self.from_html(html_string, version: :html4)
  # Detect if this is a full document (has <html> tag) or fragment
  # Full documents should use document parser to preserve structure
  # Fragments should use fragment parser to avoid adding implicit wrappers
  is_full_document = html_string.match?(%r{<html[\s>]}i)

  # Parse with Nokogiri using appropriate parser
  doc = if is_full_document
          # CRITICAL FIX: For full HTML documents, parse as document first
          # and extract the body element. This avoids Nokogiri::HTML.fragment()
          # incorrectly moving head elements (like meta) to the body.
          # Parse as full document to get proper structure
          full_doc = if version == :html5
                       Nokogiri::HTML5(html_string)
                     else
                       Nokogiri::HTML4(html_string)
                     end
          # Extract body element and create fragment from it
          body = full_doc.at_css("body")
          if body
            # Create a fragment and copy body children to it
            # This preserves the body structure without head elements
            frag = if version == :html5
                     Nokogiri::HTML5::DocumentFragment.new(full_doc)
                   else
                     Nokogiri::HTML4::DocumentFragment.new(full_doc)
                   end
            body.children.each do |child|
              frag.add_child(child.dup)
            end
            frag
          elsif version == :html5
            # No body found, fall back to fragment parsing
            Nokogiri::HTML5.fragment(html_string)
          else
            Nokogiri::HTML4.fragment(html_string)
          end
        elsif version == :html5
          # Fragment - use fragment parser to avoid implicit wrappers
          Nokogiri::HTML5.fragment(html_string)
        else
          Nokogiri::HTML4.fragment(html_string)
        end

  # HTML doesn't have strict namespace requirements like XML,
  # so skip the relative namespace URI check

  # Convert to XPath data model (reuse XML infrastructure)
  build_from_nokogiri(doc)
end

.parse(html_string, version: :html4) ⇒ Object

Alias for compatibility



76
77
78
# File 'lib/canon/html/data_model.rb', line 76

def self.parse(html_string, version: :html4)
  from_html(html_string, version: version)
end

.serialize(node) ⇒ Object

Serialize HTML node to string



81
82
83
84
85
86
# File 'lib/canon/html/data_model.rb', line 81

def self.serialize(node)
  # HTML nodes use the same serialization as XML
  # Delegate to XML serialization implementation
  require_relative "../xml/data_model"
  Canon::Xml::DataModel.serialize(node)
end