Class: Canon::Html::DataModel

Inherits:
DataModel show all
Defined in:
lib/canon/html/data_model.rb

Overview

Builds XPath data model from HTML HTML-specific parsing with lowercase element/attribute names, whitespace-sensitive element handling, and fragment parsing

Class Method Summary collapse

Class Method Details

.build_attribute_nodes(nokogiri_element, element) ⇒ Object

Build attribute nodes for an element



173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/canon/html/data_model.rb', line 173

def self.build_attribute_nodes(nokogiri_element, element)
  nokogiri_element.attributes.each do |name, attr|
    next if name.start_with?("xmlns")

    attr_node = Canon::Xml::Nodes::AttributeNode.new(
      name: attr.name,
      value: attr.value,
      namespace_uri: attr.namespace&.href,
      prefix: attr.namespace&.prefix,
    )
    element.add_attribute(attr_node)
  end
end

.build_comment_node(nokogiri_comment) ⇒ Object

Build comment node from Nokogiri comment



209
210
211
# File 'lib/canon/html/data_model.rb', line 209

def self.build_comment_node(nokogiri_comment)
  Canon::Xml::Nodes::CommentNode.new(value: nokogiri_comment.content)
end

.build_element_node(nokogiri_element) ⇒ Object

Build element node from Nokogiri element



111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/canon/html/data_model.rb', line 111

def self.build_element_node(nokogiri_element)
  element = Canon::Xml::Nodes::ElementNode.new(
    name: nokogiri_element.name,
    namespace_uri: nokogiri_element.namespace&.href,
    prefix: nokogiri_element.namespace&.prefix,
  )

  # Build namespace nodes (includes inherited namespaces)
  build_namespace_nodes(nokogiri_element, element)

  # Build attribute nodes
  build_attribute_nodes(nokogiri_element, element)

  # Build child nodes
  nokogiri_element.children.each do |child|
    node = build_node_from_nokogiri(child)
    element.add_child(node) if node
  end

  element
end

.build_from_nokogiri(nokogiri_doc) ⇒ Object

Build XPath data model from Nokogiri document or fragment



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/canon/html/data_model.rb', line 67

def self.build_from_nokogiri(nokogiri_doc)
  root = Canon::Xml::Nodes::RootNode.new

  if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
    # For Documents (HTML4, HTML5): process the root element
    root.add_child(build_element_node(nokogiri_doc.root))

    # Process PIs and comments outside doc element
    nokogiri_doc.children.each do |child|
      next if child == nokogiri_doc.root
      next if child.is_a?(Nokogiri::XML::DTD)

      node = build_node_from_nokogiri(child)
      root.add_child(node) if node
    end
  else
    # For DocumentFragments: process all children directly
    # Fragments don't have a single .root, they contain multiple top-level nodes
    nokogiri_doc.children.each do |child|
      next if child.is_a?(Nokogiri::XML::DTD)

      node = build_node_from_nokogiri(child)
      root.add_child(node) if node
    end
  end

  root
end

.build_namespace_nodes(nokogiri_element, element) ⇒ Object

Build namespace nodes for an element



134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/canon/html/data_model.rb', line 134

def self.build_namespace_nodes(nokogiri_element, element)
  # Collect all in-scope namespaces
  namespaces = collect_in_scope_namespaces(nokogiri_element)

  namespaces.each do |prefix, uri|
    ns_node = Canon::Xml::Nodes::NamespaceNode.new(
      prefix: prefix,
      uri: uri,
    )
    element.add_namespace(ns_node)
  end
end

.build_node_from_nokogiri(nokogiri_node) ⇒ Object

Build node from Nokogiri node



97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/canon/html/data_model.rb', line 97

def self.build_node_from_nokogiri(nokogiri_node)
  case nokogiri_node
  when Nokogiri::XML::Element
    build_element_node(nokogiri_node)
  when Nokogiri::XML::Text
    build_text_node(nokogiri_node)
  when Nokogiri::XML::Comment
    build_comment_node(nokogiri_node)
  when Nokogiri::XML::ProcessingInstruction
    build_pi_node(nokogiri_node)
  end
end

.build_pi_node(nokogiri_pi) ⇒ Object

Build PI node from Nokogiri PI



214
215
216
217
218
219
# File 'lib/canon/html/data_model.rb', line 214

def self.build_pi_node(nokogiri_pi)
  Canon::Xml::Nodes::ProcessingInstructionNode.new(
    target: nokogiri_pi.name,
    data: nokogiri_pi.content,
  )
end

.build_text_node(nokogiri_text) ⇒ Object

Build text node from Nokogiri text node HTML-specific: handles whitespace-sensitive elements (pre, code, textarea, script, style)



189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/canon/html/data_model.rb', line 189

def self.build_text_node(nokogiri_text)
  # Skip text nodes that are only whitespace between elements
  # EXCEPT in whitespace-sensitive elements (pre, code, textarea, script, style)
  # where whitespace is semantically significant
  content = nokogiri_text.content

  if content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
    # Check if parent is whitespace-sensitive
    parent_name = nokogiri_text.parent.name.downcase
    whitespace_sensitive_tags = %w[pre code textarea script style]

    # Skip whitespace-only text UNLESS in whitespace-sensitive element
    return nil unless whitespace_sensitive_tags.include?(parent_name)
  end

  # Nokogiri already handles CDATA conversion and entity resolution
  Canon::Xml::Nodes::TextNode.new(value: content)
end

.collect_in_scope_namespaces(nokogiri_element) ⇒ Object

Collect all in-scope namespaces for an element



148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/canon/html/data_model.rb', line 148

def self.collect_in_scope_namespaces(nokogiri_element)
  namespaces = {}

  # Walk up the tree to collect all namespace declarations
  current = nokogiri_element
  while current && !current.is_a?(Nokogiri::XML::Document)
    if current.is_a?(Nokogiri::XML::Element)
      current.namespace_definitions.each do |ns|
        prefix = ns.prefix || ""
        # Only add if not already defined (child overrides parent)
        unless namespaces.key?(prefix)
          namespaces[prefix] = ns.href
        end
      end
    end
    current = current.parent
  end

  # Always include xml namespace
  namespaces["xml"] ||= "http://www.w3.org/XML/1998/namespace"

  namespaces
end

.from_html(html_string, version: :html4) ⇒ Canon::Xml::Nodes::RootNode

Build XPath data model from HTML string

Parameters:

  • html_string (String)

    HTML content to parse

  • version (Symbol) (defaults to: :html4)

    HTML version (:html4 or :html5)

Returns:



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/canon/html/data_model.rb', line 24

def self.from_html(html_string, version: :html4)
  # Detect if this is a full document (has <html> tag) or fragment
  # Full documents should use document parser to preserve structure
  # Fragments should use fragment parser to avoid adding implicit wrappers
  is_full_document = html_string.match?(/<html[\s>]/i)

  # Parse with Nokogiri using appropriate parser
  doc = if is_full_document
          # Full document - use fragment parser to avoid Nokogiri's phantom tag insertion
          # The fragment parser avoids auto-inserted meta tags in HTML4
          if version == :html5
            Nokogiri::HTML5.fragment(html_string)
          else
            Nokogiri::HTML4.fragment(html_string)
          end
        elsif version == :html5
          # Fragment - use fragment parser to avoid implicit wrappers
          Nokogiri::HTML5.fragment(html_string)
        else
          Nokogiri::HTML4.fragment(html_string)
        end

  # HTML doesn't have strict namespace requirements like XML,
  # so skip the relative namespace URI check

  # Convert to XPath data model (reuse XML infrastructure)
  build_from_nokogiri(doc)
end

.parse(html_string, version: :html4) ⇒ Object

Alias for compatibility



54
55
56
# File 'lib/canon/html/data_model.rb', line 54

def self.parse(html_string, version: :html4)
  from_html(html_string, version: version)
end

.serialize(node) ⇒ Object

Serialize HTML node to string



59
60
61
62
63
64
# File 'lib/canon/html/data_model.rb', line 59

def self.serialize(node)
  # HTML nodes use the same serialization as XML
  # Delegate to XML serialization implementation
  require_relative "../xml/data_model"
  Canon::Xml::DataModel.serialize(node)
end