Class: Canon::Xml::DataModel

Inherits:
DataModel show all
Defined in:
lib/canon/xml/data_model.rb

Overview

Builds XPath data model from XML

Class Method Summary collapse

Class Method Details

.build_attribute_nodes(nokogiri_element, element) ⇒ Object

Build attribute nodes for an element



193
194
195
196
197
198
199
200
201
202
203
# File 'lib/canon/xml/data_model.rb', line 193

def self.build_attribute_nodes(nokogiri_element, element)
  nokogiri_element.attributes.each_value do |attr|
    attr_node = Nodes::AttributeNode.new(
      name: attr.name,
      value: attr.value,
      namespace_uri: attr.namespace&.href,
      prefix: attr.namespace&.prefix,
    )
    element.add_attribute(attr_node)
  end
end

.build_comment_node(nokogiri_comment) ⇒ Object

Build comment node from Nokogiri comment



222
223
224
# File 'lib/canon/xml/data_model.rb', line 222

def self.build_comment_node(nokogiri_comment)
  Nodes::CommentNode.new(value: nokogiri_comment.content)
end

.build_element_node(nokogiri_element, preserve_whitespace: false) ⇒ Object

Build element node from Nokogiri element rubocop:disable Metrics/MethodLength



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/canon/xml/data_model.rb', line 129

def self.build_element_node(nokogiri_element, preserve_whitespace: false)
  element = Nodes::ElementNode.new(
    name: nokogiri_element.name,
    namespace_uri: nokogiri_element.namespace&.href,
    prefix: nokogiri_element.namespace&.prefix,
  )

  # Build namespace nodes (includes inherited namespaces)
  build_namespace_nodes(nokogiri_element, element)

  # Build attribute nodes
  build_attribute_nodes(nokogiri_element, element)

  # Build child nodes
  nokogiri_element.children.each do |child|
    node = build_node_from_nokogiri(child,
                                    preserve_whitespace: preserve_whitespace)
    element.add_child(node) if node
  end

  element
end

.build_from_nokogiri(nokogiri_doc, preserve_whitespace: false) ⇒ Object

Build XPath data model from Nokogiri document or fragment rubocop:disable Metrics/MethodLength



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/canon/xml/data_model.rb', line 78

def self.build_from_nokogiri(nokogiri_doc, preserve_whitespace: false)
  root = Nodes::RootNode.new

  if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
    # For Documents (XML, HTML4, HTML5, Moxml): process the root element
    root.add_child(build_element_node(nokogiri_doc.root,
                                      preserve_whitespace: preserve_whitespace))

    # Process PIs and comments outside doc element
    nokogiri_doc.children.each do |child|
      next if child == nokogiri_doc.root
      next if child.is_a?(Nokogiri::XML::DTD)

      node = build_node_from_nokogiri(child,
                                      preserve_whitespace: preserve_whitespace)
      root.add_child(node) if node
    end
  else
    # For DocumentFragments: process all children directly
    # Fragments don't have a single .root, they contain multiple top-level nodes
    nokogiri_doc.children.each do |child|
      next if child.is_a?(Nokogiri::XML::DTD)

      node = build_node_from_nokogiri(child,
                                      preserve_whitespace: preserve_whitespace)
      root.add_child(node) if node
    end
  end

  root
end

.build_namespace_nodes(nokogiri_element, element) ⇒ Object

Build namespace nodes for an element



153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/canon/xml/data_model.rb', line 153

def self.build_namespace_nodes(nokogiri_element, element)
  # Collect all in-scope namespaces
  namespaces = collect_in_scope_namespaces(nokogiri_element)

  namespaces.each do |prefix, uri|
    ns_node = Nodes::NamespaceNode.new(
      prefix: prefix,
      uri: uri,
    )
    element.add_namespace(ns_node)
  end
end

.build_node_from_nokogiri(nokogiri_node, preserve_whitespace: false) ⇒ Object

Build node from Nokogiri node



111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'lib/canon/xml/data_model.rb', line 111

def self.build_node_from_nokogiri(nokogiri_node,
preserve_whitespace: false)
  case nokogiri_node
  when Nokogiri::XML::Element
    build_element_node(nokogiri_node,
                       preserve_whitespace: preserve_whitespace)
  when Nokogiri::XML::Text
    build_text_node(nokogiri_node,
                    preserve_whitespace: preserve_whitespace)
  when Nokogiri::XML::Comment
    build_comment_node(nokogiri_node)
  when Nokogiri::XML::ProcessingInstruction
    build_pi_node(nokogiri_node)
  end
end

.build_pi_node(nokogiri_pi) ⇒ Object

Build PI node from Nokogiri PI



227
228
229
230
231
232
# File 'lib/canon/xml/data_model.rb', line 227

def self.build_pi_node(nokogiri_pi)
  Nodes::ProcessingInstructionNode.new(
    target: nokogiri_pi.name,
    data: nokogiri_pi.content,
  )
end

.build_text_node(nokogiri_text, preserve_whitespace: false) ⇒ Object

Build text node from Nokogiri text node



206
207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'lib/canon/xml/data_model.rb', line 206

def self.build_text_node(nokogiri_text, preserve_whitespace: false)
  # XML text nodes: preserve all content including whitespace
  # Unlike HTML, XML treats all whitespace as significant
  content = nokogiri_text.content

  # Skip empty text nodes between elements (common formatting whitespace)
  # UNLESS preserve_whitespace is true (for structural_whitespace: :strict)
  if !preserve_whitespace && content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
    return nil
  end

  # Nokogiri already handles CDATA conversion and entity resolution
  Nodes::TextNode.new(value: content)
end

.check_for_relative_namespace_uris(doc) ⇒ Object

Check for relative namespace URIs (prohibited by C14N 1.1) rubocop:disable Metrics/MethodLength



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/canon/xml/data_model.rb', line 54

def self.check_for_relative_namespace_uris(doc)
  doc.traverse do |node|
    next unless node.is_a?(Nokogiri::XML::Element)

    node.namespace_definitions.each do |ns|
      next if ns.href.nil? || ns.href.empty?

      # Check if URI is relative
      if relative_uri?(ns.href)
        raise Canon::Error,
              "Relative namespace URI not allowed: #{ns.href}"
      end
    end
  end
end

.collect_in_scope_namespaces(nokogiri_element) ⇒ Object

Collect all in-scope namespaces for an element rubocop:disable Metrics/MethodLength



168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/canon/xml/data_model.rb', line 168

def self.collect_in_scope_namespaces(nokogiri_element)
  namespaces = {}

  # Walk up the tree to collect all namespace declarations
  current = nokogiri_element
  while current && !current.is_a?(Nokogiri::XML::Document)
    if current.is_a?(Nokogiri::XML::Element)
      current.namespace_definitions.each do |ns|
        prefix = ns.prefix || ""
        # Only add if not already defined (child overrides parent)
        unless namespaces.key?(prefix)
          namespaces[prefix] = ns.href
        end
      end
    end
    current = current.parent
  end

  # Always include xml namespace
  namespaces["xml"] ||= "http://www.w3.org/XML/1998/namespace"

  namespaces
end

.from_xml(xml_string, preserve_whitespace: false) ⇒ Nodes::RootNode

Build XPath data model from XML string

Parameters:

  • xml_string (String)

    XML content to parse

  • preserve_whitespace (Boolean) (defaults to: false)

    Whether to preserve whitespace-only text nodes

Returns:



23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/canon/xml/data_model.rb', line 23

def self.from_xml(xml_string, preserve_whitespace: false)
  # Parse with Nokogiri
  doc = Nokogiri::XML(xml_string) do |config|
    config.nonet     # Disable network access
    config.strict    # Strict parsing
  end

  # Check for relative namespace URIs (prohibited by C14N 1.1)
  check_for_relative_namespace_uris(doc)

  # Convert to XPath data model
  build_from_nokogiri(doc, preserve_whitespace: preserve_whitespace)
end

.parse(xml_string) ⇒ Object

Alias for compatibility with base class interface



38
39
40
# File 'lib/canon/xml/data_model.rb', line 38

def self.parse(xml_string)
  from_xml(xml_string)
end

.relative_uri?(uri) ⇒ Boolean

Check if a URI is relative

Returns:

  • (Boolean)


71
72
73
74
# File 'lib/canon/xml/data_model.rb', line 71

def self.relative_uri?(uri)
  # A URI is relative if it doesn't have a scheme
  uri !~ %r{^[a-zA-Z][a-zA-Z0-9+.-]*:}
end

.serialize(node) ⇒ String

Serialize XML node to string

Parameters:

Returns:

  • (String)

    Serialized XML string



46
47
48
49
50
# File 'lib/canon/xml/data_model.rb', line 46

def self.serialize(node)
  # Implementation will delegate to existing XML serialization
  # This is a placeholder for the base class interface
  node.to_s
end