Class: Canon::Xml::DataModel

Inherits:
DataModel show all
Defined in:
lib/canon/xml/data_model.rb

Class Method Summary collapse

Class Method Details

.build_attribute_nodes(nokogiri_element, element) ⇒ Object



234
235
236
237
238
239
240
241
242
243
244
# File 'lib/canon/xml/data_model.rb', line 234

def self.build_attribute_nodes(nokogiri_element, element)
  nokogiri_element.attributes.each_value do |attr|
    attr_node = Nodes::AttributeNode.new(
      name: attr.name,
      value: attr.value,
      namespace_uri: attr.namespace&.href,
      prefix: attr.namespace&.prefix,
    )
    element.add_attribute(attr_node)
  end
end

.build_comment_node(nokogiri_comment) ⇒ Object



257
258
259
# File 'lib/canon/xml/data_model.rb', line 257

def self.build_comment_node(nokogiri_comment)
  Nodes::CommentNode.new(value: nokogiri_comment.content)
end

.build_element_node(nokogiri_element, preserve_whitespace: false) ⇒ Object



182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/canon/xml/data_model.rb', line 182

def self.build_element_node(nokogiri_element, preserve_whitespace: false)
  element = Nodes::ElementNode.new(
    name: nokogiri_element.name,
    namespace_uri: nokogiri_element.namespace&.href,
    prefix: nokogiri_element.namespace&.prefix,
  )

  build_namespace_nodes(nokogiri_element, element)
  build_attribute_nodes(nokogiri_element, element)

  nokogiri_element.children.each do |child|
    node = build_node_from_nokogiri(child,
                                    preserve_whitespace: preserve_whitespace)
    element.add_child(node) if node
  end

  element
end

.build_from_moxml(moxml_doc, preserve_whitespace: false) ⇒ Object



275
276
277
278
279
280
281
282
283
284
# File 'lib/canon/xml/data_model.rb', line 275

def self.build_from_moxml(moxml_doc, preserve_whitespace: false)
  root = Nodes::RootNode.new

  if moxml_doc.respond_to?(:root) && moxml_doc.root
    root.add_child(build_moxml_element_node(moxml_doc.root,
                                            preserve_whitespace: preserve_whitespace))
  end

  root
end

.build_from_nokogiri(nokogiri_doc, preserve_whitespace: false) ⇒ Object



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/canon/xml/data_model.rb', line 139

def self.build_from_nokogiri(nokogiri_doc, preserve_whitespace: false)
  root = Nodes::RootNode.new

  if nokogiri_doc.respond_to?(:root) && nokogiri_doc.root
    root.add_child(build_element_node(nokogiri_doc.root,
                                      preserve_whitespace: preserve_whitespace))
    nokogiri_doc.children.each do |child|
      next if child == nokogiri_doc.root
      next if child.is_a?(Nokogiri::XML::DTD)

      node = build_node_from_nokogiri(child,
                                      preserve_whitespace: preserve_whitespace)
      root.add_child(node) if node
    end
  else
    nokogiri_doc.children.each do |child|
      next if child.is_a?(Nokogiri::XML::DTD)

      node = build_node_from_nokogiri(child,
                                      preserve_whitespace: preserve_whitespace)
      root.add_child(node) if node
    end
  end

  root
end

.build_moxml_attribute_nodes(moxml_element, element) ⇒ Object



340
341
342
343
344
345
346
347
348
# File 'lib/canon/xml/data_model.rb', line 340

def self.build_moxml_attribute_nodes(moxml_element, element)
  moxml_element.attributes.each do |attr|
    attr_node = Nodes::AttributeNode.new(
      name: attr.name,
      value: attr.value,
    )
    element.add_attribute(attr_node)
  end
end

.build_moxml_comment_node(moxml_comment) ⇒ Object



360
361
362
# File 'lib/canon/xml/data_model.rb', line 360

def self.build_moxml_comment_node(moxml_comment)
  Nodes::CommentNode.new(value: moxml_comment.text)
end

.build_moxml_element_node(moxml_element, preserve_whitespace: false) ⇒ Object



300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# File 'lib/canon/xml/data_model.rb', line 300

def self.build_moxml_element_node(moxml_element,
preserve_whitespace: false)
  ns = moxml_element.namespace
  element = Nodes::ElementNode.new(
    name: moxml_element.name,
    namespace_uri: ns&.uri,
    prefix: ns&.prefix,
  )

  build_moxml_namespace_nodes(moxml_element, element)
  build_moxml_attribute_nodes(moxml_element, element)

  moxml_element.children.each do |child|
    node = build_moxml_node(child,
                            preserve_whitespace: preserve_whitespace)
    element.add_child(node) if node
  end

  element
end

.build_moxml_namespace_nodes(moxml_element, element) ⇒ Object



321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
# File 'lib/canon/xml/data_model.rb', line 321

def self.build_moxml_namespace_nodes(moxml_element, element)
  moxml_element.namespace_definitions.each do |ns|
    ns_node = Nodes::NamespaceNode.new(
      prefix: ns.prefix || "",
      uri: ns.uri,
    )
    element.add_namespace(ns_node)
  end

  unless element.namespaces.any? do |n|
    n.prefix == "xml"
  end
    element.add_namespace(Nodes::NamespaceNode.new(
                            prefix: "xml",
                            uri: "http://www.w3.org/XML/1998/namespace",
                          ))
  end
end

.build_moxml_node(node, preserve_whitespace: false) ⇒ Object



286
287
288
289
290
291
292
293
294
295
296
297
298
# File 'lib/canon/xml/data_model.rb', line 286

def self.build_moxml_node(node, preserve_whitespace: false)
  case node
  when Moxml::Element
    build_moxml_element_node(node,
                             preserve_whitespace: preserve_whitespace)
  when Moxml::Text
    build_moxml_text_node(node, preserve_whitespace: preserve_whitespace)
  when Moxml::Comment
    build_moxml_comment_node(node)
  when Moxml::ProcessingInstruction
    build_moxml_pi_node(node)
  end
end

.build_moxml_pi_node(moxml_pi) ⇒ Object



364
365
366
367
368
369
# File 'lib/canon/xml/data_model.rb', line 364

def self.build_moxml_pi_node(moxml_pi)
  Nodes::ProcessingInstructionNode.new(
    target: moxml_pi.target,
    data: moxml_pi.data,
  )
end

.build_moxml_text_node(moxml_text, preserve_whitespace: false) ⇒ Object



350
351
352
353
354
355
356
357
358
# File 'lib/canon/xml/data_model.rb', line 350

def self.build_moxml_text_node(moxml_text, preserve_whitespace: false)
  content = moxml_text.text

  if !preserve_whitespace && content.strip.empty? && moxml_text.parent.is_a?(Moxml::Element)
    return nil
  end

  Nodes::TextNode.new(value: content, original: content)
end

.build_namespace_nodes(nokogiri_element, element) ⇒ Object



201
202
203
204
205
206
207
208
209
210
211
# File 'lib/canon/xml/data_model.rb', line 201

def self.build_namespace_nodes(nokogiri_element, element)
  namespaces = collect_in_scope_namespaces(nokogiri_element)

  namespaces.each do |prefix, uri|
    ns_node = Nodes::NamespaceNode.new(
      prefix: prefix,
      uri: uri,
    )
    element.add_namespace(ns_node)
  end
end

.build_node_from_nokogiri(nokogiri_node, preserve_whitespace: false) ⇒ Object



166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/canon/xml/data_model.rb', line 166

def self.build_node_from_nokogiri(nokogiri_node,
preserve_whitespace: false)
  case nokogiri_node
  when Nokogiri::XML::Element
    build_element_node(nokogiri_node,
                       preserve_whitespace: preserve_whitespace)
  when Nokogiri::XML::Text
    build_text_node(nokogiri_node,
                    preserve_whitespace: preserve_whitespace)
  when Nokogiri::XML::Comment
    build_comment_node(nokogiri_node)
  when Nokogiri::XML::ProcessingInstruction
    build_pi_node(nokogiri_node)
  end
end

.build_pi_node(nokogiri_pi) ⇒ Object



261
262
263
264
265
266
# File 'lib/canon/xml/data_model.rb', line 261

def self.build_pi_node(nokogiri_pi)
  Nodes::ProcessingInstructionNode.new(
    target: nokogiri_pi.name,
    data: nokogiri_pi.content,
  )
end

.build_text_node(nokogiri_text, preserve_whitespace: false) ⇒ Object



246
247
248
249
250
251
252
253
254
255
# File 'lib/canon/xml/data_model.rb', line 246

def self.build_text_node(nokogiri_text, preserve_whitespace: false)
  content = nokogiri_text.content

  if !preserve_whitespace && content.strip.empty? && nokogiri_text.parent.is_a?(Nokogiri::XML::Element)
    return nil
  end

  original = nokogiri_text.to_xml
  Nodes::TextNode.new(value: content, original: original)
end

.check_for_relative_namespace_uris(doc) ⇒ Object



125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/canon/xml/data_model.rb', line 125

def self.check_for_relative_namespace_uris(doc)
  doc.traverse do |node|
    next unless node.is_a?(Nokogiri::XML::Element)

    node.namespace_definitions.each do |ns|
      next if ns.href.nil? || ns.href.empty?
      if relative_uri?(ns.href)
        raise Canon::Error,
              "Relative namespace URI not allowed: #{ns.href}"
      end
    end
  end
end

.collect_in_scope_namespaces(nokogiri_element) ⇒ Object



213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# File 'lib/canon/xml/data_model.rb', line 213

def self.collect_in_scope_namespaces(nokogiri_element)
  namespaces = {}

  current = nokogiri_element
  while current && !current.is_a?(Nokogiri::XML::Document)
    if current.is_a?(Nokogiri::XML::Element)
      current.namespace_definitions.each do |ns|
        prefix = ns.prefix || ""
        unless namespaces.key?(prefix)
          namespaces[prefix] = ns.href
        end
      end
    end
    current = current.parent
  end

  namespaces["xml"] ||= "http://www.w3.org/XML/1998/namespace"

  namespaces
end

.extract_xml_encoding(xml_string) ⇒ Object



92
93
94
95
96
97
98
99
# File 'lib/canon/xml/data_model.rb', line 92

def self.extract_xml_encoding(xml_string)
  binary_string = xml_string.dup.force_encoding("BINARY")
  if binary_string =~ /\A\s*<\?xml[^>]*\bencoding\s*=\s*["']([^"']+)["'][^>]*\?>/i
    return Regexp.last_match(1)
  end

  nil
end

.from_moxml_xml(xml_string, preserve_whitespace:) ⇒ Object

— Moxml path —



270
271
272
273
# File 'lib/canon/xml/data_model.rb', line 270

def self.from_moxml_xml(xml_string, preserve_whitespace:)
  doc = Canon::XmlParsing.parse(xml_string)
  build_from_moxml(doc, preserve_whitespace: preserve_whitespace)
end

.from_nokogiri_xml(xml_string, preserve_whitespace:) ⇒ Object

— Nokogiri path —



115
116
117
118
119
120
121
122
123
# File 'lib/canon/xml/data_model.rb', line 115

def self.from_nokogiri_xml(xml_string, preserve_whitespace:)
  doc = Nokogiri::XML(xml_string, &:nonet)
  check_for_relative_namespace_uris(doc)
  result = build_from_nokogiri(doc,
                               preserve_whitespace: preserve_whitespace)
  errors = Array(doc.errors).map(&:to_s)
  result.parse_errors = errors if errors.any?
  result
end

.from_xml(xml_string, preserve_whitespace: false) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
# File 'lib/canon/xml/data_model.rb', line 19

def self.from_xml(xml_string, preserve_whitespace: false)
  normalized_xml = normalize_encoding(xml_string)

  if Canon::XmlBackend.nokogiri?
    from_nokogiri_xml(normalized_xml,
                      preserve_whitespace: preserve_whitespace)
  else
    from_moxml_xml(normalized_xml,
                   preserve_whitespace: preserve_whitespace)
  end
end

.normalize_encoding(xml_string) ⇒ Object



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/canon/xml/data_model.rb', line 31

def self.normalize_encoding(xml_string)
  return xml_string unless xml_string.is_a?(String)

  declared_encoding = extract_xml_encoding(xml_string)

  if declared_encoding
    if declared_encoding.upcase != "UTF-8"
      utf8_reinterpreted = try_utf8_reinterpretation(xml_string)
      if utf8_reinterpreted
        return update_xml_declaration(xml_string,
                                      "UTF-8")
      end

      return transcode_to_utf8(xml_string, declared_encoding)
    end
  elsif xml_string.encoding.name != "UTF-8"
    reinterpreted = try_utf8_reinterpretation(xml_string)
    return reinterpreted if reinterpreted

    return transcode_to_utf8(xml_string, xml_string.encoding.name)
  end

  xml_string
end

.parse(xml_string) ⇒ Object



101
102
103
# File 'lib/canon/xml/data_model.rb', line 101

def self.parse(xml_string)
  from_xml(xml_string)
end

.relative_uri?(uri) ⇒ Boolean

Returns:

  • (Boolean)


109
110
111
# File 'lib/canon/xml/data_model.rb', line 109

def self.relative_uri?(uri)
  uri !~ %r{^[a-zA-Z][a-zA-Z0-9+.-]*:}
end

.serialize(node) ⇒ Object



105
106
107
# File 'lib/canon/xml/data_model.rb', line 105

def self.serialize(node)
  node.to_s
end

.transcode_to_utf8(xml_string, source_encoding) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/canon/xml/data_model.rb', line 62

def self.transcode_to_utf8(xml_string, source_encoding)
  if source_encoding != "UTF-8"
    forced = xml_string.dup.force_encoding(source_encoding)
    if forced.valid_encoding?
      utf8_check = xml_string.dup.force_encoding("UTF-8")
      if utf8_check.valid_encoding?
        return xml_string.dup.force_encoding("UTF-8")
      end

      return forced.encode("UTF-8", source_encoding,
                           invalid: :replace,
                           undef: :replace,
                           replace: "?")
    end
  end

  xml_string.dup.force_encoding("UTF-8")
rescue EncodingError
  xml_string
end

.try_utf8_reinterpretation(xml_string) ⇒ Object



83
84
85
86
87
88
89
90
# File 'lib/canon/xml/data_model.rb', line 83

def self.try_utf8_reinterpretation(xml_string)
  return xml_string if xml_string.encoding.name == "UTF-8"

  forced = xml_string.dup.force_encoding("UTF-8")
  return forced if forced.valid_encoding?

  nil
end

.update_xml_declaration(xml_string, new_encoding) ⇒ Object



56
57
58
59
60
# File 'lib/canon/xml/data_model.rb', line 56

def self.update_xml_declaration(xml_string, new_encoding)
  xml_string.sub(/\bencoding\s*=\s*["'][^"']+["']/i) do |_match|
    %(encoding="#{new_encoding}")
  end
end