Class: Scrapetor::Node

Inherits:
Object
  • Object
show all
Defined in:
lib/scrapetor/node.rb

Overview

Featherweight node wrapper. Holds a document reference and a backing Nokolexbor element. Selector ops delegate to the backing engine in Phase 1; the native extension (Phase 2) replaces this with arena-DOM + bytecode VM.

Instance Method Summary collapse

Constructor Details

#initialize(doc, backing) ⇒ Node

Returns a new instance of Node.



11
12
13
14
# File 'lib/scrapetor/node.rb', line 11

def initialize(doc, backing)
  @doc = doc
  @nlx = backing
end

Instance Method Details

#==(other) ⇒ Object Also known as: eql?



502
503
504
# File 'lib/scrapetor/node.rb', line 502

def ==(other)
  other.is_a?(Node) && @nlx == other.backing_node
end

#[](key) ⇒ Object



86
87
88
# File 'lib/scrapetor/node.rb', line 86

def [](key)
  @nlx[key.to_s]
end

#[]=(key, value) ⇒ Object Also known as: set_attribute

—– Mutation API (delegated to Nokolexbor) —–



257
258
259
260
# File 'lib/scrapetor/node.rb', line 257

def []=(key, value)
  @nlx[key.to_s] = value.nil? ? nil : value.to_s
  value
end

#absolute_url(base = nil) ⇒ Object



90
91
92
93
# File 'lib/scrapetor/node.rb', line 90

def absolute_url(base = nil)
  href = @nlx["href"] || @nlx["src"]
  URL.absolute(href, base || @doc.base_url)
end

#add_child(node_or_html) ⇒ Object Also known as: <<, add_child!



283
284
285
# File 'lib/scrapetor/node.rb', line 283

def add_child(node_or_html)
  wrap_result(@nlx.add_child(unwrap_mut(node_or_html)))
end

#add_class(klass) ⇒ Object Also known as: append_class

—– Class manipulation —–



314
315
316
317
# File 'lib/scrapetor/node.rb', line 314

def add_class(klass)
  @nlx.add_class(klass.to_s)
  self
end

#add_next_sibling(node_or_html) ⇒ Object Also known as: after



294
295
296
# File 'lib/scrapetor/node.rb', line 294

def add_next_sibling(node_or_html)
  wrap_result(@nlx.add_next_sibling(unwrap_mut(node_or_html)))
end

#add_previous_sibling(node_or_html) ⇒ Object Also known as: before



289
290
291
# File 'lib/scrapetor/node.rb', line 289

def add_previous_sibling(node_or_html)
  wrap_result(@nlx.add_previous_sibling(unwrap_mut(node_or_html)))
end

#ancestors(selector = nil) ⇒ Object



410
411
412
413
414
415
416
417
418
419
# File 'lib/scrapetor/node.rb', line 410

def ancestors(selector = nil)
  list = []
  cur = parent
  while cur
    list << cur
    cur = cur.parent
  end
  result = NodeSet.new(@doc, list.map(&:backing_node))
  selector.nil? ? result : result.select { |n| n.matches?(selector) }
end

#at(selector, *_extra) ⇒ Object Also known as: at_css



127
128
129
130
131
# File 'lib/scrapetor/node.rb', line 127

def at(selector, *_extra)
  n = @nlx.at_css(selector)
  return n if n.is_a?(String)
  n && Node.new(@doc, n)
end

#at_xpath(expr) ⇒ Object



436
437
438
439
# File 'lib/scrapetor/node.rb', line 436

def at_xpath(expr)
  result = xpath(expr)
  result.is_a?(Array) ? result.first : result
end

#attr(key) ⇒ Object



82
83
84
# File 'lib/scrapetor/node.rb', line 82

def attr(key)
  @nlx[key.to_s]
end

#attribute(name) ⇒ Object



456
457
458
# File 'lib/scrapetor/node.rb', line 456

def attribute(name)
  @nlx.attribute_nodes.find { |a| a.name == name.to_s }
end

#attribute_nodesObject



452
453
454
# File 'lib/scrapetor/node.rb', line 452

def attribute_nodes
  @nlx.attribute_nodes
end

#attributesObject

Nokogiri-compat: return all element attributes as a Hash.



50
51
52
53
54
# File 'lib/scrapetor/node.rb', line 50

def attributes
  h = {}
  @nlx.attribute_nodes.each { |a| h[a.name] = a.value }
  h
end

#backing_nodeObject



251
252
253
# File 'lib/scrapetor/node.rb', line 251

def backing_node
  @nlx
end

#batch_css(selectors) ⇒ Object

Batch API: array of selector strings → array of results, one C round-trip total. Delegates to the underlying Element’s batch_css; falls back to N individual css() calls if the backing node doesn’t expose batch.



152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# File 'lib/scrapetor/node.rb', line 152

def batch_css(selectors)
  if @nlx.respond_to?(:batch_css)
    results = @nlx.batch_css(selectors)
    results.map do |r|
      case r
      when Array
        # ::text / ::attr results — array of strings; pass through.
        # Element arrays — wrap in NodeSet.
        if r.empty? || r.first.is_a?(String)
          r
        else
          NodeSet.new(@doc, r)
        end
      else
        r # NodeSet or other
      end
    end
  else
    selectors.map { |s| css(s) }
  end
end

#blank?Boolean

Returns:

  • (Boolean)


448
449
450
# File 'lib/scrapetor/node.rb', line 448

def blank?
  text.to_s.strip.empty?
end

#cdata?Boolean

Returns:

  • (Boolean)


474
475
476
# File 'lib/scrapetor/node.rb', line 474

def cdata?
  node_type == 4
end

#childObject

Nokogiri-compat: ‘Node#child` returns the first child regardless of node type (text / element / comment). Used by parsers that poke at the immediate inner content (e.g. heading nodes whose text lives in a text-node child).



352
353
354
355
# File 'lib/scrapetor/node.rb', line 352

def child
  c = @nlx.children.to_a.first
  c && Node.new(@doc, c)
end

#childrenObject



205
206
207
208
# File 'lib/scrapetor/node.rb', line 205

def children
  kids = @nlx.children.to_a.select { |c| c.respond_to?(:element?) && c.element? }
  NodeSet.new(@doc, kids)
end

#classesObject



329
330
331
# File 'lib/scrapetor/node.rb', line 329

def classes
  (@nlx["class"] || "").split(/\s+/).reject(&:empty?)
end

#clean_textObject



20
21
22
# File 'lib/scrapetor/node.rb', line 20

def clean_text
  Cleaner.clean(text)
end

#comment?Boolean

Returns:

  • (Boolean)


465
466
467
# File 'lib/scrapetor/node.rb', line 465

def comment?
  node_type == 8
end

#content=(text) ⇒ Object



273
274
275
276
# File 'lib/scrapetor/node.rb', line 273

def content=(text)
  @nlx.content = text.to_s
  text
end

#css(selector, *_extra) ⇒ Object Also known as: search

Nokogiri-compat: ‘node.css(selector, ns_or_handler)`. Extra args are XPath-only and harmless to ignore for CSS.



113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'lib/scrapetor/node.rb', line 113

def css(selector, *_extra)
  result = @nlx.css(selector)
  # `::text` / `::attr(name)` queries hand back a flat Array of
  # String/TextNode. Pass that through as-is. For everything else
  # — including the empty-NodeSet case — wrap in a NodeSet so the
  # caller can chain `.at_css`, `.each_with_index`, etc. Detect the
  # pseudo-element shape by checking the selector string; relying
  # on the result shape would mis-classify zero-match queries.
  if result.is_a?(Array) && selector_pseudo_element?(selector)
    return result
  end
  NodeSet.new(@doc, result.to_a)
end

#css_pathObject

Build a minimal CSS path back to this node (id-based when available, falling back to tag + :nth-of-type indexing).



379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
# File 'lib/scrapetor/node.rb', line 379

def css_path
  parts = []
  cur = @nlx
  while cur && cur.respond_to?(:name) && cur.element?
    if (id = cur["id"]) && !id.empty?
      parts.unshift("##{id}")
      break
    end
    index = 1
    sib = cur.previous_sibling
    while sib
      index += 1 if sib.respond_to?(:element?) && sib.element? && sib.name == cur.name
      sib = sib.previous_sibling
    end
    parts.unshift("#{cur.name}:nth-of-type(#{index})")
    cur = cur.parent
  end
  parts.join(" > ")
end

#dateObject



105
106
107
108
109
# File 'lib/scrapetor/node.rb', line 105

def date
  Date.parse(text.to_s)
rescue ArgumentError, TypeError
  nil
end

#documentObject



486
487
488
# File 'lib/scrapetor/node.rb', line 486

def document
  @doc
end

#document?Boolean

Returns:

  • (Boolean)


74
# File 'lib/scrapetor/node.rb', line 74

def document?; false; end

#each_attributeObject

Iterate over attributes as Nokogiri does.



77
78
79
80
# File 'lib/scrapetor/node.rb', line 77

def each_attribute
  return enum_for(:each_attribute) unless block_given?
  @nlx.attribute_nodes.each { |a| yield [a.name, a.value] }
end

#element?Boolean

Returns:

  • (Boolean)


70
71
72
# File 'lib/scrapetor/node.rb', line 70

def element?
  @nlx.respond_to?(:element?) ? @nlx.element? : true
end

#element_childrenObject Also known as: elements



362
363
364
365
# File 'lib/scrapetor/node.rb', line 362

def element_children
  kids = @nlx.children.to_a.select { |x| x.respond_to?(:element?) && x.element? }
  NodeSet.new(@doc, kids)
end

#extract(map) ⇒ Object

Per-result extract: routes to the underlying Element’s C-side extract entry point (one C call assembles the whole hash).



185
186
187
188
189
190
191
192
193
# File 'lib/scrapetor/node.rb', line 185

def extract(map)
  if @nlx.respond_to?(:extract)
    @nlx.extract(map)
  else
    out = {}
    map.each_pair { |k, sel| out[k] = at_css(sel) }
    out
  end
end

#extract_css(map) ⇒ Object

Hash-form batch: => selector → => result.



175
176
177
178
179
180
181
# File 'lib/scrapetor/node.rb', line 175

def extract_css(map)
  keys = map.keys
  results = batch_css(map.values)
  out = {}
  keys.each_with_index { |k, i| out[k] = results[i] }
  out
end

#extract_each(outer_selector, fields) ⇒ Object

extract_each: under this node, run the outer + inner field plans entirely in C. One round-trip, Array<Hash> back.



197
198
199
200
201
202
203
# File 'lib/scrapetor/node.rb', line 197

def extract_each(outer_selector, fields)
  if @nlx.respond_to?(:extract_each)
    @nlx.extract_each(outer_selector, fields)
  else
    css(outer_selector).map { |n| n.extract(fields) }
  end
end

#fingerprintObject



247
248
249
# File 'lib/scrapetor/node.rb', line 247

def fingerprint
  Fingerprint.structural(self)
end

#first_element_childObject



343
344
345
346
# File 'lib/scrapetor/node.rb', line 343

def first_element_child
  c = @nlx.children.to_a.find { |x| x.respond_to?(:element?) && x.element? }
  c && Node.new(@doc, c)
end

#fragment?Boolean

Returns:

  • (Boolean)


482
483
484
# File 'lib/scrapetor/node.rb', line 482

def fragment?
  false
end

#get_attribute(key) ⇒ Object



263
264
265
# File 'lib/scrapetor/node.rb', line 263

def get_attribute(key)
  @nlx[key.to_s]
end

#has_attribute?(name) ⇒ Boolean Also known as: key?, attribute?

Returns:

  • (Boolean)


64
65
66
# File 'lib/scrapetor/node.rb', line 64

def has_attribute?(name)
  !@nlx[name.to_s].nil?
end

#has_class?(klass) ⇒ Boolean

Returns:

  • (Boolean)


333
334
335
# File 'lib/scrapetor/node.rb', line 333

def has_class?(klass)
  classes.include?(klass.to_s)
end

#hashObject



507
508
509
# File 'lib/scrapetor/node.rb', line 507

def hash
  @nlx.hash
end

#inner_htmlObject



30
31
32
# File 'lib/scrapetor/node.rb', line 30

def inner_html
  @nlx.inner_html
end

#inner_html=(html) ⇒ Object



278
279
280
281
# File 'lib/scrapetor/node.rb', line 278

def inner_html=(html)
  @nlx.inner_html = html.to_s
  html
end

#keysObject



56
57
58
# File 'lib/scrapetor/node.rb', line 56

def keys
  @nlx.attribute_nodes.map(&:name)
end

#last_element_childObject



357
358
359
360
# File 'lib/scrapetor/node.rb', line 357

def last_element_child
  c = @nlx.children.to_a.reverse.find { |x| x.respond_to?(:element?) && x.element? }
  c && Node.new(@doc, c)
end

#matches?(selector) ⇒ Boolean

Returns:

  • (Boolean)


421
422
423
424
# File 'lib/scrapetor/node.rb', line 421

def matches?(selector)
  ns = @doc.css(selector)
  ns.to_a.any? { |n| n.backing_node == @nlx }
end

#moneyObject



95
96
97
# File 'lib/scrapetor/node.rb', line 95

def money
  Money.parse(text)
end

#nameObject Also known as: node_name, tag_name



39
40
41
# File 'lib/scrapetor/node.rb', line 39

def name
  @nlx.name
end

#next_element_siblingObject Also known as: next_element



229
230
231
232
233
234
235
# File 'lib/scrapetor/node.rb', line 229

def next_element_sibling
  sib = @nlx.next_sibling
  while sib && !(sib.respond_to?(:element?) && sib.element?)
    sib = sib.next_sibling
  end
  sib && Node.new(@doc, sib)
end

#next_siblingObject Also known as: next

Nokogiri-compatible: returns the literal next node (may be a text / comment node). Use ‘next_element_sibling` (or `next_element`) to skip non-element siblings.



219
220
221
222
# File 'lib/scrapetor/node.rb', line 219

def next_sibling
  sib = @nlx.next_sibling
  sib && Node.new(@doc, sib)
end

#node_typeObject Also known as: type



368
369
370
# File 'lib/scrapetor/node.rb', line 368

def node_type
  @nlx.respond_to?(:node_type) ? @nlx.node_type : 1
end

#numberObject



99
100
101
102
103
# File 'lib/scrapetor/node.rb', line 99

def number
  v = text.to_s.gsub(/[^\d.\-]/, "")
  return nil if v.empty? || v == "-"
  v.include?(".") ? v.to_f : v.to_i
end

#outer_htmlObject Also known as: to_html



34
35
36
# File 'lib/scrapetor/node.rb', line 34

def outer_html
  @nlx.to_html
end

#parentObject



210
211
212
213
214
# File 'lib/scrapetor/node.rb', line 210

def parent
  p = @nlx.parent
  return nil if p.nil? || (defined?(Dom::Document) && p.is_a?(Dom::Document))
  Node.new(@doc, p)
end

#pathObject



373
374
375
# File 'lib/scrapetor/node.rb', line 373

def path
  @nlx.path if @nlx.respond_to?(:path)
end

#previous_element_siblingObject Also known as: previous_element



238
239
240
241
242
243
244
# File 'lib/scrapetor/node.rb', line 238

def previous_element_sibling
  sib = @nlx.previous_sibling
  while sib && !(sib.respond_to?(:element?) && sib.element?)
    sib = sib.previous_sibling
  end
  sib && Node.new(@doc, sib)
end

#previous_siblingObject Also known as: prev, previous



224
225
226
227
# File 'lib/scrapetor/node.rb', line 224

def previous_sibling
  sib = @nlx.previous_sibling
  sib && Node.new(@doc, sib)
end

#processing_instruction?Boolean

Returns:

  • (Boolean)


478
479
480
# File 'lib/scrapetor/node.rb', line 478

def processing_instruction?
  node_type == 7
end

#removeObject Also known as: unlink, delete



305
306
307
308
# File 'lib/scrapetor/node.rb', line 305

def remove
  @nlx.remove
  self
end

#remove_attribute(key) ⇒ Object Also known as: delete_attribute



267
268
269
270
# File 'lib/scrapetor/node.rb', line 267

def remove_attribute(key)
  @nlx.remove_attribute(key.to_s)
  self
end

#remove_class(klass = nil) ⇒ Object



320
321
322
323
324
325
326
327
# File 'lib/scrapetor/node.rb', line 320

def remove_class(klass = nil)
  if klass.nil?
    @nlx.remove_attribute("class")
  else
    @nlx.remove_class(klass.to_s)
  end
  self
end

#replace(node_or_html) ⇒ Object Also known as: replace_with, swap



299
300
301
# File 'lib/scrapetor/node.rb', line 299

def replace(node_or_html)
  wrap_result(@nlx.replace(unwrap_mut(node_or_html)))
end

#rootObject



490
491
492
# File 'lib/scrapetor/node.rb', line 490

def root
  @doc.root
end

#serialize(*args) ⇒ Object



498
499
500
# File 'lib/scrapetor/node.rb', line 498

def serialize(*args)
  to_html(*args)
end

#textObject Also known as: content, inner_text



16
17
18
# File 'lib/scrapetor/node.rb', line 16

def text
  @nlx.text
end

#text?Boolean Also known as: text_node?

Returns:

  • (Boolean)


469
470
471
# File 'lib/scrapetor/node.rb', line 469

def text?
  node_type == 3
end

#to_xml(*args) ⇒ Object



460
461
462
# File 'lib/scrapetor/node.rb', line 460

def to_xml(*args)
  @nlx.to_html(*args)
end

#traverse {|_self| ... } ⇒ Object

Yields:

  • (_self)

Yield Parameters:



404
405
406
407
408
# File 'lib/scrapetor/node.rb', line 404

def traverse(&block)
  return enum_for(:traverse) unless block_given?
  yield self
  element_children.each { |c| c.traverse(&block) }
end

#valuesObject



60
61
62
# File 'lib/scrapetor/node.rb', line 60

def values
  @nlx.attribute_nodes.map(&:value)
end

#visible_textObject



24
25
26
27
28
# File 'lib/scrapetor/node.rb', line 24

def visible_text
  stripped = @nlx.dup
  stripped.css("script, style, noscript").each(&:remove) if stripped.respond_to?(:css)
  Cleaner.clean(stripped.text)
end

#wrap(html_or_node) ⇒ Object



441
442
443
444
445
446
# File 'lib/scrapetor/node.rb', line 441

def wrap(html_or_node)
  if @nlx.respond_to?(:wrap)
    @nlx.wrap(html_or_node)
  end
  self
end

#write_to(io, *args) ⇒ Object



494
495
496
# File 'lib/scrapetor/node.rb', line 494

def write_to(io, *args)
  io.write(to_html(*args))
end

#xpath(expr) ⇒ Object

XPath helpers. The native engine doesn’t yet implement XPath, so we return empty results rather than NoMethodError on Node — this keeps callers that probe both engines from crashing. Evaluate an XPath expression against this node (relative expressions are scoped to it). See Scrapetor::Document#xpath for the supported subset.



432
433
434
# File 'lib/scrapetor/node.rb', line 432

def xpath(expr)
  Scrapetor::XPath.evaluate(self, expr)
end

#xpath_pathObject

XPath path to this node.



400
401
402
# File 'lib/scrapetor/node.rb', line 400

def xpath_path
  path
end