Class: Scrapetor::NodeSet

Inherits:
Object
  • Object
show all
Includes:
Enumerable
Defined in:
lib/scrapetor/node_set.rb

Instance Method Summary collapse

Constructor Details

#initialize(doc, backing_nodes) ⇒ NodeSet

Returns a new instance of NodeSet.



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/scrapetor/node_set.rb', line 7

def initialize(doc, backing_nodes)
  @doc = doc
  # `defined?` guard so this works when the native extension isn't
  # loaded (e.g. install-time build failure, or the gem is required
  # before its C extension is in place). Without the guard a plain
  # NodeSet construction raises NameError on missing constant —
  # which is the v0.1.x crash a production audit run surfaced.
  if defined?(Scrapetor::Native::DocumentWrapper::LazyIds) &&
     backing_nodes.is_a?(Scrapetor::Native::DocumentWrapper::LazyIds)
    @lazy_ids = backing_nodes
    @nodes    = nil
  else
    @nodes = backing_nodes
  end
end

Instance Method Details

#&(other) ⇒ Object



287
288
289
290
# File 'lib/scrapetor/node_set.rb', line 287

def &(other)
  keep = other.respond_to?(:backing_nodes) ? other.backing_nodes : Array(other)
  self.class.new(@doc, backing_nodes & keep)
end

#+(other) ⇒ Object



163
164
165
166
# File 'lib/scrapetor/node_set.rb', line 163

def +(other)
  other_nodes = other.respond_to?(:backing_nodes) ? other.backing_nodes : Array(other)
  self.class.new(@doc, backing_nodes + other_nodes)
end

#-(other) ⇒ Object



282
283
284
285
# File 'lib/scrapetor/node_set.rb', line 282

def -(other)
  drop = other.respond_to?(:backing_nodes) ? other.backing_nodes : Array(other)
  self.class.new(@doc, backing_nodes - drop)
end

#[](index, length = nil) ⇒ Object Also known as: slice



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/scrapetor/node_set.rb', line 58

def [](index, length = nil)
  if length
    slice = backing_nodes[index, length]
    return self.class.new(@doc, slice || [])
  end
  if index.is_a?(Range)
    slice = backing_nodes[index]
    return self.class.new(@doc, slice || [])
  end
  if @lazy_ids
    id = @lazy_ids.ids[index]
    return nil unless id
    Node.new(@doc, Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper))
  else
    n = @nodes[index]
    n && Node.new(@doc, n)
  end
end

#at(selector) ⇒ Object Also known as: at_css



105
106
107
# File 'lib/scrapetor/node_set.rb', line 105

def at(selector)
  first&.at(selector)
end

#attr(name) ⇒ Object Also known as: attribute



154
155
156
# File 'lib/scrapetor/node_set.rb', line 154

def attr(name)
  first&.attr(name)
end

#backing_nodesObject



177
178
179
180
# File 'lib/scrapetor/node_set.rb', line 177

def backing_nodes
  return materialize if @lazy_ids
  @nodes
end

#childrenObject

Aggregate of children across all nodes in the set. Mirrors Nokogiri’s NodeSet#children — every child of every node, including text and comment nodes, flattened into a single NodeSet. Pulls children straight from the backing element (rather than going through Node#children, which filters to elements only) so callers that iterate mixed-content can still see the text segments.



137
138
139
140
141
142
143
144
145
146
# File 'lib/scrapetor/node_set.rb', line 137

def children
  collected = []
  backing_nodes.each do |bk|
    next unless bk.respond_to?(:children)
    kids = bk.children
    kids = kids.to_a if kids.respond_to?(:to_a)
    kids.each { |c| collected << c }
  end
  NodeSet.new(@doc, collected)
end

#css(selector) ⇒ Object Also known as: search



110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/scrapetor/node_set.rb', line 110

def css(selector)
  # Determine up front whether the selector ends in a `::text` /
  # `::attr` pseudo-element. Inferring from the result shape (was
  # the previous approach) misclassifies zero-match queries as
  # string-shaped and breaks `.at_css` chained off an empty NodeSet.
  pe = selector.to_s
  string_result = pe.include?("::") &&
                  pe =~ /::(?:text|attr\([^)]+\)|first-letter|first-line|before|after)\s*\z/i

  collected = []
  backing_nodes.each do |n|
    next unless n.respond_to?(:css)
    result = n.css(selector)
    result = result.to_a if result.respond_to?(:to_a)
    result.each { |hit| collected << hit }
  end
  return collected if string_result
  NodeSet.new(@doc, collected)
end

#eachObject



23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/scrapetor/node_set.rb', line 23

def each
  return enum_for(:each) unless block_given?
  if @lazy_ids
    wrap = @lazy_ids.wrapper
    native = @lazy_ids.native
    @lazy_ids.ids.each do |id|
      yield Node.new(@doc, Scrapetor::Native::Element.new(native, id, wrap))
    end
  else
    @nodes.each { |n| yield Node.new(@doc, n) }
  end
end

#each_with_indexObject



222
223
224
225
# File 'lib/scrapetor/node_set.rb', line 222

def each_with_index
  return enum_for(:each_with_index) unless block_given?
  backing_nodes.each_with_index { |n, i| yield Node.new(@doc, n), i }
end

#empty?Boolean

Returns:

  • (Boolean)


84
85
86
# File 'lib/scrapetor/node_set.rb', line 84

def empty?
  @lazy_ids ? @lazy_ids.ids.empty? : @nodes.empty?
end

#extract(fields) ⇒ Object

Map every node through the ‘extract(fields)` extraction. Lets the standard SERP-result pattern collapse to:

doc.css(".result").extract(title: ".t", price: ".p")
# => [{title: ..., price: ...}, ...]


297
298
299
# File 'lib/scrapetor/node_set.rb', line 297

def extract(fields)
  map { |n| n.extract(fields) }
end

#find_allObject



248
249
250
251
# File 'lib/scrapetor/node_set.rb', line 248

def find_all
  return enum_for(:find_all) unless block_given?
  select { |n| yield(n) }
end

#firstObject



36
37
38
39
40
41
42
43
44
45
# File 'lib/scrapetor/node_set.rb', line 36

def first
  if @lazy_ids
    id = @lazy_ids.ids.first
    return nil unless id
    Node.new(@doc, Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper))
  else
    n = @nodes.first
    n && Node.new(@doc, n)
  end
end

#include?(node) ⇒ Boolean

Returns:

  • (Boolean)


277
278
279
280
# File 'lib/scrapetor/node_set.rb', line 277

def include?(node)
  target = node.is_a?(Node) ? node.backing_node : node
  backing_nodes.include?(target)
end

#index(node) ⇒ Object



272
273
274
275
# File 'lib/scrapetor/node_set.rb', line 272

def index(node)
  target = node.is_a?(Node) ? node.backing_node : node
  backing_nodes.index(target)
end

#lastObject



47
48
49
50
51
52
53
54
55
56
# File 'lib/scrapetor/node_set.rb', line 47

def last
  if @lazy_ids
    id = @lazy_ids.ids.last
    return nil unless id
    Node.new(@doc, Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper))
  else
    n = @nodes.last
    n && Node.new(@doc, n)
  end
end

#mapObject



88
89
90
91
92
93
94
95
96
97
# File 'lib/scrapetor/node_set.rb', line 88

def map
  return enum_for(:map) unless block_given?
  if @lazy_ids
    wrap = @lazy_ids.wrapper
    native = @lazy_ids.native
    @lazy_ids.ids.map { |id| yield Node.new(@doc, Scrapetor::Native::Element.new(native, id, wrap)) }
  else
    @nodes.map { |n| yield Node.new(@doc, n) }
  end
end

#materializeObject

Force the lazy-ids path to allocate its Element wrappers. Used by operations that need the original backing nodes (set algebra, +/-/&, removal).



185
186
187
188
189
190
# File 'lib/scrapetor/node_set.rb', line 185

def materialize
  return @nodes unless @lazy_ids
  @nodes = @lazy_ids.ids.map { |id| Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper) }
  @lazy_ids = nil
  @nodes
end

#popObject



260
261
262
263
264
# File 'lib/scrapetor/node_set.rb', line 260

def pop
  materialize
  n = @nodes.pop
  n && Node.new(@doc, n)
end

#push(node) ⇒ Object Also known as: <<



253
254
255
256
257
# File 'lib/scrapetor/node_set.rb', line 253

def push(node)
  materialize
  @nodes << (node.is_a?(Node) ? node.backing_node : node)
  self
end

#rejectObject



238
239
240
241
242
243
244
245
246
# File 'lib/scrapetor/node_set.rb', line 238

def reject
  return enum_for(:reject) unless block_given?
  kept = []
  backing_nodes.each do |n|
    wrapped = Node.new(@doc, n)
    kept << n unless yield(wrapped)
  end
  self.class.new(@doc, kept)
end

#removeObject Also known as: unlink

—– Bulk mutation passthroughs —–

Nokogiri NodeSet exposes a handful of bulk operations that map onto iterating the underlying nodes. We keep parity so callers can do ‘doc.css(’br’).remove` etc. without crashing.



198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'lib/scrapetor/node_set.rb', line 198

def remove
  # Two-phase. First promote every backing node to its Dom
  # equivalent (so path-based lookup happens against the still-
  # intact tree); then remove. A naive "iterate + remove" works on
  # a mutable Dom but invalidates the position-index paths the
  # Native::Element fallback relies on after the first deletion.
  resolved = backing_nodes.map do |n|
    if n.respond_to?(:promote_to_dom!)
      n.promote_to_dom!
    else
      n
    end
  end
  resolved.each do |target|
    if target.respond_to?(:remove)
      target.remove
    else
      Node.new(@doc, target).remove
    end
  end
  self
end

#reverseObject



159
160
161
# File 'lib/scrapetor/node_set.rb', line 159

def reverse
  self.class.new(@doc, backing_nodes.reverse)
end

#selectObject Also known as: filter



227
228
229
230
231
232
233
234
235
# File 'lib/scrapetor/node_set.rb', line 227

def select
  return enum_for(:select) unless block_given?
  kept = []
  backing_nodes.each do |n|
    wrapped = Node.new(@doc, n)
    kept << n if yield(wrapped)
  end
  self.class.new(@doc, kept)
end

#shiftObject



266
267
268
269
270
# File 'lib/scrapetor/node_set.rb', line 266

def shift
  materialize
  n = @nodes.shift
  n && Node.new(@doc, n)
end

#sizeObject Also known as: length, count



78
79
80
# File 'lib/scrapetor/node_set.rb', line 78

def size
  @lazy_ids ? @lazy_ids.ids.size : @nodes.size
end

#textObject Also known as: inner_text, content



99
100
101
# File 'lib/scrapetor/node_set.rb', line 99

def text
  backing_nodes.map(&:text).join
end

#to_aObject Also known as: to_ary



168
169
170
# File 'lib/scrapetor/node_set.rb', line 168

def to_a
  map { |n| n }
end

#to_htmlObject Also known as: inner_html, to_s



148
149
150
# File 'lib/scrapetor/node_set.rb', line 148

def to_html
  backing_nodes.map { |n| n.respond_to?(:to_html) ? n.to_html : n.to_s }.join
end