Class: Scrapetor::NodeSet
- Inherits:
-
Object
- Object
- Scrapetor::NodeSet
- Includes:
- Enumerable
- Defined in:
- lib/scrapetor/node_set.rb
Instance Method Summary collapse
- #&(other) ⇒ Object
- #+(other) ⇒ Object
- #-(other) ⇒ Object
- #[](index, length = nil) ⇒ Object (also: #slice)
- #at(selector) ⇒ Object (also: #at_css)
- #attr(name) ⇒ Object (also: #attribute)
- #backing_nodes ⇒ Object
-
#children ⇒ Object
Aggregate of children across all nodes in the set.
- #css(selector) ⇒ Object (also: #search)
- #each ⇒ Object
- #each_with_index ⇒ Object
- #empty? ⇒ Boolean
-
#extract(fields) ⇒ Object
Map every node through the ‘extract(fields)` extraction.
- #find_all ⇒ Object
- #first ⇒ Object
- #include?(node) ⇒ Boolean
- #index(node) ⇒ Object
-
#initialize(doc, backing_nodes) ⇒ NodeSet
constructor
A new instance of NodeSet.
- #last ⇒ Object
- #map ⇒ Object
-
#materialize ⇒ Object
Force the lazy-ids path to allocate its Element wrappers.
- #pop ⇒ Object
- #push(node) ⇒ Object (also: #<<)
- #reject ⇒ Object
-
#remove ⇒ Object
(also: #unlink)
—– Bulk mutation passthroughs —–.
- #reverse ⇒ Object
- #select ⇒ Object (also: #filter)
- #shift ⇒ Object
- #size ⇒ Object (also: #length, #count)
- #text ⇒ Object (also: #inner_text, #content)
- #to_a ⇒ Object (also: #to_ary)
- #to_html ⇒ Object (also: #inner_html, #to_s)
Constructor Details
#initialize(doc, backing_nodes) ⇒ NodeSet
Returns a new instance of NodeSet.
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
# File 'lib/scrapetor/node_set.rb', line 7 def initialize(doc, backing_nodes) @doc = doc # `defined?` guard so this works when the native extension isn't # loaded (e.g. install-time build failure, or the gem is required # before its C extension is in place). Without the guard a plain # NodeSet construction raises NameError on missing constant — # which is the v0.1.x crash a production audit run surfaced. if defined?(Scrapetor::Native::DocumentWrapper::LazyIds) && backing_nodes.is_a?(Scrapetor::Native::DocumentWrapper::LazyIds) @lazy_ids = backing_nodes @nodes = nil else @nodes = backing_nodes end end |
Instance Method Details
#&(other) ⇒ Object
287 288 289 290 |
# File 'lib/scrapetor/node_set.rb', line 287 def &(other) keep = other.respond_to?(:backing_nodes) ? other.backing_nodes : Array(other) self.class.new(@doc, backing_nodes & keep) end |
#+(other) ⇒ Object
163 164 165 166 |
# File 'lib/scrapetor/node_set.rb', line 163 def +(other) other_nodes = other.respond_to?(:backing_nodes) ? other.backing_nodes : Array(other) self.class.new(@doc, backing_nodes + other_nodes) end |
#-(other) ⇒ Object
282 283 284 285 |
# File 'lib/scrapetor/node_set.rb', line 282 def -(other) drop = other.respond_to?(:backing_nodes) ? other.backing_nodes : Array(other) self.class.new(@doc, backing_nodes - drop) end |
#[](index, length = nil) ⇒ Object Also known as: slice
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/scrapetor/node_set.rb', line 58 def [](index, length = nil) if length slice = backing_nodes[index, length] return self.class.new(@doc, slice || []) end if index.is_a?(Range) slice = backing_nodes[index] return self.class.new(@doc, slice || []) end if @lazy_ids id = @lazy_ids.ids[index] return nil unless id Node.new(@doc, Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper)) else n = @nodes[index] n && Node.new(@doc, n) end end |
#at(selector) ⇒ Object Also known as: at_css
105 106 107 |
# File 'lib/scrapetor/node_set.rb', line 105 def at(selector) first&.at(selector) end |
#attr(name) ⇒ Object Also known as: attribute
154 155 156 |
# File 'lib/scrapetor/node_set.rb', line 154 def attr(name) first&.attr(name) end |
#backing_nodes ⇒ Object
177 178 179 180 |
# File 'lib/scrapetor/node_set.rb', line 177 def backing_nodes return materialize if @lazy_ids @nodes end |
#children ⇒ Object
Aggregate of children across all nodes in the set. Mirrors Nokogiri’s NodeSet#children — every child of every node, including text and comment nodes, flattened into a single NodeSet. Pulls children straight from the backing element (rather than going through Node#children, which filters to elements only) so callers that iterate mixed-content can still see the text segments.
137 138 139 140 141 142 143 144 145 146 |
# File 'lib/scrapetor/node_set.rb', line 137 def children collected = [] backing_nodes.each do |bk| next unless bk.respond_to?(:children) kids = bk.children kids = kids.to_a if kids.respond_to?(:to_a) kids.each { |c| collected << c } end NodeSet.new(@doc, collected) end |
#css(selector) ⇒ Object Also known as: search
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
# File 'lib/scrapetor/node_set.rb', line 110 def css(selector) # Determine up front whether the selector ends in a `::text` / # `::attr` pseudo-element. Inferring from the result shape (was # the previous approach) misclassifies zero-match queries as # string-shaped and breaks `.at_css` chained off an empty NodeSet. pe = selector.to_s string_result = pe.include?("::") && pe =~ /::(?:text|attr\([^)]+\)|first-letter|first-line|before|after)\s*\z/i collected = [] backing_nodes.each do |n| next unless n.respond_to?(:css) result = n.css(selector) result = result.to_a if result.respond_to?(:to_a) result.each { |hit| collected << hit } end return collected if string_result NodeSet.new(@doc, collected) end |
#each ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/scrapetor/node_set.rb', line 23 def each return enum_for(:each) unless block_given? if @lazy_ids wrap = @lazy_ids.wrapper native = @lazy_ids.native @lazy_ids.ids.each do |id| yield Node.new(@doc, Scrapetor::Native::Element.new(native, id, wrap)) end else @nodes.each { |n| yield Node.new(@doc, n) } end end |
#each_with_index ⇒ Object
222 223 224 225 |
# File 'lib/scrapetor/node_set.rb', line 222 def each_with_index return enum_for(:each_with_index) unless block_given? backing_nodes.each_with_index { |n, i| yield Node.new(@doc, n), i } end |
#empty? ⇒ Boolean
84 85 86 |
# File 'lib/scrapetor/node_set.rb', line 84 def empty? @lazy_ids ? @lazy_ids.ids.empty? : @nodes.empty? end |
#extract(fields) ⇒ Object
Map every node through the ‘extract(fields)` extraction. Lets the standard SERP-result pattern collapse to:
doc.css(".result").extract(title: ".t", price: ".p")
# => [{title: ..., price: ...}, ...]
297 298 299 |
# File 'lib/scrapetor/node_set.rb', line 297 def extract(fields) map { |n| n.extract(fields) } end |
#find_all ⇒ Object
248 249 250 251 |
# File 'lib/scrapetor/node_set.rb', line 248 def find_all return enum_for(:find_all) unless block_given? select { |n| yield(n) } end |
#first ⇒ Object
36 37 38 39 40 41 42 43 44 45 |
# File 'lib/scrapetor/node_set.rb', line 36 def first if @lazy_ids id = @lazy_ids.ids.first return nil unless id Node.new(@doc, Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper)) else n = @nodes.first n && Node.new(@doc, n) end end |
#include?(node) ⇒ Boolean
277 278 279 280 |
# File 'lib/scrapetor/node_set.rb', line 277 def include?(node) target = node.is_a?(Node) ? node.backing_node : node backing_nodes.include?(target) end |
#index(node) ⇒ Object
272 273 274 275 |
# File 'lib/scrapetor/node_set.rb', line 272 def index(node) target = node.is_a?(Node) ? node.backing_node : node backing_nodes.index(target) end |
#last ⇒ Object
47 48 49 50 51 52 53 54 55 56 |
# File 'lib/scrapetor/node_set.rb', line 47 def last if @lazy_ids id = @lazy_ids.ids.last return nil unless id Node.new(@doc, Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper)) else n = @nodes.last n && Node.new(@doc, n) end end |
#map ⇒ Object
88 89 90 91 92 93 94 95 96 97 |
# File 'lib/scrapetor/node_set.rb', line 88 def map return enum_for(:map) unless block_given? if @lazy_ids wrap = @lazy_ids.wrapper native = @lazy_ids.native @lazy_ids.ids.map { |id| yield Node.new(@doc, Scrapetor::Native::Element.new(native, id, wrap)) } else @nodes.map { |n| yield Node.new(@doc, n) } end end |
#materialize ⇒ Object
Force the lazy-ids path to allocate its Element wrappers. Used by operations that need the original backing nodes (set algebra, +/-/&, removal).
185 186 187 188 189 190 |
# File 'lib/scrapetor/node_set.rb', line 185 def materialize return @nodes unless @lazy_ids @nodes = @lazy_ids.ids.map { |id| Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper) } @lazy_ids = nil @nodes end |
#pop ⇒ Object
260 261 262 263 264 |
# File 'lib/scrapetor/node_set.rb', line 260 def pop materialize n = @nodes.pop n && Node.new(@doc, n) end |
#push(node) ⇒ Object Also known as: <<
253 254 255 256 257 |
# File 'lib/scrapetor/node_set.rb', line 253 def push(node) materialize @nodes << (node.is_a?(Node) ? node.backing_node : node) self end |
#reject ⇒ Object
238 239 240 241 242 243 244 245 246 |
# File 'lib/scrapetor/node_set.rb', line 238 def reject return enum_for(:reject) unless block_given? kept = [] backing_nodes.each do |n| wrapped = Node.new(@doc, n) kept << n unless yield(wrapped) end self.class.new(@doc, kept) end |
#remove ⇒ Object Also known as: unlink
—– Bulk mutation passthroughs —–
Nokogiri NodeSet exposes a handful of bulk operations that map onto iterating the underlying nodes. We keep parity so callers can do ‘doc.css(’br’).remove` etc. without crashing.
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
# File 'lib/scrapetor/node_set.rb', line 198 def remove # Two-phase. First promote every backing node to its Dom # equivalent (so path-based lookup happens against the still- # intact tree); then remove. A naive "iterate + remove" works on # a mutable Dom but invalidates the position-index paths the # Native::Element fallback relies on after the first deletion. resolved = backing_nodes.map do |n| if n.respond_to?(:promote_to_dom!) n.promote_to_dom! else n end end resolved.each do |target| if target.respond_to?(:remove) target.remove else Node.new(@doc, target).remove end end self end |
#reverse ⇒ Object
159 160 161 |
# File 'lib/scrapetor/node_set.rb', line 159 def reverse self.class.new(@doc, backing_nodes.reverse) end |
#select ⇒ Object Also known as: filter
227 228 229 230 231 232 233 234 235 |
# File 'lib/scrapetor/node_set.rb', line 227 def select return enum_for(:select) unless block_given? kept = [] backing_nodes.each do |n| wrapped = Node.new(@doc, n) kept << n if yield(wrapped) end self.class.new(@doc, kept) end |
#shift ⇒ Object
266 267 268 269 270 |
# File 'lib/scrapetor/node_set.rb', line 266 def shift materialize n = @nodes.shift n && Node.new(@doc, n) end |
#size ⇒ Object Also known as: length, count
78 79 80 |
# File 'lib/scrapetor/node_set.rb', line 78 def size @lazy_ids ? @lazy_ids.ids.size : @nodes.size end |
#text ⇒ Object Also known as: inner_text, content
99 100 101 |
# File 'lib/scrapetor/node_set.rb', line 99 def text backing_nodes.map(&:text).join end |
#to_a ⇒ Object Also known as: to_ary
168 169 170 |
# File 'lib/scrapetor/node_set.rb', line 168 def to_a map { |n| n } end |
#to_html ⇒ Object Also known as: inner_html, to_s
148 149 150 |
# File 'lib/scrapetor/node_set.rb', line 148 def to_html backing_nodes.map { |n| n.respond_to?(:to_html) ? n.to_html : n.to_s }.join end |