Class: Scrapetor::Dom::Document
- Inherits:
-
Object
- Object
- Scrapetor::Dom::Document
show all
- Includes:
- NodeMethods
- Defined in:
- lib/scrapetor/dom.rb
Instance Attribute Summary collapse
Attributes included from NodeMethods
#parent
Instance Method Summary
collapse
#add_next_sibling, #add_previous_sibling, #comment?, #doctype?, #document, #next_element_sibling, #next_sibling, #previous_element_sibling, #previous_sibling, #remove, #replace, #text?
Constructor Details
Returns a new instance of Document.
424
425
426
427
428
429
430
431
|
# File 'lib/scrapetor/dom.rb', line 424
def initialize
@children = []
@doctype = nil
@parent = nil
@class_index = nil
@tag_index = nil
@id_index = nil
end
|
Instance Attribute Details
#children ⇒ Object
Returns the value of attribute children.
422
423
424
|
# File 'lib/scrapetor/dom.rb', line 422
def children
@children
end
|
#doctype ⇒ Object
Returns the value of attribute doctype.
422
423
424
|
# File 'lib/scrapetor/dom.rb', line 422
def doctype
@doctype
end
|
Instance Method Details
#add_child(node_or_html) ⇒ Object
512
513
514
515
516
|
# File 'lib/scrapetor/dom.rb', line 512
def add_child(node_or_html)
nodes = Dom.normalize_replacement(node_or_html, parent: self)
nodes.each { |n| n.parent = self; @children << n }
nodes.last
end
|
#at_css(selector) ⇒ Object
Also known as:
at
504
505
506
|
# File 'lib/scrapetor/dom.rb', line 504
def at_css(selector)
css(selector).first
end
|
#at_xpath(expr) ⇒ Object
510
|
# File 'lib/scrapetor/dom.rb', line 510
def at_xpath(expr); xpath(expr).first; end
|
#body ⇒ Object
492
493
494
|
# File 'lib/scrapetor/dom.rb', line 492
def body
@children.flat_map { |c| c.element? ? c.css("body") : [] }.first
end
|
#build_indexes! ⇒ Object
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
|
# File 'lib/scrapetor/dom.rb', line 455
def build_indexes!
cls = Hash.new { |h, k| h[k] = [] }
tag = Hash.new { |h, k| h[k] = [] }
ids = {}
walk = ->(node) {
return unless node.respond_to?(:children)
node.children.each do |c|
next unless c.element?
tag[c.name] << c
id_attr = c["id"]
ids[id_attr] ||= c if id_attr && !id_attr.empty?
class_attr = c["class"]
if class_attr
class_attr.split(/\s+/).each { |t| cls[t] << c unless t.empty? }
end
walk.call(c)
end
}
walk.call(self)
@class_index = cls
@tag_index = tag
@id_index = ids
{ class: cls, tag: tag, id: ids }
end
|
#class_index ⇒ Object
Lazy structural indexes. Built on first access during a fallback selector evaluation so the per-query candidate set drops from “every element in document order” to “elements that already carry the anchor class / tag / id”. On a 100KB document with ~5000 elements that’s the difference between a 5ms walk and a ~50µs lookup.
443
444
445
|
# File 'lib/scrapetor/dom.rb', line 443
def class_index
@class_index ||= build_indexes![:class]
end
|
#css(selector) ⇒ Object
500
501
502
|
# File 'lib/scrapetor/dom.rb', line 500
def css(selector)
Dom::Selectors.css(self, selector)
end
|
#document? ⇒ Boolean
434
|
# File 'lib/scrapetor/dom.rb', line 434
def document?; true; end
|
#element? ⇒ Boolean
433
|
# File 'lib/scrapetor/dom.rb', line 433
def element?; false; end
|
#head ⇒ Object
488
489
490
|
# File 'lib/scrapetor/dom.rb', line 488
def head
@children.flat_map { |c| c.element? ? c.css("head") : [] }.first
end
|
#html_element ⇒ Object
484
485
486
|
# File 'lib/scrapetor/dom.rb', line 484
def html_element
@children.find { |c| c.element? && c.name == "html" } || root
end
|
#id_index ⇒ Object
451
452
453
|
# File 'lib/scrapetor/dom.rb', line 451
def id_index
@id_index ||= build_indexes![:id]
end
|
#name ⇒ Object
435
|
# File 'lib/scrapetor/dom.rb', line 435
def name; "#document"; end
|
#root ⇒ Object
480
481
482
|
# File 'lib/scrapetor/dom.rb', line 480
def root
@children.find(&:element?)
end
|
#tag_index ⇒ Object
447
448
449
|
# File 'lib/scrapetor/dom.rb', line 447
def tag_index
@tag_index ||= build_indexes![:tag]
end
|
#text ⇒ Object
496
497
498
|
# File 'lib/scrapetor/dom.rb', line 496
def text
@children.map(&:text).join
end
|
#to_html ⇒ Object
Also known as:
to_s
518
519
520
521
522
523
|
# File 'lib/scrapetor/dom.rb', line 518
def to_html
out = +""
out << "<!DOCTYPE #{@doctype}>" if @doctype
@children.each { |c| out << c.to_html }
out
end
|
#traverse {|_self| ... } ⇒ Object
526
527
528
529
530
531
532
533
534
535
536
537
|
# File 'lib/scrapetor/dom.rb', line 526
def traverse(&block)
return enum_for(:traverse) unless block_given?
yield self
@children.each do |c|
if c.respond_to?(:traverse)
c.traverse(&block)
else
yield c
end
end
self
end
|
#xpath(_expr) ⇒ Object
509
|
# File 'lib/scrapetor/dom.rb', line 509
def xpath(_expr); []; end
|