Class: Scrapetor::Dom::Document

Inherits:
Object
  • Object
show all
Includes:
NodeMethods
Defined in:
lib/scrapetor/dom.rb

Instance Attribute Summary collapse

Attributes included from NodeMethods

#parent

Instance Method Summary collapse

Methods included from NodeMethods

#add_next_sibling, #add_previous_sibling, #comment?, #doctype?, #document, #next_element_sibling, #next_sibling, #previous_element_sibling, #previous_sibling, #remove, #replace, #text?

Constructor Details

#initializeDocument

Returns a new instance of Document.



424
425
426
427
428
429
430
431
# File 'lib/scrapetor/dom.rb', line 424

def initialize
  @children = []
  @doctype  = nil
  @parent   = nil
  @class_index = nil
  @tag_index   = nil
  @id_index    = nil
end

Instance Attribute Details

#childrenObject

Returns the value of attribute children.



422
423
424
# File 'lib/scrapetor/dom.rb', line 422

def children
  @children
end

#doctypeObject

Returns the value of attribute doctype.



422
423
424
# File 'lib/scrapetor/dom.rb', line 422

def doctype
  @doctype
end

Instance Method Details

#add_child(node_or_html) ⇒ Object



512
513
514
515
516
# File 'lib/scrapetor/dom.rb', line 512

def add_child(node_or_html)
  nodes = Dom.normalize_replacement(node_or_html, parent: self)
  nodes.each { |n| n.parent = self; @children << n }
  nodes.last
end

#at_css(selector) ⇒ Object Also known as: at



504
505
506
# File 'lib/scrapetor/dom.rb', line 504

def at_css(selector)
  css(selector).first
end

#at_xpath(expr) ⇒ Object



510
# File 'lib/scrapetor/dom.rb', line 510

def at_xpath(expr); xpath(expr).first; end

#bodyObject



492
493
494
# File 'lib/scrapetor/dom.rb', line 492

def body
  @children.flat_map { |c| c.element? ? c.css("body") : [] }.first
end

#build_indexes!Object



455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
# File 'lib/scrapetor/dom.rb', line 455

def build_indexes!
  cls = Hash.new { |h, k| h[k] = [] }
  tag = Hash.new { |h, k| h[k] = [] }
  ids = {}
  walk = ->(node) {
    return unless node.respond_to?(:children)
    node.children.each do |c|
      next unless c.element?
      tag[c.name] << c
      id_attr = c["id"]
      ids[id_attr] ||= c if id_attr && !id_attr.empty?
      class_attr = c["class"]
      if class_attr
        class_attr.split(/\s+/).each { |t| cls[t] << c unless t.empty? }
      end
      walk.call(c)
    end
  }
  walk.call(self)
  @class_index = cls
  @tag_index = tag
  @id_index = ids
  { class: cls, tag: tag, id: ids }
end

#class_indexObject

Lazy structural indexes. Built on first access during a fallback selector evaluation so the per-query candidate set drops from “every element in document order” to “elements that already carry the anchor class / tag / id”. On a 100KB document with ~5000 elements that’s the difference between a 5ms walk and a ~50µs lookup.



443
444
445
# File 'lib/scrapetor/dom.rb', line 443

def class_index
  @class_index ||= build_indexes![:class]
end

#css(selector) ⇒ Object



500
501
502
# File 'lib/scrapetor/dom.rb', line 500

def css(selector)
  Dom::Selectors.css(self, selector)
end

#document?Boolean

Returns:

  • (Boolean)


434
# File 'lib/scrapetor/dom.rb', line 434

def document?; true; end

#element?Boolean

Returns:

  • (Boolean)


433
# File 'lib/scrapetor/dom.rb', line 433

def element?; false; end

#headObject



488
489
490
# File 'lib/scrapetor/dom.rb', line 488

def head
  @children.flat_map { |c| c.element? ? c.css("head") : [] }.first
end

#html_elementObject



484
485
486
# File 'lib/scrapetor/dom.rb', line 484

def html_element
  @children.find { |c| c.element? && c.name == "html" } || root
end

#id_indexObject



451
452
453
# File 'lib/scrapetor/dom.rb', line 451

def id_index
  @id_index ||= build_indexes![:id]
end

#nameObject



435
# File 'lib/scrapetor/dom.rb', line 435

def name; "#document"; end

#rootObject



480
481
482
# File 'lib/scrapetor/dom.rb', line 480

def root
  @children.find(&:element?)
end

#tag_indexObject



447
448
449
# File 'lib/scrapetor/dom.rb', line 447

def tag_index
  @tag_index ||= build_indexes![:tag]
end

#textObject



496
497
498
# File 'lib/scrapetor/dom.rb', line 496

def text
  @children.map(&:text).join
end

#to_htmlObject Also known as: to_s



518
519
520
521
522
523
# File 'lib/scrapetor/dom.rb', line 518

def to_html
  out = +""
  out << "<!DOCTYPE #{@doctype}>" if @doctype
  @children.each { |c| out << c.to_html }
  out
end

#traverse {|_self| ... } ⇒ Object

Yields:

  • (_self)

Yield Parameters:



526
527
528
529
530
531
532
533
534
535
536
537
# File 'lib/scrapetor/dom.rb', line 526

def traverse(&block)
  return enum_for(:traverse) unless block_given?
  yield self
  @children.each do |c|
    if c.respond_to?(:traverse)
      c.traverse(&block)
    else
      yield c
    end
  end
  self
end

#xpath(_expr) ⇒ Object



509
# File 'lib/scrapetor/dom.rb', line 509

def xpath(_expr); []; end