Module: Scrapetor::Dom::Parser

Defined in:
lib/scrapetor/dom/parser.rb

Overview

Build a Dom::Document from raw HTML via the SAX tokenizer.

Constant Summary collapse

VOID_TAGS =
Scrapetor::Dom::VOID.to_h { |t| [t, true] }.freeze

Class Method Summary collapse

Class Method Details

.fragment(html) ⇒ Object

Parse a fragment — return an Array of nodes (no Document wrapper).



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/scrapetor/dom/parser.rb', line 42

def self.fragment(html)
  wrapper = Element.new("__fragment__")
  stack = [wrapper]
  Scrapetor::SAX::Tokenizer.new(html).each_event do |event|
    type, *args = event
    case type
    when :start
      name, attrs = args
      element = Element.new(name, attrs || {})
      stack.last.add_child(element)
      stack.push(element) unless VOID_TAGS[element.name]
    when :end
      name = args[0]
      idx = stack.rindex { |n| n.is_a?(Element) && n.name == name }
      stack.slice!(idx..) if idx && idx > 0
    when :text
      stack.last.add_child(Text.new(args[0]))
    when :comment
      stack.last.add_child(Comment.new(args[0]))
    end
  end
  nodes = wrapper.children
  nodes.each { |n| n.parent = nil }
  nodes
end

.parse(html) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/scrapetor/dom/parser.rb', line 9

def self.parse(html)
  doc = Dom::Document.new
  stack = [doc]
  tokenizer = Scrapetor::SAX::Tokenizer.new(html)
  tokenizer.each_event do |event|
    type, *args = event
    case type
    when :doc_start, :doc_end
      # no-op
    when :doctype
      doc.doctype = args[0]
    when :start
      name, attrs = args
      element = Element.new(name, attrs || {})
      stack.last.add_child(element)
      stack.push(element) unless VOID_TAGS[element.name]
    when :end
      name = args[0]
      # Pop frames until matching close or root.
      idx = stack.rindex { |n| n.is_a?(Element) && n.name == name }
      if idx
        stack.slice!(idx..)
      end
    when :text
      stack.last.add_child(Text.new(args[0]))
    when :comment
      stack.last.add_child(Comment.new(args[0]))
    end
  end
  doc
end