Module: Scrapetor::Dom::Parser
- Defined in:
- lib/scrapetor/dom/parser.rb
Overview
Build a Dom::Document from raw HTML via the SAX tokenizer.
Constant Summary collapse
Class Method Summary collapse
-
.fragment(html) ⇒ Object
Parse a fragment — return an Array of nodes (no Document wrapper).
- .parse(html) ⇒ Object
Class Method Details
.fragment(html) ⇒ Object
Parse a fragment — return an Array of nodes (no Document wrapper).
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/scrapetor/dom/parser.rb', line 42 def self.fragment(html) wrapper = Element.new("__fragment__") stack = [wrapper] Scrapetor::SAX::Tokenizer.new(html).each_event do |event| type, *args = event case type when :start name, attrs = args element = Element.new(name, attrs || {}) stack.last.add_child(element) stack.push(element) unless VOID_TAGS[element.name] when :end name = args[0] idx = stack.rindex { |n| n.is_a?(Element) && n.name == name } stack.slice!(idx..) if idx && idx > 0 when :text stack.last.add_child(Text.new(args[0])) when :comment stack.last.add_child(Comment.new(args[0])) end end nodes = wrapper.children nodes.each { |n| n.parent = nil } nodes end |
.parse(html) ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/scrapetor/dom/parser.rb', line 9 def self.parse(html) doc = Dom::Document.new stack = [doc] tokenizer = Scrapetor::SAX::Tokenizer.new(html) tokenizer.each_event do |event| type, *args = event case type when :doc_start, :doc_end # no-op when :doctype doc.doctype = args[0] when :start name, attrs = args element = Element.new(name, attrs || {}) stack.last.add_child(element) stack.push(element) unless VOID_TAGS[element.name] when :end name = args[0] # Pop frames until matching close or root. idx = stack.rindex { |n| n.is_a?(Element) && n.name == name } if idx stack.slice!(idx..) end when :text stack.last.add_child(Text.new(args[0])) when :comment stack.last.add_child(Comment.new(args[0])) end end doc end |