Module: Scrapetor::Microdata
- Defined in:
- lib/scrapetor/microdata.rb
Overview
Microdata extractor (HTML5 itemscope / itemprop / itemtype).
Walks the DOM looking for itemscope elements and emits a nested hash structure of items + properties. The format mirrors what schema.org/docs/datamodel.html describes:
{
"type" => "https://schema.org/Product", # from itemtype
"id" => "...", # from itemid
"properties" => {
"name" => "Widget",
"price" => "19.99",
"offer" => { "type" => "https://schema.org/Offer", ... }
}
}
Class Method Summary collapse
- .build_item(node) ⇒ Object
- .descendant_of_nested_itemscope?(el, scope) ⇒ Boolean
- .extract(doc) ⇒ Object
- .gather_props(scope, props) ⇒ Object
- .has_itemscope_ancestor?(node) ⇒ Boolean
- .property_value(el) ⇒ Object
Class Method Details
.build_item(node) ⇒ Object
39 40 41 42 43 44 45 46 47 |
# File 'lib/scrapetor/microdata.rb', line 39 def self.build_item(node) item = {} item["type"] = node["itemtype"] if node["itemtype"] item["id"] = node["itemid"] if node["itemid"] props = {} gather_props(node, props) item["properties"] = props item end |
.descendant_of_nested_itemscope?(el, scope) ⇒ Boolean
69 70 71 72 73 74 75 76 |
# File 'lib/scrapetor/microdata.rb', line 69 def self.descendant_of_nested_itemscope?(el, scope) cur = el.parent while cur && cur != scope return true if cur.respond_to?(:[]) && cur["itemscope"] cur = cur.respond_to?(:parent) ? cur.parent : nil end false end |
.extract(doc) ⇒ Object
20 21 22 23 24 25 26 27 28 |
# File 'lib/scrapetor/microdata.rb', line 20 def self.extract(doc) items = [] doc.css("[itemscope]").each do |node| # Skip nested items — they'll be reached via the parent's properties. next if has_itemscope_ancestor?(node) items << build_item(node) end items end |
.gather_props(scope, props) ⇒ Object
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/scrapetor/microdata.rb', line 49 def self.gather_props(scope, props) scope.css("[itemprop]").each do |el| # Only direct descendants in microdata terms: an itemprop on a # descendant of a nested itemscope belongs to the nested item. next if descendant_of_nested_itemscope?(el, scope) names = (el["itemprop"] || "").split(/\s+/).reject(&:empty?) next if names.empty? value = property_value(el) names.each do |n| if props.key?(n) props[n] = [props[n]] unless props[n].is_a?(Array) props[n] << value else props[n] = value end end end end |
.has_itemscope_ancestor?(node) ⇒ Boolean
30 31 32 33 34 35 36 37 |
# File 'lib/scrapetor/microdata.rb', line 30 def self.has_itemscope_ancestor?(node) ancestor = node.parent while ancestor return true if ancestor.respond_to?(:[]) && ancestor["itemscope"] ancestor = ancestor.respond_to?(:parent) ? ancestor.parent : nil end false end |
.property_value(el) ⇒ Object
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/scrapetor/microdata.rb', line 78 def self.property_value(el) if el["itemscope"] return build_item(el) end tag = el.respond_to?(:name) ? el.name.to_s.downcase : "" case tag when "meta" then el["content"] when "audio", "embed", "iframe", "img", "source", "track", "video" el["src"] when "a", "area", "link" then el["href"] when "object" then el["data"] when "data" then el["value"] || el.text when "meter" then el["value"] || el.text when "time" then el["datetime"] || el.text else text = el.text.to_s text.gsub(/\s+/, " ").strip end end |