Module: Scrapetor::Microdata

Defined in:
lib/scrapetor/microdata.rb

Overview

Microdata extractor (HTML5 itemscope / itemprop / itemtype).

Walks the DOM looking for itemscope elements and emits a nested hash structure of items + properties. The format mirrors what schema.org/docs/datamodel.html describes:

{
  "type"       => "https://schema.org/Product",  # from itemtype
  "id"         => "...",                          # from itemid
  "properties" => {
    "name"  => "Widget",
    "price" => "19.99",
    "offer" => { "type" => "https://schema.org/Offer", ... }
  }
}

Class Method Summary collapse

Class Method Details

.build_item(node) ⇒ Object



39
40
41
42
43
44
45
46
47
# File 'lib/scrapetor/microdata.rb', line 39

def self.build_item(node)
  item = {}
  item["type"] = node["itemtype"] if node["itemtype"]
  item["id"]   = node["itemid"]   if node["itemid"]
  props = {}
  gather_props(node, props)
  item["properties"] = props
  item
end

.descendant_of_nested_itemscope?(el, scope) ⇒ Boolean

Returns:

  • (Boolean)


69
70
71
72
73
74
75
76
# File 'lib/scrapetor/microdata.rb', line 69

def self.descendant_of_nested_itemscope?(el, scope)
  cur = el.parent
  while cur && cur != scope
    return true if cur.respond_to?(:[]) && cur["itemscope"]
    cur = cur.respond_to?(:parent) ? cur.parent : nil
  end
  false
end

.extract(doc) ⇒ Object



20
21
22
23
24
25
26
27
28
# File 'lib/scrapetor/microdata.rb', line 20

def self.extract(doc)
  items = []
  doc.css("[itemscope]").each do |node|
    # Skip nested items — they'll be reached via the parent's properties.
    next if has_itemscope_ancestor?(node)
    items << build_item(node)
  end
  items
end

.gather_props(scope, props) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/scrapetor/microdata.rb', line 49

def self.gather_props(scope, props)
  scope.css("[itemprop]").each do |el|
    # Only direct descendants in microdata terms: an itemprop on a
    # descendant of a nested itemscope belongs to the nested item.
    next if descendant_of_nested_itemscope?(el, scope)

    names = (el["itemprop"] || "").split(/\s+/).reject(&:empty?)
    next if names.empty?
    value = property_value(el)
    names.each do |n|
      if props.key?(n)
        props[n] = [props[n]] unless props[n].is_a?(Array)
        props[n] << value
      else
        props[n] = value
      end
    end
  end
end

.has_itemscope_ancestor?(node) ⇒ Boolean

Returns:

  • (Boolean)


30
31
32
33
34
35
36
37
# File 'lib/scrapetor/microdata.rb', line 30

def self.has_itemscope_ancestor?(node)
  ancestor = node.parent
  while ancestor
    return true if ancestor.respond_to?(:[]) && ancestor["itemscope"]
    ancestor = ancestor.respond_to?(:parent) ? ancestor.parent : nil
  end
  false
end

.property_value(el) ⇒ Object



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/scrapetor/microdata.rb', line 78

def self.property_value(el)
  if el["itemscope"]
    return build_item(el)
  end
  tag = el.respond_to?(:name) ? el.name.to_s.downcase : ""
  case tag
  when "meta"                  then el["content"]
  when "audio", "embed", "iframe", "img", "source", "track", "video"
    el["src"]
  when "a", "area", "link"     then el["href"]
  when "object"                then el["data"]
  when "data"                  then el["value"] || el.text
  when "meter"                 then el["value"] || el.text
  when "time"                  then el["datetime"] || el.text
  else
    text = el.text.to_s
    text.gsub(/\s+/, " ").strip
  end
end