Module: Scrapetor::StructuredData

Defined in:
lib/scrapetor/structured_data.rb

Overview

Extract structured-data signals every SEO/RAG pipeline needs: JSON-LD, OpenGraph, Twitter Cards, Schema.org microdata.

These are deterministic and fast — no DOM walk beyond ‘doc.css(…)` which is delegated to the backing tokenizer.

Constant Summary collapse

JSON_LD_SELECTOR =
'script[type="application/ld+json"]'.freeze

Class Method Summary collapse

Class Method Details

.collect_meta(doc, prefix:) ⇒ Object



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/scrapetor/structured_data.rb', line 58

def self.collect_meta(doc, prefix:)
  h = {}
  doc.css("meta").each do |meta|
    # OpenGraph uses `property=`; Twitter Cards use `name=`. Some sites
    # do both. Check both.
    key = meta.attr("property") || meta.attr("name")
    next if key.nil?
    next unless key.start_with?(prefix)
    val = meta.attr("content")
    next if val.nil?
    short_key = key[prefix.length..]
    h[short_key] = val if !h.key?(short_key)
  end
  h
end

.json_ld(doc) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/scrapetor/structured_data.rb', line 14

def self.json_ld(doc)
  out = []
  doc.css(JSON_LD_SELECTOR).each do |script|
    body = script.text
    next if body.nil? || body.strip.empty?
    begin
      parsed = JSON.parse(body)
    rescue JSON::ParserError
      next
    end
    if parsed.is_a?(Array)
      out.concat(parsed)
    elsif parsed.is_a?(Hash) && parsed["@graph"].is_a?(Array)
      out.concat(parsed["@graph"])
    else
      out << parsed
    end
  end
  out
end

.opengraph(doc) ⇒ Object



35
36
37
# File 'lib/scrapetor/structured_data.rb', line 35

def self.opengraph(doc)
  collect_meta(doc, prefix: "og:")
end

.schema_org(doc, type: nil) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/scrapetor/structured_data.rb', line 43

def self.schema_org(doc, type: nil)
  list = json_ld(doc)
  return list if type.nil?
  target = type.to_s
  list.select do |item|
    next false unless item.is_a?(Hash)
    t = item["@type"]
    case t
    when String then t == target
    when Array  then t.include?(target)
    else false
    end
  end
end

.twitter_card(doc) ⇒ Object



39
40
41
# File 'lib/scrapetor/structured_data.rb', line 39

def self.twitter_card(doc)
  collect_meta(doc, prefix: "twitter:")
end