Module: Scrapetor::PageType

Defined in:
lib/scrapetor/page_type.rb

Overview

Heuristic page-type detection.

Returns one of:

:product_page, :product_listing, :article, :search_results,
:forum_thread, :profile, :documentation, :unknown

The heuristic prefers strong signals (JSON-LD @type, OpenGraph og:type) and falls back to structural heuristics (repeated card patterns, byline + body, search bar + result list).

Constant Summary collapse

PRODUCT_OG_TYPES =
%w[product product.item og:product].freeze
ARTICLE_OG_TYPES =
%w[article news.article].freeze
PROFILE_OG_TYPES =
%w[profile person og:profile].freeze

Class Method Summary collapse

Class Method Details

.detect(doc) ⇒ Object



18
19
20
21
22
23
# File 'lib/scrapetor/page_type.rb', line 18

def self.detect(doc)
  from_structured_data(doc) ||
    from_opengraph(doc) ||
    from_structure(doc) ||
    :unknown
end

.from_opengraph(doc) ⇒ Object

—– OpenGraph signals —–



43
44
45
46
47
48
49
50
# File 'lib/scrapetor/page_type.rb', line 43

def self.from_opengraph(doc)
  og = doc.opengraph
  t = (og["type"] || "").to_s.downcase
  return :product_page if PRODUCT_OG_TYPES.any? { |x| t.include?(x) }
  return :article      if ARTICLE_OG_TYPES.any? { |x| t.include?(x) }
  return :profile      if PROFILE_OG_TYPES.any? { |x| t.include?(x) }
  nil
end

.from_structure(doc) ⇒ Object

—– structural fallback —–



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/scrapetor/page_type.rb', line 54

def self.from_structure(doc)
  # Search results: a search bar + a list of result items
  if doc.css('input[type="search"], form[role="search"], [class*="search-result"]').any?
    return :search_results
  end

  # Repeated cards = listing
  grid_candidates = %w[
    .product-card .product-tile .product-item .listing-item
    [class*="product-grid"] [class*="card"] [class*="tile"]
  ].flat_map { |sel| doc.css(sel).to_a }.uniq
  return :product_listing if grid_candidates.size >= 6

  # Article: <article> with a byline AND a long body
  articles = doc.css("article")
  if articles.any?
    text = articles.first.text.to_s
    word_count = text.scan(/\S+/).size
     = doc.css(".byline, .author, [rel='author'], [itemprop='author']").any?
    return :article if word_count >= 200 || 
  end

  # Profile: avatar + name + bio
  if doc.css('[class*="avatar"], [class*="profile-header"]').any? &&
     doc.css('[class*="bio"], [class*="about"]').any?
    return :profile
  end

  # Forum thread
  if doc.css('.thread, .topic, [class*="post-message"]').size >= 2
    return :forum_thread
  end

  # Documentation: code blocks + heading hierarchy
  if doc.css("pre code").size >= 3 && doc.css("h1, h2, h3").size >= 3
    return :documentation
  end

  nil
end

.from_structured_data(doc) ⇒ Object

—– strong signals: JSON-LD —–



27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/scrapetor/page_type.rb', line 27

def self.from_structured_data(doc)
  types = doc.json_ld.flat_map { |item| Array(item.is_a?(Hash) ? item["@type"] : nil) }.compact.map(&:to_s)
  return nil if types.empty?
  return :product_listing if types.include?("ItemList") &&
                             (types.include?("Product") || types.include?("Offer"))
  return :product_page    if types.include?("Product")
  return :article         if (types & %w[NewsArticle Article BlogPosting]).any?
  return :search_results  if types.include?("SearchResultsPage")
  return :profile         if (types & %w[Person ProfilePage]).any?
  return :forum_thread    if types.include?("DiscussionForumPosting")
  return :documentation   if types.include?("TechArticle")
  nil
end