Module: Scrapetor::PageType
- Defined in:
- lib/scrapetor/page_type.rb
Overview
Heuristic page-type detection.
Returns one of:
:product_page, :product_listing, :article, :search_results,
:forum_thread, :profile, :documentation, :unknown
The heuristic prefers strong signals (JSON-LD @type, OpenGraph og:type) and falls back to structural heuristics (repeated card patterns, byline + body, search bar + result list).
Constant Summary collapse
- PRODUCT_OG_TYPES =
%w[product product.item og:product].freeze
- ARTICLE_OG_TYPES =
%w[article news.article].freeze
- PROFILE_OG_TYPES =
%w[profile person og:profile].freeze
Class Method Summary collapse
- .detect(doc) ⇒ Object
-
.from_opengraph(doc) ⇒ Object
—– OpenGraph signals —–.
-
.from_structure(doc) ⇒ Object
—– structural fallback —–.
-
.from_structured_data(doc) ⇒ Object
—– strong signals: JSON-LD —–.
Class Method Details
.detect(doc) ⇒ Object
18 19 20 21 22 23 |
# File 'lib/scrapetor/page_type.rb', line 18 def self.detect(doc) from_structured_data(doc) || from_opengraph(doc) || from_structure(doc) || :unknown end |
.from_opengraph(doc) ⇒ Object
—– OpenGraph signals —–
43 44 45 46 47 48 49 50 |
# File 'lib/scrapetor/page_type.rb', line 43 def self.from_opengraph(doc) og = doc.opengraph t = (og["type"] || "").to_s.downcase return :product_page if PRODUCT_OG_TYPES.any? { |x| t.include?(x) } return :article if ARTICLE_OG_TYPES.any? { |x| t.include?(x) } return :profile if PROFILE_OG_TYPES.any? { |x| t.include?(x) } nil end |
.from_structure(doc) ⇒ Object
—– structural fallback —–
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/scrapetor/page_type.rb', line 54 def self.from_structure(doc) # Search results: a search bar + a list of result items if doc.css('input[type="search"], form[role="search"], [class*="search-result"]').any? return :search_results end # Repeated cards = listing grid_candidates = %w[ .product-card .product-tile .product-item .listing-item [class*="product-grid"] [class*="card"] [class*="tile"] ].flat_map { |sel| doc.css(sel).to_a }.uniq return :product_listing if grid_candidates.size >= 6 # Article: <article> with a byline AND a long body articles = doc.css("article") if articles.any? text = articles.first.text.to_s word_count = text.scan(/\S+/).size = doc.css(".byline, .author, [rel='author'], [itemprop='author']").any? return :article if word_count >= 200 || end # Profile: avatar + name + bio if doc.css('[class*="avatar"], [class*="profile-header"]').any? && doc.css('[class*="bio"], [class*="about"]').any? return :profile end # Forum thread if doc.css('.thread, .topic, [class*="post-message"]').size >= 2 return :forum_thread end # Documentation: code blocks + heading hierarchy if doc.css("pre code").size >= 3 && doc.css("h1, h2, h3").size >= 3 return :documentation end nil end |
.from_structured_data(doc) ⇒ Object
—– strong signals: JSON-LD —–
27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/scrapetor/page_type.rb', line 27 def self.from_structured_data(doc) types = doc.json_ld.flat_map { |item| Array(item.is_a?(Hash) ? item["@type"] : nil) }.compact.map(&:to_s) return nil if types.empty? return :product_listing if types.include?("ItemList") && (types.include?("Product") || types.include?("Offer")) return :product_page if types.include?("Product") return :article if (types & %w[NewsArticle Article BlogPosting]).any? return :search_results if types.include?("SearchResultsPage") return :profile if (types & %w[Person ProfilePage]).any? return :forum_thread if types.include?("DiscussionForumPosting") return :documentation if types.include?("TechArticle") nil end |