Class: Coelacanth::Extractor::HeuristicProbe

Inherits:
Object
  • Object
show all
Defined in:
lib/coelacanth/extractor/heuristic_probe.rb

Overview

Scores DOM nodes based on simple heuristics to locate the primary article body.

Defined Under Namespace

Classes: Result

Constant Summary collapse

BLOCK_SELECTOR =
"article, main, section, div".freeze
TAG_WEIGHTS =
Hash.new(0).merge(
  "article" => 80,
  "main" => 60,
  "section" => 30,
  "div" => 10
).freeze
NEGATIVE_TOKENS =
%w[nav footer header sidebar related share menu].freeze
POSITIVE_TOKENS =
%w[content article body post entry text].freeze

Instance Method Summary collapse

Instance Method Details

#call(doc:, url: nil) ⇒ Object



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/coelacanth/extractor/heuristic_probe.rb', line 31

def call(doc:, url: nil)
  candidates = doc.css(BLOCK_SELECTOR).map do |node|
    score_candidate(node)
  end.compact

  return if candidates.empty?

  best = candidates.max_by { |candidate| candidate[:score] }
  return if best[:score] < minimum_score

  Result.new(
    title: title_from_meta(doc),
    node: expand(best[:node]),
    published_at: published_at_from_meta(doc),
    byline: (doc),
    source_tag: :heuristic,
    confidence: confidence(best[:score])
  )
end