Class: Coelacanth::Extractor::WeakMlProbe

Inherits:
Object
  • Object
show all
Defined in:
lib/coelacanth/extractor/weak_ml_probe.rb

Overview

Lightweight probabilistic scorer that emulates a learned classifier using heuristics.

Defined Under Namespace

Classes: Result

Constant Summary collapse

BLOCK_SELECTOR =
"article, main, section, div".freeze
TOKEN_WEIGHTS =
{
  "content" => 1.1,
  "article" => 1.0,
  "body" => 0.9,
  "post" => 0.8,
  "entry" => 0.75,
  "text" => 0.6,
  "story" => 0.6,
  "blog" => 0.5,
  "share" => -1.0,
  "nav" => -1.3,
  "footer" => -1.2,
  "header" => -1.1,
  "related" => -0.8
}.freeze
FEATURE_WEIGHTS =
{
  bias: -1.2,
  text_length: 0.002,
  link_density: -2.6,
  punctuation_density: 1.8,
  depth: -0.12,
  token_score: 1.6
}.freeze

Instance Method Summary collapse

Instance Method Details

#call(doc:, url: nil) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/coelacanth/extractor/weak_ml_probe.rb', line 47

def call(doc:, url: nil)
  candidates = doc.css(BLOCK_SELECTOR).map do |node|
    evaluate(node)
  end.compact

  return if candidates.empty?

  best = candidates.max_by { |candidate| candidate[:probability] }
  return if best[:probability] < 0.45

  Result.new(
    title: title_from_meta(doc),
    node: best[:node],
    published_at: published_at_from_meta(doc),
    byline: (doc),
    source_tag: :ml,
    confidence: best[:probability].clamp(0.0, 0.9)
  )
end