Class: SourceMonitor::Scrapers::Readability

Inherits:
Base
  • Object
show all
Defined in:
lib/source_monitor/scrapers/readability.rb

Constant Summary collapse

DEFAULT_ACCEPT =
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
FETCHER_CLASS =
SourceMonitor::Scrapers::Fetchers::HttpFetcher
PARSER_CLASS =
SourceMonitor::Scrapers::Parsers::ReadabilityParser

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Base

adapter_name, call, #initialize

Constructor Details

This class inherits a constructor from SourceMonitor::Scrapers::Base

Class Method Details

.default_settingsObject



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/source_monitor/scrapers/readability.rb', line 15

def self.default_settings
  {
    http: {
      headers: {
        "Accept" => DEFAULT_ACCEPT,
        "User-Agent" => SourceMonitor::HTTP::DEFAULT_USER_AGENT
      },
      timeout: SourceMonitor::HTTP::DEFAULT_TIMEOUT,
      open_timeout: SourceMonitor::HTTP::DEFAULT_OPEN_TIMEOUT,
      proxy: nil
    },
    selectors: {
      content: nil,
      title: nil
    },
    readability: {
      remove_unlikely_candidates: true,
      clean_conditionally: true,
      retry_length: 250,
      min_text_length: 25
    }
  }
end

Instance Method Details

#callObject



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/source_monitor/scrapers/readability.rb', line 39

def call
  url = preferred_url
  return failure_result("missing_url", "No URL available for scraping", url:) if url.blank?

  fetch_result = fetcher.fetch(url:, settings: settings[:http])
  return build_fetch_failure(fetch_result, url) if fetch_result.status == :failed

  parser_result = parser.parse(
    html: fetch_result.body.to_s,
    selectors: settings[:selectors],
    readability: settings[:readability]
  )

  return build_parser_failure(parser_result, fetch_result, url) if parser_result.status == :failed

  Result.new(
    status: parser_result.status,
    html: fetch_result.body,
    content: parser_result.content,
    metadata: (fetch_result:, parser_result:, url:)
  )
rescue StandardError => error
  failure_result(error.class.name, error.message, url: url)
end