Class: SourceMonitor::Scrapers::Readability
- Defined in:
- lib/source_monitor/scrapers/readability.rb
Constant Summary collapse
- DEFAULT_ACCEPT =
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"- FETCHER_CLASS =
SourceMonitor::Scrapers::Fetchers::HttpFetcher
- PARSER_CLASS =
SourceMonitor::Scrapers::Parsers::ReadabilityParser
Class Method Summary collapse
Instance Method Summary collapse
Methods inherited from Base
adapter_name, call, #initialize
Constructor Details
This class inherits a constructor from SourceMonitor::Scrapers::Base
Class Method Details
.default_settings ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/source_monitor/scrapers/readability.rb', line 15 def self.default_settings { http: { headers: { "Accept" => DEFAULT_ACCEPT, "User-Agent" => SourceMonitor::HTTP::DEFAULT_USER_AGENT }, timeout: SourceMonitor::HTTP::DEFAULT_TIMEOUT, open_timeout: SourceMonitor::HTTP::DEFAULT_OPEN_TIMEOUT, proxy: nil }, selectors: { content: nil, title: nil }, readability: { remove_unlikely_candidates: true, clean_conditionally: true, retry_length: 250, min_text_length: 25 } } end |
Instance Method Details
#call ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/source_monitor/scrapers/readability.rb', line 39 def call url = preferred_url return failure_result("missing_url", "No URL available for scraping", url:) if url.blank? fetch_result = fetcher.fetch(url:, settings: settings[:http]) return build_fetch_failure(fetch_result, url) if fetch_result.status == :failed parser_result = parser.parse( html: fetch_result.body.to_s, selectors: settings[:selectors], readability: settings[:readability] ) return build_parser_failure(parser_result, fetch_result, url) if parser_result.status == :failed Result.new( status: parser_result.status, html: fetch_result.body, content: parser_result.content, metadata: (fetch_result:, parser_result:, url:) ) rescue StandardError => error failure_result(error.class.name, error., url: url) end |