Class: SourceMonitor::Items::ItemCreator::ContentExtractor
- Inherits:
-
Object
- Object
- SourceMonitor::Items::ItemCreator::ContentExtractor
- Defined in:
- lib/source_monitor/items/item_creator/content_extractor.rb
Instance Attribute Summary collapse
-
#source ⇒ Object
readonly
Returns the value of attribute source.
Instance Method Summary collapse
- #build_feed_content_metadata(result:, raw_content:, processed_content:) ⇒ Object
- #deep_copy(value) ⇒ Object
- #default_feed_readability_options ⇒ Object
- #feed_content_parser_class ⇒ Object
- #html_fragment?(value) ⇒ Boolean
-
#initialize(source:) ⇒ ContentExtractor
constructor
A new instance of ContentExtractor.
- #process_feed_content(raw_content, title:) ⇒ Object
- #should_process_feed_content?(raw_content) ⇒ Boolean
- #wrap_content_for_readability(content, title:) ⇒ Object
Constructor Details
#initialize(source:) ⇒ ContentExtractor
Returns a new instance of ContentExtractor.
11 12 13 |
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 11 def initialize(source:) @source = source end |
Instance Attribute Details
#source ⇒ Object (readonly)
Returns the value of attribute source.
9 10 11 |
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 9 def source @source end |
Instance Method Details
#build_feed_content_metadata(result:, raw_content:, processed_content:) ⇒ Object
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 72 def (result:, raw_content:, processed_content:) = { "strategy" => result.strategy&.to_s, "status" => result.status&.to_s, "applied" => result.content.present?, "changed" => processed_content != raw_content } if result. && result.[:readability_text_length] ["readability_text_length"] = result.[:readability_text_length] end ["title"] = result.title if result.title.present? .compact end |
#deep_copy(value) ⇒ Object
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 92 def deep_copy(value) if value.respond_to?(:deep_dup) return value.deep_dup end case value when Hash value.each_with_object(value.class.new) do |(key, nested), copy| copy[key] = deep_copy(nested) end when Array value.map { |element| deep_copy(element) } else value.dup end rescue TypeError value end |
#default_feed_readability_options ⇒ Object
65 66 67 68 69 70 |
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 65 def default = SourceMonitor::Scrapers::Readability.default_settings[:readability] return {} unless default deep_copy(default) end |
#feed_content_parser_class ⇒ Object
45 46 47 |
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 45 def feed_content_parser_class SourceMonitor::Scrapers::Parsers::ReadabilityParser end |
#html_fragment?(value) ⇒ Boolean
88 89 90 |
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 88 def html_fragment?(value) value.to_s.match?(/<\s*\w+/) end |
#process_feed_content(raw_content, title:) ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 15 def process_feed_content(raw_content, title:) return [ raw_content, nil ] unless should_process_feed_content?(raw_content) parser = feed_content_parser_class.new html = wrap_content_for_readability(raw_content, title: title) result = parser.parse(html: html, readability: ) processed_content = result.content.presence || raw_content = (result: result, raw_content: raw_content, processed_content: processed_content) [ processed_content, .presence ] rescue StandardError => error = { "status" => "failed", "strategy" => "readability", "applied" => false, "changed" => false, "error_class" => error.class.name, "error_message" => error. } [ raw_content, ] end |
#should_process_feed_content?(raw_content) ⇒ Boolean
38 39 40 41 42 43 |
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 38 def should_process_feed_content?(raw_content) source.respond_to?(:feed_content_readability_enabled?) && source.feed_content_readability_enabled? && raw_content.present? && html_fragment?(raw_content) end |
#wrap_content_for_readability(content, title:) ⇒ Object
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 49 def wrap_content_for_readability(content, title:) safe_title = title.present? ? CGI.escapeHTML(title) : "Feed Entry" <<~HTML <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>#{safe_title}</title> </head> <body> #{content} </body> </html> HTML end |