Class: SourceMonitor::Items::ItemCreator::ContentExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/source_monitor/items/item_creator/content_extractor.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source:) ⇒ ContentExtractor

Returns a new instance of ContentExtractor.



11
12
13
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 11

def initialize(source:)
  @source = source
end

Instance Attribute Details

#sourceObject (readonly)

Returns the value of attribute source.



9
10
11
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 9

def source
  @source
end

Instance Method Details

#build_feed_content_metadata(result:, raw_content:, processed_content:) ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 72

def (result:, raw_content:, processed_content:)
   = {
    "strategy" => result.strategy&.to_s,
    "status" => result.status&.to_s,
    "applied" => result.content.present?,
    "changed" => processed_content != raw_content
  }

  if result. && result.[:readability_text_length]
    ["readability_text_length"] = result.[:readability_text_length]
  end

  ["title"] = result.title if result.title.present?
  .compact
end

#deep_copy(value) ⇒ Object



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 92

def deep_copy(value)
  if value.respond_to?(:deep_dup)
    return value.deep_dup
  end

  case value
  when Hash
    value.each_with_object(value.class.new) do |(key, nested), copy|
      copy[key] = deep_copy(nested)
    end
  when Array
    value.map { |element| deep_copy(element) }
  else
    value.dup
  end
rescue TypeError
  value
end

#default_feed_readability_optionsObject



65
66
67
68
69
70
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 65

def default_feed_readability_options
  default = SourceMonitor::Scrapers::Readability.default_settings[:readability]
  return {} unless default

  deep_copy(default)
end

#feed_content_parser_classObject



45
46
47
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 45

def feed_content_parser_class
  SourceMonitor::Scrapers::Parsers::ReadabilityParser
end

#html_fragment?(value) ⇒ Boolean

Returns:

  • (Boolean)


88
89
90
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 88

def html_fragment?(value)
  value.to_s.match?(/<\s*\w+/)
end

#process_feed_content(raw_content, title:) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 15

def process_feed_content(raw_content, title:)
  return [ raw_content, nil ] unless should_process_feed_content?(raw_content)

  parser = feed_content_parser_class.new
  html = wrap_content_for_readability(raw_content, title: title)
  result = parser.parse(html: html, readability: default_feed_readability_options)

  processed_content = result.content.presence || raw_content
   = (result: result, raw_content: raw_content, processed_content: processed_content)

  [ processed_content, .presence ]
rescue StandardError => error
   = {
    "status" => "failed",
    "strategy" => "readability",
    "applied" => false,
    "changed" => false,
    "error_class" => error.class.name,
    "error_message" => error.message
  }
  [ raw_content,  ]
end

#should_process_feed_content?(raw_content) ⇒ Boolean

Returns:

  • (Boolean)


38
39
40
41
42
43
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 38

def should_process_feed_content?(raw_content)
  source.respond_to?(:feed_content_readability_enabled?) &&
    source.feed_content_readability_enabled? &&
    raw_content.present? &&
    html_fragment?(raw_content)
end

#wrap_content_for_readability(content, title:) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/source_monitor/items/item_creator/content_extractor.rb', line 49

def wrap_content_for_readability(content, title:)
  safe_title = title.present? ? CGI.escapeHTML(title) : "Feed Entry"
  <<~HTML
    <!DOCTYPE html>
    <html>
      <head>
        <meta charset="utf-8">
        <title>#{safe_title}</title>
      </head>
      <body>
        #{content}
      </body>
    </html>
  HTML
end