Class: SourceMonitor::Items::ItemCreator::EntryParser

Inherits:
Object
  • Object
show all
Includes:
MediaExtraction
Defined in:
lib/source_monitor/items/item_creator/entry_parser.rb,
lib/source_monitor/items/item_creator/entry_parser/media_extraction.rb

Defined Under Namespace

Modules: MediaExtraction

Constant Summary collapse

CONTENT_METHODS =
%i[content content_encoded summary].freeze
TIMESTAMP_METHODS =
%i[published updated].freeze
KEYWORD_SEPARATORS =
/[,;]+/.freeze
METADATA_ROOT_KEY =
"feedjira_entry".freeze
FINGERPRINT_SEPARATOR =
"\u0000".freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from MediaExtraction

#extract_enclosures, #extract_media_content, #extract_media_thumbnail_url

Constructor Details

#initialize(source:, entry:, content_extractor:) ⇒ EntryParser

Returns a new instance of EntryParser.



18
19
20
21
22
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 18

def initialize(source:, entry:, content_extractor:)
  @source = source
  @entry = entry
  @content_extractor = content_extractor
end

Instance Attribute Details

#entryObject (readonly)

Returns the value of attribute entry.



16
17
18
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 16

def entry
  @entry
end

#sourceObject (readonly)

Returns the value of attribute source.



16
17
18
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 16

def source
  @source
end

Instance Method Details

#atom_entry?Boolean

Returns:

  • (Boolean)


282
283
284
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 282

def atom_entry?
  defined?(Feedjira::Parser::AtomEntry) && entry.is_a?(Feedjira::Parser::AtomEntry)
end

#extract_authorObject



133
134
135
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 133

def extract_author
  string_or_nil(entry.author) if entry.respond_to?(:author)
end

#extract_authorsObject



137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 137

def extract_authors
  values = []

  if entry.respond_to?(:rss_authors)
    values.concat(Array(entry.rss_authors).map { |value| string_or_nil(value) })
  end

  if entry.respond_to?(:dc_creators)
    values.concat(Array(entry.dc_creators).map { |value| string_or_nil(value) })
  elsif entry.respond_to?(:dc_creator)
    values << string_or_nil(entry.dc_creator)
  end

  if entry.respond_to?(:author_nodes)
    values.concat(
      Array(entry.author_nodes).map do |node|
        next unless node.respond_to?(:name) || node.respond_to?(:email) || node.respond_to?(:uri)

        string_or_nil(node.name) || string_or_nil(node.email) || string_or_nil(node.uri)
      end
    )
  end

  if json_entry?
    if entry.respond_to?(:json) && entry.json
      json_authors = Array(entry.json["authors"]).map { |author| string_or_nil(author["name"]) }
      values.concat(json_authors)
      values << string_or_nil(entry.json.dig("author", "name"))
    end
  end

  primary_author = extract_author
  values << primary_author if primary_author.present?

  values.compact.uniq
end

#extract_categoriesObject



174
175
176
177
178
179
180
181
182
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 174

def extract_categories
  list = []
  list.concat(Array(entry.categories)) if entry.respond_to?(:categories)
  list.concat(Array(entry.tags)) if entry.respond_to?(:tags)
  if json_entry? && entry.respond_to?(:json) && entry.json
    list.concat(Array(entry.json["tags"]))
  end
  sanitize_string_array(list)
end

#extract_comments_countObject



221
222
223
224
225
226
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 221

def extract_comments_count
  raw = nil
  raw ||= entry.slash_comments_raw if entry.respond_to?(:slash_comments_raw)
  raw ||= entry.comments_count if entry.respond_to?(:comments_count)
  safe_integer(raw)
end

#extract_comments_urlObject



217
218
219
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 217

def extract_comments_url
  string_or_nil(entry.comments) if entry.respond_to?(:comments)
end

#extract_contentObject



109
110
111
112
113
114
115
116
117
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 109

def extract_content
  CONTENT_METHODS.each do |method|
    next unless entry.respond_to?(method)

    value = string_or_nil(entry.public_send(method))
    return value if value.present?
  end
  nil
end


211
212
213
214
215
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 211

def extract_copyright
  return string_or_nil(entry.copyright) if entry.respond_to?(:copyright)

  string_or_nil(entry.json["copyright"]) if json_entry? && entry.respond_to?(:json) && entry.json
end

#extract_guidObject



64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 64

def extract_guid
  entry_guid = entry.respond_to?(:entry_id) ? string_or_nil(entry.entry_id) : nil
  return entry_guid if entry_guid.present?

  return unless entry.respond_to?(:id)

  entry_id = string_or_nil(entry.id)
  return if entry_id.blank?

  url = extract_url
  return entry_id if url.blank? || entry_id != url

  nil
end

#extract_keywordsObject



198
199
200
201
202
203
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 198

def extract_keywords
  keywords = []
  keywords.concat(split_keywords(entry.media_keywords_raw)) if entry.respond_to?(:media_keywords_raw)
  keywords.concat(split_keywords(entry.itunes_keywords_raw)) if entry.respond_to?(:itunes_keywords_raw)
  sanitize_string_array(keywords)
end

#extract_languageObject



205
206
207
208
209
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 205

def extract_language
  return string_or_nil(entry.language) if entry.respond_to?(:language)

  string_or_nil(entry.json["language"]) if json_entry? && entry.respond_to?(:json) && entry.json
end

#extract_metadataObject



228
229
230
231
232
233
234
235
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 228

def 
  return {} unless entry.respond_to?(:to_h)

  normalized = (entry.to_h)
  return {} if normalized.blank?

  { METADATA_ROOT_KEY => normalized }
end

#extract_summaryObject



103
104
105
106
107
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 103

def extract_summary
  return unless entry.respond_to?(:summary)

  string_or_nil(entry.summary)
end

#extract_tagsObject



184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 184

def extract_tags
  tags = []

  tags.concat(Array(entry.tags)) if entry.respond_to?(:tags)

  if json_entry? && entry.respond_to?(:json) && entry.json
    tags.concat(Array(entry.json["tags"]))
  end

  tags = extract_categories if tags.empty? && entry.respond_to?(:categories)

  sanitize_string_array(tags)
end

#extract_timestampObject



119
120
121
122
123
124
125
126
127
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 119

def extract_timestamp
  TIMESTAMP_METHODS.each do |method|
    next unless entry.respond_to?(method)

    value = entry.public_send(method)
    return value if value.present?
  end
  nil
end

#extract_updated_timestampObject



129
130
131
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 129

def extract_updated_timestamp
  entry.updated if entry.respond_to?(:updated) && entry.updated.present?
end

#extract_urlObject



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 79

def extract_url
  if entry.respond_to?(:url)
    primary_url = string_or_nil(entry.url)
    return primary_url if primary_url.present?
  end

  if entry.respond_to?(:link_nodes)
    alternate = Array(entry.link_nodes).find do |node|
      rel = string_or_nil(node&.rel)&.downcase
      rel.nil? || rel == "alternate"
    end
    alternate ||= Array(entry.link_nodes).first
    href = string_or_nil(alternate&.href)
    return href if href.present?
  end

  if entry.respond_to?(:links)
    href = Array(entry.links).map { |link| string_or_nil(link) }.find(&:present?)
    return href if href.present?
  end

  nil
end

#generate_fingerprint(title, url, content) ⇒ Object



237
238
239
240
241
242
243
244
245
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 237

def generate_fingerprint(title, url, content)
  Digest::SHA256.hexdigest(
    [
      title.to_s,
      url.to_s,
      content.to_s
    ].join(FINGERPRINT_SEPARATOR)
  )
end

#json_entry?Boolean

Returns:

  • (Boolean)


278
279
280
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 278

def json_entry?
  defined?(Feedjira::Parser::JSONFeedItem) && entry.is_a?(Feedjira::Parser::JSONFeedItem)
end

#normalize_metadata(value) ⇒ Object



286
287
288
289
290
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 286

def (value)
  JSON.parse(JSON.generate(value))
rescue JSON::GeneratorError, JSON::ParserError, TypeError
  {}
end

#parseObject



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 24

def parse
  url = extract_url
  title = string_or_nil(entry.title) if entry.respond_to?(:title)
  raw_content = extract_content
  content,  = @content_extractor.process_feed_content(raw_content, title: title)
  fingerprint = generate_fingerprint(title, url, content)
  published_at = extract_timestamp
  updated_at_source = extract_updated_timestamp

   = 
  if .present?
     = .merge("feed_content_processing" => )
  end

  {
    guid: extract_guid,
    title: title,
    url: url,
    canonical_url: url,
    author: extract_author,
    authors: extract_authors,
    summary: extract_summary,
    content: content,
    published_at: published_at,
    updated_at_source: updated_at_source,
    categories: extract_categories,
    tags: extract_tags,
    keywords: extract_keywords,
    enclosures: extract_enclosures,
    media_thumbnail_url: extract_media_thumbnail_url,
    media_content: extract_media_content,
    language: extract_language,
    copyright: extract_copyright,
    comments_url: extract_comments_url,
    comments_count: extract_comments_count,
    metadata: ,
    content_fingerprint: fingerprint
  }.compact
end

#safe_integer(value) ⇒ Object



266
267
268
269
270
271
272
273
274
275
276
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 266

def safe_integer(value)
  return if value.nil?
  return value if value.is_a?(Integer)

  string = value.to_s.strip
  return if string.blank?

  Integer(string, 10)
rescue ArgumentError
  nil
end

#sanitize_string_array(values) ⇒ Object



253
254
255
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 253

def sanitize_string_array(values)
  Array(values).map { |value| string_or_nil(value) }.compact.uniq
end

#split_keywords(value) ⇒ Object



257
258
259
260
261
262
263
264
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 257

def split_keywords(value)
  return [] if value.nil?

  string = string_or_nil(value)
  return [] if string.blank?

  string.split(KEYWORD_SEPARATORS).map { |keyword| keyword.strip.presence }.compact
end

#string_or_nil(value) ⇒ Object



247
248
249
250
251
# File 'lib/source_monitor/items/item_creator/entry_parser.rb', line 247

def string_or_nil(value)
  return value unless value.is_a?(String)

  value.strip.presence
end