Class: AlExtPosts::ExternalPostsGenerator

Inherits:

Jekyll::Generator

Object
Jekyll::Generator
AlExtPosts::ExternalPostsGenerator

show all

Defined in:: lib/al_ext_posts.rb

Instance Method Summary collapse

#create_document(site, source_name, url, content, src = {}) ⇒ Object
#fetch_content_from_url(url) ⇒ Object
#fetch_from_rss(site, src) ⇒ Object
#fetch_from_urls(site, src) ⇒ Object
#generate(site) ⇒ Object
#metadata_for_post(src, post) ⇒ Object
#metadata_value(post, key) ⇒ Object
#parse_published_date(published_date) ⇒ Object
#process_entries(site, src, entries) ⇒ Object

Instance Method Details

#create_document(site, source_name, url, content, src = {}) ⇒ `Object`

# File 'lib/al_ext_posts.rb', line 49

def create_document(site, source_name, url, content, src = {})
  # check if title is composed only of whitespace or foreign characters
  if content[:title].gsub(/[^\w]/, '').strip.empty?
    # use the source name and last url segment as fallback
    slug = "#{source_name.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')}-#{url.split('/').last}"
  else
    # parse title from the post or use the source name and last url segment as fallback
    slug = content[:title].downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
    slug = "#{source_name.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')}-#{url.split('/').last}" if slug.empty?
  end

  path = site.in_source_dir("_posts/#{slug}.md")
  doc = Jekyll::Document.new(
    path, { :site => site, :collection => site.collections['posts'] }
  )
  doc.data['external_source'] = source_name
  doc.data['title'] = content[:title]
  doc.data['feed_content'] = content[:content]
  doc.data['description'] = content[:summary]
  doc.data['date'] = content[:published]
  doc.data['redirect'] = url

  # Apply default categories and tags from source configuration
  if src['categories'] && src['categories'].is_a?(Array) && !src['categories'].empty?
    doc.data['categories'] = src['categories']
  end
  if src['tags'] && src['tags'].is_a?(Array) && !src['tags'].empty?
    doc.data['tags'] = src['tags']
  end

  doc.content = content[:content]
  site.collections['posts'].docs << doc
end

#fetch_content_from_url(url) ⇒ `Object`

# File 'lib/al_ext_posts.rb', line 120

def fetch_content_from_url(url)
  html = HTTParty.get(url).body
  parsed_html = Nokogiri::HTML(html)

  title = parsed_html.at('head title')&.text.strip || ''

  description = parsed_html.at('head meta[name="description"]')&.attr('content')
  description ||= parsed_html.at('head meta[name="og:description"]')&.attr('content')
  description ||= parsed_html.at('head meta[property="og:description"]')&.attr('content')

  body_content = parsed_html.search('p').map { |e| e.text }
  body_content = body_content.join() || ''

  {
    title: title,
    content: body_content,
    summary: description
    # Note: The published date is now added in the fetch_from_urls method.
  }
end

#fetch_from_rss(site, src) ⇒ `Object`

# File 'lib/al_ext_posts.rb', line 25

def fetch_from_rss(site, src)
  xml = HTTParty.get(src['rss_url']).body
  return if xml.nil?
  begin
    feed = Feedjira.parse(xml)
  rescue StandardError => e
    puts "Error parsing RSS feed from #{src['rss_url']} - #{e.message}"
    return
  end
  process_entries(site, src, feed.entries)
end

#fetch_from_urls(site, src) ⇒ `Object`

# File 'lib/al_ext_posts.rb', line 83

def fetch_from_urls(site, src)
  src['posts'].each do |post|
    puts "...fetching #{post['url']}"
    content = fetch_content_from_url(post['url'])
    content[:published] = parse_published_date(post['published_date'])
    create_document(site, src['name'], post['url'], content, metadata_for_post(src, post))
  end
end

#generate(site) ⇒ `Object`

# File 'lib/al_ext_posts.rb', line 12

def generate(site)
  if site.config['external_sources'] != nil
    site.config['external_sources'].each do |src|
      puts "Fetching external posts from #{src['name']}:"
      if src['rss_url']
        fetch_from_rss(site, src)
      elsif src['posts']
        fetch_from_urls(site, src)
      end
    end
  end
end

#metadata_for_post(src, post) ⇒ `Object`

# File 'lib/al_ext_posts.rb', line 92

def metadata_for_post(src, post)
  metadata = src.dup
  %w[categories tags].each do |key|
    value = metadata_value(post, key)
    metadata[key] = value if value && !(value.respond_to?(:empty?) && value.empty?)
  end
  metadata
end

#metadata_value(post, key) ⇒ `Object`

# File 'lib/al_ext_posts.rb', line 101

def metadata_value(post, key)
  if post.respond_to?(:key?)
    post[key] || post[key.to_sym]
  elsif post.respond_to?(key)
    post.public_send(key)
  end
end

#parse_published_date(published_date) ⇒ `Object`

# File 'lib/al_ext_posts.rb', line 109

def parse_published_date(published_date)
  case published_date
  when String
    Time.parse(published_date).utc
  when Date
    published_date.to_time.utc
  else
    raise "Invalid date format for #{published_date}"
  end
end

#process_entries(site, src, entries) ⇒ `Object`

# File 'lib/al_ext_posts.rb', line 37

def process_entries(site, src, entries)
  entries.each do |e|
    puts "...fetching #{e.url}"
    create_document(site, src['name'], e.url, {
      title: e.title,
      content: e.content,
      summary: e.summary,
      published: e.published
    }, metadata_for_post(src, e))
  end
end

Class: AlExtPosts::ExternalPostsGenerator

Instance Method Summary collapse

Instance Method Details

#create_document(site, source_name, url, content, src = {}) ⇒ Object

#fetch_content_from_url(url) ⇒ Object

#fetch_from_rss(site, src) ⇒ Object

#fetch_from_urls(site, src) ⇒ Object

#generate(site) ⇒ Object

#metadata_for_post(src, post) ⇒ Object

#metadata_value(post, key) ⇒ Object

#parse_published_date(published_date) ⇒ Object

#process_entries(site, src, entries) ⇒ Object

#create_document(site, source_name, url, content, src = {}) ⇒ `Object`

#fetch_content_from_url(url) ⇒ `Object`

#fetch_from_rss(site, src) ⇒ `Object`

#fetch_from_urls(site, src) ⇒ `Object`

#generate(site) ⇒ `Object`

#metadata_for_post(src, post) ⇒ `Object`

#metadata_value(post, key) ⇒ `Object`

#parse_published_date(published_date) ⇒ `Object`

#process_entries(site, src, entries) ⇒ `Object`