Class: Legion::Data::Extract::Handlers::Html

Inherits:
Base
  • Object
show all
Defined in:
lib/legion/data/extract/handlers/html.rb

Class Method Summary collapse

Methods inherited from Base

available?, for_type, inherited, register, supported_types

Methods included from Logging::Helper

#handle_exception

Class Method Details

.extensionsObject



9
# File 'lib/legion/data/extract/handlers/html.rb', line 9

def self.extensions = %w[.html .htm]

.extract(source) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/legion/data/extract/handlers/html.rb', line 12

def self.extract(source)
  require 'nokogiri'

  content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
  doc = ::Nokogiri::HTML(content)

  # Remove script and style elements
  doc.css('script, style, noscript').each(&:remove)

  title = doc.at_css('title')&.text&.strip
  text = doc.text.gsub(/\s+/, ' ').strip
  { text: text, metadata: { title: title } }
rescue LoadError => e
  handle_exception(e, level: :warn, handled: true, operation: :extract_html, gem: gem_name)
  { text: nil, error: :gem_not_installed, gem: gem_name }
rescue StandardError => e
  handle_exception(e, level: :warn, handled: true, operation: :extract_html)
  { text: nil, error: e.message }
end

.gem_nameObject



10
# File 'lib/legion/data/extract/handlers/html.rb', line 10

def self.gem_name = 'nokogiri'

.typeObject



8
# File 'lib/legion/data/extract/handlers/html.rb', line 8

def self.type = :html