Class: Legion::Data::Extract::Handlers::Html
- Defined in:
- lib/legion/data/extract/handlers/html.rb
Class Method Summary collapse
Methods inherited from Base
available?, for_type, inherited, register, supported_types
Methods included from Logging::Helper
Class Method Details
.extensions ⇒ Object
9 |
# File 'lib/legion/data/extract/handlers/html.rb', line 9 def self.extensions = %w[.html .htm] |
.extract(source) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/legion/data/extract/handlers/html.rb', line 12 def self.extract(source) require 'nokogiri' content = source.respond_to?(:read) ? source.read : File.read(source.to_s) doc = ::Nokogiri::HTML(content) # Remove script and style elements doc.css('script, style, noscript').each(&:remove) title = doc.at_css('title')&.text&.strip text = doc.text.gsub(/\s+/, ' ').strip { text: text, metadata: { title: title } } rescue LoadError => e handle_exception(e, level: :warn, handled: true, operation: :extract_html, gem: gem_name) { text: nil, error: :gem_not_installed, gem: gem_name } rescue StandardError => e handle_exception(e, level: :warn, handled: true, operation: :extract_html) { text: nil, error: e. } end |
.gem_name ⇒ Object
10 |
# File 'lib/legion/data/extract/handlers/html.rb', line 10 def self.gem_name = 'nokogiri' |
.type ⇒ Object
8 |
# File 'lib/legion/data/extract/handlers/html.rb', line 8 def self.type = :html |