Class: Html2Text
- Inherits:
-
Object
- Object
- Html2Text
- Includes:
- BasicLogging, TagMunging
- Defined in:
- lib/html2crtext.rb
Overview
in a PRAWN-generated PDF-file
Constant Summary
Constants included from BasicLogging
BasicLogging::DEBUG, BasicLogging::ERROR, BasicLogging::FATAL, BasicLogging::INFO, BasicLogging::Levels, BasicLogging::UNKNOWN, BasicLogging::WARN
Instance Attribute Summary
Attributes included from BasicLogging
Instance Method Summary collapse
-
#handle(tag = @html) ⇒ Object
TODO: verify children in the node-list.
-
#initialize(html) ⇒ Html2Text
constructor
A new instance of Html2Text.
- #text ⇒ Object
Methods included from TagMunging
#all_body_cells, #all_body_rows, #all_row_cells, #new_tag, #remove_empty_rows, #remove_empty_sub_tags, #right_align_numbers, #tag_by_content
Methods included from BasicLogging
is_muted?, #log, mute, #set_level, #set_target
Constructor Details
#initialize(html) ⇒ Html2Text
Returns a new instance of Html2Text.
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/html2crtext.rb', line 29 def initialize( html) #remove line-breaks html.gsub!("\n", ' ') # whatever... html.gsub!("<BR>", "\n") html.gsub!("<BR/>", "\n") html.gsub!("<br>", "\n") html.gsub!("<br/>", "\n") html.gsub!("<br />", "\n") # Try to remove html-comments # ? for lazyness. matches first --> # otherwise matches too much html.gsub! /\<!--.*?-->/m, '' html.gsub! /\s+/, ' ' @html = Nokogiri::HTML::fragment(html) define_tag_handlers if @html @out_text = '' end |
Instance Method Details
#handle(tag = @html) ⇒ Object
TODO: verify children in the node-list
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/html2crtext.rb', line 59 def handle(tag = @html) debug('handle tag ' << tag.name) tcontent = '' if(tag.respond_to?(:children) ) tag.children().each do |t| debug("\tchild tag " << t.name) tresult = nil handler = @@tag_handlers[t.name.downcase] if(!handler) handler = @@tag_handlers['any'] end tresult = handler.call(t) tcontent << (tresult ? tresult : '') end else tcontent = tag.to_s end tcontent.gsub("=09", " ") end |
#text ⇒ Object
48 49 50 51 52 53 54 55 56 |
# File 'lib/html2crtext.rb', line 48 def text() @out_text = handle(@html) # remove empty lines @out_text.gsub! /^$\n/, '' # replace multiple white-spaces by 1 @out_text.gsub! /\t+/, "\t" @out_text.gsub! /\ +/, ' ' return @out_text end |