Class: Html2Text

Inherits:
Object
  • Object
show all
Includes:
BasicLogging, TagMunging
Defined in:
lib/html2crtext.rb

Overview

in a PRAWN-generated PDF-file

Constant Summary

Constants included from BasicLogging

BasicLogging::DEBUG, BasicLogging::ERROR, BasicLogging::FATAL, BasicLogging::INFO, BasicLogging::Levels, BasicLogging::UNKNOWN, BasicLogging::WARN

Instance Attribute Summary

Attributes included from BasicLogging

#log_level, #target

Instance Method Summary collapse

Methods included from TagMunging

#all_body_cells, #all_body_rows, #all_row_cells, #new_tag, #remove_empty_rows, #remove_empty_sub_tags, #right_align_numbers, #tag_by_content

Methods included from BasicLogging

is_muted?, #log, mute, #set_level, #set_target

Constructor Details

#initialize(html) ⇒ Html2Text

Returns a new instance of Html2Text.



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/html2crtext.rb', line 29

def initialize( html)
  #remove line-breaks
  html.gsub!("\n", ' ')
  # whatever...
  html.gsub!("<BR>", "\n")
  html.gsub!("<BR/>", "\n")
  html.gsub!("<br>", "\n")
  html.gsub!("<br/>", "\n")
  html.gsub!("<br />", "\n")
  # Try to remove html-comments
  # ? for lazyness. matches first -->
  # otherwise matches too much
  html.gsub! /\<!--.*?-->/m, ''
  html.gsub! /\s+/, ' '
  @html = Nokogiri::HTML::fragment(html)
  define_tag_handlers if @html
  @out_text = ''
end

Instance Method Details

#handle(tag = @html) ⇒ Object

TODO: verify children in the node-list



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/html2crtext.rb', line 59

def handle(tag = @html)
  debug('handle tag ' << tag.name)
  tcontent = ''
  if(tag.respond_to?(:children) )
    tag.children().each do |t|
      debug("\tchild tag " << t.name)
      tresult = nil
      handler = @@tag_handlers[t.name.downcase]	
      if(!handler)
        handler = @@tag_handlers['any']
      end
      tresult = handler.call(t)
      tcontent << (tresult ? tresult : '')
    end
  else
    tcontent = tag.to_s
  end
  tcontent.gsub("=09", " ")
end

#textObject



48
49
50
51
52
53
54
55
56
# File 'lib/html2crtext.rb', line 48

def text()
  @out_text = handle(@html)
  # remove empty lines
  @out_text.gsub! /^$\n/, ''
  # replace multiple white-spaces by 1
  @out_text.gsub! /\t+/, "\t" 
  @out_text.gsub! /\ +/, ' '
  return @out_text
end