Class: Rsssf::PageConverter

Inherits:

Object

Object
Rsssf::PageConverter

show all

Defined in:: lib/rsssf/convert/errata.rb,
lib/rsssf/convert/convert.rb,
lib/rsssf/convert/html_to_txt.rb,
lib/rsssf/convert/html_entities.rb,
lib/rsssf/convert/html_to_txt/replace_hr.rb,
lib/rsssf/convert/html_to_txt/remove_emails.rb,
lib/rsssf/convert/html_to_txt/replace_a_href.rb,
lib/rsssf/convert/html_to_txt/replace_a_name.rb,
lib/rsssf/convert/html_to_txt/replace_heading.rb,
lib/rsssf/convert/html_to_txt/beautify_anchors.rb

Constant Summary collapse

ENTITIES =

%w[
À   &Agrave;
Á   &Aacute;
Â   &Acirc;
Ã   &Atilde;
Ä   &Auml;
Å   &Aring;

à   &agrave;
á   &aacute;
â   &acirc;
ã   &atilde;
ä   &auml;
å   &aring;
Æ   &AElig;
æ   &aelig;
ß   &szlig;
Ç   &Ccedil;
ç   &ccedil;
È   &Egrave;
É   &Eacute;
Ê   &Ecirc;
Ë   &Euml;
è   &egrave;
é   &eacute;
ê   &ecirc;
ë   &euml;

ð   &eth;

Ì   &Igrave;
Í   &Iacute;
Î   &Icirc;
Ï   &Iuml;
ì   &igrave;
í   &iacute;
î   &icirc;
ï   &iuml;
Ñ   &Ntilde;
ñ   &ntilde;
Ò   &Ograve;
Ó   &Oacute;
Ô   &Ocirc;
Õ   &Otilde;
Ö   &Ouml;
ò   &ograve;
ó   &oacute;
ô   &ocirc;
õ   &otilde;
ö   &ouml;
Ø   &Oslash;
ø   &oslash;
Ù   &Ugrave;
Ú   &Uacute;
Û   &Ucirc;
Ü   &Uuml;
ù   &ugrave;
ú   &uacute;
û   &ucirc;
ü   &uuml;
Ý   &Yacute;
ý   &yacute;
ÿ   &yuml;

<    &lt;
>    &gt;
&    &amp;
©    &copy;
®    &reg;

]

HR_LINE_ASCII =

"\n\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n\n"

EMAIL_RE =

%r{ \s*
\(
 [a-z][a-z0-9_]+
   @[a-z]+(\.[a-z]+)+
 \)
}imx

A_HREF_RE = <A href=“www.rsssf.org/”>Rec.Sport.Soccer Statistics Foundation</A> <A href="http://www.rsssfbrasil.com">RSSSF Brazil</A> and Daniel Dalence (<A href="mailto:danielballack@terra.com.br">danielballack@terra.com.br</A>) empty <a>Primer Descenso – First Relegation</a>

%r{<A 
    (?: 
       \s+ HREF [ ]* = 
          (?<href>[^>]+?)
    )?
  >
    (?<title>.+?)
  <\/A>
}imx

A_NAME_OLD_RE = note - for content use non-greedy to allow match of tags inside content too

%r{<A [ ]+ NAME [ ]* =
     (?<name>[^>]+?) 
  >
     (?<title>.+?) 
  </A>
}imx

A_NAME_RE =

%r{<A [ ]+ NAME [ ]* =
     (?<name>[^>]+?) 
  >
}imx

HEADING_RE = note - for h1,h2,h3,h4,h5,h6 use a backref(erence) e.g. \1 note - include leading and trailing spaces (incl. newlines) !!! note - for content inside use non-greedy to allow match of tags inside content too

%r{ \s*
  <H(?<level>[1-6])>
    (?<title> .+?)
  </H\k<level>>
  \s*
}imx

BOLD_OR_UNDERLINE_LINE_HEADING_RE = note - MUST be a one a single line (see make heading for more) e.g. "<h#{tag}>#{text}</h#{tag}>"

%r{^
       [ ]*
  <H (?<tag> [BU]) >
    (?<title> .+?)
  </H \k<tag> >
      [ ]*
       $
}ix

Class Method Summary collapse

.convert(html, url:) ⇒ Object

convenience helper.
.convert_html_entities(html, url: nil) ⇒ Object
.errata_html(html) ⇒ Object
.errata_html_entities(html) ⇒ Object
.log(msg) ⇒ Object

more helpers.

Instance Method Summary collapse

#beautify_anchors(html) ⇒ Object
#convert(html, url:) ⇒ Object

add anchor: options or such lets you toggle adding anchors (§premier etc.) - why? why not?.
#convert_html_entities(html, url: nil) ⇒ Object
#errata_html(html) ⇒ Object
#errata_html_entities(html) ⇒ Object
#errata_txt(txt) ⇒ Object
#html_to_txt(html, url:) ⇒ Object
#log(msg) ⇒ Object
#remove_emails(html) ⇒ Object
#replace_a_href(html) ⇒ Object
#replace_a_name(html) ⇒ Object
#replace_a_name_old(html) ⇒ Object
#replace_heading(html) ⇒ Object
#replace_hr(html) ⇒ Object
#squish(str) ⇒ Object

Class Method Details

.convert(html, url:) ⇒ `Object`

convenience helper

# File 'lib/rsssf/convert/convert.rb', line 6

def self.convert( html, url: )
  @@converter ||= new   ## use a "shared" built-in converter
  @@converter.convert( html, url: url )
end

.convert_html_entities(html, url: nil) ⇒ `Object`

# File 'lib/rsssf/convert/html_entities.rb', line 81

def self.convert_html_entities( html, url: nil )
  ## check for html entities
  html = html.gsub( "&auml;", 'ä' )
  html = html.gsub( "&ouml;", 'ö' )
  html = html.gsub( "&uuml;", 'ü' )
  html = html.gsub( "&Auml;", 'Ä' )
  html = html.gsub( "&Ouml;", 'Ö' )
  html = html.gsub( "&Uuml;", 'Ü' )
  html = html.gsub( "&szlig;", 'ß' )


  html = errata_html_entities( html )


  ENTITIES.each_slice(2) do |str, entity|
     html = html.gsub( entity, str )
  end



  ##############
  ## check for more entities
  ##   limit &---; to length 10 - why? why not?


  ## check for decimal entities (mapping 1:1 to unicode) 
  html = html.gsub(/&#(\d+);/) do |match|
           uni =  if match == '&#307;'   ## use like Van D&#307;k  -> Van Dijk
                   'ij'
                  else
                   [$1.to_i].pack("U")          
                  end 
       
            ##puts "   converting numeric html entity #{match} to unicode char #{uni}"

           uni
        end

      
  html = html.gsub( /&[^; ]{1,10};/) do |match|
         ##   ignore weird edge case of &A;
         ##    e.g. [M&A; moved from pool B] - where M&A is name of club
         ##
         ##  in ital03.html:
         ###    [Eugenio Corini 22pen&36pen; Christian Vieri 69]
         ##     Francesco Totti 31, Vincenzo Montella 49&68; Antonio Di Natale 11] 

             if match == '&A;' ||
                match == '&36pen;' || match == '&68;'
             else
                msg = "found unencoded html entity #{match}"
                msg += " in >#{url}<"   if url

                puts "*** WARN - #{msg}"
                log( msg )  ## log too (see log.txt)
             end
             
             match   ## pass through as is (1:1)
  end
  
  html
end

.errata_html(html) ⇒ `Object`

# File 'lib/rsssf/convert/errata.rb', line 10

def self.errata_html( html )
   ## auto-fix known typos / errors
   ###   kind of PRE-processing, see errata_txt for POST-processing
   ###  check - rename to errata_pre/post - why? why not?


     ## quick fix - rm </ADDRESS>
    ##  </ADDRESS>
    ##   tablesb/braz94.html
    html = html.gsub( '</ADDRESS>', ''  )

   ## quick fix   </a  => </a>   
   ##  <a href="#play6">Gold League (Calle 6)</a
   ##  <a href="#zpl">PBZ Premier League 2025/26</a
   ##  <a href="#lig1">Championnat National Ligue 1</a

   html = html.gsub( /<\/A
                          (?! [ ]*>)     ## negative lookahead
                      /ix, '</A>' )

   ## quick fix  </br>  => <br>
   html = html.gsub( /<\/BR>/i, '<BR>' )



  ## quick fix - change typo <H1></H2>
  ##  tables/58full.html
  html = html.gsub( '<H1>Quarterfinals</H2>', '<H2>Quarterfinals</H2>' ) 

  ## quick fix - change typo <M>,<N> to <B>
  ##   tables/54full.html
  html = html.gsub( '<M>MEX</B>', '<B>MEX</B>' ) 
  ##   tables/58full.html 
  html = html.gsub( '<N>CZE</B>', '<B>CZE</B>' ) 


  ## quick fix -
  ##   tablesb/braz88.html 
  html = html.gsub( '<</TITLE>', '</TITLE>' )

  
  ## quick fix
  ##   hr (horizontal rule) via img
  ##   in  tables/30full.html and others
  ##   
  ## <IMG SRC="xshadow.gif.pagespeed.ic.AbdeNVcmzw.png" ALT="-----------">
  ##   look  for
  ## <IMG   ALT="---">
  html = html.gsub(  /<IMG
                            [^>]+?
                           ALT="-{3,}"
                         >/ixm, '<HR>' )


  html
end

.errata_html_entities(html) ⇒ `Object`

# File 'lib/rsssf/convert/errata.rb', line 69

def self.errata_html_entities( html )
    ########
    ## typos / autofix - keep - why? why not?
    html = html.gsub( "&oulm;", 'ö' )    ## support typo in entity (&ouml;)
    html = html.gsub( "&uml;",  'ü' )    ## support typo in entity (&uuml;) - why? why not?
    html = html.gsub( "&slig;", "ß" )    ## support typo in entity (&szlig;)
    html = html.gsub( "&aaacute;", "á" )  ## typo for &aacute; 
    html = html.gsub( "&nitlde;", "ñ" )  ## typ for &ntilde;   
    html
end

.log(msg) ⇒ `Object`

more helpers

# File 'lib/rsssf/convert/convert.rb', line 58

def self.log( msg )
  ## append msg to ./logs.txt
  ##     use ./errors.txt - why? why not?
  File.open( './logs.txt', 'a:utf-8' ) do |f|
    f.write( msg )
    f.write( "\n" )
  end
end

Instance Method Details

#beautify_anchors(html) ⇒ `Object`

# File 'lib/rsssf/convert/html_to_txt/beautify_anchors.rb', line 5

def beautify_anchors( html )
  ## beautify 
  ##  ‹§2fin›
  ##
  ## == Semifinals
  ##
  ##  merge anchor (a name) with heading into one line e.g.
  ##       => 
  ##  == Semifinals  ‹§2fin›

   html = html.gsub( /\s*
                          (?<name>‹§
                                    [^›]+?
                                 ›)
                      \s*
                          (?<heading>={2,}
                              [^=\n]+? 
                          )
                       \n
                       \s*/ixm ) do |match|
   
           m = Regexp.last_match
 
           match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
           puts "   mergeing anchor (a name) with heading into one line - >#{match}<" 

           "\n\n#{m[:heading]}  #{m[:name]}\n\n"
    end

###
## 
## beautify 
##  ‹§argsquad›Argentine Squad Full Info
##  ‹§eng›ENGLAND
##
##
##  reformat anchor (a name) start line with text  e.g.
##       => 
##  Argentine Squad Full Info  ‹§argsquad›
##  ENGLAND  ‹§eng›

   html = html.gsub( /\n
                          (?<name>‹§
                                    [^›]+?
                                 ›)
                      [ ]*
                          (?<text>[^\n]+? 
                          )
                       \n
                       /ixm ) do |match|
   
           m = Regexp.last_match
 
           match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
           puts "   move anchor (a name) starting line with text to end - >#{match}<"

           "\n#{m[:text]}  #{m[:name]}\n"
    end

###
## beautify heading
##   ==== ‹§gra›Group A
##     =>
##   ==== Group A  ‹§gra›

   html = html.gsub( /\n
                          (?<heading_marker>
                               ={2,})
                               [ ]*
                          (?<name>‹§
                                    [^›]+?
                                 ›)
                             [ ]*
                          (?<heading_text>[^\n]+? 
                          )
                       \n
                       /ixm ) do |match|
   
           m = Regexp.last_match
 
           match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
           puts "   move anchor (a name) in heading to end - >#{match}<"

           "\n#{m[:heading_marker]} #{m[:heading_text]}  #{m[:name]}\n"
    end
    html
end

#convert(html, url:) ⇒ `Object`

add anchor: options or such

lets you toggle adding anchors (§premier etc.) - why? why not?

# File 'lib/rsssf/convert/convert.rb', line 18

def convert( html, url: )
  ### todo/fix: first check if html is all ascii-7bit e.g.
  ## includes only chars from 64 to 127!!!

  ## normalize newlines
  ##   replace \r\n (form feed \r) used by Windows - ff+lf;
  ##         just use \n (new line a.k.a. line feed)
  html = html.gsub( "\r\n", "\n" )

  ##  convert tabs to two spaces (or use four??)
  html = html.gsub( "\t", '   ' )



  html = convert_html_entities( html, url: url )

 ###################################
 ### smart quotes quick fixes
 ### convert all "smart" quote to (standard) single and double quotes
 ##  D´Alessandro   =>  D'Alessandro
 ##    81´ and 88'   =>  81' and 88'


  html = html.gsub( /[´’‘]/, "'" )
  html = html.gsub( /[“”]/,  '"' )

### convert fancy (unicode) dashes/hyphens to plain dash/hyphen
   html = html.gsub( '–', '-' )



  txt   = html_to_txt( html, url: url )
  txt
end

#convert_html_entities(html, url: nil) ⇒ `Object`

143	# File 'lib/rsssf/convert/html_entities.rb', line 143 def convert_html_entities( html, url: nil ) self.class.convert_html_entities( html, url: url ); end

#errata_html(html) ⇒ `Object`

66	# File 'lib/rsssf/convert/errata.rb', line 66 def errata_html( html ) self.class.errata_html( html ); end

#errata_html_entities(html) ⇒ `Object`

79	# File 'lib/rsssf/convert/errata.rb', line 79 def errata_html_entities( html ) self.class.errata_html_entities( html ); end

#errata_txt(txt) ⇒ `Object`

# File 'lib/rsssf/convert/errata.rb', line 83

def errata_txt( txt )
  ## kind-of POST-processing, see errata_html for PRE-processing

   ## quick fix - squish spaces (to single)
   ##   tables/82full.html
   txt = txt.gsub( 'Second  phase', 'Second phase' )


   ## quick fix - add (missing) closing bracket (])
   ##   tables/70q.html
   txt = txt.gsub(/^South America Group 10 \[Brazil$/,
                   'South America Group 10 [Brazil]' )


  txt
end

#html_to_txt(html, url:) ⇒ `Object`

# File 'lib/rsssf/convert/html_to_txt.rb', line 17

def html_to_txt( html, url: )

###
#   todo: check if any tags (still) present??


  ## cut off everything before body
  ##   
  ## note - might incl. attributes e.g.
  ## <body bgcolor="yellow">

  ## record / track (important) edits - sub(stitutions) etc.
  edits = []


  html = html.sub( /.+?
                      <BODY [^>]*? >
                      \s*
                   /xim, 
                   '' )
                  
  ## special case i)   no <body> - cut-off head if present
  ## cut off everything before <head/>
  ##   used in braz93.html, braz98.html
  html = html.sub( /.+?
                     <\/HEAD>
                       \s*
                   /xim, '' )    

  ## special case ii) no <body>, no </head> 
  ## cut off everything before <head/>
  ##   used in braz93.html, braz98.html
  html = html.sub( /.+?
                     <HEAD\/>
                       \s*
                   /xim, '' )    




  ## cut off everything after body (closing)
  html = html.sub( /<\/BODY>.*/im, '' )
  
  ## special case
  ## cut off everything after </html> (closing)
  ##   used in braz93.html, braz98.html
  html = html.sub( /<\/HTML>.*/im, '' )




  ## quick fix
  ## <title>World Cup 1950 qualifications</title>
  ## <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-2">
  
  ## remove title and meta
  html = html.sub( /<TITLE>.*?<\/TITLE>/i, '' )
  html = html.sub( /<META .*?>/i, '' )


  ## (auto-)fix known types / errors
  ##  todo - pass in/along filename/url too - why? why not?
  html = errata_html( html )



  ##
  ##   change ^<b><a name ...></a></b>$  or    <hb>  - heading "bold" - might be h5
  ##          ^<u><a name ...></a></u>$    to  <hu>  - heading "underscore" - might be h6
  html, more_edits = make_heading( html )
  edits += more_edits 






  ## remove cite
  html = html.gsub( /<CITE>([^<]+)<\/CITE>/im ) do |_|
    puts " remove cite >#{$1}<"
    "#{$1}"
  end


  html = replace_hr( html )




  ## replace break (br)
  ## note: do NOT use m/multiline for now - why? why not??
  html = html.gsub( /<BR>\s*/i ) do |match|    ## note: include (swallow) "extra" newline
    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
    puts " replace break (br) - >#{match}<"
    "\n"
  end



  
  html = replace_a_name( html )

  html = replace_a_href( html )

  ## quickfix  remove trailing </a> left possibly by a_name
  html = html.gsub( /<\/A>/i, '' )



  ## replace paragrah (p)
  html = html.gsub( /\s*<P>\s*/im ) do |match|    ## note: include (swallow) "extra" newline
    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
    puts " replace paragraph (p) - >#{match}<"
    "\n\n"
  end
  html = html.gsub( /<\/P>/i, '' )  ## replace paragraph (p) closing w/ nothing for now



 
  html = replace_heading( html )


  ## remove i(talics)
  ##    use non-greedy match as default? e.g. .*? - why? why not?
  ##    or greedy w/  ([^<]+)
  html = html.gsub( /<I>(.*?)<\/I>/im ) do |_|
    puts " remove italic (i) >#{$1}<"
    "#{$1}"
  end

  html = html.gsub( /<U>(.*?)<\/U>/im ) do |_|
    puts " remove underline (u) >#{$1}<"
    "#{$1}"
  end

  ## remove b   - note: might include anchors (thus, call after anchors)
  ###   use non-greedy match as default? e.g. .*? - why? why not?
  ## was - "**#{$1}**"
  html = html.gsub( /<B>(.*?)<\/B>/im ) do |_|
    puts " remove bold (b) >#{$1}<"
    "#{$1}"  
  end

  ## <strong></strong>
  html = html.gsub( /<STRONG>(.*?)<\/STRONG>/im ) do |_|
    puts " remove strong (strong) >#{$1}<"
    "#{$1}"  
  end



  ## replace preformatted (pre)
  html = html.gsub( /<PRE>|<\/PRE>/i ) do |match|
    puts " replace preformatted (pre)"

      ## note - replace preformatted blocks
      ##           with comments 
      ##  was:
      ##  ''  # replace w/ nothing for now (keep surrounding newlines)
      
     if match.downcase == '<pre>'
         '<!-- start pre -->'
     else
         '<!-- end pre -->'
     end
  end


=begin
  puts
  puts
  puts "html:"
  puts html[0..2000]
  puts "-- snip --"
  puts html[-1000..-1]   ## print last hundred chars
=end


  html = remove_emails( html )


  html = beautify_anchors( html )


  ## check for html tags
  ##  left
  ##  use scan instead of 
  html.gsub( /<
                \/?
                [A-Z]+ [^>]*
              > 
             /xim ) do |match|

          if ['<menu>', '<ul>', '<li>',
               '</menu>', '</ul>', '</li>'].include?(match.downcase) 
               ## do nothing
          else 
                  msg = "found unprocessed html tag #{match} in >#{url}<"
                  puts "*** WARN - #{msg}"
                  log( msg )  ## log too (see log.txt)
          end 
          match
    end


  ##
  ## todo/fix
  ##    move up-front - kind of preprocessing (not post) - why? why not?

  ## cleanup whitespaces
  ##   todo/fix:  convert newline in space first
  ##                and than collapse spaces etc.!!!
  txt = String.new
  html.each_line do |line|
     line = line.gsub( "\t", '  ' ) # replace all tabs w/ two spaces for nwo
     line = line.rstrip             # remove trailing whitespace (incl. newline/formfeed)

     txt << line
     txt << "\n"
  end

  txt = errata_txt( txt )

  [txt, edits]
end

#log(msg) ⇒ `Object`

66	# File 'lib/rsssf/convert/convert.rb', line 66 def log( msg ) self.class.log( msg ); end

#remove_emails(html) ⇒ `Object`

# File 'lib/rsssf/convert/html_to_txt/remove_emails.rb', line 13

def remove_emails( html )
  ### remove converted ("blinded") mailto anchors
  ##  note   usually inside () e.g.
  ##    (‹mailto›) 
  ##   plus slurp up all leading whitespace (incl. newline) - why? why not?
  html = html.gsub( /\s*
                      \(‹mailto›\)
                     /xm, '' )
  
   ###
   ##  remove "regular emails too e.g.
   ##
   ## Thanks to Marcelo Leme de Arruda (___@___.__.br),
   ##  Ricardo FF Pontes (___@____.com), 
   ## Santiago Reis (____@____.com.br),
   ## Marcos Lacerda Queiroz (___@____.com.br)
   ##  etc.

  ## check for "free-standing e.g. on its own line" emails only for now
   html = html.gsub( EMAIL_RE ) do |match|
    puts "removing  email >#{match}<"
    ''   
   end
   html
end

#replace_a_href(html) ⇒ `Object`

# File 'lib/rsssf/convert/html_to_txt/replace_a_href.rb', line 29

def replace_a_href( html )
  ## remove anchors (a href)
  #    note: heading 4 includes anchor (thus, let anchors go first)
  #  note: <a \newline href is used for authors email - thus incl. support for newline as space
  html.gsub( A_HREF_RE ) do |match|   ## note: use .+? non-greedy match
    m = Regexp.last_match
    captures = m.named_captures
    href  = if m['href']
               m['href'].gsub( /["']/, '' ).strip   ## remove ("" or '')
            else
               nil
            end
    title = m['title'].strip   ## note: "save" caputure first; gets replaced by gsub (next regex call)


    if href.nil?
       ## report error - <a>hello</a> is useless
       puts " replace anchor w/ missing (!!) href (a) >#{title}<"
      "‹#{squish(title)}›"
  
    ## e.g.
    ##  ‹Larsen23@gmx.de, see page mailto:Larsen23@gmx.de›
    ##  ‹danielballack@terra.com.br, see page mailto:danielballack@terra.com.br›
    ##  ‹zja70@aol.com, see page mailto:zja70@aol.com›)
  
    elsif href.start_with?( 'mailto:')
      puts " blank mailto  -  anchor (a) href >#{href}, >#{title}<"
      '‹mailto›'   ## delete/remove email
    else
      puts " replace anchor (a) href >#{href}, >#{title}<"

      ## convert href to xref
      xref = if href.start_with?('#')    ## in-page ref
              ", see §#{href[1..-1]}"
             elsif href.start_with?( /https?:/ )            ## external page ref
               ## skip - keep empty - why? why not? (or add url domain?)
               ''
             else
               ## hack - check for some custom excludes  
               if title.start_with?( 'Rec.Sport.Soccer' )
                    ## skip - keep empty
                    '' 
               else   
                 ## strip (ending)  .htm|html
                 ", see page #{href.sub( /\.html?$/,'')}"
               end
             end

      "‹#{squish(title)}#{xref}›"
    end
  end
end

#replace_a_name(html) ⇒ `Object`

# File 'lib/rsssf/convert/html_to_txt/replace_a_name.rb', line 56

def replace_a_name( html )

  ## note - allows <a name=""> without closing </a>
  ##    <a name="semi"><H2>Semifinals</H2>
  ##   always put anchor on its own line for now

  ##
  ## remove (named) anchors
  html.gsub( A_NAME_RE ) do |match|   ## note: use .+? non-greedy match
    m = Regexp.last_match
    
    name = m[:name].gsub( /["']/, '' ).strip   ## remove ("" or '')
    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
    puts " replace anchor (a) name >#{name}<    -    >#{match}<"
  
   ##
   ## todo - report WARN if title incl. tags
   ##    assumes text only for now - why? why not?
   ##  add a name inside heading !!!
   ##  do NOT add heading inside a name !!!

    "‹§#{name}›"   ## note - use two spaces min (between title & name)
  end
end

#replace_a_name_old(html) ⇒ `Object`

# File 'lib/rsssf/convert/html_to_txt/replace_a_name.rb', line 33

def replace_a_name_old( html )
  ##
  ## remove (named) anchors
  html.gsub( A_NAME_RE ) do |match|   ## note: use .+? non-greedy match
    m = Regexp.last_match
    name = m[:name].gsub( /["']/, '' ).strip   ## remove ("" or '')
    title = m[:title].strip   ## note: "save" caputure first; gets replaced by gsub (next regex call)
    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
    puts " replace anchor (a) name >#{name}<, >#{title}<    -    >#{match}<"
  

   ##
   ## todo - report WARN if title incl. tags
   ##    assumes text only for now - why? why not?
   ##  add a name inside heading !!!
   ##  do NOT add heading inside a name !!!

    "#{title}  ‹§#{name}›"   ## note - use two spaces min (between title & name)
  end
end

#replace_heading(html) ⇒ `Object`

# File 'lib/rsssf/convert/html_to_txt/replace_heading.rb', line 34

def replace_heading( html )
   html = html.gsub( HEADING_RE ) do |_|
      m = Regexp.last_match

      level = m[:level].to_i(10) 
      title = m[:title]

      puts " replace heading #{level} (h#{level}) >#{title}<"

      ## note: make sure to always add two newlines before and after
      "\n\n#{'='*level} #{title}\n\n"    
      
   end

   html = html.gsub( BOLD_OR_UNDERLINE_LINE_HEADING_RE ) do |_|
      m = Regexp.last_match

      tag = m[:tag].downcase 
      title = m[:title]

      ## use heading 5 for b and heading 6 for underline for now
      ##   maybe later change to custom  ==_ or ==* or such
      ##     to mark the heading (sourced via bold/underscore) ???
      level =  if tag == 'b'
                    5
               elsif tag == 'u'
                    6 
               else
                   raise ArgumentError, "b(old)|u(underscore) tag expected; got #{tag}"
               end

      puts " replace #{tag}-heading #{level} (h#{level}) >#{title}<"

      ## note: do NOT add any newlines before and after
      "#{'='*level} #{title}"    
   end
 
  html
end

#replace_hr(html) ⇒ `Object`

# File 'lib/rsssf/convert/html_to_txt/replace_hr.rb', line 9

def replace_hr( html )

  html = html.gsub( /\s*<HR>\s*/im ) do |match|
    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
    puts " replace horizontal rule (hr) - >#{match}<"
    HR_LINE_ASCII    ## check what hr to use use  - . - . - or =-=-=-= or somehting distinct?
  end
 
  html
end

#squish(str) ⇒ `Object`

# File 'lib/rsssf/convert/html_to_txt.rb', line 9

def squish( str )
   ## squish more than one white space to one space
   str.gsub( /[ \r\t\n]+/, ' ' )
end

Class: Rsssf::PageConverter

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.convert(html, url:) ⇒ Object

.convert_html_entities(html, url: nil) ⇒ Object

.errata_html(html) ⇒ Object

.errata_html_entities(html) ⇒ Object

.log(msg) ⇒ Object

Instance Method Details

#beautify_anchors(html) ⇒ Object

#convert(html, url:) ⇒ Object

#convert_html_entities(html, url: nil) ⇒ Object

#errata_html(html) ⇒ Object

#errata_html_entities(html) ⇒ Object

#errata_txt(txt) ⇒ Object

#html_to_txt(html, url:) ⇒ Object

#log(msg) ⇒ Object

#remove_emails(html) ⇒ Object

#replace_a_href(html) ⇒ Object

#replace_a_name(html) ⇒ Object

#replace_a_name_old(html) ⇒ Object

#replace_heading(html) ⇒ Object

#replace_hr(html) ⇒ Object

#squish(str) ⇒ Object