Class: Rsssf::Prep

Inherits:

Object

Object
Rsssf::Prep

show all

Defined in:: lib/rsssf/prepare/convert.rb,
lib/rsssf/prepare/download.rb,
lib/rsssf/prepare/convert-meta.rb,
lib/rsssf/prepare/convert-links.rb,
lib/rsssf/prepare/convert-navlines.rb,
lib/rsssf/prepare/convert-postproc.rb

Overview

todo: find a better name e.g. BatchPrep or ??

Constant Summary collapse

TITLE_RE =

%r{
    <TITLE>(?<text>.*?)</TITLE>
}ixm

ABOUT_META_RE =

%r{
    ## (i) author(s) info
   \b authors? [ ]* :
    \s+
      (?<author> .+?)    ## note - non-greedy (may incl. newline break!!)
    \s+
    ## (ii) followed by date
    \b last [ ]+ updated [ ]*:
      \s*
      (?<date> \d{1,2} [ ]+              ## day
                [a-z]{3,10} [ ]+         ## month
                \d{4} \b)                ## year
}ixm

LINK_APAGE_RE = see page 2006f see page ../tablesw/worldcup› e.g. ‹League C, see page 2023uefanl§lgc› ‹League A, see page 2023uefanl.html#lga› todo/fix - fix upstream ?? (e.g. remove. html and replace #=>§)

%r{  ‹(?<title> [^›]+?)
       , [ ] see [ ] page [ ]
      (?<pageref> [^›]+?)
    ›
}ix

START_WITH_ABOUT_RE = note - start_with anchored w/ A to start of string

%r{  \A
    [ \n]*   ## trailing spaces or blank lines
    ={2,} [ ]* About [ ]+ this [ ]+ document
     .*?
}ix

START_WITH_CUSTOM_RE = remove “custom” sections by title e.g. === Index of groups

%r{  \A
    [ \n]*   ## trailing spaces or blank lines
    ={2,}
        [ ]*
          (?<title>
             Index [ ] of [ ] groups
           )
       [ ]*
     $
}ix

START_WITH_NAV_RE = todo - fix remove all menu, ul,li, tags etc. before nav check see https://rsssf.github.io/tables/2014q.html as an example!!!

%r{  \A
   [ \n]*    ## trailing spaces or blank lines
   ‹.+?›    ##  link  (exlude named anchor - why? why not? §)
}ix

Class Method Summary collapse

.convert_pages(pages, outdir:) ⇒ Object

convenience helper.
.download_pages(pages, force:) ⇒ Object

convenience helper.

Instance Method Summary collapse

#collect_links(txt, basename:, dirname:) ⇒ Object
#convert_pages(pages, outdir:) ⇒ Object
#download_pages(pages, force:) ⇒ Object
#expand_pageref(pageref, dirname:) ⇒ Object

[“1973/74”, “oost74”], [“1975/76”, “oost76”], [“list of final tables”, “oosthist”], [“list of champions”, “oostchamp”], [“list of cup finals”, “oostcuphist”], [“list of super cup finals”, “oostsupcuphist”], [“list of foundation dates”, “oostfound”]].
#find_author_n_date(txt) ⇒ Object

change name to authors_n_updated or such - why? why not?.
#find_title(html) ⇒ Object
#postproc_page(txt, basename:, dirname:) ⇒ Object
#proc_navlines_by_sections(txt) ⇒ Object
#strip_navlines(lines, heading: true) ⇒ Object

Class Method Details

.convert_pages(pages, outdir:) ⇒ `Object`

convenience helper

# File 'lib/rsssf/prepare/convert.rb', line 7

def self.convert_pages( pages, outdir: )
      @@prep ||= new   ## use a "shared" built-in prep
      @@prep.convert_pages( pages, outdir: outdir )
end

.download_pages(pages, force:) ⇒ `Object`

convenience helper

# File 'lib/rsssf/prepare/download.rb', line 7

def self.download_pages( pages, force: )
   @@prep ||= new   ## use a "shared" built-in prep
   @@prep.download_pages( pages, force: force )
end

Instance Method Details

#collect_links(txt, basename:, dirname:) ⇒ `Object`

# File 'lib/rsssf/prepare/convert-links.rb', line 63

def collect_links( txt, basename:, dirname: )

  links = txt.scan( LINK_APAGE_RE )

  links.map do |link|
                   link[1] = expand_pageref( link[1], dirname: dirname )
                   link
               end

  links
end

#convert_pages(pages, outdir:) ⇒ `Object`

# File 'lib/rsssf/prepare/convert.rb', line 13

def convert_pages( pages, outdir: )
  pages.each_with_index do |config,i|
    puts
    puts "==> [#{i+1}/#{pages.size}] converting #{config.pretty_inspect}..."

    page     = config['page']
    url      = "https://rsssf.org/#{page}"

    html     = Webcache.read( url )


    edits = []

    txt, more_edits = PageConverter.convert( html, url: url )
    edits += more_edits


    basename = File.basename( page, File.extname( page ))
    dirname  = File.dirname( page )


    ##
    ##  post-process .txt page

    txt, more_edits, links, about = postproc_page( txt, basename: basename,
                                                        dirname: dirname )
    edits += more_edits



    title  =  find_title( html ) || 'n/a'

    authors, updated = about ? find_author_n_date( about ) : [nil,nil]

 header_props = <<EOS
     title:   #{title}
     source:  #{url}
EOS

   if authors && updated
      ##  assume plural if and or command (,)
      header_props +=  if /\band\b|,/i.match( authors )
                         "     authors: #{authors}\n"
                       else
                         "     author:  #{authors}\n"
                       end
      header_props +=    "     updated: #{updated}"
   end


  header = <<EOS
  <!--
#{header_props}
    -->
EOS


     ## note - (auto-) add (comment) header to written out txt!!!
     write_text( "#{outdir}/#{dirname}/#{basename}.txt", header+txt )

     ## todo/check - delete edits file if no edits - why? why not?
     if edits.size > 0
        write_text( "#{outdir}/#{dirname}/#{basename}.edits.txt", edits.join("\n") )
     end

     ## todo/check - delete links file if no links - why? why not?
     if links.size > 0
         buf = links.map do |link|
                              title   = link[0]
                              pageref = link[1]
                             "#{'%-30s' % pageref}  :  #{title}"
                        end.join( "\n")

         write_text( "#{outdir}/#{dirname}/#{basename}.links.txt", buf )
     end

     ## todo/check - delete about file if no about - why? why not?
     if about
        write_text( "#{outdir}/#{dirname}/#{basename}.about.txt", about )
     end

  end
end

#download_pages(pages, force:) ⇒ `Object`

# File 'lib/rsssf/prepare/download.rb', line 13

def download_pages( pages, force: )
  pages.each_with_index do |config,i|

## todo / double check fix read_csv upstream
##    if   empty column has comment it is "" empty string otherwise
##                it is nil!!!  ??

    encoding = config['encoding']
    encoding = 'windows-1252'   if encoding.nil? || encoding.empty?

    page     = config['page']
    url      = "https://rsssf.org/#{page}"


## check if not in cache
##   note - use force == true  to always (force) download
    if Webcache.cached?( url ) && force == false
        puts "   CACHE HIT - #{url}"
    else
        puts "==> [#{i+1}/#{pages.size}] download #{config.pretty_inspect}..."
        html = Rsssf.download_page( url, encoding: encoding )
    end
  end
end

#expand_pageref(pageref, dirname:) ⇒ `Object`

[“1973/74”, “oost74”],

["1975/76", "oost76"],
["list of final tables", "oosthist"],
["list of champions", "oostchamp"],
["list of cup finals", "oostcuphist"],
["list of super cup finals", "oostsupcuphist"],
["list of foundation dates", "oostfound"]]

# File 'lib/rsssf/prepare/convert-links.rb', line 28

def expand_pageref( pageref, dirname: )

                  ##
                  ##  note - pre-proces
                  ##   2023uefanl.html#lga
                  ##     stkitts2025.html#pres
                  ##
                  ##   remove .html
                  ##    and optional anchor
                  ##
                  ##   fix - upstream - why? why not?

                   pageref = pageref.sub(  %r{ \.html\b }ix, '' )
                   ## check - only really one # allowed in url path???
                   pageref = pageref.sub(  '#', '§' )


                 if /^[a-z0-9][a-z0-9§-]*$/.match?( pageref )
                    ## assume relative page in "local" dir
                    "#{dirname}/#{pageref}"
                 elsif pageref.start_with?( '../')
                    ## ../tablesw/worldcup
                     pageref.sub( "../", '' )
                 elsif pageref.start_with?( './' )
                     raise ArgumentError, "found (unsupported) ./ pageref >#{pageref}<"
                 elsif pageref.start_with?( '/' )
                     raise ArgumentError, "found (unsupported) / pageref >#{pageref}<"
                 elsif pageref.start_with?( %r{^https?:}i )
                     raise ArgumentError, "found (unsupported) https?: pageref >#{pageref}<"
                 else
                     raise ArgumentError, "found (unsupported) pageref >#{pageref}<"
                 end
end

#find_author_n_date(txt) ⇒ `Object`

change name to authors_n_updated or such - why? why not?

# File 'lib/rsssf/prepare/convert-meta.rb', line 59

def find_author_n_date( txt )
  ##
  ## fix/todo: move authors n last updated
  ##  whitespace cleanup  - why? why not??

  if m=ABOUT_META_RE.match( txt )

    authors = m[:author].strip.gsub(/\s+/, ' ' )  # cleanup whitespace; squish-style
    authors = authors.gsub( /[ ]*,[ ]*/, ', ' )    # prettify commas - always single space after comma (no space before)

    updated = m[:date].strip.gsub(/\s+/, ' ' )

    [authors, updated]
  else
     ## report error or raise exception??
     ##  return nil for now
     [nil,nil]  ## or return (single) nil ??
  end
end

#find_title(html) ⇒ `Object`

# File 'lib/rsssf/prepare/convert-meta.rb', line 21

def find_title( html )
  if m=TITLE_RE.match( html )
     text = m[:text].strip

     ## note - convert html entities
     ##  e.g. Brazil 2000 - Copa Jo&atilde;o Havelange
     text = PageConverter.convert_html_entities( text )

     ##  add autofix known typos/erratas here!!!
     ## note - title quick typo fix (in brazil) remove <
     ##   e.g. <TITLE>Brazil 1988<</TITLE>
     text = text.gsub( '<', '' )

     text
  else
     nil
  end
end

#postproc_page(txt, basename:, dirname:) ⇒ `Object`

# File 'lib/rsssf/prepare/convert-postproc.rb', line 51

def postproc_page( txt, basename:, dirname: )

  ### record edits in its own txt file
  edits = []
  links = []
  about = nil


###
##  step 1
##   split by horizontal rules (hrs)
##       and remove navigations sections
##             starting with links e.g.
## ‹Bundesliga, see §bund›

   sects = txt.split( /^=-=-=-=-=-=-=-=-=-=-=-=-=-=-=$/ )




   sects = sects.select do |sect|
             if START_WITH_NAV_RE.match?( sect )
                links += collect_links( sect, basename: basename,
                                              dirname: dirname )

              edit = String.new
               edit += "-- removing nav(igation) section:"
               edit += sect

               puts edit

               edits << edit   ## record edit

               false           ## remove section
             elsif m=START_WITH_CUSTOM_RE.match( sect )
                links += collect_links( sect, basename: basename,
                                              dirname: dirname )

                edit = String.new
                edit += "-- removing custom section with title >#{m[:title]}<:"
                edit += sect

                puts edit

                edits << edit   ## record edit

                false           ## remove section

             elsif START_WITH_ABOUT_RE.match?( sect )
                ## note - do NOT collect links in about section!!!

               about = sect
               false           ## remove (about) section
             else
                links += collect_links( sect, basename: basename,
                                              dirname: dirname )
               true            ## keep section
             end
           end

   ## sects.each_with_index do |sect,i|
   ##  puts "==> #{i+1}/#{sects.size}"
   ##  pp sect
   ## end
   ##  puts "  #{sects.size} sect(s)"


   ## note - replace hr with blank line
   txt = sects.join( "\n\n" )


   ###
   ## remove pre comments
   txt = txt.gsub( "<!-- start pre -->\n", '' )
   txt = txt.gsub( "<!-- end pre -->\n", '' )



    ## try to remove leading and trailing nav(igation) lines
    txt, more_edits = proc_navlines_by_sections( txt )
    edits += more_edits

   ## note - return (new) txt AND recorded edits (& erratas)
   ##        return edits as array or joined (single) string - why? why not?
   ##   note - return empty array if no edits!!
   [txt, edits, links, about]
end

#proc_navlines_by_sections(txt) ⇒ `Object`

# File 'lib/rsssf/prepare/convert-navlines.rb', line 75

def proc_navlines_by_sections( txt )

   edits = []

   ###
   ##  remove  remaing nav html elements
   ##  <MENU></MENU>
   ##   <UL></UL>
   ##   <LI></LI>

      tags = []
     txt = txt.gsub( %r{   <MENU> | </MENU>
                         | <UL>   | </UL>
                         | <LI>   | </LI>
                       }ix ) do |match|
         tags << match
           ''
     end

     if tags.size > 0
                edit = String.new
                edit += "-- removed #{tags.size} remaining nav html element(s):\n"
                edit += tags.join( ' ')

                puts edit

                edits << edit   ## record edit
      end


    sections = txt.split( %r{^
                               (?= [ ]* ={2,} [ ]*
                                    [\p{L}0-9]  ## one letter or digit required
                               )
                            }ix
                        )


     newsections = []
     sections.each_with_index do |sect,sectno|
          newlines, navlines = strip_navlines( sect.lines, heading: true )

          if navlines.size > 0
             edit = String.new
             edit += "-- removing #{navlines.size} leading nav line(s) in section #{sectno+1}:\n"
             edit += navlines.join
             puts edit

             edits << edit
          end


          ## special check for last section
          if sectno+1 == sections.size
              ## reverse lines
              ##  and remove trailing navlines until hitting body
              ##   note - set heading to false
              newlines, navlines = strip_navlines( newlines.reverse, heading: false )
              newlines = newlines.reverse
              navlines = navlines.reverse

              if navlines.size > 0
                edit = String.new
                edit += "-- removing #{navlines.size} trailing nav line(s) in last section #{sectno+1}:\n"
                edit += navlines.join
                puts edit

                edits << edit
              end
          end

          newsections << newlines.join
     end # each section

    [newsections.join, edits]
end

#strip_navlines(lines, heading: true) ⇒ `Object`

# File 'lib/rsssf/prepare/convert-navlines.rb', line 28

def strip_navlines( lines, heading: true )
## note - expects an array of lines (e.g. txt.lines!!!)

          newlines = []
          navlines = []
          body     = false    ## hit/seen body?
          lines.each_with_index do |line,lineno|
              ## check for optional leading heading line
              ## note - first line is heading
              ##  (only optional for first section)
              if heading && lineno == 0 && line.lstrip.start_with?( '==' )
                    newlines << line
                    next
              end

              ##  possibly remove leading nav link lines
              if !body
                 if line.strip.empty?
                    newlines << line
                    next
                 end

                 ## remove leading nav link lines only
                 newline = line.strip.gsub( /‹.+?›/, '' )
                 ##  check what's left over?
                 ##  if only space or pipe (|) or dot (.) than remove
                 if newline.match?( %r{\A
                                        [ |.]*
                                     \z}ix )
                    ## puts "  removing nav line #{line}"
                    navlines << line
                    ## eat-up; record edit
                 else
                    body = true
                    newlines << line
                 end
              else
                  newlines << line
              end
          end  # each line

          [newlines,navlines]
end

Class: Rsssf::Prep

Overview

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.convert_pages(pages, outdir:) ⇒ Object

.download_pages(pages, force:) ⇒ Object

Instance Method Details

#collect_links(txt, basename:, dirname:) ⇒ Object

#convert_pages(pages, outdir:) ⇒ Object

#download_pages(pages, force:) ⇒ Object

#expand_pageref(pageref, dirname:) ⇒ Object

#find_author_n_date(txt) ⇒ Object

#find_title(html) ⇒ Object

#postproc_page(txt, basename:, dirname:) ⇒ Object

#proc_navlines_by_sections(txt) ⇒ Object

#strip_navlines(lines, heading: true) ⇒ Object