Class: Rsssf::Prep

Inherits:
Object
  • Object
show all
Defined in:
lib/rsssf/prepare/convert.rb,
lib/rsssf/prepare/download.rb,
lib/rsssf/prepare/convert-meta.rb,
lib/rsssf/prepare/convert-links.rb,
lib/rsssf/prepare/convert-navlines.rb,
lib/rsssf/prepare/convert-postproc.rb

Overview

todo: find a better name e.g. BatchPrep or ??

Constant Summary collapse

TITLE_RE =
%r{
    <TITLE>(?<text>.*?)</TITLE>
}ixm
ABOUT_META_RE =
%r{
    ## (i) author(s) info
   \b authors? [ ]* :
    \s+
      (?<author> .+?)    ## note - non-greedy (may incl. newline break!!)
    \s+
    ## (ii) followed by date
    \b last [ ]+ updated [ ]*:
      \s*
      (?<date> \d{1,2} [ ]+              ## day
                [a-z]{3,10} [ ]+         ## month
                \d{4} \b)                ## year
}ixm
%r{  ‹(?<title> [^›]+?)
       , [ ] see [ ] page [ ]
      (?<pageref> [^›]+?)
    ›
}ix
START_WITH_ABOUT_RE =

note - start_with anchored w/ A to start of string

%r{  \A
    [ \n]*   ## trailing spaces or blank lines
    ={2,} [ ]* About [ ]+ this [ ]+ document
     .*?
}ix
START_WITH_CUSTOM_RE =

remove “custom” sections by title

e.g.   === Index of groups
%r{  \A
    [ \n]*   ## trailing spaces or blank lines
    ={2,}
        [ ]*
          (?<title>
             Index [ ] of [ ] groups
           )
       [ ]*
     $
}ix
START_WITH_NAV_RE =

todo - fix

remove all menu, ul,li, tags etc.
 before nav check
 see https://rsssf.github.io/tables/2014q.html
    as an example!!!
%r{  \A
   [ \n]*    ## trailing spaces or blank lines
   ‹.+?›    ##  link  (exlude named anchor - why? why not? §)
}ix

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.convert_pages(pages, outdir:) ⇒ Object

convenience helper



7
8
9
10
# File 'lib/rsssf/prepare/convert.rb', line 7

def self.convert_pages( pages, outdir: )
      @@prep ||= new   ## use a "shared" built-in prep
      @@prep.convert_pages( pages, outdir: outdir )
end

.download_pages(pages, force:) ⇒ Object

convenience helper



7
8
9
10
# File 'lib/rsssf/prepare/download.rb', line 7

def self.download_pages( pages, force: )
   @@prep ||= new   ## use a "shared" built-in prep
   @@prep.download_pages( pages, force: force )
end

Instance Method Details



63
64
65
66
67
68
69
70
71
72
73
# File 'lib/rsssf/prepare/convert-links.rb', line 63

def collect_links( txt, basename:, dirname: )

  links = txt.scan( LINK_APAGE_RE )

  links.map do |link|
                   link[1] = expand_pageref( link[1], dirname: dirname )
                   link
               end

  links
end

#convert_pages(pages, outdir:) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/rsssf/prepare/convert.rb', line 13

def convert_pages( pages, outdir: )
  pages.each_with_index do |config,i|
    puts
    puts "==> [#{i+1}/#{pages.size}] converting #{config.pretty_inspect}..."

    page     = config['page']
    url      = "https://rsssf.org/#{page}"

    html     = Webcache.read( url )


    edits = []

    txt, more_edits = PageConverter.convert( html, url: url )
    edits += more_edits


    basename = File.basename( page, File.extname( page ))
    dirname  = File.dirname( page )


    ##
    ##  post-process .txt page

    txt, more_edits, links, about = postproc_page( txt, basename: basename,
                                                        dirname: dirname )
    edits += more_edits



    title  =  find_title( html ) || 'n/a'

    authors, updated = about ? find_author_n_date( about ) : [nil,nil]

 header_props = <<EOS
     title:   #{title}
     source:  #{url}
EOS

   if authors && updated
      ##  assume plural if and or command (,)
      header_props +=  if /\band\b|,/i.match( authors )
                         "     authors: #{authors}\n"
                       else
                         "     author:  #{authors}\n"
                       end
      header_props +=    "     updated: #{updated}"
   end


  header = <<EOS
  <!--
#{header_props}
    -->
EOS


     ## note - (auto-) add (comment) header to written out txt!!!
     write_text( "#{outdir}/#{dirname}/#{basename}.txt", header+txt )

     ## todo/check - delete edits file if no edits - why? why not?
     if edits.size > 0
        write_text( "#{outdir}/#{dirname}/#{basename}.edits.txt", edits.join("\n") )
     end

     ## todo/check - delete links file if no links - why? why not?
     if links.size > 0
         buf = links.map do |link|
                              title   = link[0]
                              pageref = link[1]
                             "#{'%-30s' % pageref}  :  #{title}"
                        end.join( "\n")

         write_text( "#{outdir}/#{dirname}/#{basename}.links.txt", buf )
     end

     ## todo/check - delete about file if no about - why? why not?
     if about
        write_text( "#{outdir}/#{dirname}/#{basename}.about.txt", about )
     end

  end
end

#download_pages(pages, force:) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/rsssf/prepare/download.rb', line 13

def download_pages( pages, force: )
  pages.each_with_index do |config,i|

## todo / double check fix read_csv upstream
##    if   empty column has comment it is "" empty string otherwise
##                it is nil!!!  ??

    encoding = config['encoding']
    encoding = 'windows-1252'   if encoding.nil? || encoding.empty?

    page     = config['page']
    url      = "https://rsssf.org/#{page}"


## check if not in cache
##   note - use force == true  to always (force) download
    if Webcache.cached?( url ) && force == false
        puts "   CACHE HIT - #{url}"
    else
        puts "==> [#{i+1}/#{pages.size}] download #{config.pretty_inspect}..."
        html = Rsssf.download_page( url, encoding: encoding )
    end
  end
end

#expand_pageref(pageref, dirname:) ⇒ Object

[“1973/74”, “oost74”],

["1975/76", "oost76"],
["list of final tables", "oosthist"],
["list of champions", "oostchamp"],
["list of cup finals", "oostcuphist"],
["list of super cup finals", "oostsupcuphist"],
["list of foundation dates", "oostfound"]]


28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/rsssf/prepare/convert-links.rb', line 28

def expand_pageref( pageref, dirname: )

                  ##
                  ##  note - pre-proces
                  ##   2023uefanl.html#lga
                  ##     stkitts2025.html#pres
                  ##
                  ##   remove .html
                  ##    and optional anchor
                  ##
                  ##   fix - upstream - why? why not?

                   pageref = pageref.sub(  %r{ \.html\b }ix, '' )
                   ## check - only really one # allowed in url path???
                   pageref = pageref.sub(  '#', '§' )


                 if /^[a-z0-9][a-z0-9§-]*$/.match?( pageref )
                    ## assume relative page in "local" dir
                    "#{dirname}/#{pageref}"
                 elsif pageref.start_with?( '../')
                    ## ../tablesw/worldcup
                     pageref.sub( "../", '' )
                 elsif pageref.start_with?( './' )
                     raise ArgumentError, "found (unsupported) ./ pageref >#{pageref}<"
                 elsif pageref.start_with?( '/' )
                     raise ArgumentError, "found (unsupported) / pageref >#{pageref}<"
                 elsif pageref.start_with?( %r{^https?:}i )
                     raise ArgumentError, "found (unsupported) https?: pageref >#{pageref}<"
                 else
                     raise ArgumentError, "found (unsupported) pageref >#{pageref}<"
                 end
end

#find_author_n_date(txt) ⇒ Object

change name to authors_n_updated or such - why? why not?



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/rsssf/prepare/convert-meta.rb', line 59

def find_author_n_date( txt )
  ##
  ## fix/todo: move authors n last updated
  ##  whitespace cleanup  - why? why not??

  if m=ABOUT_META_RE.match( txt )

    authors = m[:author].strip.gsub(/\s+/, ' ' )  # cleanup whitespace; squish-style
    authors = authors.gsub( /[ ]*,[ ]*/, ', ' )    # prettify commas - always single space after comma (no space before)

    updated = m[:date].strip.gsub(/\s+/, ' ' )

    [authors, updated]
  else
     ## report error or raise exception??
     ##  return nil for now
     [nil,nil]  ## or return (single) nil ??
  end
end

#find_title(html) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/rsssf/prepare/convert-meta.rb', line 21

def find_title( html )
  if m=TITLE_RE.match( html )
     text = m[:text].strip

     ## note - convert html entities
     ##  e.g. Brazil 2000 - Copa Jo&atilde;o Havelange
     text = PageConverter.convert_html_entities( text )

     ##  add autofix known typos/erratas here!!!
     ## note - title quick typo fix (in brazil) remove <
     ##   e.g. <TITLE>Brazil 1988<</TITLE>
     text = text.gsub( '<', '' )

     text
  else
     nil
  end
end

#postproc_page(txt, basename:, dirname:) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/rsssf/prepare/convert-postproc.rb', line 51

def postproc_page( txt, basename:, dirname: )

  ### record edits in its own txt file
  edits = []
  links = []
  about = nil


###
##  step 1
##   split by horizontal rules (hrs)
##       and remove navigations sections
##             starting with links e.g.
## ‹Bundesliga, see §bund›

   sects = txt.split( /^=-=-=-=-=-=-=-=-=-=-=-=-=-=-=$/ )




   sects = sects.select do |sect|
             if START_WITH_NAV_RE.match?( sect )
                links += collect_links( sect, basename: basename,
                                              dirname: dirname )

              edit = String.new
               edit += "-- removing nav(igation) section:"
               edit += sect

               puts edit

               edits << edit   ## record edit

               false           ## remove section
             elsif m=START_WITH_CUSTOM_RE.match( sect )
                links += collect_links( sect, basename: basename,
                                              dirname: dirname )

                edit = String.new
                edit += "-- removing custom section with title >#{m[:title]}<:"
                edit += sect

                puts edit

                edits << edit   ## record edit

                false           ## remove section

             elsif START_WITH_ABOUT_RE.match?( sect )
                ## note - do NOT collect links in about section!!!

               about = sect
               false           ## remove (about) section
             else
                links += collect_links( sect, basename: basename,
                                              dirname: dirname )
               true            ## keep section
             end
           end

   ## sects.each_with_index do |sect,i|
   ##  puts "==> #{i+1}/#{sects.size}"
   ##  pp sect
   ## end
   ##  puts "  #{sects.size} sect(s)"


   ## note - replace hr with blank line
   txt = sects.join( "\n\n" )


   ###
   ## remove pre comments
   txt = txt.gsub( "<!-- start pre -->\n", '' )
   txt = txt.gsub( "<!-- end pre -->\n", '' )



    ## try to remove leading and trailing nav(igation) lines
    txt, more_edits = proc_navlines_by_sections( txt )
    edits += more_edits

   ## note - return (new) txt AND recorded edits (& erratas)
   ##        return edits as array or joined (single) string - why? why not?
   ##   note - return empty array if no edits!!
   [txt, edits, links, about]
end

#proc_navlines_by_sections(txt) ⇒ Object



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/rsssf/prepare/convert-navlines.rb', line 75

def proc_navlines_by_sections( txt )

   edits = []

   ###
   ##  remove  remaing nav html elements
   ##  <MENU></MENU>
   ##   <UL></UL>
   ##   <LI></LI>

      tags = []
     txt = txt.gsub( %r{   <MENU> | </MENU>
                         | <UL>   | </UL>
                         | <LI>   | </LI>
                       }ix ) do |match|
         tags << match
           ''
     end

     if tags.size > 0
                edit = String.new
                edit += "-- removed #{tags.size} remaining nav html element(s):\n"
                edit += tags.join( ' ')

                puts edit

                edits << edit   ## record edit
      end


    sections = txt.split( %r{^
                               (?= [ ]* ={2,} [ ]*
                                    [\p{L}0-9]  ## one letter or digit required
                               )
                            }ix
                        )


     newsections = []
     sections.each_with_index do |sect,sectno|
          newlines, navlines = strip_navlines( sect.lines, heading: true )

          if navlines.size > 0
             edit = String.new
             edit += "-- removing #{navlines.size} leading nav line(s) in section #{sectno+1}:\n"
             edit += navlines.join
             puts edit

             edits << edit
          end


          ## special check for last section
          if sectno+1 == sections.size
              ## reverse lines
              ##  and remove trailing navlines until hitting body
              ##   note - set heading to false
              newlines, navlines = strip_navlines( newlines.reverse, heading: false )
              newlines = newlines.reverse
              navlines = navlines.reverse

              if navlines.size > 0
                edit = String.new
                edit += "-- removing #{navlines.size} trailing nav line(s) in last section #{sectno+1}:\n"
                edit += navlines.join
                puts edit

                edits << edit
              end
          end

          newsections << newlines.join
     end # each section

    [newsections.join, edits]
end

#strip_navlines(lines, heading: true) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/rsssf/prepare/convert-navlines.rb', line 28

def strip_navlines( lines, heading: true )
## note - expects an array of lines (e.g. txt.lines!!!)

          newlines = []
          navlines = []
          body     = false    ## hit/seen body?
          lines.each_with_index do |line,lineno|
              ## check for optional leading heading line
              ## note - first line is heading
              ##  (only optional for first section)
              if heading && lineno == 0 && line.lstrip.start_with?( '==' )
                    newlines << line
                    next
              end

              ##  possibly remove leading nav link lines
              if !body
                 if line.strip.empty?
                    newlines << line
                    next
                 end

                 ## remove leading nav link lines only
                 newline = line.strip.gsub( /‹.+?›/, '' )
                 ##  check what's left over?
                 ##  if only space or pipe (|) or dot (.) than remove
                 if newline.match?( %r{\A
                                        [ |.]*
                                     \z}ix )
                    ## puts "  removing nav line #{line}"
                    navlines << line
                    ## eat-up; record edit
                 else
                    body = true
                    newlines << line
                 end
              else
                  newlines << line
              end
          end  # each line

          [newlines,navlines]
end