Class: Rsssf::Prep
- Inherits:
-
Object
- Object
- Rsssf::Prep
- Defined in:
- lib/rsssf/prepare/convert.rb,
lib/rsssf/prepare/download.rb,
lib/rsssf/prepare/convert-meta.rb,
lib/rsssf/prepare/convert-links.rb,
lib/rsssf/prepare/convert-navlines.rb,
lib/rsssf/prepare/convert-postproc.rb
Overview
todo: find a better name e.g. BatchPrep or ??
Constant Summary collapse
- TITLE_RE =
%r{ <TITLE>(?<text>.*?)</TITLE> }ixm- ABOUT_META_RE =
%r{ ## (i) author(s) info \b authors? [ ]* : \s+ (?<author> .+?) ## note - non-greedy (may incl. newline break!!) \s+ ## (ii) followed by date \b last [ ]+ updated [ ]*: \s* (?<date> \d{1,2} [ ]+ ## day [a-z]{3,10} [ ]+ ## month \d{4} \b) ## year }ixm- LINK_APAGE_RE =
see page 2006f
see page ../tablesw/worldcup› e.g. ‹League C, see page 2023uefanl§lgc› ‹League A, see page 2023uefanl.html#lga› todo/fix - fix upstream ?? (e.g. remove. html and replace #=>§) %r{ ‹(?<title> [^›]+?) , [ ] see [ ] page [ ] (?<pageref> [^›]+?) › }ix- START_WITH_ABOUT_RE =
note - start_with anchored w/ A to start of string
%r{ \A [ \n]* ## trailing spaces or blank lines ={2,} [ ]* About [ ]+ this [ ]+ document .*? }ix- START_WITH_CUSTOM_RE =
remove “custom” sections by title
e.g. === Index of groups %r{ \A [ \n]* ## trailing spaces or blank lines ={2,} [ ]* (?<title> Index [ ] of [ ] groups ) [ ]* $ }ix- START_WITH_NAV_RE =
todo - fix
remove all menu, ul,li, tags etc. before nav check see https://rsssf.github.io/tables/2014q.html as an example!!! %r{ \A [ \n]* ## trailing spaces or blank lines ‹.+?› ## link (exlude named anchor - why? why not? §) }ix
Class Method Summary collapse
-
.convert_pages(pages, outdir:) ⇒ Object
convenience helper.
-
.download_pages(pages, force:) ⇒ Object
convenience helper.
Instance Method Summary collapse
- #collect_links(txt, basename:, dirname:) ⇒ Object
- #convert_pages(pages, outdir:) ⇒ Object
- #download_pages(pages, force:) ⇒ Object
-
#expand_pageref(pageref, dirname:) ⇒ Object
[“1973/74”, “oost74”], [“1975/76”, “oost76”], [“list of final tables”, “oosthist”], [“list of champions”, “oostchamp”], [“list of cup finals”, “oostcuphist”], [“list of super cup finals”, “oostsupcuphist”], [“list of foundation dates”, “oostfound”]].
-
#find_author_n_date(txt) ⇒ Object
change name to authors_n_updated or such - why? why not?.
- #find_title(html) ⇒ Object
- #postproc_page(txt, basename:, dirname:) ⇒ Object
- #proc_navlines_by_sections(txt) ⇒ Object
- #strip_navlines(lines, heading: true) ⇒ Object
Class Method Details
.convert_pages(pages, outdir:) ⇒ Object
convenience helper
7 8 9 10 |
# File 'lib/rsssf/prepare/convert.rb', line 7 def self.convert_pages( pages, outdir: ) @@prep ||= new ## use a "shared" built-in prep @@prep.convert_pages( pages, outdir: outdir ) end |
.download_pages(pages, force:) ⇒ Object
convenience helper
7 8 9 10 |
# File 'lib/rsssf/prepare/download.rb', line 7 def self.download_pages( pages, force: ) @@prep ||= new ## use a "shared" built-in prep @@prep.download_pages( pages, force: force ) end |
Instance Method Details
#collect_links(txt, basename:, dirname:) ⇒ Object
63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/rsssf/prepare/convert-links.rb', line 63 def collect_links( txt, basename:, dirname: ) links = txt.scan( LINK_APAGE_RE ) links.map do |link| link[1] = ( link[1], dirname: dirname ) link end links end |
#convert_pages(pages, outdir:) ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/rsssf/prepare/convert.rb', line 13 def convert_pages( pages, outdir: ) pages.each_with_index do |config,i| puts puts "==> [#{i+1}/#{pages.size}] converting #{config.pretty_inspect}..." page = config['page'] url = "https://rsssf.org/#{page}" html = Webcache.read( url ) edits = [] txt, more_edits = PageConverter.convert( html, url: url ) edits += more_edits basename = File.basename( page, File.extname( page )) dirname = File.dirname( page ) ## ## post-process .txt page txt, more_edits, links, about = postproc_page( txt, basename: basename, dirname: dirname ) edits += more_edits title = find_title( html ) || 'n/a' , updated = about ? ( about ) : [nil,nil] header_props = <<EOS title: #{title} source: #{url} EOS if && updated ## assume plural if and or command (,) header_props += if /\band\b|,/i.match( ) " authors: #{}\n" else " author: #{}\n" end header_props += " updated: #{updated}" end header = <<EOS <!-- #{header_props} --> EOS ## note - (auto-) add (comment) header to written out txt!!! write_text( "#{outdir}/#{dirname}/#{basename}.txt", header+txt ) ## todo/check - delete edits file if no edits - why? why not? if edits.size > 0 write_text( "#{outdir}/#{dirname}/#{basename}.edits.txt", edits.join("\n") ) end ## todo/check - delete links file if no links - why? why not? if links.size > 0 buf = links.map do |link| title = link[0] pageref = link[1] "#{'%-30s' % pageref} : #{title}" end.join( "\n") write_text( "#{outdir}/#{dirname}/#{basename}.links.txt", buf ) end ## todo/check - delete about file if no about - why? why not? if about write_text( "#{outdir}/#{dirname}/#{basename}.about.txt", about ) end end end |
#download_pages(pages, force:) ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/rsssf/prepare/download.rb', line 13 def download_pages( pages, force: ) pages.each_with_index do |config,i| ## todo / double check fix read_csv upstream ## if empty column has comment it is "" empty string otherwise ## it is nil!!! ?? encoding = config['encoding'] encoding = 'windows-1252' if encoding.nil? || encoding.empty? page = config['page'] url = "https://rsssf.org/#{page}" ## check if not in cache ## note - use force == true to always (force) download if Webcache.cached?( url ) && force == false puts " CACHE HIT - #{url}" else puts "==> [#{i+1}/#{pages.size}] download #{config.pretty_inspect}..." html = Rsssf.download_page( url, encoding: encoding ) end end end |
#expand_pageref(pageref, dirname:) ⇒ Object
[“1973/74”, “oost74”],
["1975/76", "oost76"],
["list of final tables", "oosthist"],
["list of champions", "oostchamp"],
["list of cup finals", "oostcuphist"],
["list of super cup finals", "oostsupcuphist"],
["list of foundation dates", "oostfound"]]
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/rsssf/prepare/convert-links.rb', line 28 def ( pageref, dirname: ) ## ## note - pre-proces ## 2023uefanl.html#lga ## stkitts2025.html#pres ## ## remove .html ## and optional anchor ## ## fix - upstream - why? why not? pageref = pageref.sub( %r{ \.html\b }ix, '' ) ## check - only really one # allowed in url path??? pageref = pageref.sub( '#', '§' ) if /^[a-z0-9][a-z0-9§-]*$/.match?( pageref ) ## assume relative page in "local" dir "#{dirname}/#{pageref}" elsif pageref.start_with?( '../') ## ../tablesw/worldcup pageref.sub( "../", '' ) elsif pageref.start_with?( './' ) raise ArgumentError, "found (unsupported) ./ pageref >#{pageref}<" elsif pageref.start_with?( '/' ) raise ArgumentError, "found (unsupported) / pageref >#{pageref}<" elsif pageref.start_with?( %r{^https?:}i ) raise ArgumentError, "found (unsupported) https?: pageref >#{pageref}<" else raise ArgumentError, "found (unsupported) pageref >#{pageref}<" end end |
#find_author_n_date(txt) ⇒ Object
change name to authors_n_updated or such - why? why not?
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/rsssf/prepare/convert-meta.rb', line 59 def ( txt ) ## ## fix/todo: move authors n last updated ## whitespace cleanup - why? why not?? if m=ABOUT_META_RE.match( txt ) = m[:author].strip.gsub(/\s+/, ' ' ) # cleanup whitespace; squish-style = .gsub( /[ ]*,[ ]*/, ', ' ) # prettify commas - always single space after comma (no space before) updated = m[:date].strip.gsub(/\s+/, ' ' ) [, updated] else ## report error or raise exception?? ## return nil for now [nil,nil] ## or return (single) nil ?? end end |
#find_title(html) ⇒ Object
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
# File 'lib/rsssf/prepare/convert-meta.rb', line 21 def find_title( html ) if m=TITLE_RE.match( html ) text = m[:text].strip ## note - convert html entities ## e.g. Brazil 2000 - Copa João Havelange text = PageConverter.convert_html_entities( text ) ## add autofix known typos/erratas here!!! ## note - title quick typo fix (in brazil) remove < ## e.g. <TITLE>Brazil 1988<</TITLE> text = text.gsub( '<', '' ) text else nil end end |
#postproc_page(txt, basename:, dirname:) ⇒ Object
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# File 'lib/rsssf/prepare/convert-postproc.rb', line 51 def postproc_page( txt, basename:, dirname: ) ### record edits in its own txt file edits = [] links = [] about = nil ### ## step 1 ## split by horizontal rules (hrs) ## and remove navigations sections ## starting with links e.g. ## ‹Bundesliga, see §bund› sects = txt.split( /^=-=-=-=-=-=-=-=-=-=-=-=-=-=-=$/ ) sects = sects.select do |sect| if START_WITH_NAV_RE.match?( sect ) links += collect_links( sect, basename: basename, dirname: dirname ) edit = String.new edit += "-- removing nav(igation) section:" edit += sect puts edit edits << edit ## record edit false ## remove section elsif m=START_WITH_CUSTOM_RE.match( sect ) links += collect_links( sect, basename: basename, dirname: dirname ) edit = String.new edit += "-- removing custom section with title >#{m[:title]}<:" edit += sect puts edit edits << edit ## record edit false ## remove section elsif START_WITH_ABOUT_RE.match?( sect ) ## note - do NOT collect links in about section!!! about = sect false ## remove (about) section else links += collect_links( sect, basename: basename, dirname: dirname ) true ## keep section end end ## sects.each_with_index do |sect,i| ## puts "==> #{i+1}/#{sects.size}" ## pp sect ## end ## puts " #{sects.size} sect(s)" ## note - replace hr with blank line txt = sects.join( "\n\n" ) ### ## remove pre comments txt = txt.gsub( "<!-- start pre -->\n", '' ) txt = txt.gsub( "<!-- end pre -->\n", '' ) ## try to remove leading and trailing nav(igation) lines txt, more_edits = proc_navlines_by_sections( txt ) edits += more_edits ## note - return (new) txt AND recorded edits (& erratas) ## return edits as array or joined (single) string - why? why not? ## note - return empty array if no edits!! [txt, edits, links, about] end |
#proc_navlines_by_sections(txt) ⇒ Object
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# File 'lib/rsssf/prepare/convert-navlines.rb', line 75 def proc_navlines_by_sections( txt ) edits = [] ### ## remove remaing nav html elements ## <MENU></MENU> ## <UL></UL> ## <LI></LI> = [] txt = txt.gsub( %r{ <MENU> | </MENU> | <UL> | </UL> | <LI> | </LI> }ix ) do |match| << match '' end if .size > 0 edit = String.new edit += "-- removed #{.size} remaining nav html element(s):\n" edit += .join( ' ') puts edit edits << edit ## record edit end sections = txt.split( %r{^ (?= [ ]* ={2,} [ ]* [\p{L}0-9] ## one letter or digit required ) }ix ) newsections = [] sections.each_with_index do |sect,sectno| newlines, navlines = strip_navlines( sect.lines, heading: true ) if navlines.size > 0 edit = String.new edit += "-- removing #{navlines.size} leading nav line(s) in section #{sectno+1}:\n" edit += navlines.join puts edit edits << edit end ## special check for last section if sectno+1 == sections.size ## reverse lines ## and remove trailing navlines until hitting body ## note - set heading to false newlines, navlines = strip_navlines( newlines.reverse, heading: false ) newlines = newlines.reverse navlines = navlines.reverse if navlines.size > 0 edit = String.new edit += "-- removing #{navlines.size} trailing nav line(s) in last section #{sectno+1}:\n" edit += navlines.join puts edit edits << edit end end newsections << newlines.join end # each section [newsections.join, edits] end |
#strip_navlines(lines, heading: true) ⇒ Object
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
# File 'lib/rsssf/prepare/convert-navlines.rb', line 28 def strip_navlines( lines, heading: true ) ## note - expects an array of lines (e.g. txt.lines!!!) newlines = [] navlines = [] body = false ## hit/seen body? lines.each_with_index do |line,lineno| ## check for optional leading heading line ## note - first line is heading ## (only optional for first section) if heading && lineno == 0 && line.lstrip.start_with?( '==' ) newlines << line next end ## possibly remove leading nav link lines if !body if line.strip.empty? newlines << line next end ## remove leading nav link lines only newline = line.strip.gsub( /‹.+?›/, '' ) ## check what's left over? ## if only space or pipe (|) or dot (.) than remove if newline.match?( %r{\A [ |.]* \z}ix ) ## puts " removing nav line #{line}" navlines << line ## eat-up; record edit else body = true newlines << line end else newlines << line end end # each line [newlines,navlines] end |