Class: Rsssf::Fmtfix

Inherits:
Object
  • Object
show all
Defined in:
lib/rsssf/fmtfix/dates.rb,
lib/rsssf/fmtfix/goals.rb,
lib/rsssf/fmtfix/score.rb,
lib/rsssf/fmtfix/errata.rb,
lib/rsssf/fmtfix/fmtfix.rb,
lib/rsssf/fmtfix/rounds.rb,
lib/rsssf/fmtfix/tables.rb,
lib/rsssf/fmtfix/headers.rb,
lib/rsssf/fmtfix/outline.rb,
lib/rsssf/fmtfix/topscorers.rb,
lib/rsssf/fmtfix/fmtfix-base.rb,
lib/rsssf/fmtfix/dates_helpers.rb,
lib/rsssf/fmtfix/patch_headings.rb

Overview

todo: find a better name e.g. Format or Fixer or ??

Constant Summary collapse

MONTH_LINES =

note - (re)use the same date regex style & capture names

from football.txt tokenizer
parse_names( <<TXT )
January    Jan
February   Feb
March      Mar
April      Apr
May
June       Jun
July       Jul
August     Aug
September  Sept  Sep
October    Oct
November   Nov
December   Dec
TXT
MONTH_NAMES =
build_names( MONTH_LINES )
MONTH_MAP =

pp MONTH_NAMES

build_map( MONTH_LINES, downcase: true )
DAY_LINES =
parse_names( <<TXT )
Monday                   Mon  Mo
Tuesday            Tues  Tue  Tu
Wednesday                Wed  We
Thursday    Thurs  Thur  Thu  Th
Friday                   Fri  Fr
Saturday                 Sat  Sa
Sunday                   Sun  Su
TXT
DAY_NAMES =
build_names( DAY_LINES )
DAY_MAP =

pp DAY_NAMES

build_map( DAY_LINES, downcase: true )
DATE_I_RE =

e.g. Aug 9

   Fri Aug 9
   Fri  Aug 9
   Fri, Aug 9
   Fri, Aug 9 2024
   Fri, Aug 9, 2024
        Aug 9, 2024
        Aug 9, 2024
note - eat-up optional comma after DAY_NAMES!!

add around for date not known perfectly

around Mar 29
ca. Nov 1

Jan 25/87    - support two-digit year
 Jan 28/87

extra/bonus -   allows (double) space typo for month day e.g
                      Aug  9
%r{
(?<date>
  \b
     ## optional around qualifier
     ((?<around>   around
                 | ca?\.)
                  [ ]
     )?
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+)
     )?
     (?<month_name>#{MONTH_NAMES})
            [ ]{1,2}   ## note - allow (double) space typo
     (?<day>\d{1,2})
          \b
     ## optional year
     (   (?:      ,? [ ]       ## note - comma optional with single space required for now
                (?<year>\d{4})        ## optional year 2025 (yyyy)
            |     /
                (?<yy>\d{2})
          )
            \b
     )?
)}ix
DATE_IB_RE =

date i - alt style with weekday at the end (used in arg2026.txt) e.g.

Mar 23, Mon
Mar 25, Wed
Apr 1, Wed
May 3, Sun
Jul 26, Sun
%r{
(?<date>
  \b
     (?<month_name>#{MONTH_NAMES})
            [ ]{1,2}   ## note - allow (double) space typo
     (?<day>\d{1,2})
          , [ ]?
      (?<day_name>#{DAY_NAMES})
     \b
)}ix
DATE_II_RE =
%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+)
     )?
     (?<day>\d{1,2})
         [ ]
     (?<month_name>#{MONTH_NAMES})
          \b
     ## optional year
     (?:  [ ]
        (?<year>\d{4})        ## optional year 2025 (yyyy)
        \b
     )?
)}ix
DATE_LEGS_RE =
%r{
(?<date_legs>
 \b
     (?<month_name1>#{MONTH_NAMES})
          [ ]
     (?<day1>\d{1,2})
       (?:
             , [ ]{0,5}
           | [ ]{1,5} and [ ]{1,5}
           | [ ]{0,5} & [ ]{0,5}
        )
    (?:     ## note - make 2nd month_name optiona
        (?<month_name2>#{MONTH_NAMES})
          [ ]
      )?
     (?<day2>\d{1,2})
      \b
    ## optional two-digit year
     (?:    /
          (?<yy2>\d{2})
            \b
     )?
)}ix
DATE_LIST_RE =

merge date_list and date_legs ??

 or keep date_legs always with two dates by definition??
and date_list starting w/ three or more dates?

 May 2,3,4
 Feb 28, Mar 1,2
%r{
(?<date_list>
 \b
     (?<month_name1>#{MONTH_NAMES})
          [ ]
     (?<day1>\d{1,2})
     (?:  [,;] [ ]{0,5}  )


      (?:  ## note - make 2nd month_name optiona
        (?<month_name2>#{MONTH_NAMES})
          [ ]
      )?
     (?<day2>\d{1,2})
       (?: [,;] [ ]{0,5}  )


      (?:     ## note - make 3rd month_name optiona
        (?<month_name3>#{MONTH_NAMES})
          [ ]
      )?
     (?<day3>\d{1,2})
     \b


     ### optional fourth date
     (?:
         [,;] [ ]{0,5}
         (?:   ## note - make 4th month_name optiona
            (?<month_name4>#{MONTH_NAMES})
            [ ]
         )?
       (?<day4>\d{1,2})
         \b
     )?
)}ix
DATE_RANGE_RE =
%r{
(?<date_range>
 \b
     (?<month_name1>#{MONTH_NAMES})
          [ ]
     (?<day1>\d{1,2})
            [ ]? - [ ]?
    (?:   ## optional month
       (?<month_name2>#{MONTH_NAMES})
           [ ]
    )?
      (?<day2>\d{1,2})
     \b
)}ix
FMT_DAY_NAMES =
[
    nil,   ##  or use '!ERROR!' - why? why not?
    'Mon',  # 1
    'Tue',  # 2
    'Wed',  # 3
    'Thu',  # 4
    'Fri',  # 5
    'Sat',  # 6
    'Sun',  # 7
]
FMT_MONTH_NAMES =
[
    nil,    ## or use '!ERROR!' - why? why not?
    'Jan',  # 1
    'Feb',  # 2
    'Mar',  # 3
    'Apr',  # 4
    'May',  # 5
    'Jun',  # 6
    'Jul',  # 7
    'Aug',  # 8
    'Sep',  # 9
    'Oct',  # 10
    'Nov',  # 11
    'Dec',  # 12
]
GOALS_ =
%q{
      [^:\[\]\n]*?
         \b
         \d{1,3}  '?  ## incl. minute
      [^\[\]\n]*?
}
ERRATAS =
{
## in austria
##       avoid confusion with  /DD is year!!!
##          maybe make it a switch to turn on
    '[Nov 13/14]' => '[Nov 13,14]',
    '[Mar 25/26]' => '[Mar 25,26]',
    '[Aug 12/13]' => '[Aug 12,13]',



## "classic" typos
    ## month
    '[Niv 8]' => '[Nov 8]',
    '[Mov 7]'  => '[Nov 7]',
    '[Mov 26]' => '[Nov 26]',
    ## double brackets
    '[Apr 15]]'                    => '[Apr 15]',
    "[[36' Hansen, 58' Glasner]"  => "[36' Hansen, 58' Glasner]",
    ### more
    '  att; '  => ' att: '   ##  e.g. Wembley; att; 11,689

}
ROUND_PAT_BASE =

e.g. round 1, round 2, etc.

    matchday 1
    week 1
 note - add optional   Matchday 1 of 2 or such
    keep why? why not?

matchweek used by premerleague.com
week used in msl/usa (no matchdays/rounds)
  note - matchweek might start on tuesday (e.g. tue to mon)
              or check if always 7day week?

note - use 1-9 regex (cannot start with 0) - why? why not?

make week 01 or round 01 or matchday 01 possible?
%q{
         (   Round
           | Matchday
           | Matchweek
           | Week )   [ ]{1,2}  [1-9][0-9]*

        (?:    ## note - add optional   Matchday 1 of 2 or such
               [ ] of [ ] [1-9][0-9]*
        )?
}
ROUND_NAMES_EN =

add more pattern via config

todo/fix - check if .txt is empty
               do NOT add ( || will match everything!!)

rename names_misc to names_more - why? why not?

read_patterns( "#{Rsssf.config_dir}/rounds_en.txt" )
ROUND_NAMES_ES =
read_patterns( "#{Rsssf.config_dir}/rounds_es.txt" )
ROUND_NAMES_MISC =
read_patterns( "#{Rsssf.config_dir}/rounds_misc.txt" )
ROUND_PAT =
ROUND_PAT_BASE + ' | ' + ROUND_NAMES_EN.join( ' | ' ) +
' | ' + ROUND_NAMES_ES.join( ' | ' ) +
' | ' + ROUND_NAMES_MISC.join( ' | ' )
TABLE_HEADER_RE =
%r{
      ############
      ## negative & positive lookaheads

##         (?!
##               .* [ ]{2,}       ## no (inline)  double (or more) spaces allowed
##         )

      (?:
      ## (i)  can only start with non-zero number
      ##      or alpha
      ##
      ##  A.  or
      ##  1.  or
      ##   mixed with dot  1A. yes/no?, A1. yes/no?,  1B1. ?
      ##   1.K    - 1.Klasse

      ##
      ##  note - \b(oundary) - to always get complete tokens (alphanum) tokens
      ##            note - \b includes [a-z0-9_] PLUS underscore (_)
      ##                          check if underscore is \b
      ##                              e.g.   09_  or _09 or  match \b[0-9]\b  ???
      ##   use our own asserts?
      ##      BNUM (boundary number) e.g. [^0-9]
      ##      BALPHA (boundary alpha) e.g. [^a-z]
      ##      BALNUM (boundary alphanum) e.g. [^a-z0-9]
      ##    classic is   [^a-z0-9_]

         (?<header>
          (?=
              .*  \p{L}+    ## must incl. alpha character - not only numbers!!
          )

  ## note
  ##   order matters
  ##   move specific first!!

            \b
             (?:     [0-9]+\p{L}  [0-9\p{L}]* \b    ## (ii) mixed alphanum (starting w/ num)
                |    [0-9]+  \b  \.?  (?! \d)    ## (i)  num
                |  \p{L}+[0-9]  [0-9\p{L}]* \b     ## (iiii) mixed alphanum (starting w/ alpha)
                |  \p{L}+  \b    \.?              ## (iii) alpha
             )
             (?:
                ## " (i-iiii) connector options  (a) single space
                ##                                   -- exclude numbers on numbers (FIX)
                ##                               (b) dash (-) or slash (/)
                ##                                  --  must be alpha(.?)-alpha
                ##                                        incl.  K.-H.  with trailing dot
                ##                              add ampersand (&) too
                ##                                    w/ leading & trailing opt space?
                ##                                                incl.  K.&H., K. & H.
               (?:   [ ]?
                   | (?<! \d)  -   ## add negative lookbehind&ahead (no numbers please)
                     (?! \d)
                   |  /
                )
                    ## repeat (i-iiii) see above
                       ## todo - do NOT allow numbers followed by numbers
                \b
                (?:  [0-9]+ \b    (?! [ ] \d)     ## (i) num - no more ordinals - why? why not?
                  |  [0-9]+\p{L}  [0-9\p{L}]* \b     ## (ii) mixed alphanum (starting w/ num)
                                                  ##     group 1a 1FC?? - why? why not?
                  |  \p{L}+  \b   \.?              ## (iii) alpha
                  |  \p{L}+[0-9]  [0-9\p{L}]* \b   ## (iiii) mixed alphanum (starting w/ alpha)
               )
            )*
            (?:
                 [ ]
                \(  [^:()\[\]]+?  \)
            )?
        )  ## end-of-capture header
   )
   :?    ## optional colon (:) e.g. final table:
}ix
TABLE_RE =
%r{

         ### optional table header
          (?:
             ### negative lookahead
             ##    MUST NOT match  standing line e.g.  10  3  4
             ##      or         table heading (see below)
             ##      or   -----  (old style structured heading left overs)
                    (?! ^[ ]* (?:   [^\n]+?  [ ]+ \d{1,3} [ ]+ \d{1,3} [ ]+ \d{1,3}
                                |   (?: GP | M | Team ) [ ]
                                |  -{3,}
                              )
                     )

             ## (i) table header
             ##
             ## fix - make header match more strict!!!
             ##   e.g. do NOT match ---  or more than three spaces or such
             ## exlcude in header
             ##   NB:
             ##    [*]
             ##    [1]
             ## exclude heading === e.g.
             ##    ==== USL Premier Development
                 ^
                [ ]*


              ## exclude comma (,) - why? why not?
              ##   and numbers  - unless group 1
              ## e.g. Kaczor 78 - Dreßel 19, Steinkogler 50,
              ## B'schweig  2-1 Schalke    (Handschuh 38, Popivoda 55 - Fischer 82)
              ##  M'gladbach 2-1 1. FC Köln (Jensen 6, Wittkamp 35 - D.Müller 78)
              ##   Kraft 3, E.Kremers 38)
              ##  Schalke     4-0 Tasmania    (Klose 2, 78, Herrmann 40, Kreuz 82)
              ##
              ## allow name such as
              ##    USL - 1ST DIVISION (2nd Division)


             (?<header>  [^=*:,0-9\[\]\n]+?
                          ([ ] \d{1,2} \b)?   ## optional number only at the end e.g. group 1
                     )
                  :?  ## optional colon (:) e.g. final table:
                  ## cut-off everything separated by more than three spaces
                  ##   e.g. might be "inline" table heading (follow table header name)
                  ##  e.g. Group 1                  M     W     T     L    GF    GA    DIF   PTS
                  (?: [ ]{4,} (?: GP | M |Team ) [ ]  [^\n]+? )?
              [ ]*
             ## note - allow optional blank line - why? why not?
             (?:  \n ^[ ]* )?
             \n
          )?


      #### optional  table heading line
      (?:  ^(?:
          #{table_heading_( 'GP  W   L   D  GF  GA  PTS?' )}
        | #{table_heading_( 'GP  W   L   T  GF  GA  PTS?' )}
        | #{table_heading_( 'GP  W   T   L  GF  GA  PTS?' )}
        | #{table_heading_( 'GP  W   D   L  GF  GA  PTS?' )}
        ##  SW  sudden death win, SL sudden death lose
        | #{table_heading_( 'GP  W   L  SW  GF  GA  PTS?' )}
        | #{table_heading_( 'GP  W SW  SL   L   GF  GA  PTS?' )}
        | #{table_heading_( 'GP  W SOW SOL  L  GF  GA PTS?'   )}
        ##  mx/spanish
        | #{table_heading_( 'M   W   T   L  GF  GC  DIF  PTS' )}
        | #{table_heading_( 'M   W   T   L  GF  GA PTS AVGE' )}
        | #{table_heading_( 'Team  M  W  T  L  GF-GA  PTS')}
        | #{table_heading_( 'Team   M  W  T  L  GF-GA  PTS EP  TP')}
        )
       ## note - allow optional blank line - why? why not?
          (?: \n ^[ ]* )?
            \n
      )?


  ## MUST be followed by a table (standing) line
  ## e.g.  1.FC Cincinnati    34  20  9  5  57-39  69
  ##
  ##   note - allow "run-on" e.g. LB14 on first number
  ## Hudson Valley Quickstrike LB14  12   0   2   40   9   38
  ## Hudson Valley Quickstrike LB12  11   1   0   26   9   33
  ##
  ##    17    11     5     1    40    16    +24    38
  ##  or
  ###  + 1.DC United                       32 17  6/ 3  6 65-43 57

         ^
         (?:
               [^\n]+?
                 (?:
                    (?:

                      \d{1,3}
                 [ ]+ \d{1,3}  ## win
 (?: [ ]+ | [ ]* / [ ]* ) \d{1,3}  ## draw
                 [ ]+ \d{1,3}  ## lose
                 [ ]+ \d{1,3}  (?:  [ ]* [:-] [ ]*
                                  | [ ]+ )  \d{1,3}
                 [ ]+ [+-]? \d{1,3} \b  # might be diff or point allow +/-!!
                   )
                 )
               [^\n]*?
          )
         \n

         ## eat-up the rest
         .*?   ## non-greedy - match everything (incl. newline!) until
                 (?:   \n (?= \n)    ## break on blank line (\n\n) or end-of-string/file
                          | \z
                 )

}ixm
OPT_REF =

let’s you check optional ref e.g. ‹§fin›

%q{
   (?: [ ]*
     ‹ (?<ref> §[^›]+?) ›
   )?
}
HEADER_ROUND_RE =

note - allow optional colon e.g.

Playoff:
Round 21:
%r{\A
        [ ]*
         (?<round> #{ROUND_PAT})
              :?   ## note - allow optional colon (:)  e.g. Playoff:
            #{OPT_REF}
         [ ]*
\z}ix
HEADER_DATE_RE =
%r{\A
      [ ]*
      \[  #{date_(DATE_I_RE, DATE_IB_RE,
                  DATE_II_RE,
                  DATE_RANGE_RE,
                  DATE_LIST_RE, DATE_LEGS_RE,
                  )}
      \]
      [ ]*
\z}ix
HEADER_DATE_II_RE =

alternate date header (no brackets incl. year)

 Aug 7 1999
 Sep 4 1999
Oct 23 1999
Nov 20 1999
Apr 1 2000
%r{\A
      [ ]*
         #{date_(DATE_I_RE, DATE_II_RE)}
      [ ]*
\z}ix
CITY_ =
Sep 16, Berchtold 26, Glasner 54, Kuljic 60

— note - exclude numbers in follow-up text!!!

use a shared pattern for city-like text !!
   maybe allow more and make more specific later

exclude comma (,) - why? why not?
 split in CITY_ and CITY_PLUS_ or such?
 or find a better name ??

allow number if:

Happel-Stadion, Wien, att: 9,200
Happel-Stadion, Wien; att: 7000
Innsbruck; att: 6700
Wörthersee-Stadion, Klagenfurt; att: 30,000
Wörthersee Stadion, Klagenfurt; att: 20,500
Hayward, Calif.; att: 5.528   -- note: dot (.) NOT comma (,)

Apr 30, 28 Black Arena, Klagenfurt; att: 30,000

Wörthersee Stadion, known as 28 Black Arena for sponsorship reasons

Ernst-Happel-Stadion, Wien; att: 20100; ref: Hofmann
%q{   (?<city>  (?:   [^0-9:;\[\]]+?
                  | .+?
                      [ ] att: [ ] [0-9,.]+
                      (?: [;,] [ ] ref: [ ] .+?  ## w/ optional ref:
                      )?
               )
     )
}
HEADER_DATE_N_CITY_RE =
Jun 3, Ferrol
Apr 2, Wembley

-or-

Sat May 17 - at Millennium Stadium, Cardiff
Sun May 25 - at Millennium Stadium, Cardiff
%r{\A
      [ ]*
      \[  #{date_(DATE_I_RE,
                  DATE_II_RE)}
           (?:       , [ ]*
               | [ ] - [ ] at [ ]
            )
           #{CITY_}
      \]
      [ ]*
\z}ix
HEADER_DATE_ALT_RE =

alternate date header with brackets (in oost02.txt)

[31-08]  change to _ 31/08 _
[07-09]
[07-09]
[30-05, Thaur]
%r{\A
      [ ]*
      \[  (?<date>
             (?<day> \d{1,2}) - (?<month> \d{1,2})
          )
          (?:
              , [ ]*
              #{CITY_}
          )?
      \]
      [ ]*
\z}ix
HEADER_ROUND_N_DATE_RE =
%r{\A
        [ ]*
         (?<round> #{ROUND_PAT})
         [ ]+
        \[
           #{date_(DATE_I_RE, DATE_IB_RE, DATE_II_RE,
                   DATE_RANGE_RE,
                   DATE_LIST_RE, DATE_LEGS_RE)}
        \]
        #{OPT_REF}
        [ ]*
\z}ix
HEADER_ROUND_N_DATE_N_CITY_RE =

Final [May 1, Klagenfurt]

%r{\A
        [ ]*
         (?<round> #{ROUND_PAT})
         [ ]+
        \[  #{date_(DATE_I_RE, DATE_II_RE)}
             , [ ]*
           #{CITY_}
        \]
        [ ]*
\z}ix
HEADER_ROUND_N_CITY_RE =

Final [in Völs] Final [in Kundl]

%r{\A
        [ ]*
         (?<round> #{ROUND_PAT})
         [ ]+
        \[in [ ]+ #{CITY_}
        \]
        [ ]*
\z}ix
HEADER_ROUND_N_CITY_N_DATE_RE =

reverse

Final [Graz, May 12]

Super Cup Final [Graz, Jul 6] Final [London, Feb 27]

%r{\A
        [ ]*
         (?<round> #{ROUND_PAT})
         [ ]+
        \[ #{CITY_}
             , [ ]*
            #{date_(DATE_I_RE, DATE_II_RE)}
        \]
        [ ]*
\z}ix
HX_RE =
%r{          ## negative lookahead
         ##   do NOT match  =-=
         ##   do NOT match  ===========  (without any heading text!!)
         ##     e.g.
         ##       Fall season
         ##       ===========

        (?! ^[ ]* (?:    =-=
                     |  ={1,} [ ]* $
                   )
         )

         ^
        [ ]*

      (?<marker> ={1,6})
         [ ]*
      (?<text> .+?)
         #{OPT_REF}
         [ ]*
$}x
TOPSCORERS_RE =

e.g.

topscorer, topscorers
top scorer, top scorers
scorer, scorers
%r{^     [ ]*
    (?<header>
       (?: top [ ]?)?  ## note - optional top
          scorers?      ## singular or plural
     )
      (?: [ ]* :)?    ## note - optional colon
       [ ]*
      \n{0,2}       ## note - optional leading blank line!!

    .*?             ## non-greedy - match everything until
  (?:   \n (?= \n)    ## blank line (\n\n) or end-of-string/file
      | \z
  )
}ixm

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.build_map(lines, downcase: false) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/rsssf/fmtfix/dates_helpers.rb', line 43

def self.build_map( lines,
               downcase: false )
   ## note: downcase name!!!
  ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
  ##  {"january" => 1,  "jan" => 1,
  ##   "february" => 2, "feb" => 2,
  ##   "march" => 3,    "mar" => 3,
  ##   "april" => 4,    "apr" => 4,
  ##   "may" => 5,
  ##   "june" => 6,     "jun" => 6, ...
  lines.each_with_index.reduce( {} ) do |h,(line,i)|
    line.each do |name|
       h[ downcase ? name.downcase : name ] = i+1
    end  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
    h
  end
end

.build_names(lines) ⇒ Object



36
37
38
39
40
# File 'lib/rsssf/fmtfix/dates_helpers.rb', line 36

def self.build_names( lines )
  ## join all words together into a single string e.g.
  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
  lines.map { |line| line.join('|') }.join('|')
end

.date_(*re) ⇒ Object

helper for inline regexes (with union) and escaped

Raises:

  • (ArgumentError)


51
52
53
54
55
56
# File 'lib/rsssf/fmtfix/headers.rb', line 51

def self.date_( *re )
      raise ArgumentError, "more than one date regex expected, got #{re}"  if re.size < 1

      ## (auto-)wrap in non-capature group - why? why not?
      "(?: #{Regexp.union( *re ).source})"
end

.fmtfix(txt, heading_patches: nil) ⇒ Object

convenience helper



42
43
44
45
# File 'lib/rsssf/fmtfix/fmtfix.rb', line 42

def self.fmtfix( txt, heading_patches: nil )
      @@fmtfix ||= new   ## use a "shared" built-in fmtfix
      @@fmtfix.fmtfix( txt, heading_patches: heading_patches )
end

.fmtfix_pages(pages, outdir:, path:, heading_patches: nil) ⇒ Object

convenience helper



8
9
10
11
12
13
# File 'lib/rsssf/fmtfix/fmtfix.rb', line 8

def self.fmtfix_pages( pages, outdir:, path:, heading_patches: nil )
      @@fmtfix ||= new   ## use a "shared" built-in fmtfix
      @@fmtfix.fmtfix_pages( pages, outdir: outdir,
                                    path: path,
                                    heading_patches: heading_patches )
end

.mkheading_regex(str) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/rsssf/fmtfix/patch_headings.rb', line 12

def self.mkheading_regex( str )

    str = str.strip
    ##
    ## change all spaces (other than [ ] and $$ ) to interpunkt
    str = str.gsub( %r{
                             (?<charclass> [ ]* \[ [^\[\]]+ \] [*?+]? [ ]*)
                          |  (?<newline>   [ ]* \$\$ [ ]*)
                          |  (?<spaces>    [ ]+)
                        }x
                       ) do
              m = Regexp.last_match
              if m[:spaces]
                 ' [ ] '     ##  change space to [ ]
              elsif m[:newline]
                 ' \s+ '     ##  $$ => \s+  -- note - make sure \s incl. newline!!
              else
                 m[0]         ## keep as is
              end
            end

  ##  escpape  .   to \.
  ##  change  ~    to [ ]?  -- that is, optional space
  ##  change  ( )  to \( \)
   str = str.gsub(  '~', ' [ ]? ' )
   str = str.gsub(  '.', '\.' )
   str = str.gsub(  '(', '\(' )
   str = str.gsub(  ')', '\)' )


  ### last step change builtins
  ##     '$SEASON$' => '\d{4}/(?:\d{2}|\d{4})',
   str = str.gsub( '$SEASON$', '\d{4}/(?:\d{2}|\d{4})' )

end

.parse_heading_patches(txt) ⇒ Object



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/rsssf/fmtfix/patch_headings.rb', line 50

def self.parse_heading_patches( txt )
   patches = {}

   header = nil

   txt.each_line do |line|

      line = line.strip
      next  if line.empty? || line.start_with?('#')
      break if line == '__END__'

      ## check if heading
      if m=%r{ ^
              [ ]* =+ [ ]*
                (?<text> .+?)
              (?: [ ]* =+ )?
               [ ]*
                $
              }x.match(line)

          header = patches[m[:text]] = []
      else
          re =  mkheading_regex( line )
          ## note - wrap in %r{^$}ix
          header <<  %r{^ #{re} $}ix
      end


   end
   patches
end

.parse_names(txt) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/rsssf/fmtfix/dates_helpers.rb', line 9

def self.parse_names( txt )
  lines = [] # array of lines (with words)

  txt.each_line do |line|
    line = line.strip

    next if line.empty?
    next if line.start_with?( '#' )   ## skip comments too

    ## strip inline (until end-of-line) comments too
    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
    ##   =>   Janvier  Janv  Jan

    line = line.sub( /#.*/, '' ).strip
    ## pp line

    values = line.split( /[ \t]+/ )
    ## pp values

    ## todo/fix -- add check for duplicates
    lines << values
  end
  lines

end

.read_heading_patches(path) ⇒ Object



83
# File 'lib/rsssf/fmtfix/patch_headings.rb', line 83

def self.read_heading_patches( path )  parse_heading_patches( read_text( path)); end

.table_heading_(line) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
# File 'lib/rsssf/fmtfix/tables.rb', line 6

def self.table_heading_( line )
  ## M   W  T  L  GF  GA  PTS  AVGE
  ##  =>
  ## (?:
  ##   [ ]+ M  [ ]+  W [ ]+ T [ ]+ L [ ]+ GF [ ]+ GA [ ]+ PTS [ ]+ AVGE  [ ]*
  ##  )

   cols = line.strip.split( /[ ]+/ )

   "(?: [ ]+ #{cols.join(' [ ]+ ')} [ ]*)"
end

Instance Method Details

#_build_date(m) ⇒ Object

“internal” date helpers



305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
# File 'lib/rsssf/fmtfix/dates.rb', line 305

def _build_date( m )
             ## quick fix for undefined group name reference
             m = m.named_captures.transform_keys(&:to_sym)  if m.is_a?(MatchData)

            date = {}
         ## map month names
         ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date[:y]  = m[:year].to_i(10)  if m[:year]
            ## check - use y too for two-digit year or keep separate - why? why not?
            date[:yy] = m[:yy].to_i(10)    if m[:yy]    ## two digit year (e.g. 25 or 78 etc.)
            date[:m] = m[:month].to_i(10)  if m[:month]
            date[:m] = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
            date[:d]  = m[:day].to_i(10)   if m[:day]
            date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

            date[:around] = true     if m[:around]

            date
end

#_build_date_legs(m) ⇒ Object



325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# File 'lib/rsssf/fmtfix/dates.rb', line 325

def _build_date_legs( m )
             ## quick fix for undefined group name reference
             m = m.named_captures.transform_keys(&:to_sym)  if m.is_a?(MatchData)

             legs = {}
            ## map month names
            ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date = {}
            date[:m] = MONTH_MAP[ m[:month_name1].downcase ]
            date[:d]  = m[:day1].to_i(10)
            legs[:date1] = date

            date = {}
            date[:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            date[:d]  = m[:day2].to_i(10)
            date[:yy] = m[:yy2].to_i(10)    if m[:yy2]    ## two digit year (e.g. 25 or 78 etc.)
            legs[:date2] = date

            legs
end

#_build_date_list(m) ⇒ Object



348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
# File 'lib/rsssf/fmtfix/dates.rb', line 348

def _build_date_list( m )
             ## quick fix for undefined group name reference
             m = m.named_captures.transform_keys(&:to_sym)  if m.is_a?(MatchData)

            list = {}
            ## map month names
            ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date = {}
            date[:m] = MONTH_MAP[ m[:month_name1].downcase ]
            date[:d]  = m[:day1].to_i(10)
            list[:date1] = date

            date = {}
            date[:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            date[:d]  = m[:day2].to_i(10)
            list[:date2] = date

            date = {}
            date[:m] = MONTH_MAP[ m[:month_name3].downcase ]   if m[:month_name3]
            date[:d]  = m[:day3].to_i(10)
            list[:date3] = date

            if m[:day4]
               date = {}
               date[:m] = MONTH_MAP[ m[:month_name4].downcase ]   if m[:month_name4]
               date[:d]  = m[:day4].to_i(10)
               list[:date4] = date
            end

            list
end

#_build_date_range(m) ⇒ Object



381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
# File 'lib/rsssf/fmtfix/dates.rb', line 381

def _build_date_range( m )
             ## quick fix for undefined group name reference
             m = m.named_captures.transform_keys(&:to_sym)  if m.is_a?(MatchData)

             range = {}
            ## map month names
            ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date = {}
            date[:m] = MONTH_MAP[ m[:month_name1].downcase ]
            date[:d]  = m[:day1].to_i(10)
            range[:date1] = date

            date = {}
            date[:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            date[:d]  = m[:day2].to_i(10)
            range[:date2] = date

            range
end

#_fmt_date(date, format: nil) ⇒ Object

use format: ‘numeric’ for 23/7 or 23/7/2010 etc.



430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
# File 'lib/rsssf/fmtfix/dates.rb', line 430

def _fmt_date( date, format: nil )   ### use format: 'numeric' for  23/7 or 23/7/2010 etc.
    buf = String.new

    if format && format.downcase == 'numeric'
      buf << "#{date[:d]}/#{date[:m]}"

      if date[:y]       ## (optional) four-digit year e.g. 2010
        buf << "/#{date[:y]}"
      elsif date[:yy]   ## (optional) two-digit year  e.g. 98
        buf << ("/%02d" % date[:yy])    ## note - make sure 0,1,2 become 00, 01, 02
      end

      buf
    else    ## use Fri Feb 7 2025
      ## check for "canonical" convention for around/ca. date or such
      buf << "c. "   if date[:around]

      buf << "#{FMT_DAY_NAMES[date[:wday]]} "  if date[:wday]
      buf << "#{FMT_MONTH_NAMES[date[:m]]} "
      buf << "#{date[:d]}"


      if date[:y]
         buf << " #{date[:y]}"
      elsif date[:yy]
         ## note - expand two-digit year to four-digit year
         buf << if date[:yy] < 30
                   ## note - make sure 0,1,2 become 00, 01, 02
                  " 20%02d" % date[:yy]   ## 2000, 2001, .. 2029
                else
                 " 19%02d" % date[:yy]   ## 1930, 1931 .. 1999
                end
      end

      buf
    end

    buf
end

#_fmt_date_legs(legs, format: nil) ⇒ Object

use format: ‘numeric’ for 23/7 or 23/7/2010 etc.



470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
# File 'lib/rsssf/fmtfix/dates.rb', line 470

def _fmt_date_legs( legs, format: nil )   ### use format: 'numeric' for  23/7 or 23/7/2010 etc.
    buf = String.new

    buf << "#{FMT_MONTH_NAMES[legs[:date1][:m]]} "
    buf << "#{legs[:date1][:d]}"
    buf << " & "
    buf << "#{FMT_MONTH_NAMES[legs[:date2][:m]]} "  if legs[:date2][:m]
    buf << "#{legs[:date2][:d]}"

    if legs[:date2][:yy]
         ## note - expand two-digit year to four-digit year
         buf << if legs[:date2][:yy] < 30
                   ## note - make sure 0,1,2 become 00, 01, 02
                  " 20%02d" % legs[:date2][:yy]   ## 2000, 2001, .. 2029
                else
                 " 19%02d" % legs[:date2][:yy]   ## 1930, 1931 .. 1999
                end
    end

    buf
end

#_fmt_date_list(list, format: nil) ⇒ Object

use format: ‘numeric’ for 23/7 or 23/7/2010 etc.



493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
# File 'lib/rsssf/fmtfix/dates.rb', line 493

def _fmt_date_list( list, format: nil )   ### use format: 'numeric' for  23/7 or 23/7/2010 etc.
    buf = String.new

    buf << "#{FMT_MONTH_NAMES[list[:date1][:m]]} "
    buf << "#{list[:date1][:d]}"

    if list[:date2][:m]  ## add extra space if month present
      buf << "; #{FMT_MONTH_NAMES[list[:date2][:m]]} "
    else
      buf << ","
    end
    buf << "#{list[:date2][:d]}"

    if list[:date3][:m]   ## add extra space if month present
      buf << "; #{FMT_MONTH_NAMES[list[:date3][:m]]} "
    else
      buf << ","
    end
    buf << "#{list[:date3][:d]}"

    if list[:date4]
      if list[:date4][:m]   ## add extra space if month present
         buf << "; #{FMT_MONTH_NAMES[list[:date4][:m]]} "
      else
         buf << ","
      end
      buf << "#{list[:date4][:d]}"
    end


    buf
end

#_fmt_date_range(range, format: nil) ⇒ Object

use format: ‘numeric’ for 23/7 or 23/7/2010 etc.



527
528
529
530
531
532
533
534
535
536
537
# File 'lib/rsssf/fmtfix/dates.rb', line 527

def _fmt_date_range( range, format: nil )   ### use format: 'numeric' for  23/7 or 23/7/2010 etc.
    buf = String.new

    buf << "#{FMT_MONTH_NAMES[range[:date1][:m]]} "
    buf << "#{range[:date1][:d]}"
    buf << "-"
    buf << "#{FMT_MONTH_NAMES[range[:date2][:m]]} "  if range[:date2][:m]
    buf << "#{range[:date2][:d]}"

    buf
end

#_norm_date(m, format: nil) ⇒ Object

note - line-by-line processing / matching



245
246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'lib/rsssf/fmtfix/headers.rb', line 245

def _norm_date( m, format: nil )
   ## quick fix for undefined group name reference
   m = m.named_captures.transform_keys(&:to_sym)  if m.is_a?(MatchData)

  if m[:date_list]
    _fmt_date_list(_build_date_list( m ), format: format )
  elsif m[:date_legs]
    _fmt_date_legs(_build_date_legs( m ), format: format )
  elsif m[:date_range]
    _fmt_date_range(_build_date_range( m ), format: format )
  else   ## assume m[:date]
    _fmt_date(_build_date( m ), format: format )
  end
end

#_patch_heading(txt, rxs, title) ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/rsssf/fmtfix/patch_headings.rb', line 89

def _patch_heading( txt, rxs, title )
   found_match = false
   rxs.each do |rx|
     txt = txt.sub( rx ) do |match|
               puts "  found heading match >#{match}< replace with >== #{title}<"

               if title == '*'    ## use orginal title/ do NOT replace/normalize
                  ## note - keep going with replacements here
                  ##
                  ##  note - autoremove (optional) trailing colon (:)
                  ##                                    or  dot (.)
                  ##              use .chomp(':') instead - why? why not?
                  match = match.sub( /[.:]$/, '').rstrip
                  "== #{match}\n"
               else
                  ## note - only short-circuit match if NOT generic replace
                  found_match = true
                  "== #{title}\n"
               end
             end
     ## note - break on first match
     break if found_match
   end
   txt
end

#_scan_outline(txt) ⇒ Object



171
# File 'lib/rsssf/fmtfix/outline.rb', line 171

def _scan_outline( txt )   txt.scan( HX_RE );  end

#autofix(txt) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/rsssf/fmtfix/fmtfix-base.rb', line 9

def autofix( txt )

 ##
 ## make sure no tabs (expand to two spaces)
  txt = txt.gsub( "\t", '  ' )
  txt = txt.gsub( "\r\n", "\n" )  ## unify newline

  ## fix unicode space !! use code point!!
  txt = txt.gsub( /[ ]/, ' ' )




    txt = handle_tables( txt )     ## e.g. final/halfway table (aka standings)
    txt = handle_topscorers( txt )


    txt = handle_errata_txt( txt )



  #####
   ## line-by-line processing / matching

   newtxt = String.new
   txt.each_line do |line|
        ## check if line incl. newline? - yes

         ## note - handle_header returns nil if no match
         ##            otherwise the reformatted (new) line !!!
         newline = handle_header( line.rstrip )

         newtxt <<   (newline ? newline : line)
   end

   txt = newtxt


   txt = handle_score( txt )



   txt = handle_goals( txt )


  ###
  ## todo
  ##   fix subs in lineup  in oost00.txt
  # Salzburg: Safar - Szewczyk (97./Lipcsei) - Winklhofer, C.Jank - Laessig,
  #        Hütter (71./Meyssen) - Nikolic, Aufhauser, Kitzbichler - Struber,
  #        Polster (56./Sabitzer)



  txt
end

#autofix_outline(txt, title:) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/rsssf/fmtfix/outline.rb', line 36

def autofix_outline( txt, title: )


     hx =  txt.scan( HX_RE )

     ### note - shortcircuit if no headings found!!!
     return txt    if hx.size == 0




     ## update counts/usage of h1,h2,h3,h4,h5,h6
     counts = [nil,0,0,0,0,0,0]
     hx.each do |marker,_|
                   level = marker.size;
                   counts[ level ] += 1
     end

      ## flatten level; only record levels with heading counts
      levels = []
      counts.each_with_index do  |count,level|
            levels << level    if count && count > 0
      end



      #####
      ### special case for first heading
      ##    check if heading is matching title AND the only one in top level
      htop_marker, htop_header = hx[0]
      htop_level = htop_marker.size

      ##  top heading MUST always be lowest (top)
      if htop_level == levels[0]
        if counts[htop_level] == 1
          ##  check if same as title
          ##    if yes pop (that is, remove too)
          if htop_header == title
             counts[htop_level] = 0  ## update/reset counter
             levels.shift            ## remove first level (inline op)!!!

             ### note - space in header must be replaces with [ ]!!!!
             ##                    or \\   with Regex.escape!!!
             ###  note - MUST escape string for regex e.g. [Bra..] or 1.
             ###
             ###   V COPA BRASIL - 1979 [Brazilian Championship]
             ##    check if space works with escape??

             htop_re = %r{
                            ^
                           [ ]* #{htop_marker}
                           [ ]* #{Regexp.escape(htop_header)}
                              .*?
                           $    ## or use \n - why? why not?
                         }x
             ## remove line in txt too
             txt = txt.sub( htop_re ) do |match|
                   puts "   removing top heading matching title  -- >#{match}<"
                                 ''
                              end
          else
             ## warn/log  - heading top NOT matching page title
             msg = "first top heading NOT matching page title  #{htop_header} <=> #{title}"
             puts "!! WARN #{msg}"
             log( msg )
          end
        else
          ## warn/log   - more than one top level heading!!!
          msg = "more than one (#{counts[htop_level]}) top heading #{htop_level} found " +
                "in page with title #{title}"
          ## maybe add headers in the future - why? why not?
          log( msg )
       end
      else
         ## warn/log   - top heading NOT top!!
          msg = "top heading #{htop_level} not top (#{levels[0]}) " +
                "in page with title #{title}"
          log( msg )
      end


       mapping = {}
       levels.each_with_index do |level,i|
            from = level
            to   = i+1
            mapping[from] = to
       end


      # rewrite headings
       txt = txt.gsub( HX_RE ) do
                 m = Regexp.last_match
                 old_marker = m[:marker]
                 old_level  = m[:marker].size

                 new_level = mapping[old_level]

                 if new_level.nil?
                    puts "!! no heading #{old_level} mapping found in page >#{title}<:"
                    puts "match:"
                    pp m
                    puts "counts:"
                    pp counts
                    puts "levels:"
                    pp levels
                    puts "mapping:"
                    pp mapping
                    exit 1
                 end

                 new_marker =  '=' * new_level

                 ## remove level diff from marker
                 ##
                 ##  maybe in the future use track trailing marker too
                 ##   and rebuild heading/header instead of gsub

                ## note -  always start at level 2 (page title like in wikipedia is level 1)
                 ##                  thus, new_level+1

                 if (old_level - new_level+1) > 0
                    ## note - will remove diff from leading (and possibly trailing) marker too
                    m[0].gsub( old_marker, new_marker+'=' )
                 else
                    m[0]
                 end
           end


    txt

end

#build_outline(txt) ⇒ Object



173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/rsssf/fmtfix/outline.rb', line 173

def build_outline( txt )

     hx =  txt.scan( HX_RE )


     counts = [nil,0,0,0,0,0,0]  ## note - index 0 is nil
                                 ##  index 1 (h1) is 0 etc.

     hx.each { |marker,_| counts[ marker.size ] +=1 }


     buf = String.new
     buf += "  outline:"
     buf += " " +
          "#{counts[1]==0 ? '-' : 'h1'}/" +
          "#{counts[2]==0 ? '-' : 'h2'}/" +
          "#{counts[3]==0 ? '-' : 'h3'}/" +
          "#{counts[4]==0 ? '-' : 'h4'}/" +
          "#{counts[5]==0 ? '-' : 'h5'}/" +
          "#{counts[6]==0 ? '-' : 'h6'}" +
          "\n"

         buf += "           " +
              "#{counts[1]==0 ? '-' : counts[1]}/" +
               "#{counts[2]==0 ? '-' : counts[2]}/" +
               "#{counts[3]==0 ? '-' : counts[3]}/" +
               "#{counts[4]==0 ? '-' : counts[4]}/" +
               "#{counts[5]==0 ? '-' : counts[5]}/" +
               "#{counts[6]==0 ? '-' : counts[6]}" +
               "\n"

     hx.each do |marker,text|
        buf << "    (%d) %-6s" % [marker.size, marker]
        buf <<  "  "
        buf << text
        buf << "\n"
     end


     ## count anchors (aka a name)
     ##  e.g
       aname = txt.scan( /‹§  [^›]+  ›/x )

        if aname.size > 0
          buf << "\n"
          buf << "  aname #{aname.size}: "
          buf <<  aname.join( ',' )
          buf << "\n"
        end

        buf
end

#fmtfix(txt, heading_patches: nil) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/rsssf/fmtfix/fmtfix.rb', line 49

def fmtfix( txt,  heading_patches: nil )

        ### note - step 1
        ##      autofix-outline
        ##  and patch headings/outline if empty
        ##        with at_headings.txt, de_headings.txt etc.

        ## get title
        meta = Page.parse_meta( txt )
        title = meta[:title] || 'n/a'

        newtxt = autofix_outline( txt, title: title )


        if heading_patches
            ##
            ## check if any headings / outline
             headings = _scan_outline( newtxt )
             if headings.size == 0
                newtxt = patch_headings( newtxt, heading_patches )
             end
        end


        newtxt = autofix( newtxt )



=begin
        ##
        ## add (quick) outline
        outline = build_outline( newtxt )

        ## add inside  <!-- source: ...  [auto-add here] -->
        ## e.g.
        ##   <!--
        ##      source: https://rsssf.org/tableso/oost98.html
        ##    -->

        newtxt = newtxt.sub( %r{^[ ]*<!--
                       [ \n]*
                         (source: .+?)
                        [ \n]*
                      -->
                   }ix,
               "<!--\n  \\1\n\n#{outline} -->" )
=end
         newtxt
end

#fmtfix_pages(pages, outdir:, path:, heading_patches: nil) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/rsssf/fmtfix/fmtfix.rb', line 15

def fmtfix_pages( pages, outdir:,
                         path:,         ## (lookup search) path (array expected!!!)
                         heading_patches: nil )

     pages.each_with_index do |config,i|

            puts "==> #{i+1}/#{pages.size} #{config.pretty_inspect}..."

            page = config['page']
            dirname  = File.dirname( page )
            basename = File.basename( page, File.extname( page ) )
            extname  = File.extname( page )

            inname = "#{dirname}/#{basename}.txt"
            filename = find_file!( inname, path: path )

            txt = read_text( filename )
            newtxt = fmtfix( txt, heading_patches: heading_patches )

            outfile = File.join(  outdir, "#{basename}.txt" )
            write_text( outfile, newtxt )
     end
end

#handle_errata_txt(txt) ⇒ Object



33
34
35
36
37
38
39
# File 'lib/rsssf/fmtfix/errata.rb', line 33

def handle_errata_txt( txt )
   ERRATAS.each do |errata,replace|
      txt = txt.gsub( errata, replace )
   end

   txt
end

#handle_goals(txt, opts: {}) ⇒ Object



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/rsssf/fmtfix/goals.rb', line 38

def handle_goals( txt, opts: {} )


##
##  quick fix - change [pen] to (pen) and
##                     [og] to (og)
##   e.g. [Parkin 57 [og] - Nogan 47]
##        [McIndoe 11 [pen] Green 20, Blundell 90 - Robinson 74]


   txt = txt.gsub( '[pen]', '(pen)')
   txt = txt.gsub( '[og]', '(og)')



=begin
   ##   [15' Barisic, 80' Gilewicz; 10' (og) Barisic]
   ##  try (simple) goal line
   ##   note keep leading spaces / indent

##  note - first line must include a score!!
###      change to named captures!! - use \k<> !!!
   txt = txt.gsub( %r{^
                        (  .+?
                             \d{1,2}-\d{1,2}
                           .*?
                          \n
                         )
                     ([ ]*)
                       \[
                        ( .*?
                           \b\d{1,3}'  ## incl. minute
                          .*?
                        )
                      \]
                     [ ]*
                    $}ix,
                    '\1\2(\3)' )
=end




  ##  try (simple double) goal line
   ##   note keep leading spaces / indent
  ## [21' Dospel, 42' and 64' Mayrleb, 51' Datoru, 72' Sobczak; 25' and
  ## 90' B.Akwuegbu]
  ##  -or-
  ###  [Jose Manuel Jurado 12, Diego Forlán 40, 63,
  ##   "Simao" Pedro Fonseca 90]
  ##  [Rubén Suárez 10; Abdoulay Konko 12, 63, Alvaro Negredo 27,
  ##   "Renato" Dirnei Florencio 87]


##  ["Edmilson" Gomes de Moraes 40, Marco Perez 68,
##   Ander Herrera 82; Fernando Fernandez 1, 27,
##   Juan Miguel Jimenez "Juanmi" 6, 28, Quincy Owusu-abeyie 35]
##  or
##  [Jose Manuel Casado 16,Emiliano Armenteros 20,
##   Jorge Andujar Moreno "Coke" 60; Jose Javier Barkero 14pen,
##   Jose Antonio Culebras 90+].
##    note - remove optional

   txt = txt.gsub( %r{^
                     ([ ]*)
                       \[
                        (            #{GOALS_}
                              \n     #{GOALS_}
                             (?:
                                 \n  #{GOALS_}
                             )?
                        )
                      \]
                      \.?  ## optional trailing dot
                      [ ]*
                    $}ix,
                    '\1(\2)' )


## note - match for single line goes last !!


###
###    [Fernando Llorente 47]
##   [Sebastián Fernández 44; Aritz Aduriz 9, Joaquín Sanchez 71, 75]
   ##  try (simple) goal line with number only!!!
   ##   note keep leading spaces / indent

## Fluminense     3-0   0-2  São Caetano
##    [Magno Alves 70', 88', Roni 75']
##    [Daniel 15', Magrão 46'p]

## fix/fix/fix - merge with rule above!!!
##                make minute optional!!!

   txt = txt.gsub( %r{^
                      (?<match>  .+?
                                \d{1,2}-\d{1,2}
                                .*?
                         )
                          \n
                     (?<indent1>  [ ]*)
                       \[
                        (?<goals1> #{GOALS_})
                      \]
                     [ ]*
                (?:   ## check for second goal line following
                      ##   used in br for aggregate matches
                    \n
                    (?<indent2> [ ]*)
                       \[
                        (?<goals2> #{GOALS_})
                      \]
                     [ ]*
                )?
                    $}ix ) do |match|

                     if opts[:goals]
                        puts "  match:"
                        puts match
                     end

                      m = Regexp.last_match
                      buf = String.new
                      buf += "#{m[:match]}\n"
                      buf += "#{m[:indent1]}(#{m[:goals1]})"
                      buf += "\n#{m[:indent2]}(#{m[:goals2]})"   if m[:indent2] && m[:goals2]
                      buf
                    end

   txt
end

#handle_header(line) ⇒ Object



261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
# File 'lib/rsssf/fmtfix/headers.rb', line 261

def handle_header( line )
      ## note - returns    newline (matched header line reformatted)
      ##                    or nil (if no match!!)
      ##
       line = line.rstrip   ## expect chomp of newline "upstream" - why? why not?


      if m = HEADER_ROUND_RE.match(line.rstrip)
                   "#{m[:round]} ▪\n"
      elsif m = HEADER_DATE_RE.match(line.rstrip)
                   ## e.g. [Nov 20]
                   ## e.g. [April 1]
                   date = _norm_date( m )
                   "_ #{date} _\n"
      elsif m = HEADER_DATE_N_CITY_RE.match(line.rstrip)
                   ## e.g. [Jun 3, Ferrol]
                   ## e.g. [Apr 2, Wembley]
                   ##   [Sat May 17 - at Millennium Stadium, Cardiff]
                   ##   [Sun May 25 - at Millennium Stadium, Cardiff]

                   date = _norm_date( m )

                   ##  note - check for special case
                   ##     [Dec 10, replay]
                   ##  change to  ▪ Replay ▪   _ Dec 10 _
                   if m[:city] == 'replay'
                      "▪ Replay ▪  _ #{date} _\n"
                   else
                      "_ #{date} _ @ #{m[:city]}\n"
                   end
      elsif m = HEADER_DATE_II_RE.match(line.rstrip)
                    ##  note - no enclosing brackets []!!!
                    ## e.g. Nov 20 1999  or Nov 20, 1999
                    ##      Apr 1 2000   or Apr 1, 2000
                     date = _norm_date( m )
                   "_ #{date} _\n"
      elsif m = HEADER_DATE_ALT_RE.match(line.rstrip)
                    ## e.g. [07-09]
                    ##      [30-05, Thaur]
                    ## date = _norm_date( m, format: 'numeric' )
                    date = _norm_date( m  )
                    buf = String.new
                    buf += "_ #{date} _"
                    buf += " @ #{m[:city]}"    if m[:city]
                    buf += "\n"
                    buf
      elsif m = HEADER_ROUND_N_DATE_RE.match(line.strip)
                     date = _norm_date( m )
                   "#{m[:round]}#{date}\n"
      elsif m = HEADER_ROUND_N_DATE_N_CITY_RE.match(line.strip)
                     date = _norm_date( m )
                   "#{m[:round]}#{date} @ #{m[:city]}\n"
      elsif m = HEADER_ROUND_N_CITY_RE.match(line.strip)
                   "#{m[:round]} ▪  @ #{m[:city]}\n"
      elsif m = HEADER_ROUND_N_CITY_N_DATE_RE.match(line.strip)
                     date = _norm_date( m )
                    ## note - reverse (rotate) date & city
                   "#{m[:round]}#{date} @ #{m[:city]}\n"
       else
         nil
       end
end

#handle_score(txt) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/rsssf/fmtfix/score.rb', line 5

def handle_score( txt )

    ## fix typos - move to errata
    txt = txt.gsub( 'paet, 3-4 pen]', '[aet, 3-4 pen]' )




    ###  [aet]   => (aet)         -- after extra time
    ##   [asdet] => (asdet)      -- after sudden death extra time
    txt = txt.gsub( '[aet]',   '(aet)' )
    txt = txt.gsub( '[asdet]', '(asdet)' )




   ## [aet, 2-3 pen] => (aet, 2-3 pen)
   ##  [aet, 9-10 pen]
   ## [aet, 2-3pen]
   ## [aet, 7-6pen]

   txt = txt.gsub( %r{
                         \[
                            aet[,;.] [ ]?
                             (\d{1,2}-\d{1,2}) [ ]? pen
                         \]
                    }ix,
                    '(aet, \1 pen)')


###  [aet, pen 4-3]
##   [aet, pen 2-4]
###    =>
##      (aet, 2-4 pen)
   txt = txt.gsub( %r{
                         \[
                            aet[,;.] [ ]?
                             pen [ ] (\d{1,2}-\d{1,2})
                         \]
                    }ix,
                    '(aet, \1 pen)')

 ##   [5-4 pen]
 ##   [3-4 pen]
 ##   [1-3 pen], [1-3pen]
   txt = txt.gsub( %r{
                         \[
                           (\d{1,2}-\d{1,2}) [ ]? pen
                         \]
                    }ix,
                    '(\1 pen)')


##   [Pen 4-1], [Pen 4-5], [Pen 1-3]
##      =>
##    (4-1 pen)
   txt = txt.gsub( %r{
                         \[
                           pen [ ] (\d{1,2}-\d{1,2})
                         \]
                    }ix,
                    '(\1 pen)')


##  [5-3 PK], [6-5 PK]
##      =>
##   (6-5 pen)
      txt = txt.gsub( %r{
                           \[
                              (\d{1,2}-\d{1,2}) [ ] PK
                         \]
                        }ix,
                        '(\1 pen)'  )



 ##
 ## check special case usage - uniques?
 ##    [8-7 pen(no extra time)]
 ##    [Pen 2-4 (1-3?)]


   txt
end

#handle_tables(txt, tables: []) ⇒ Object



282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# File 'lib/rsssf/fmtfix/tables.rb', line 282

def handle_tables( txt, tables: [] )


   txt = txt.gsub( TABLE_RE ) do |match|

                 m = Regexp.last_match

                 puts "  proc table >#{m[:header]}< block:"
                 puts ">>> (begin)"
                 puts match
                 puts "<<< (end)"

                    ## remove everyting
                    ##  or put in comment block later with command line option/switch!!
                    ##    ''

                     ## replace with "collapsed" marker



                    tables << match
                    table_id = tables.size
                    if m[:header]   ## note - header might be missing
                                    ##   table starting w/ blank line
                       "<!-- $table#{table_id}$ - #{m[:header]} -->\n"
                    else
                       "<!-- $table#{table_id}$ -->\n"
                    end
                  end
   txt
end

#handle_topscorers(txt, topscorers: [], opts: {}) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/rsssf/fmtfix/topscorers.rb', line 29

def handle_topscorers( txt, topscorers: [], opts: {} )
   txt = txt.gsub( TOPSCORERS_RE ) do |match|
                 if opts[:topscorers]
                   puts "  proc topscorers block:"
                   puts match
                 end

                    ## remove everyting
                    ##  or put in comment block later with command line option/switch!!
                    ##    ''

                    ## replace with "collapsed" marker
                      topscorers << match
                    topscorers_id = topscorers.size
                    "<!-- $topscorers#{topscorers_id}$ -->\n\n"
                  end
   txt
end

#patch_headings(txt, patches) ⇒ Object



116
117
118
119
120
121
122
# File 'lib/rsssf/fmtfix/patch_headings.rb', line 116

def patch_headings( txt, patches )

     patches.each do |title, rxs|
         txt = _patch_heading( txt, rxs, title )
     end
     txt
end