Class: SportDb::Lexer

Inherits:
Object
  • Object
show all
Defined in:
lib/sportdb/parser.rb,
lib/sportdb/parser/lexer.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/token-geo.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-note.rb,
lib/sportdb/parser/token-prop.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-time.rb,
lib/sportdb/parser/lexer_token.rb,
lib/sportdb/parser/token-goals.rb,
lib/sportdb/parser/token-group.rb,
lib/sportdb/parser/token-round.rb,
lib/sportdb/parser/token-score.rb,
lib/sportdb/parser/lexer-on_top.rb,
lib/sportdb/parser/token-status.rb,
lib/sportdb/parser/lexer-on_goal.rb,
lib/sportdb/parser/lexer-prep_doc.rb,
lib/sportdb/parser/lexer-tokenize.rb,
lib/sportdb/parser/lexer-prep_line.rb,
lib/sportdb/parser/token-prop_name.rb,
lib/sportdb/parser/token-score_full.rb,
lib/sportdb/parser/token-score_legs.rb,
lib/sportdb/parser/token-date--names.rb,
lib/sportdb/parser/lexer-on_group_def.rb,
lib/sportdb/parser/lexer-on_prop_misc.rb,
lib/sportdb/parser/lexer-on_round_def.rb,
lib/sportdb/parser/token-score_fuller.rb,
lib/sportdb/parser/token-date--helpers.rb,
lib/sportdb/parser/token-date_duration.rb,
lib/sportdb/parser/token-status_inline.rb,
lib/sportdb/parser/lexer-on_prop_lineup.rb,
lib/sportdb/parser/token-goals--helpers.rb,
lib/sportdb/parser/token-score--helpers.rb,
lib/sportdb/parser/lexer-on_prop_penalties.rb

Defined Under Namespace

Classes: Context, Token

Constant Summary collapse

ANY_RE =

general catch-all (RECOMMENDED (ALWAYS) use as last entry in union)

to avoid advance of pos match!!!
%r{
     (?<any> .)
}ix
SPACES_RE =
%r{
     (?<spaces> [ ]{2,})
   | (?<space>  [ ])
}ix
ATTENDANCE_RE =

add att(endance) e.g. att: 18000

A v B 2-1  att: 18000
%r{
    (?<attendance>
     \b
        (?: attendance|att )
            : [ ]*
         (?<value>
              [1-9]
              (?: _? \d+ )*
         )
     \b
)}ix
TEAM_HOME_RE =

home/away/neutral - (h), (a), (n)

add support for h/a/n
   with (?-i \b [han] \b) lower-case and \b boundry - why? why not?
%r{  (?<team_home> \(h\) )}ix
TEAM_AWAY_RE =
%r{  (?<team_away> \(a\)  )}ix
TEAM_NEUTRAL_RE =
%r{  (?<team_neutral> \(n\) )}ix
VS_RE =

note - only match case sensitive (downcased letters)!!! note - bigger match first e.g. vs than v etc.

%r{
    (?<vs>
       (?<=[ ])	# positive lookBEHIND for space
       (?-i:
           vs\.?|v
       )
       (?=[ ])   # positive lookAHEAD for space
    )
}ix
RE =

“top-level” regex used for:

- date_header
- match_header & match_line_more
- match_line
Regexp.union(
                    SPACES_RE,
                    STATUS_RE,   ## match status e.g. [cancelled], etc.

                    INLINE_WO_RE,    ## (inline) match status - w/o (walkout)
                    INLINE_NP_RE,    ## (inline) match status - n/p (not played)
                    INLINE_BYE_RE,   ## (inline) match status - bye (advance to next round)
                    INLINE_ABD_RE,   ## (inline) match status - abd/abd. (abandoned)
                    INLINE_SUSP_RE,  ## (inline) match status - susp/susp.  (suspended)
                    INLINE_PPD_RE,   ## (inline) match status - ppd/ppd. or pstp/pstp. or postp/postp. or p-p (postponed)
                    INLINE_VOID_RE,  ## (inline) match status - x-x (voided)
                    INLINE_AWD_RE,   ## (inline) match status - awd/awd. (awarded)
                    INLINE_CANC_RE,  ## (inline) match status - canc/canc. (cancelled/canceled)


                    TEAM_HOME_RE,     ## (H)
                    TEAM_AWAY_RE,     ## (A)
                    TEAM_NEUTRAL_RE,  ## (N)

                    NOTE_RE,  ### fix - change to INLINE_NOTE !!!
                    DATE_LEGS_RE,  # note - must go before date!!!
                    DATE_RE,  ## note - date must go before time (e.g. 12.12. vs 12.12)
                    TIME_RE,

                    ATTENDANCE_RE,   # note - allow att: for now inline in matches too - why? why not?

                    SCORE_FULL_1ST_RE, # note - MUST go before SCORE_LEGS_RE!!
                 ##   e.g. 2-2, 5-1 pen.
                    SCORE_LEGS_RE,
                    SCORE_FULL_RE,
                    SCORE_FULLER_RE,
                    SCORE_FULLER_MORE_RE,
                    SCORE_AWD_RE,   #  (inline) score awarded e.g. 3-0 awd or 0-1 awd. etc.
                    SCORE_ABD_RE,   #  (inline) score abandoned e.g. 2-1 abd.
                    SCORE_RE,   ## note basic score e.g. 1-1 must go after SCORE_FULL_RE!!!

                    VS_RE,

                   TEXT_RE,

              %r{ (?<sym> [,@()-] ) }x,   ## todo - check if "standalone" comma (,) in use?
                   ANY_RE,
)
START_WITH_ORD =

ord (for ordinal number)

e.g. (51) or (1) etc.  - limit digits of number - why? why not???
%r{
   \A
    [ ]*    ## ignore leading spaces (if any)
(?<ord>
  \(
   (?<value>\d+)
  \)
)}ix
START_WITH_YEAR =

e.g. 1930, 1986, 2002, 2010, 2022, 2026

     note - only YYYY
note - look out for clubs like  1860 München (de) !!!
                                1899 Hoffenheim (de)
                                1896 Löwenherz  (ch - a.k.a. FC Winterthur ??)
                any others starting with YYYY ?!
note - YEAR requires TWO (trailing) spaces !!!!! e.g.
   1930    Uruguay             4-2 Argentina
   1934    Italy               2-1 Czechoslovakia   (AET)
   2022    Argentina           3-3 France           (AET, 4-2 pen)

 do NOT match (iso date!!) -  2020-11-12
                              2020/11/12
                              2020.11.12 etc.
%r{
   \A
       [ ]*    ## ignore leading spaces (if any)
     (?<year>
        \d{4}
     )
     ## positive lookahead
       (?= [ ]{2} |   ## min. TWO spaces!!! or
           [ ]@ |   ##   space with geo marker or
           [ ]* \z  ##    year (date) header (end-of-line/string)
        )
}x
HEADING_RE =
%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?<heading_marker> ={1,6} )
    [ ]*
     (?<heading>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1
        [^=]+?   ## use non-greedy
     )
    [ ]*  ## ignore trailing spaces (if any)
     (?: =*)  ## allow any trailing heading markers
    [ ]*  ## ignore trailing spaces (if any)
  \z
}ix
GEO_TEXT_RE =
%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
          ## positive lookbehind -  for now space (or beginning of line - for testing) only
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<= [ ,›>\[\]]|^)
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |
                # opt 2 - start with num!! -
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MAY be followed by (optional space) !
                      ## MUST be follow by a to z!!!!
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add another weirdo case
                ##   e.g.   's Gravenwezel-Schilde
                ##   add more letters (or sequences here - why? why not?)
                    '\p{L}+
               )

               ##
               ## todo/check - find a different "more intuitive" regex/rule if possible?
               ##    for single spaces only (and _/ MUST not be surround by spaces)

              (?:
                  (?:
                    [ ]?   # only single (inline) space allowed - double spaces are breaks!!!
                    (?:
                       \p{L} | \d  | [.&'°]
                        |
                       (?: (?<! [ ])  ## no space allowed before (but possible after)
                            [-]
                       )
                         |
                       (?: (?<! [ ])  ## no spaces allowed around these characters
                           [_/]
                          (?! [ ])
                       )
                    )+
                  )
                  |
              ## for now allow auto-add optional
              ##   parenthesis enclosed closed text
              ##   e.g. Dublin (Dalymount Park)
              ##        Bucuresti (23 August)
              ##        Paris (Parc des Princes)
              ##        Ost-Berlin (Walter-Ulbricht)
              ##        Athinai (OAKA - Maroussi)
              ##
              ##   or   Valencia (Spain) or Solna
              (?:
                    [ ]
                    \(
                        [^()\[\],;:›<>]+    ## todo - add more special chars
                                            ##   maybe list only allowed ones??
                                            ##   make pattern more strict - why? why not?
                    \)
              )
          )*


              ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)

            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## POSITIVE lookahead
            (?=[ ,›>\[\]]|$)

   )
}ix
GEO_END_RE =
%r{
   (?<geo_end>
        ,
    )
    ## POSITIVE lookahead for props
    ##   todo/fix - use generic [a-z]+ - why? why not?
    (?=
        [ ]*  ## optional spaces
         (?:     attendance|att
              |  referee?s|refs?
          )
         :
    )
}ix
GEO_RE =
Regexp.union(
                    SPACES_RE,
                    GEO_END_RE,
                    GEO_TEXT_RE,
                    /  (?<sym> [,›>\[] ) /x,
                    ANY_RE,
)
DATE_I_RE =

e.g. Fri Aug 9

    Fri  Aug 9
   Fri, Aug 9
   Fri, Aug 9 2024
   Fri, Aug 9, 2024
        Aug 9, 2024
        Aug 9, 2024
note - eat-up optional comma after DAY_NAMES!!

   note - Fri Aug/9  no longer supported!!!
%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+)
     )?
     (?<month_name>#{MONTH_NAMES})
          [ ]
     (?<day>\d{1,2})
          \b
     ## optional year
     (      ,? [ ]       ## note - comma optinal with single space required for now
            (?<year>\d{4})        ## optional year 2025 (yyyy)
              \b
     )?
)}ix
DATE_LEGS_I_RE =

todo/fix - add (opt) day_name later

add (opt) year later

e.g. Aug 9 & Aug 10 note - allow shortcut e.g. Aug 9 & 10

%r{
(?<date_legs>
 \b
     (?<month_name1>#{MONTH_NAMES})
          [ ]
     (?<day1>\d{1,2})
    [ ] & [ ]
     (?:
        (?<month_name2>#{MONTH_NAMES})
          [ ]
      )?  ## note - make 2nd month_name optional
     (?<day2>\d{1,2})
  \b
)}ix
DATE_II_RE =

e.g. 3 June or 10 June

 note - allow more spaces between  DAY_NAMES and DAY e.g.
  Sun  1 Mar
  Wed  4 Mar
  Sat 14 Mar
  Sat 11 Apr
  Sat 11 Apr 2021
  Sat 11 Apr 21

  Sat, 11 Apr
 note - eat-up optional comma after DAY_NAMES!!

note - Sat 14 Mar 17:30
        check two-digit year (with NEGATIVE lookahead for time!!!)
%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+)
     )?
     (?<day>\d{1,2})
         [ ]
     (?<month_name>#{MONTH_NAMES})
          \b
     ## optional year
     (  [ ]
        (?:
           (?<year>\d{4})        ## optional year 2025 (yyyy)
               |
            (?:
               (?<yy>\d{2})           ## optional year 25 (yy)
                ## check NEGATIVE lookahead
               (?! :|[:h]\d{2})
            )
        )
        \b
     )?
)}ix
DATE_III_A_RE =

e.g. iso-date - 2011-08-25

note - allow/support ("shortcuts") e.g 2011-8-25  or 2011-8-3 / 2011-08-03 etc.
%r{
(?<date>
  \b
   (?<year>\d{4})
       -
   (?<month>\d{1,2})
       -
   (?<day>\d{1,2})
  \b
)}ix
DATE_III_B_RE =

starting w/ day/month/year e.g. 25-08-2011

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          (?: ,?[ ]+)
     )?
   (?<day>\d{1,2})
       -
   (?<month>\d{1,2})
       -
   (?<year>\d{4})
  \b
)}ix
DATE_IIII_RE =

allow (short)“european” style 8.8.

note - assume day/month!!!
%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+)
     )?
   (?<day>\d{1,2})
       \.
   (?<month>\d{1,2})
       \.
   (?: (?:
          (?<year>\d{4})        ## optional year 2025 (yyyy)
              |
          (?<yy>\d{2})           ## optional year 25 (yy)
       )
        \b
   )?
)
}ix
DATE_IIIII_RE =

04/03/2026 or 4/3/2026

04/03/26   or 4/3/26
04/03      or 4/3
%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          (?: ,?[ ]+)
     )?
   (?<day>\d{1,2})
       /
   (?<month>\d{1,2})
    \b
   (?:
        /
       (?:
          (?<year>\d{4})         ## optional year 2025 (yyyy)
              |
          (?<yy>\d{2})           ## optional year 25 (yy)
       )
      \b
   )?
)
}ix
DATE_RE =

map tables

note: order matters; first come-first matched/served
Regexp.union(
   DATE_I_RE,
   DATE_II_RE,
   DATE_III_A_RE,    ## e.g. 1973-08-14
   DATE_III_B_RE,
   DATE_IIII_RE,    ## e.g. 8.8. or 8.13.79 or 08.14.1973
   DATE_IIIII_RE,   ## e.g.  08/14/1973
)
DATE_LEGS_RE =

todo - add more format style here; change to Regexp.union later!!!

DATE_LEGS_I_RE
NOTE_RE =

fix - use (?<text>) - text capture for inner text!!

use (?<note> for complete match as a convention!! )
%r{
\[ 
  (?<note>
     [^\[\]\#]*?    ## note - non-greedy/lazy operator
                    ##    exclude comments inside note block - why? why not?
  )
\]
}xi
NOTA_BENE_RE =

check for “literal” (multi-line) note blocks

 eg.  nb:  or note:          
 space required after double colon - why? why not?              

note - use \A (instead of ^) - \A strictly matches the start of the string.
%r{   \A
    [ ]*  ## ignore leading spaces (if any)
 (?: nb | note) [ ]* : [ ]+   
  (?<nota_bene>
       .+?  ## use non-greedy 
   )
    [ ]*  ## ignore trailing spaces (if any) 
   \z
}xi
PROP_KEY_WORD_ =

(i) starting w/ letters

         note - incl./allows digits (0-9)
e.g. a1, a2000, etc.

note - added back optional trailing dot (.) for abbrev. word !!!

%r{
       \p{L}
         [\p{L}\d]*
         \.?
}ix
PROP_KEY_NUM_ =

note - incl. optional dot or numsign e.g. 1. or 1°

%r{
           \d+
           [.°]?
}ix
PROP_KEY_NUMALPHA_ =

e.g. 1A, 1FC etc.

note - no trailing dot (.) for now - check if any cases exist in real world
%r{
         \d+
         \p{L}
          [\p{L}\d]*
}ix
START_WITH_PROP_KEY_RE =
%r{
  \A         ## note - MUST start line; leading spaces optional (eat-up)
(?<prop_key>
    [ ]*     ##  optional leading spaces
  (?<key>
      (?:
          ## (i) starting w/ letters
            #{PROP_KEY_WORD_}

          ## (ii) starting w/ number
          ##  e.g. 1fc, 1a,
          | #{PROP_KEY_NUMALPHA_}
          ##      followed by optional dot) and
          ##                  optional space
          ##      MUST be follow by letter (a to z)!!!!
          ##   eg. 1[ fc], 1.[ fc], 1.[fc],  etc.
          | #{PROP_KEY_NUM_}   (?= [ ]? \p{L})
      )
      (?:
          ## connectors  - note - no dot (.), must match with abbrev word or num!!
           (?: ## (i)   single space or WITHOUT surrounding spaces!! - slash (/), dash (-)
               ##     e.g. do NOT match   one - two     or one / two
               ##                        only one-two   or  one/two

                 [ /-]

               ## (ii)     surrounded by leading or trailing optional space
               ##            c & a, etc.
               ##            d'ivoire, d' ivoire
               ##            borusia 'gladbach etc.
               ##              exclude space ' space - why? why not? (or ignore for now)
               ##
               ##    check for quotes  ('') - not realy supported here
               ##              e.g. leading or trailing ' will NOT match

                |  [ ]? & [ ]?
                |  [ ]? '
                |  ' [ ]?

               #### (iii)
               ##   note - special "hack"  to connect WITHOUT space
               ##     for   Union 1.FC  and SKN St.Pölten or St.Pölten
               ##       connects      1.FC      => NUM+WORD
               ##                     1°Mayo    => NUM+WORD
               ##                     St.Pölten => ABBREV+WORD
               ##
               ## note - match WITHOUT (space) connector
               ##                  1.FC  (Union 1.FC Stein)
               ##               [WORD: "Union"], [NUM: "1."], [WORD: "FC"]
               ##                  St.Pölten (SKN St.Pölten)
               ##                [WORD: "SKN"], [ABBREV: "St."], [WORD: "Pölten"]
               |   (?<=  [.°] )
                   (?=  \p{L})
           )
            (?:
                  #{PROP_KEY_NUMALPHA_}
               |  #{PROP_KEY_NUM_}
               |  #{PROP_KEY_WORD_}
              )
      )*
     )       ## close <key> capture
   [ ]*?     ## slurp trailing spaces
    :

                ## positive lookahead (must be followed by space!!)
                ##     or allow end-of-line too
   (?= [ ]+|$)
  )  ## close <prop_key> capture
}ix
INLINE_CAPTAIN =
c

or [C] for marking player as captain

support [y ] too - or require Y - why? why not?
%r{ (?<inline_captain>
    \[ [cC] \]
)}x
INLINE_YELLOW =
%r{ (?<inline_yellow>
     \[ [yY]
         ## optional minute
         (?: [ ]+
           (?<minute> \d{1,3})
              '?
           (?:
              \+
              (?<offset>\d{1,2})
               '?
           )?
         )?
     \]
)}x
INLINE_RED =
%r{ (?<inline_red>
     \[ [rR]
         ## optional minute
         (?: [ ]+
           (?<minute> \d{1,3})
              '?
           (?:
              \+
              (?<offset>\d{1,2})
               '?
           )?
         )?
     \]
)}x
INLINE_YELLOW_RED =
%r{ (?<inline_yellow_red>
     \[ (?:y/r |
           Y/R  )
         ## optional minute
         (?: [ ]+
           (?<minute> \d{1,3})
              '?
           (?:
              \+
              (?<offset>\d{1,2})
               '?
           )?
         )?
     \]
)}x
PROP_KEY_INLINE_RE =

simple prop key for inline use e.g.

Coach:  or Trainer:  or ...  add more here later
%r{
   \b
(?<prop_key>    ## note: use prop_key (NOT prop_key_inline or such)
  (?<key>
      \p{L}+
  )
   ## note - NO spaces allowed for key for now!!!
    :
   ## possitive lookahead (must be followed by space!!)
   (?=[ ]+)
  )
}ix
PROP_NUM_RE =

note allow underscore inline e.g.

5_000

discuss/check - allow space inline (e.g. 5 000) - why? why not?

%r{
 \b
  (?<num>
      (?<value> [0-9]+
                 (?: _ [0-9]+)*
      )
  )
 \b
}x
ENCLOSED_NAME_RE =

todo/fix - allow more chars in enclosed name - why? why not?

                   e.g.  (') - Cote D'Ivore etc.
change to PAREN_NAME or PARENTHESIS or such - why? why not?
%r{
        (?<enclosed_name>
           \(
          (?<name>
              \p{L}+
              (?:
                 [ ]
                   \p{L}+
              )*
          )
            \)
        )
}ix
TEXT_RE =
%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
           ## positive lookbehind
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<=[ ,;@|\[\]]
                 |^
            )
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |

                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MUST be followed by (optional dot) and
                      ##                      required space !!!
                      ## MUST be follow by a to z!!!!
                      [.°]?     ## optional dot (.) or degree(°) - todo - add number sign too!!
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add another weirdo case
                ##   e.g.   's Gravenwezel-Schilde
                    '[s] [ ] \p{L}+
               )


              (?:(?:  (?:[ ]   # only single spaces allowed inline!!!
                          ## note - exclude (v[ ]/vs[ ]/vs.[ ])
                          ##    AND switch to case-sensitive (via -i!!!)
                        (?! (?-i: (?:  ## note - (big) V not matching for versus!!!
                                      vs\.?|v|

                                      n/p|N/P|
                                      w/o|W/O|
                                      abd\.?|ABD|
                                      aban\.?|ABAN|
                                      susp\.?|SUSP|
                                      ppd\.?|PPD|
                                      pst\.?|PST|
                                      po?stp\.?|PO?STP|P-P|
                                      x-x|X-X|
                                      awd\.?|AWD|
                                      canc\.?|CANC ) [ ]
                                        |
                                  (?: bye|BYE ) (?:[ ]|$))
                          )
                      )
                      |
                     [/-]   ## must NOT be surrounded by spaces
                  )?
                (?:
                  \p{L}
                     |
                  (?:   ## note - restrict [.&'] to single char usage (no doubled e.g. && etc.)
                    \. (?! \.)  ## allow single points only (now two or more etc.)
                     |
                    & (?! &)
                     |
                    ' (?! ')
                   )
                     |
                 (?:
                   \d+
                   (?!
                     [0-9h'+] |    ## protected break on 12h / 12' / 1-1
                                    ##  check usege for 3+4 - possible? where ? why?
                     (?:[.:-]\d)     ## protected/exclude/break on 12.03 / 12:03 / 12-12
                                      ##  BUT allow Park21-Arena for example e.g. 21-A :-)
                    )
                    [°]?  ## followed by optional ord
                   ## negative lookahead for numbers
                   ##   note - include digits itself!!!
                   ##   note - remove / (slash) e.g. allows UDI'19/Beter Bed
                 )
               )
              )*  ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)


            ## allow optional at the end
            ##  tag or year
            ##   make it and in the future - why? why not?
            ##
            ## change - fix
            ##   do NOT use (A) for amateur
            ##   use A or A. with NO ()!!!
            ## (A) -    allow with predined  alpha only for now
            ##          e.g. (A) - amateur a team or b?
            ###  same for U21 or U9 etc
            ##        use with NO ()!!! - why? why not?
            ##      or U21 U9 etc.   - why? why not?
            ##       or etc.
            ## (1879-1893) or allow years e.g. (1879-1893)
            ###
            ##    add allow country code three to five letters for now
            ##       change to generic 1 to 5 - why? why not?
            ##     e.g. (A), (I),
            ##          (AUT)
            ##          (TRNC)   five? for UEFA code for northern cyprus
            ##     change to 1 to 4 - why? why not?
            ##   check - fix possible for upper case only here
            ##                     inline for this group only?
            (?:
               [ ]
               \(
                  \d{4}-\d{4}
               \)
            )?
             (?:
                ######
                # check for country code (cc)
                #       e.g. (AUT) or ,AUT or AUT
                (?:
               [ ]   ## note - do NOT allow more than one space!!! - why? why not?
                   \(
                       ## note - auto-exclude reserved (aet)  from SCORE_FULLER_MORE!!!
                       ##     plus golden goal (gg)/sudden death (sd), silver goal (sg)
                       ##    (ht), (ft)
                       (?! (?: aet | agget | asdet | asget | ht | ft )
                             \)
                       )
                     (?:
                       [A-Z]{1,5}
                     )
                  \)
                )
                  |
                (?:
                    [ ]*[,›>][ ]*
                        [A-Z]{1,5}
                     \b
                )
             )?
            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## positive lookahead
            (?=[ ,;@|\[\]]
                 |$
            )
   )
}ix
TIME_RE =
%r{
        \b
    (?<time>
             (?<hour>\d{1,2})
                   [:h]
              (?<minute>\d{2})

                 #### optional (inline) timezone
                 ##    note - non-utc timezone MUST be hard-coded (added) here!!!
                 ##     avoids eating-up team names (separated by one space)
                 ##            e.g.  18:30 MEX v MEX
                 (?:
                    [ ]  ## require space - why? why not
                     (?<timezone>
                        (?:
                          ## GMT   - Greenwich Mean Time
                          ## BST   - British Summer Time
                          ## CES?T - Central European (Summer) Time
                          ## EES?T - Eastern European (Summer) Time
                          ##
                          (?: GMT|BST|CES?T|EES?T)
                               (?: /
                                   UTC  (?: [+-]\d{1,4} | ±0)
                               )?
                          )
                          |
                          (?:
                             UTC  (?: [+-]\d{1,4} | ±0)
                          )
                     )
                 )?
        )
      \b

####
###  note - local time is now INLINE and MUST follow time
       (?:
           [ ]+   ## todo/check - make space optional - why? why not?
           \(
        (?<time_local>
                (?<local_hour>\d{1,2})
                   [:h]    ### todo/fix - MUST match style in time above!!!
                           ###   use capture with backref!!!!
                (?<local_minute>\d{2})

                ####
                ## optional "local" timezone name eg. BRT or CEST etc.
                (?:
                    [ ] ## require space - why? why not
                   (?<local_timezone>
                      (?:  [A-Z]{3,4}
                           (?: /
                                   UTC (?: [+-]\d{1,4} | ±0)
                           )?
                      )
                      |
                      (?:     ## e.g. 0 or 00 or 0000
                          UTC   (?: [+-]\d{1,4} | ±0)
                      )
                  )
               )?  # note - make timezone  optional!!!
          )
      \)
       )?
}ix
START_GOAL_LINE_RE =

note - assume lines starting with opening ( are goal lines!!!!

note - use \A (instead of ^) - \A strictly matches the start of the string.

 note -  check for negative lookahead
               to exclude ord (numbers) e.g.  (1), (42), etc.!!!

todo/fix -- exclude (a), (h), (n)  - TEAM_AWAY, TEAM_HOME, TEAM_NEUTRAL tokens!!
%r{
                    \A
                       [ ]*    ## ignore leading spaces (if any)
                      \(

                      # check NEGATIVE lookahead
                      (?!
                            ##  exclude (a), (h), (n)
                            ##    TEAM_AWAY, TEAM_HOME, TEAM_NEUTRAL
                            (?: a|h|n )
                            \)
                       )

}xi
START_GOAL_LINE_COMPAT_RE =
%r{
                   \A
                        [ ]*    ## ignore leading spaces (if any)
                      \(

                      ## (i) check NEGATIVE lookahead
                      ##    exclude score e.g. 1-1 etc.
                          (?! [ ]* \b \d-\d \b)

                      ## (ii) check POSITIVE lookahead
                          (?= [ ]*
                               \d{1,3}
                                   '?    ## optional minute marker
                                  (?: \+
                                      \d{1,2}
                                    '?    ## optional minute marker
                                  )?
                            )
}xi
START_GOAL_LINE_ALT_RE =

check for goal line (alternate syntax)

(1-0 Player, 1-1 Player, ...)
must start-off OR yes, include score

note - allow "centered" style e.g.
       (    Player 44' (p)  1-0
                            1-1 Player 64'   )
%r{
    \A
       [ ]*    ## ignore leading spaces (if any)
     \(

     # check POSITIVE lookahead
      (?=  .*?         ## note - non-greedy
               \b \d-\d \b    ## score e.g. 0-1
        )
}xi
GOAL_NONE_RE =

e.g. (-; Metzger)

%r{ (?<goals_none>
       -[ ]*;
   )
}x
GOAL_SEP_ALT_RE =
%r{
          (?<goal_sep_alt>
 (?<=[ ])   ## positive lookbehind - space required
 -
 (?=[ ]|\z)    ## positive lookahead - speace required
)}x
GOAL_COUNT_RE =

e.g. (2)

 (2/p), (2/pen.), (3/2p), (3/ 2 pen.)
-or-  (2,1pen), (3, 2 pens)

 (p), (pen.) (2 pen.), (2p)
 (og), (o.g.),
  (2og), (2 o.g.), (2ogs)
%r{
   (?<goal_count>
      \(
        (?:
          ## opt penalties
            (?<pen>
              (?:  (?<pen_value> \d{1,2}) [ ]? )?
                 (?:pens|pen\.?|p)
           )
            |
          ## opt own goals (og)
            (?<og>
             (?: (?<og_value> \d{1,2}) [ ]? )?
                (?:ogs?|o\.g\.|o)
            )
            |
          ## opt fallback - classic count/number
          (?:  (?<value> [1-9])
                ## check for option penalties
                (?<pen>
                     [,/] [ ]*
                     (?: (?<pen_value> \d{1,2}) [ ]? )?
                     (?:pens|pen\.?|p)
                )?
           )
         )
      \)
)}ix
MINUTE_RE =

note - inline b check in MINUTE_RE excludes

    85pen  or 90+4pen or 38p
      (possible and NOT excluded in GOAL_MINUTE_RE  !!!)

minute with optional stoppage (offset)
%r{
     (?<minute>
               \b
             (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
                \b
                '?    ## optional minute marker

                (?: \+ (?<value2>\d{1,2})
                       \b
                      '?    ## optional minute marker
                 )?
      )
}ix
GOAL_MINUTE_NA_RE =

keep separate? or add simply inside GOAL_MINUTE_RE - why? why not?

fix-fix-fix - move into GOAL_MINUTE_RE !!!
%r{
     (?<goal_minute_na>

       # positive lookbehind
       (?<=[ ,;])

       (?<value> \?{1,2})
            '?    ## optional minute marker
     ## note - add goal minute qualifiers here inline!!!
        (?:
            (?: [ ]? (?<og>   (?: \((?:og|o\.g\.|o)\))   ## allow (og)
                                   |
                              (?: (?:og|o\.g\.|o))      ## allow plain og
                      )
            )
            |
            (?: [ ]? (?<pen>  (?: \((?:pen\.?|p)\))   ## allow ()
                                   |
                              (?: (?:pen\.?|p))
                      )
            )
            |
            ## add experimental header qualifier
            (?: [ ]? (?<hdr> \( (?:hdr\.?|h ) \) | (?: hdr\.?|h ) ))
            |
            ## add experimental free kick qualifier
            (?: [ ]? (?<fk> \( (?:fk\.?|f ) \) | (?: fk\.?|f) ))
        )?

     ## note - check positive lookahead
     (?=[ ,;)]|$)
   )
}ix
GOAL_MINUTE_RE =

goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og)

 todo/check - keep case-insensitive
                 or allow OG or P or PEN or
                 only lower case - why? why not?

add (gg) for golden goal - why? why not?
add (sg) for silver goal - why? why not??
%r{
     (?<goal_minute>
               \b
             (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
                '?    ## optional minute marker

                 (?: \+ (?<value2>\d{1,2})
                      '?    ## optional minute marker
                 )?

        ## note - add goal minute qualifiers here inline!!!
        (?:
            (?: [ ]? (?<og>   (?: \((?:og|o\.g\.|o)\))   ## allow (og)
                                   |
                              (?: (?:og|o\.g\.|o))      ## allow plain og
                      )
            )
            |
            (?: [ ]? (?<pen>  (?: \((?:pen\.?|p)\))   ## allow ()
                                   |
                              (?: (?:pen\.?|p))
                      )
            )
            |
            ## add experimental header qualifier
            (?: [ ]? (?<hdr> \( (?:hdr\.?|h ) \) | (?: hdr\.?|h ) ))
            |
            ## add experimental free kick qualifier
            (?: [ ]? (?<fk> \( (?:fk\.?|f ) \) | (?: fk\.?|f) ))
        )?

        ##  add experimental seconds
        ##    e.g. (95 secs) or (95sec) etc.
        (?: [ ]*  \(
                      (?<secs>\d{1,3})
                         [ ]?secs?
                   \)
        )?
     )

     ## note - check positive lookahead
     (?=[ ,;)]|$)
}ix
GOAL_TYPE_RE =
%r{
     (?<goal_type>
               \(
                 (?:
                      (?<og>  og|o\.g\.|o )
                         |
                      (?<pen> pen\.?|p )
                         |
                     ## add experimental header qualifier
                      (?<hdr>  hdr\.?|h )
                         |
                     ## add experimental free kick qualifier
                       (?<fk>  fk\.?|f )
                  )
                \)
)}xi
START_WITH_GROUP_DEF_LINE_RE =

check for start of group def line e.g.

     Group A  | ...
     Group 1  : ....
     Group A2 | ....
note - use \A (instead of ^) - \A strictly matches the start of the string.
%r{
   \A
   [ ]*  ## ignore leading spaces (if any)
   (?<group_def>
       Group
        [ ]
        [a-z0-9]+   ## todo/check - allow dot (.) too e.g. 1.A etc.- why? why not?
   )
   ###   positive lookahead MUST be : OR |
   (?= [ ]*
       [:|]
       [ ])  ## note: requires space for now after [:|] - keep - why? why not?
}ix
ROUND_OUTLINE_I_RE =

note - use A (instead of ^) - A strictly matches the start of the string.

todo - add support for trailing markers e.g.
  ▪ Round 1 ▪▪▪▪▪▪▪▪
  :: Round 1 ::::::::::::

check - allow without space (like in heading =Heading 1=) - why? why not?
  ▪Round 1▪▪▪▪▪▪▪▪
  ::Round 1::::::::::::
%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?<round_marker>
        [▪]{1,3}     ## BLACK SMALL SQUARE e.g. ▪,▪▪,▪▪▪
   )
    [ ]+
     (?<round_outline>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1
        ##
        ##  note - CANNOT incl. :| !!!
        ##   used for markers for defs/definitions
        [^:|]+?   ## use non-greedy
     )
     (?:
        [ ]+
        [▪]+
     )?
     [ ]*  ## ignore trailing spaces (if any)
   \z
}xi
ROUND_OUTLINE_II_RE =
%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?<round_marker>
         ::{1,3}     ## e.g. ::,:::,::::
   )
    [ ]+
     (?<round_outline>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1
        ##
        ##  note - CANNOT incl. :| !!!
        ##   used for markers for defs/definitions
        [^:|]+?   ## use non-greedy
     )
     (?:
        [ ]+
        ::+
     )?
    [ ]*  ## ignore trailing spaces (if any)
   \z
}xi
ROUND_OUTLINE_RE =
Regexp.union(  ROUND_OUTLINE_I_RE,
   ROUND_OUTLINE_II_RE,
)
ROUND_DEF_OUTLINE_RE =

note - for def(initions) only one level support

that is, no round outline additions possible (e.g ▪▪ 1st leg etc.)
%r{   \A
     [ ]*  ## ignore leading spaces (if any)
    (?: [▪]  ## BLACK SMALL SQUARE
         |
        :: )
     [ ]+
      (?<round_outline>
         [^:|]+?   ## use non-greedy
      )
     [ ]*  ## ignore trailing spaces (if any)
    ###   possitive lookahead MUST be : OR |
     (?= [:|]
         [ ])  ## note: requires space for now after [:|] - keep - why? why not?
}ix
SCORE_AWD_RE =

note - keep AWD w/o dot - why? why not?

%r{
            (?<score_awd>
 \b
  (?<score1>\d{1,2}) - (?<score2>\d{1,2})
    [ ]?
      (?-i: awd\.? | AWD )
  ## POSITIVE lookahead - requires space
  (?= [ ])
)}ix
SCORE_ABD_RE =

add support for score abandoned (inline style)

2-1 abd.   or 2-1 ABD
%r{
            (?<score_abd>
 \b
  (?<score1>\d{1,2}) - (?<score2>\d{1,2})
    [ ]?
     (?-i: abd\.? | ABD )
  ## POSITIVE lookahead - requires space
  (?= [ ])
)}ix
SCORE_RE =

2-1

note - was SCORE__FT__RE
         changed to "generic" SCORE_RE
              and
           (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    changed
           (?<score1>\d{1,2}) - (?<score2>\d{1,2})
              to
           pattern match not necessarily the full-time (ft) scoreline!!!
  - pattern also used for goal seq(uence) e.g. 1-0 Kane, 1-1 Johnson
%r{
            (?<score>
 \b
  (?<score1>\d{1,2}) - (?<score2>\d{1,2})
 \b
)}ix
POSTPONED =
%Q{ (?<postponed> postponed  | pst\\.? | po?stp\\.?  | ppd\\.? ) }
CANCELED =

add can/can. - why? why not?

%Q{ (?<canceled>  cancell?ed | canc\\.? ) }
WALKOVER =

add o/w too - why? why not?

%Q{ (?<walkover>  walkover   | w/o  | wo ) }
AWARDED =
%Q{ (?<awarded>   awarded    | awd\\.? ) }
SUSPENDED =
%Q{ (?<suspended> suspended  | susp\\.? ) }
ABANDONED =
%Q{ (?<abandoned> abandoned  | aban\\.?  | abd\\.? ) }
ANNULLED =
%Q{ (?<annulled>  annulled ) }
VOIDED =

note - alternative (name) to annulled

%Q{ (?<voided>    voided     | void ) }
REPLAY =
%Q{ (?<replay>    replay     | repl\\.? ) }
STATUS_RE =

note - status_note incl. complete text incl. <status> (not normalized)

<status> gets normalized e.g. ppt => postponed etc.
%r{
            \[
      (?:
#############################################
### opt 1 - allow long forms with note/comment for some stati
##                    e.g. [postponed due to tropical storm "Hanna"]
##                         [suspended at 84' by storm; result stood]
#########################
           (?: (?<status_note>
                  (?<status>
               ####################
               ## pre-match (not played)
                    #{POSTPONED}
                           |
                    #{CANCELED}
                           |
                    #{WALKOVER}
                           |
               ######################
               ## pre/post match
                     #{AWARDED}
                            |
               ########################
               ## post match - (partially) played
                    #{SUSPENDED}
                            |
                    #{ABANDONED}
                            |
                    #{ANNULLED}
                            |
                    #{VOIDED} ### note - alternative to annulled
              )     ## end-of-<status>
                  [ :;,-]+     ## leading spaces (or separators)
                  [^\]]+?      ## note - add non-greedy match
              ) ## end-of-<status-note>
              [ ]*  ## eat-up optional trailing spaces
            )
            |
########################################
## opt 2 - short form only (no note/comments) e.g. [postponed], [Canceled], etc.
####################################
            (?<status>
         ####################
         ## pre-match (not played)
               #{POSTPONED}
                 |
               #{CANCELED}
                 |
               #{WALKOVER}
                 |
         ######################
         ## pre/post match
               #{AWARDED}
                 |
         ########################
         ## post match - (partially) played
               #{SUSPENDED}
                 |
               #{ABANDONED}
                 |
               #{ANNULLED}
                 |
               #{VOIDED}   ### note - alternative to annulled
                 |
               #{REPLAY}       ### todo/fix - keep replay - why? why not?
                                  ###   prefer replay in round e.g.
                                  ##       ▪ Round 17, Replay
                                  ##       ▪ Semi-finals, Replays
            )
      )
    \]
}ix
GOAL_RE =
Regexp.union(
    SPACES_RE,
    GOAL_NONE_RE,
    GOAL_MINUTE_RE,
    GOAL_MINUTE_NA_RE,
    GOAL_COUNT_RE,
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
    GOAL_SEP_ALT_RE,   ##  note - add dash (-) with (required) spaces
    /  (?<sym> [;,)])  /x
    ## todo/fix - add ANY_RE !!!!
)
GOAL_ALT_RE =
Regexp.union(
    SPACES_RE,
    SCORE_RE,        ## e.g.  1-0, 0-1, etc.
    GOAL_MINUTE_RE,
    GOAL_TYPE_RE,
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
    /  (?<sym> [,)])  /x    ## note - no semicolon (;)
    ## todo/fix - add ANY_RE !!!!
)
GOAL_COMPAT_RE =
Regexp.union(
    SPACES_RE,
    SCORE_RE,        ## e.g.  1-0, 0-1, etc.
    MINUTE_RE,          ## note - matches minute e.g.  92, 7, 7' 7+3, etc.
    GOAL_TYPE_RE,
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
    /  (?<sym> [,)])  /x    ## note - no semicolon (;)
    ## todo/fix - add ANY_RE !!!!
)
HTML_COMMENT_RE =
%r{  <!--
     .*?   ## note - use non-greedy/lazy *? match
  -->
}xm
PREPROC_NOTA_BENE_RE =

check for “literal” (multi-line) note blocks

eg.  nb:  or note:
space required after double colon - why? why not?
%r{
         ^
    [ ]* (?: nb | note) [ ]* : [ ]+
       .+?  ## non-greedy

    ## positive lookahead
    ##    note - must end with blank line or end-of-file/document
      (?=          \n[ ]*\n
                 | \z
        )
}xim
PREPROC_BLOCK_RE =

note - [] block may NOT incl. square brackets

   what about comments (e.g. #)?
todo/check - rename to NOTE_BLOCK or TEXT_BLOCK or ???
%r{  \[
                      [^\[\]\#]*?  ## note - use non-greedy/lazy *? match
                  \]
}xm
PROP_NAME_WORD_ =
%r{
       \p{L}+
         \.?     ## optional dot
}ix
PROP_NAME_RE =

name different from text (**does NOT allow number in name/text**)

different from PROP_KEY too
%r{
                 (?<prop_name>
                      \b
                   (?<name>
                        #{PROP_NAME_WORD_}

                          ## connectors
                          (?:
                             ## (i) space - only one single space allowed inline!!!
                              (?:
                               ### check if negative lookbehind is redudant!!
                               ##    next char is \p{L} and NOT space
                               ##    thus double space not possible!!
                                (?<! [ ])             ## use negative lookbehind
                                  [ ]
                                (?=  \p{L}|['"]\p{L})      ## use lookahead
                              )
                              ## (ii) support (inline) quoted name e.g. "Rodri" or such
                                 | (?:
                                     (?<=[ ])   ## use positive lookbehind
                                     " \p{L}+ "
                                      ## require space here too - why? why not?
                                   )
                              ## (iii) dash (-)
                              | (?:
                                ## use  POSITIVE lookBEHIND
                                ## note - allow leading dot (.) e.g. K.-H.Förster
                                ##                short for          Karl-Heinz Förster
                                ##
                                ##    change to negative lookBEHIND   [ '"-]
                                ##      \p{L}\. | \p{L} - not MUST be fixed size
                                 (?<=
                                         [\p{L}.]
                                      )
                                 [-]   ## must be surrounded by letters
                                       ## e.g. One-Two NOT
                                       ##      One- Two or One - Two or One -Two etc.
                                (?= \p{L})      ## use lookahead
                              )
                                 |
                              (?:  ## flex rule for quote - allow any
                                    ##  only check for double quotes e.g. cannot follow other ' for now - why? why not?
                                    ##        allows  rodrigez 'rodri' for example
                                (?<!')  ## use negative lookbehind
                                   '
                              )
                            |   ## standard case with letter(s) and optional dot
                              #{PROP_NAME_WORD_}
                          )*
                    )
                ## add lookahead - must be non-alphanum
                ##    add colon (:) too - why? why not?
                  (?= [ ,;\]\)]|$)
)}ix
P_EN =

english helpers (penalty, extra time, …)

 note - p must go last (shortest match)
   pso = penalty shootout
- note - remove PSO for now (may add later back) - why? why not?

todo/fix/clean-up - keep it simple -  remove optional trailing dot (.)
                     from pen., p., agg. etc. - why? why not?
                      always use (simply) pen, p, agg
                    (also) remove  a.e.t. / a.e.t option - why? why not?

UPDATE mar/2026:  addd pens too - keep - why? why not?
                   (4-3 pens)
(4-3 Pens)  -- keep mixed Pens/Pen. too - why? why not?
(4-3 Pen.)
'(?-i: PEN | P |' +
'[Pp]ens | [Pp]en\.? | p\.? )'
ET_EN =

fix - change ET_EN to AET_EN!!! - why? why not?

check - allow Aet too - why? why not?
           or A.e.t ??
'(?-i: AET | ' +
'aet | a\.e\.t\.? )'
AETGG_EN =

after (golden goal/sudden death) extra time - add more options/styles - why? why not?

'(?-i: AET/GG | AGGET | ASDET | ' +
'aet/gg | a\.e\.t\.?/g\.g\.? | agget | asdet )'
AETSG_EN =

after (silver goal) extra time

'(?-i: AET/SG | ASGET | ' +
'aet/sg | a\.e\.t\.?/s\.g\.? | asget  )'
AGG_EN =

agg/agg. or AGG

'(?-i: AGG | agg\.? )'
SCORE_P =

fix - change SCORE_P to SCORE_FULL_P

             SCORE_ET to SCORE_FULL_ET

 (re)use SCORE_P, SCORE_ET for score only part!!!

fix/fix/fix - rename to SCORE_P_ SCORE_ET_
 mark internals with TRAILING underscore (leading NOT possible!)
%Q<  (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})
        [ ]? #{P_EN}
>
SCORE_ET =
%Q<  (?<et1>\\d{1,2}) - (?<et2>\\d{1,2})
        [ ]? #{ET_EN}
>
SCORE_LOOKAHEAD =
'(?= [ ,\]] | $)'
SCORE__ET_GG_SG__RE =

after extra-time with golden goal/sudden death & silver goal rule

      note - golden goal & silver goal EXCLUDE penalties!!!

4-3 a.e.t/g.g.
4-3 aet/gg
4-3agget   -or-   4-3 asdet
2-1 aet/sg
 -or-
 4-3 aet/gg (3-3, 2-1)
%r{
    (?<score_full>
       \b
       (?<et1>\d{1,2}) - (?<et2>\d{1,2})
                      [ ]? (?:
                               (?<aetgg> #{AETGG_EN})
                                  |
                               (?<aetsg> #{AETSG_EN})
                            )
       ### note:
       ## add optional full-time, half-time score
         (?:
             [ ]+
             \(
                [ ]*
               (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
                  [ ]*
                (?:
                   , [ ]*
                   (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
                     [ ]*
                  )?
               )? # note: make half time (HT) score optional for now
             \)
         )?
        #{SCORE_LOOKAHEAD}
)}ix
SCORE__P_ET__RE =

note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.)

3-4 pen. 2-2 a.e.t.
3-4 pen.   2-2 a.e.t.
         2-2 a.e.t.
%r{
(?<score_full>
   \b
    (?: #{SCORE_P} [ ]+
     )?             ## note: make penalty (P) score optional for now
    #{SCORE_ET}
    #{SCORE_LOOKAHEAD}
)}ix
SCORE__ET_P__RE =

note: allow SPECIAL cases WITHOUT full time scores

AND with pen in last position!
  2-2 a.e.t., 3-4 pen.
  2-2 a.e.t.  3-4 pen.  ## or without comma separator - why? why not?
%r{
(?<score_full>
   \b
    #{SCORE_ET}
       (?: [ ]*,[ ]* | [ ]+ )
    #{SCORE_P}
    #{SCORE_LOOKAHEAD}
)}ix
SCORE__FT_P__RE =

special case (i) - full time with penalties

2-2, 3-4 pen.
%r{
(?<score_full>
   \b
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
        [ ]*,[ ]*    ## note - comma required!!!
    #{SCORE_P}
    #{SCORE_LOOKAHEAD}
)}ix
SCORE__FT_HT_P__RE =

special case (ii) - full time & half-time with penalties

2-2 (1-1), 3-4 pen.
%r{
(?<score_full>
   \b
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
        [ ]*
         \(
             (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         \)
        [ ]*,[ ]*    ## note - comma required!!!
    #{SCORE_P}
    #{SCORE_LOOKAHEAD}
)}ix
SCORE__P__RE =

note: allow SPECIAL with penalty only

3-4 pen.  or 3-4p etc.
%r{
        (?<score_full>
  \b
    #{SCORE_P}
    #{SCORE_LOOKAHEAD}
)}ix
SCORE__P_ET_FT_HT_V2__RE =

support short all-in-one e.g.

e.g.      3-4 pen. 2-2 a.e.t. ( 1-1, 1-1 ) becomes
 3-4 pen. (2-2, 1-1, 1-1)
%r{
          (?<score_full>
   \b
    #{SCORE_P} [ ]+
       \(
       [ ]*
   (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]*, [ ]*
   (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*, [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
       [ ]*
    \)
   #{SCORE_LOOKAHEAD}
)}ix
SCORE__ET_FT_HT_P__RE =

e.g. 2-2 a.e.t. (1-1, 1-0), 5-1 pen.

%r{
          (?<score_full>
   \b
   #{SCORE_ET} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
   (?: [ ]*,[ ]* | [ ]+)
   #{SCORE_P}
   #{SCORE_LOOKAHEAD}
)}ix
SCORE__P_ET_FT_HT__RE =

e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or

3-4p 2-2aet (1-1, )     or
3-4 pen.  2-2 a.e.t. (1-1)       or
         2-2 a.e.t. (1-1, 1-1)  or
         2-2 a.e.t. (1-1, )     or
         2-2 a.e.t. (1-1)
%r{
          (?<score_full>
   \b
   (?:
      #{SCORE_P} [ ]+
    )?            ## note - make penalty (P) score optional for now
   #{SCORE_ET} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
 #{SCORE_LOOKAHEAD}
)}ix
SCORE__P_FT_HT__RE =

special case for case WITHOUT extra time!!

same as above (but WITHOUT extra time and pen required)
%r{
         (?<score_full>
            \b
           #{SCORE_P} [ ]+
    \(
    [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]*
 (?:
      , [ ]*
     (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         [ ]*
     )?
 )?              # note: make half time (HT) score optional for now
   \)
#{SCORE_LOOKAHEAD}
)}ix
SCORE__FT_HT__RE =

e.g. 2-1 (1-1)

%r{
            (?<score_full>
 \b
 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
      [ ]+ \( [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
      [ ]* \)
#{SCORE_LOOKAHEAD}
)}ix
SCORE_FULL_1ST_RE =

note 2-2, 5-1 pen. must get priority (get before SCORE_LEGS!!!)

                       break out
note - no need for Regexp.union for now (only single regex!)
SCORE__FT_P__RE
SCORE_FULL_RE =
Regexp.union(
  SCORE__ET_GG_SG__RE,       # e.g. 3-1 aet/gg
  SCORE__P_ET_FT_HT_V2__RE,  # e.g. 5-1 pen. (2-2, 1-1, 1-0)
  SCORE__ET_FT_HT_P__RE,    # e.g. 2-2 a.e.t. (1-1, 1-0), 5-1 pen.
  SCORE__P_ET_FT_HT__RE,    # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
  SCORE__P_FT_HT__RE,     # e.g. 5-1 pen. (1-1)
  SCORE__ET_P__RE,        # e.g. 2-2 a.e.t., 5-1 pen.
  SCORE__FT_HT_P__RE,     # e.g. 2-2 (1-1), 5-1 pen.
  SCORE__P_ET__RE,        # e.g.  5-1 pen. 2-2 a.e.t.  or  2-2 a.e.t. (w/o pen)
  SCORE__P__RE,           # e.g. 5-1 pen.
  SCORE__FT_HT__RE,        # e.g. 1-1 (1-0)
  ##  note - keep basic score as its own token!!!!
  ##   that is, SCORE & SCORE_MORE
  ### SCORE__FT__RE,           # e.g. 1-1  -- note - must go last!!!
)
SCORE_LEGS_RE =

win on away goals

aet
%r{
(?<score_legs>
   \b   
    (?<leg1_ft1>\d{1,2}) - (?<leg1_ft2>\d{1,2})
       (?: [ ]+ |  [ ]*,[ ]*)   # separate by spaces OR comma
    (?:
        ## opt 1 - after extra-time (et) score
            (?<leg2_et1>\d{1,2}) - (?<leg2_et2>\d{1,2})
               [ ]? #{ET_EN}   ## a.e.t./aet
                ### note - might end in dot (.) not alpha
                ###  thus, wordboundary NOT working
               #{SCORE_LOOKAHEAD}   
          |
        ## opt 2 - full-time (ft)  
        (?<leg2_ft1>\d{1,2}) - (?<leg2_ft2>\d{1,2})
            \b 
    )                
    (?:   ## check optional aggregate e.g. (agg 4-4)
        [ ]+
         \(
             agg [ ]
              (?<agg1>\d{1,2}) - (?<agg2>\d{1,2}) 
              
             ### add win options 
             (?:
                 ## opt 1 - on away goals
                (?<away> [ ]*,[ ]*
                         (?:win [ ])? on [ ] away [ ] goals?
                 )
                   |
                 ## opt 2 - on penalties  
                (?:
                   [ ]*,[ ]*
                   (?:win [ ])?
                    (?<leg2_p1>\d{1,2}) - (?<leg2_p2>\d{1,2})
                    [ ] on [ ] pens
                )
             )?
         \)
    )?
)}ix
MONTH_LINES =
parse_names( <<TXT )
January    Jan
February   Feb
March      Mar
April      Apr
May
June       Jun
July       Jul
August     Aug
September  Sept  Sep
October    Oct
November   Nov
December   Dec
TXT
MONTH_NAMES =
build_names( MONTH_LINES )
MONTH_MAP =

pp MONTH_NAMES

build_map( MONTH_LINES, downcase: true )
DAY_LINES =
parse_names( <<TXT )
Monday                   Mon  Mo
Tuesday            Tues  Tue  Tu
Wednesday                Wed  We
Thursday    Thurs  Thur  Thu  Th
Friday                   Fri  Fr
Saturday                 Sat  Sa
Sunday                   Sun  Su
TXT
DAY_NAMES =
build_names( DAY_LINES )
DAY_MAP =

pp DAY_NAMES

build_map( DAY_LINES, downcase: true )
GROUP_DEF_RE =

note - add comma (,) as optional separator

Regexp.union(  SPACES_RE,
   TEXT_RE,
   / (?<sym> [:|,] )  /x,
   ANY_RE,
)
PROP_CARDS_RE =

note - no inline keys possible

todo/fix - use custom (limited) prop basics too
Regexp.union(
   SPACES_RE,
   MINUTE_RE,
   PROP_NAME_RE,
   /  (?<sym>  [;,-]) /x
   ## todo/fix - add ANY_RE here too!!!
)
PROP_ATTENDANCE_RE =
Regexp.union(
   SPACES_RE,
   ENCLOSED_NAME_RE,       # e.g. (sold out) etc.  why? why not?
   PROP_NUM_RE,                 # e.g. 28 000 or 28_000  (NOT 28,000 is not valid!!!)
   ## todo/fix - add ANY_RE here too!!!
)
PROP_REFEREE_RE =
Regexp.union(
   SPACES_RE,
   ENCLOSED_NAME_RE,       # e.g. (sold out) etc.  why? why not?
   PROP_NUM_RE,                 # e.g. 28 000 or 28_000  (NOT 28,000 is not valid!!!)
   PROP_KEY_INLINE_RE,
   PROP_NAME_RE,
   /  (?<sym>  [;,]) /x
   ## todo/fix - add ANY_RE here too!!!
)
ROUND_DEF_RE =

note - add comma (,) as optional separator

Regexp.union(  SPACES_RE,
   DURATION_RE,  # note - duration MUST match before date
   DATE_RE,  ## note - date must go before time (e.g. 12.12. vs 12.12)
   / (?<sym> [:|,] ) /x,
   ANY_RE
)
SCORE_FULLER_AGG =
_mk_score_fuller_agg( win: false )
SCORE_FULLER_AGG_WIN =
_mk_score_fuller_agg( win: true )
SCORE_FULLER_P =
_mk_score_fuller_p( win: false )
SCORE_FULLER_P_WIN =
_mk_score_fuller_p( win: true )
SCORE_FULLER_AWAY_WIN =
%Q<
     (?:
      (?<away>
        ############
        ## opt 1)  with win
        (?:
            (?: win [ ] )?
            (?: (?<away1>\\d{1,2}) - (?<away2>\\d{1,2}) [ ] )?
             on [ ] away [ ] goals?     # goal or goals
        )
        |        
        #####
        ## opt 2)  "classic" (post)
        (?:
           (?: (?<away1>\\d{1,2}) - (?<away2>\\d{1,2}) [ ] )?
              [ ]* away  
        )
        |
        #####
        ## opt 3) up-front (pre)
        (?:
              away 
           (?:  [ ]
                (?<away1>\\d{1,2}) - (?<away2>\\d{1,2})
           )?   
        )
     ))                   
>
SCORE_FULLER_HT_OPT =
%Q<
  (?:   HT [ ]
      (?: (?<ht1>\\d{1,2}) - (?<ht2>\\d{1,2})) 
      [ ]*,[ ]*
  )?  ## note - make optional
>
SCORE_FULLER_FT_OPT =
%Q<
  (?:   FT [ ]
      (?: (?<ft1>\\d{1,2}) - (?<ft2>\\d{1,2})) 
      [ ]*,[ ]*
  )?  ## note - make optional
>
SCORE_FULLER__HT =

4-4 (HT 2-1)

       or
Team A  4-1  Team B  (HT 2-1)
%Q<
             \\(  HT [ ]
                  (?<ht1>\\d{1,2}) - (?<ht2>\\d{1,2}) 
             \\)
>
SCORE_FULLER__HT_FT__RE =
%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__HT}
)}ix
SCORE_FULLER_MORE__HT_FT__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__HT}
)}ix
SCORE_FULLER__ET =
%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_FT_OPT} 
                (?:
                  (?<aetgg> #{AETGG_EN})
                   |
                  (?<aetsg> #{AETSG_EN}) 
                   |
                  (?<aet> #{ET_EN})
                 )
             \\)
>
SCORE_FULLER__ET__RE =
%r{
(?<score_fuller>
   \b   
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__ET}
)}ix
SCORE_FULLER_MORE__ET__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__ET}
)}ix
SCORE_FULLER__ET_P =

4-4 (aet, win 3-5 on pens)

4-4 (aet, 3-5 on pens)
4-4 (aet, 3-5 pen)
4-4 (a.e.t., 3-5 pen.)
   or
Team A  4-4  Team B  (aet, win 3-5 on pens) 
Team A  4-4  Team B  (aet, 3-5 on pens)
Team A  4-4  Team B  (aet, 3-5 pen)
Team A  4-4  Team B  (a.e.t., 3-5 pen.)
%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_FT_OPT} 
                (?<aet> #{ET_EN})
                 [ ]*,[ ]*
                 #{SCORE_FULLER_P_WIN}
             \\)
>
SCORE_FULLER__ET_P__RE =
%r{
(?<score_fuller>
   \b   
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__ET_P}
)}ix
SCORE_FULLER_MORE__ET_P__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__ET_P}
)}ix
SCORE_FULLER__FT_P =

4-4 (win 3-5 on pens)

  4-4 (3-5 pen)
  4-4 (3-5p)
    or
Team A  4-4  Team B (win 3-5 on pens)
Team A  4-4  Team B (3-5 pen)
Team A  4-4  Team B (3-5p)
%Q<
             \\(
                  #{SCORE_FULLER_HT_OPT} 
                  #{SCORE_FULLER_P_WIN}
             \\)
>
SCORE_FULLER__FT_P__RE =
%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     \(
         #{SCORE_FULLER_P_WIN}
     \)
)}ix
SCORE_FULLER_MORE__FT_P__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__FT_P}
)}ix
SCORE_FULLER__FT_AGG =

3-2 (win 4-5 on aggregate)

3-2 (4-5 on aggregate)
3-2 (4-5 on agg)
3-2 (4-5 agg)
3-2 (4-5 agg.)
  or  
3-2 (agg 4-5)
%Q<
             \\(
                 #{SCORE_FULLER_HT_OPT} 
                 #{SCORE_FULLER_AGG_WIN}
             \\)
>
SCORE_FULLER__FT_AGG__RE =
%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__FT_AGG}
)}ix
SCORE_FULLER_MORE__FT_AGG__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__FT_AGG}
)}ix
SCORE_FULLER__FT_AGG_AWAY =

ft + agg + away

2-1 (3-3 on aggregate, win on away goals)
2-1 (3-3 on aggregate, win 2-1 on away goals)
%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_AGG}
                   [ ]*,[ ]*
                 #{SCORE_FULLER_AWAY_WIN}
             \\)
>
SCORE_FULLER__FT_AGG_AWAY__RE =
%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__FT_AGG_AWAY}
)}ix
SCORE_FULLER_MORE__FT_AGG_AWAY__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__FT_AGG_AWAY}
)}ix
SCORE_FULLER__ET_AGG_P =

2-1 (aet, 3-3 on aggregate, win 5-2 on pens)

2-1 (aet, 3-3 agg, 5-2 pen.)
%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_FT_OPT} 
                (?<aet> #{ET_EN})
                    [ ]*,[ ]*
                    #{SCORE_FULLER_AGG}  
                    [ ]*,[ ]*
                    #{SCORE_FULLER_P_WIN}                     
             \\)
>
SCORE_FULLER__ET_AGG_P__RE =
%r{
(?<score_fuller>
   \b   
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__ET_AGG_P}
)}ix
SCORE_FULLER_MORE__ET_AGG_P__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__ET_AGG_P}
)}ix
SCORE_FULLER_RE =

map tables

note: order matters - first come-first matched/served
Regexp.union(
SCORE_FULLER__HT_FT__RE,       ## e.g.  3-2 (HT 2-1)
SCORE_FULLER__ET_P__RE,        ## e.g.  2-2 (aet, win 5-3 on pens)
SCORE_FULLER__ET__RE,          ## e.g.  2-3 (aet)
SCORE_FULLER__FT_P__RE,        ## e.g.  2-2 (win 5-3 on pens)
SCORE_FULLER__FT_AGG__RE,      ## e.g.  2-3 (win 5-4 on aggregate)
SCORE_FULLER__FT_AGG_AWAY__RE, ## e.g.  2-1 (3-3 on aggreate, win 2-1 on away goals)
SCORE_FULLER__ET_AGG_P__RE,    ## e.g.  2-1 (aet, 3-3 on aggregate, win 5-2 on pens)
)
SCORE_FULLER_MORE__HT__RE =

add support for “stand-alone” (HT) and (FT) - keep why? why not?

%r{
(?<score_fuller_more>
    \( (?<ht> ht ) \)
)}ix
SCORE_FULLER_MORE__FT__RE =
%r{
(?<score_fuller_more>
     \( (?<ft> ft ) \)  
)}ix
SCORE_FULLER_MORE__FT_ET__RE =

add special for fuller_more

(aet 4-3)   -  core score is ft, and fuller more incl. et!!!
%r{
(?<score_fuller_more>
      \(#{ET_EN}
           [ ]
       (?<et1>\d{1,2}) - (?<et2>\d{1,2})
      \) 
)}ix
SCORE_FULLER_MORE__HT_FT__CLASSIC_RE =

note - simply (1-1) !!!!!

note - special attention needed for placemenent in processing error!!!
  make sure it is the last (or on of the last) match(es)
%r{
(?<score_fuller_more>
     \(  
          (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) 
     \)
)}ix
SCORE_FULLER_MORE_RE =
Regexp.union(
  SCORE_FULLER_MORE__FT__RE,          ## e.g. (ft)
  SCORE_FULLER_MORE__HT__RE,          ## e.g. (ht)
  SCORE_FULLER_MORE__HT_FT__RE,       ## e.g. (HT 2-1)
  SCORE_FULLER_MORE__ET_P__RE,        ## e.g. (aet, win 5-3 on pens)
  SCORE_FULLER_MORE__ET__RE,          ## e.g. (aet)
  SCORE_FULLER_MORE__FT_ET__RE,       ## e.g. (aet 3-2) - (SPECIAL) incl. after extra-time score!!
  SCORE_FULLER_MORE__FT_P__RE,        ## e.g. (win 5-3 on pens)
  SCORE_FULLER_MORE__FT_AGG__RE,      ## e.g. (win 5-4 on aggregate)
  SCORE_FULLER_MORE__FT_AGG_AWAY__RE, ## e.g. (3-3 on aggreate, win 2-1 on away goals)
  SCORE_FULLER_MORE__ET_AGG_P__RE,    ## e.g. (aet, 3-3 on aggregate, win 5-2 on pens)

  SCORE_FULLER_MORE__HT_FT__CLASSIC_RE,   ## e.g. (2-1)  half-time !!!!
)
DURATION_I_RE =
%r{
(?<duration>
    \b
  (?:
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name1>#{MONTH_NAMES})
      [ ]
   (?<day1>\d{1,2})
   ## optional year
   (  ,?   # optional comma
      [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]* - [ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name2>#{MONTH_NAMES})
      [ ]
   (?<day2>\d{1,2})
   ## optional year
   (  ,?   # optional comma
      [ ]
      (?<year2>\d{4})
   )?
  )
   \b
)}ix
DURATION_II_RE =

variant ii

add support for shorthand
   August 16-18, 2011
   September 13-15, 2011
    October 18-20, 2011
    March 6-8 2012
    March 6-8

 - add support for August 16+17 or such (and check 16+18)
     use <op> to check if day2 is a plus or range or such - why? why not?
%r{
(?<duration>
    \b
   (?:
       (?<month_name1>#{MONTH_NAMES})
           [ ]
        (?<day1>\d{1,2})
             -
        (?<day2>\d{1,2})
          (?:
            ,?     ## optional comma
            [ ]
            (?<year1>\d{4})
          )?     ## optional year
   )
   \b
)}ix
DURATION_RE =

map tables

note: order matters; first come-first matched/served
Regexp.union(
   DURATION_I_RE,
   DURATION_II_RE,
)
INLINE_WO_RE =

add support for WO or W-0 too - why? why not?

%r{
    (?<inline_wo>
        \b (?: w/o | W/O ) \b
)}x
INLINE_BYE_RE =

note - NOT case insensitive

%r{
   (?<inline_bye>
       \b (?: bye | BYE ) \b
)}x
INLINE_NP_RE =

A n/p B (note - basically a inline short form of A v B [cancelled] )

N/P
%r{
    (?<inline_np>
        \b (?: n/p | N/P ) \b
)}x
INLINE_ABD_RE =

abd/abd. or aban/aban. [abandoned]

ABD/ABAN
%r{
    (?<inline_abd>
        \b (?: abd\.? |
               aban\.? |
               ABD | ABAN
           )
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x
INLINE_SUSP_RE =

susp/susp. [suspended]

SUSP
%r{
    (?<inline_susp>
        \b (?: susp\.? |
                SUSP )
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x
INLINE_PPD_RE =

ppd/ppd. or pst/pst. or pstp/pstp. or postp/postp. [postponed]

PPD/PSTP/POSTP/P-P
 todo/check - add/allow p-p too - why? why not?
%r{
    (?<inline_ppd>
        \b (?: ppd\.? |
               pst\.? |
               po?stp\.? |
               PPD | PST | PO?STP | P-P
            )
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x
INLINE_VOID_RE =

void via x-x X-X

todo/check - only allow X-X - why? why not?
%r{
      (?<inline_void>
          \b (?: x-x |
                 X-X
             )
        ## POSITIVE lookahead - requires space
           (?= [ ])
)}x
INLINE_AWD_RE =

awd/awd. [awarded]

AWD
note - recommendation is to allways include score
         thus, use/prefer SCORE_AWD e.g. 0-3 awd
%r{
    (?<inline_awd>
        \b (?: awd\.? | AWD )
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x
INLINE_CANC_RE =

canc/canc. [cancelled]

CANC
%r{
    (?<inline_canc>
        \b (?: canc\.?  | CANC )
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x
PROP_LINEUP_RE =
Regexp.union(
   SPACES_RE,
   MINUTE_RE,   ## e.g.  44 or 44' or 45+1 or 45+1' etc.

   INLINE_CAPTAIN,  ## e.g. [c]
   INLINE_YELLOW,   ## e.g. [Y] or [Y 44] or [Y 44'] or [Y 45+1']
   INLINE_YELLOW_RED,  ## e.g. [Y/R] or [Y/R 78]
   INLINE_RED,         ## e.g. [R] or [R 42] or [R 42']

   PROP_KEY_INLINE_RE,
   PROP_NAME_RE,
   /  (?<sym>  [;,()\[\]-]) /x
   ## todo/fix - add ANY_RE here too!!!
)
PROP_PENALTIES_RE =
Regexp.union(
   SPACES_RE,
   SCORE_RE,               # e.g. 1-1 etc.
   ENCLOSED_NAME_RE,       # e.g. (save), (post), etc.
   PROP_NAME_RE,
    /  (?<sym>  [;,]) /x    ## add [] too - why? why not?
   ## todo/fix - add ANY_RE here too!!!
)

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(txt, debug: false) ⇒ Lexer

Returns a new instance of Lexer.

Raises:

  • (ArgumentError)


44
45
46
47
48
49
# File 'lib/sportdb/parser/lexer.rb', line 44

def initialize( txt, debug: false )
   raise ArgumentError, "text as string expected for lexer; got #{txt.class.name}"  unless txt.is_a?(String)

   @txt   = txt
   @debug = debug
end

Class Method Details

._build_date(m) ⇒ Object

“internal” date helpers



6
7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/sportdb/parser/token-date--helpers.rb', line 6

def self._build_date( m )
            date = {}
         ## map month names
         ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date[:y]  = m[:year].to_i(10)  if m[:year]
            ## check - use y too for two-digit year or keep separate - why? why not?
            date[:yy] = m[:yy].to_i(10)    if m[:yy]    ## two digit year (e.g. 25 or 78 etc.)
            date[:m] = m[:month].to_i(10)  if m[:month]
            date[:m] = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
            date[:d]  = m[:day].to_i(10)   if m[:day]
            date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

            date
end

._build_date_legs(m) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/sportdb/parser/token-date--helpers.rb', line 21

def self._build_date_legs( m )
           legs = {}
        ## map month names
         ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date = {}
            date[:m] = MONTH_MAP[ m[:month_name1].downcase ]
            date[:d]  = m[:day1].to_i(10)
            legs[:date1] = date

            date = {}
            date[:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            date[:d]  = m[:day2].to_i(10)
            legs[:date2] = date

            legs
end

._build_duration(m) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/sportdb/parser/token-date--helpers.rb', line 39

def self._build_duration( m )
            ## todo/check/fix - if end: works for kwargs!!!!!
            duration = { start: {}, end: {}}

            duration[:start][:y] = m[:year1].to_i(10)  if m[:year1]
            duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ]   if m[:month_name1]
            duration[:start][:d]  = m[:day1].to_i(10)   if m[:day1]
            duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ]   if m[:day_name1]

            duration[:end][:y] = m[:year2].to_i(10)  if m[:year2]
            duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            duration[:end][:d]  = m[:day2].to_i(10)   if m[:day2]
            duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ]   if m[:day_name2]

            duration
end

._build_goal_count(m) ⇒ Object



50
51
52
53
54
55
56
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 50

def self._build_goal_count( m )
    count = {}
    count[:count] = m[:value].to_i(10)        if m[:value]
    count[:og]    = m[:og_value] ? m[:og_value].to_i(10) : 1      if m[:og]   ## check flag
    count[:pen]   = m[:pen_value] ? m[:pen_value].to_i(10) : 1    if m[:pen]  ## check flag
    count
end

._build_goal_minute(m) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 6

def self._build_goal_minute( m )
    minute = {}

    minute[:m]     =  m[:value].to_i(10)   ## always required

    ## stoppage/injury time (offset)
    minute[:offset] = m[:value2].to_i(10)   if m[:value2]

    minute[:og]  = true       if m[:og]
    minute[:pen] = true       if m[:pen]
    minute[:freekick] = true  if m[:fk]
    minute[:header] = true    if m[:hdr]

    minute[:secs] = m[:secs].to_i(10)   if m[:secs]

    minute
end

._build_goal_minute_na(m) ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 24

def self._build_goal_minute_na( m )
    minute = {}

    minute[:m]     =  '?'   ##  or use nil or 999 or -1 or ???

    minute[:og]  = true       if m[:og]
    minute[:pen] = true       if m[:pen]
    minute[:freekick] = true  if m[:fk]
    minute[:header] = true    if m[:hdr]

    minute
end

._build_goal_type(m) ⇒ Object



58
59
60
61
62
63
64
65
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 58

def self._build_goal_type( m )
    goal = {}
    goal[:og]       = true  if m[:og]
    goal[:pen]      = true  if m[:pen]
    goal[:freekick] = true  if m[:fk]
    goal[:header]   = true  if m[:hdr]
    goal
end

._build_minute(m) ⇒ Object



39
40
41
42
43
44
45
46
47
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 39

def self._build_minute( m )
    minute = {}
    minute[:m]      = m[:value].to_i(10)   ## always required

    ## stoppage/injury time (offset)
    minute[:offset] = m[:value2].to_i(10)   if m[:value2]

    minute
end

._build_score(m) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/sportdb/parser/token-score--helpers.rb', line 5

def self._build_score( m )
             ##  note - score is "generic"
            ##      might be full-time (ft) or
            ##         after extra-time (aet) or such
            ##         or even undecided/unknown
            ##    thus, use score1/score2 and NOT ft1/ft2
            ##      thus, use (simply an) array e.g. [1,2]
            ##           and NOT hash (table) e.g. { ft: [1,2] } !!!

            score  = [m[:score1].to_i(10),
                      m[:score2].to_i(10)]

           score
end

._build_score_abd(m) ⇒ Object

score abandonded (abd/abd.)



30
31
32
33
34
35
36
37
# File 'lib/sportdb/parser/token-score--helpers.rb', line 30

def self._build_score_abd( m )      # score abandonded (abd/abd.)
            ### note - use "generic" score for now
            score   = [m[:score1].to_i(10),
                       m[:score2].to_i(10)]
            ## add score[:abd] = true ???
            ##  note - for now uses its own token e.g SCORE_ABD
            score
end

._build_score_awd(m) ⇒ Object

score awarded (awd/awd.)



20
21
22
23
24
25
26
27
28
# File 'lib/sportdb/parser/token-score--helpers.rb', line 20

def self._build_score_awd( m )    # score awarded (awd/awd.)
            ### note - use "generic" score for now
            ##         to match  A 3-0 B [awarded] etc.
            score = [m[:score1].to_i(10),
                      m[:score2].to_i(10)]
            ## add score[:awarded] = true ???
            ##  note - for now uses its own token e.g SCORE_AWD
            score
end

._build_score_full(m) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/sportdb/parser/token-score--helpers.rb', line 40

def self._build_score_full( m )
              score = {}
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]

              ## add golden/silver flags
              score[:golden] = true   if m[:aetgg]  ## golden goal (gg)/sudden death (sd)
              score[:silver] = true   if m[:aetsg]  ## silver goal (sg)

              score
end

._build_score_fuller(m) ⇒ Object



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/sportdb/parser/token-score--helpers.rb', line 58

def self._build_score_fuller( m )
              score = {}
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]
              score[:agg] = [m[:agg1].to_i(10),
                             m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]

              if m[:away1] && m[:away2]
                 score[:away] = [m[:away1].to_i(10),
                                 m[:away2].to_i(10)]
              elsif m[:away]    ## fallback if no away score; check away flag
                 score[:away] = true
              end

              ## add golden/silver flags
              score[:golden] = true   if m[:aetgg]  ## golden goal (gg)/sudden death (sd)
              score[:silver] = true   if m[:aetsg]  ## silver goal (sg)

              score
end

._build_score_fuller_more(m) ⇒ Object



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/sportdb/parser/token-score--helpers.rb', line 86

def self._build_score_fuller_more( m )
               ##    SCORE + SCORE_FULLER_MORE
               ## note -  after extra-time (aet) or full-time (ft)
               ##           score may be present in SCORE!!!
              score = {}
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]
              score[:agg] = [m[:agg1].to_i(10),
                             m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]

              if m[:away1] && m[:away2]
                 score[:away] = [m[:away1].to_i(10),
                                 m[:away2].to_i(10)]
              elsif m[:away]    ## fallback if no away score; check away flag
                 score[:away] = true
              end

              ## add golden/silver flags
              score[:golden] = true   if m[:aetgg]  ## golden goal (gg)/sudden death (sd)
              score[:silver] = true   if m[:aetsg]  ## silver goal (sg)

              ## add flag in score for et/ft/ht
              ##    used for "dangling" (generic) score
              score[:score] = 'et'   if m[:aet] || m[:aetgg] || m[:aetsg]
              score[:score] = 'ft'   if m[:ft]
              score[:score] = 'ht'   if m[:ht]

              score
end

._build_score_legs(m) ⇒ Object



123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/sportdb/parser/token-score--helpers.rb', line 123

def self._build_score_legs( m )
              legs = {}

              ############
              ### build leg1 (score)
              score = {}
              score[:ft] = [m[:leg1_ft1].to_i(10),
                            m[:leg1_ft2].to_i(10)]
              legs['leg1'] = score

              ##################
              ### build leg2 (score)
              score = {}
              score[:ft] = [m[:leg2_ft1].to_i(10),
                            m[:leg2_ft2].to_i(10)]  if m[:leg2_ft1] && m[:leg2_ft2]
              score[:et] = [m[:leg2_et1].to_i(10),
                            m[:leg2_et2].to_i(10)]  if m[:leg2_et1] && m[:leg2_et2]
              score[:p]  = [m[:leg2_p1].to_i(10),
                            m[:leg2_p2].to_i(10)]  if m[:leg2_p1] && m[:leg2_p2]
              legs['leg2'] = score

              ## check for (opt) aggregate - keep on "top-level"
              legs[:agg] = [m[:agg1].to_i(10),
                            m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]
              legs[:away] = true  if m[:away]

              legs
end

._build_status(m) ⇒ Object



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/sportdb/parser/token-status.rb', line 100

def self._build_status( m )
        status = {}
        ## note - norm status text - why? why not?
        status[:status] = if    m[:postponed] then 'postponed'
                          elsif m[:canceled]  then 'canceled'
                          elsif m[:walkover]  then 'walkover'
                          elsif m[:awarded]   then 'awarded'
                          elsif m[:suspended] then 'suspended'
                          elsif m[:abandoned] then 'abandoned'
                          elsif m[:annulled] ||
                                m[:voided]    then 'annulled'
                          elsif m[:replay]    then 'replay'
                          else  ## fallback on "generic" status (shouldn't happen)
                            m[:status]
                          end

        ## includes note? e.g.  awarded; originally 2-0
        status[:status_note] = m[:status_note]   if m[:status_note]

        status
end

._build_time(m) ⇒ Object



96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/sportdb/parser/token-time.rb', line 96

def self._build_time( m )
              ## unify to iso-format
              ###   12.40 => 12:40
              ##    12h40 => 12:40 etc.
              ##  keep string (no time-only type in ruby)
              data = { time: {} }

              hour     = m[:hour].to_i(10)  ## allow 08/07/etc.
              minute   = m[:minute].to_i(10)

              ##   check if 24:00 possible? or only 0:00 (23:59)
              unless (hour   >=0 && hour   <=23) &&
                     (minute >=0 && minute <=59)
                 raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
              end

              data[:time][:h] = hour
              data[:time][:m] = minute
              data[:time][:timezone] = m[:timezone]    if m[:timezone]


              ## check if local time present e.g.
              ##    18:30 (19:30)
              ##    18:30 (19:30 BST)  etc.
              if m[:time_local]
                  data[:time_local] = {}

                local_hour     = m[:local_hour].to_i(10)  ## allow 08/07/etc.
                local_minute   = m[:local_minute].to_i(10)

                ##   check if 24:00 possible? or only 0:00 (23:59)
                unless (hour   >=0 && hour   <=23) &&
                       (minute >=0 && minute <=59)
                   raise ArgumentError, "parse error - local time >#{m[:time_local]}< out-of-range"
                end

                data[:time_local][:h] = local_hour
                data[:time_local][:m] = local_minute
                data[:time_local][:timezone] = m[:local_timezone]    if m[:local_timezone]
              end

              data
end

._mk_score_fuller_agg(win:) ⇒ Object

regex score helpers

note - MUST double escape \d e.g. \\d!!!   if not "simple" string (e.g. '' but %Q<>)


24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/sportdb/parser/token-score_fuller.rb', line 24

def self._mk_score_fuller_agg( win: )    ## with optional win - true|false
   %Q<
                 (?:
                    ############
                    ## opt 1)  with win
                    (?:
                       #{ win ? '(?: win [ ] )?' : '' }   
                        (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2})
                          [ ] on [ ] agg (?: regate )?  
                    )
                    |        
                    #####
                    ## opt 2)  "classic" (post)
                    (?:
                       (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2})
                          [ ]*
                        #{AGG_EN}   
                    )
                    |
                    #####
                    ## opt 3) agg up-front (pre)
                    (?:
                         agg [ ]
                       (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2})   
                    )
                 )
    >
end

._mk_score_fuller_p(win:) ⇒ Object

with optional win - true|false



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/sportdb/parser/token-score_fuller.rb', line 53

def self._mk_score_fuller_p( win: )    ## with optional win - true|false
   %Q<
                 (?:
                    ############
                    ## opt 1)  with win
                    (?:
                        #{ win ? '(?: win [ ] )?' : '' }
                        (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})
                          [ ] on [ ] pens
                    )
                    |        
                    #####
                    ## opt 2)  "classic" (post)
                    (?:
                       (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})
                          [ ]*
                        #{P_EN}   
                    )
                    |
                    #####
                    ## opt 3) up-front (pre)
                    (?:
                         (?: pen|p) [ ]
                       (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})   
                    )
                 )                   
    >
end

._parse_date(str) ⇒ Object



111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/sportdb/parser/token-date--helpers.rb', line 111

def self._parse_date( str )
    ## note - strip - leading/trailing spaces automatic - why? why not?
    m = DATE_RE.match( str.strip )

    if m && m.pre_match == '' && m.post_match == ''
      ## return hash table with captured components
      date = _build_date( m )
      date
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil
    else
      nil  ## no match - return nil
    end
end

._parse_goal_count(str) ⇒ Object



95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 95

def self._parse_goal_count( str )
    ## note - strip - leading/trailing spaces
    m = GOAL_COUNT_RE.match( str.strip )
    if m && m.pre_match == '' && m.post_match == ''
      _build_goal_count( m )
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil
    else
      nil  ## no match - return nil
    end
end

._parse_goal_minute(str) ⇒ Object

parse helpers



81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 81

def self._parse_goal_minute( str )
    ## note - strip - leading/trailing spaces
    m = GOAL_MINUTE_RE.match( str.strip )
    if m && m.pre_match == '' && m.post_match == ''
      _build_goal_minute( m )
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil
    else
      nil  ## no match - return nil
    end
end

._parse_score_full(str) ⇒ Object

add parser helpers



167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/sportdb/parser/token-score--helpers.rb', line 167

def self._parse_score_full( str )
    ## note - strip - leading/trailing spaces automatic - why? why not?

    m = Regexp.union(
              SCORE_FULL_1ST_RE,
              SCORE_FULL_RE ).match( str.strip )

    if m && m.pre_match == '' && m.post_match == ''
       pp m
       _build_score_full( m )
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil
    else
      nil  ## no match - return nil
    end
end

._parse_team(str) ⇒ Object

helper for testing regex match for team names



205
206
207
208
209
210
211
212
213
214
215
216
217
# File 'lib/sportdb/parser/token-text.rb', line 205

def self._parse_team( str )
    ## note - strip - leading/trailing spaces
    m = TEXT_RE.match( str.strip )
    if m && m.pre_match == '' && m.post_match == ''
      m
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil
    else
      nil  ## no match - return nil
    end
end

.build_map(lines, downcase: false) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/sportdb/parser/token-date--names.rb', line 39

def self.build_map( lines, downcase: false )
   ## note: downcase name!!!
  ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
  ##  {"january" => 1,  "jan" => 1,
  ##   "february" => 2, "feb" => 2,
  ##   "march" => 3,    "mar" => 3,
  ##   "april" => 4,    "apr" => 4,
  ##   "may" => 5,
  ##   "june" => 6,     "jun" => 6, ...
  lines.each_with_index.reduce( {} ) do |h,(line,i)|
    line.each do |name|
       h[ downcase ? name.downcase : name ] = i+1
    end  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
    h
  end
end

.build_names(lines) ⇒ Object



32
33
34
35
36
# File 'lib/sportdb/parser/token-date--names.rb', line 32

def self.build_names( lines )
  ## join all words together into a single string e.g.
  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
  lines.map { |line| line.join('|') }.join('|')
end

.parse_date(str, start: nil) ⇒ Object

note: parse_date - returns Date object

_parse_date (with underscore) - return  hash of "parsed" regex match data!!


72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/sportdb/parser/token-date--helpers.rb', line 72

def self.parse_date( str, start: nil )
    if m = _parse_date( str )
       year  = m[:y]
       yy    = m[:yy]

       ####
       ## support two digit shortcut for year
       if yy && year.nil?
          ###
          ## for now assume 00,01 to 30 is 2000,2001 to 2030
          ##   and          31 to 99   is  1931 to 1999
          year =   yy <= 30 ?  2000+yy : 1900+yy
       end

       month = m[:m]
       day   = m[:d]
       wday  = m[:wday]


      if year.nil?     ## try to calculate year
        raise ArgumentError, "year required in date >#{str}< or pass along start date"   if start.nil?

        year =  if  month > start.month ||
                   (month == start.month && day >= start.day)
                  # assume same year as start_at event (e.g. 2013 for 2013/14 season)
                  start.year
                else
                  # assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
                  start.year+1
                end
      end
      Date.new( year,month,day )
    else
      raise ArgumentError, "unexpected date format; cannot parse >#{str}<"
    end
end

.parse_names(txt) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/sportdb/parser/token-date--names.rb', line 5

def self.parse_names( txt )
  lines = [] # array of lines (with words)

  txt.each_line do |line|
    line = line.strip

    next if line.empty?
    next if line.start_with?( '#' )   ## skip comments too

    ## strip inline (until end-of-line) comments too
    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
    ##   =>   Janvier  Janv  Jan

    line = line.sub( /#.*/, '' ).strip
    ## pp line

    values = line.split( /[ \t]+/ )
    ## pp values

    ## todo/fix -- add check for duplicates
    lines << values
  end
  lines

end

Instance Method Details

#_build_date(m) ⇒ Object



59
# File 'lib/sportdb/parser/token-date--helpers.rb', line 59

def _build_date( m )      self.class._build_date( m ); end

#_build_date_legs(m) ⇒ Object



60
# File 'lib/sportdb/parser/token-date--helpers.rb', line 60

def _build_date_legs( m ) self.class._build_date_legs( m ); end

#_build_duration(m) ⇒ Object



61
# File 'lib/sportdb/parser/token-date--helpers.rb', line 61

def _build_duration( m )  self.class._build_duration( m ); end

#_build_goal_count(m) ⇒ Object



71
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 71

def _build_goal_count( m ) self.class._build_goal_count( m ); end

#_build_goal_minute(m) ⇒ Object



68
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 68

def _build_goal_minute( m ) self.class._build_goal_minute( m ); end

#_build_goal_minute_na(m) ⇒ Object



69
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 69

def _build_goal_minute_na( m ) self.class._build_goal_minute_na( m ); end

#_build_goal_type(m) ⇒ Object



72
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 72

def _build_goal_type( m ) self.class._build_goal_type( m ); end

#_build_minute(m) ⇒ Object



70
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 70

def _build_minute( m ) self.class._build_minute( m ); end

#_build_score(m) ⇒ Object



153
# File 'lib/sportdb/parser/token-score--helpers.rb', line 153

def _build_score( m )             self.class._build_score( m ); end

#_build_score_abd(m) ⇒ Object



155
# File 'lib/sportdb/parser/token-score--helpers.rb', line 155

def _build_score_abd( m )         self.class._build_score_abd( m ); end

#_build_score_awd(m) ⇒ Object



154
# File 'lib/sportdb/parser/token-score--helpers.rb', line 154

def _build_score_awd( m )         self.class._build_score_awd( m ); end

#_build_score_full(m) ⇒ Object



156
# File 'lib/sportdb/parser/token-score--helpers.rb', line 156

def _build_score_full( m )        self.class._build_score_full( m ); end

#_build_score_fuller(m) ⇒ Object



157
# File 'lib/sportdb/parser/token-score--helpers.rb', line 157

def _build_score_fuller( m )      self.class._build_score_fuller( m ); end

#_build_score_fuller_more(m) ⇒ Object



158
# File 'lib/sportdb/parser/token-score--helpers.rb', line 158

def _build_score_fuller_more( m ) self.class._build_score_fuller_more( m ); end

#_build_score_legs(m) ⇒ Object



159
# File 'lib/sportdb/parser/token-score--helpers.rb', line 159

def _build_score_legs( m )        self.class._build_score_legs( m ); end

#_build_status(m) ⇒ Object



121
# File 'lib/sportdb/parser/token-status.rb', line 121

def _build_status( m ) self.class._build_status( m ); end

#_build_time(m) ⇒ Object



139
# File 'lib/sportdb/parser/token-time.rb', line 139

def _build_time(m) self.class._build_time(m); end

#_info(*args) ⇒ Object



32
33
34
35
# File 'lib/sportdb/parser/lexer.rb', line 32

def _info( *args )
  print "[INFO] Lexer -- "
  args.each { |arg| puts args }
end

#_on_goal(m, ctx:) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/sportdb/parser/lexer-on_goal.rb', line 19

def _on_goal( m, ctx: )

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:goals_none]    ## note - eats-up semicolon!! e.g. -; or - ;
             # was:[:GOALS_NONE,"<|GOALS_NONE|>"]
             ##   use literal text!!
             Token.new( :GOALS_NONE, m[:goals_none],
                            lineno: ctx.lineno, offset: m.offset(:goals_none))
         elsif m[:goal_sep_alt]
             # was: [:GOAL_SEP_ALT, "<|GOAL_SEP_ALT|>" ]   ## e.g. dash (-) WITH leading & trailing space required
             Token.new( :GOAL_SEP_ALT, m[:goal_sep_alt],
                              lineno: ctx.lineno, offset: m.offset(:goal_sep_alt))
         elsif m[:prop_name]    ## note - change prop_name to player
             Token.new( :PLAYER, m[:name],
                              lineno: ctx.lineno, offset: m.offset(:name))
         elsif m[:goal_minute]
             Token.new( :GOAL_MINUTE, m[:goal_minute],
                              lineno: ctx.lineno, offset: m.offset(:goal_minute),
                              value: _build_goal_minute( m ))
         elsif m[:goal_minute_na]
              ## note -  (re)use GOAL_MINUTE token; no extra GOAL_MINUTE_NA or such - why? why not?
              ##          make sure to handle 'm' => nil upstream!!!
              ##                     change to  999 or -1 or such - why? why not?
             Token.new( :GOAL_MINUTE, m[:goal_minute_na],
                               lineno: ctx.lineno, offset: m.offset(:goal_minute_na),
                                value: _build_goal_minute_na( m ))
         elsif m[:goal_count]
              Token.new( :GOAL_COUNT, m[:goal_count],
                                lineno: ctx.lineno, offset: m.offset(:goal_count),
                                value: _build_goal_count( m ))
         elsif m[:sym]
            case m[:sym]
            when ')'  ## leave goal mode!!
                _trace( "LEAVE GOAL_RE MODE" )
                @re = RE
                ##  note - use/return GOAL_END token   - change to GOAL_END_PAREN(THESIS)
                ##                                or GOAL_PAREN_CLOSE/END ???
                ##   fix - use ) too - why? why not?
                ## was: [:GOALS_END, '<|GOALS_END|>']
                Token.virtual( :GOALS_END, lineno: ctx.lineno  )
            else
                Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
            end
         else
            ctx.warn_on_else( m, mode: 'GOAL' )
            nil
         end
end

#_on_goal_alt(m, ctx:) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/sportdb/parser/lexer-on_goal.rb', line 82

def _on_goal_alt( m, ctx: )

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
             Token.new(:PLAYER, m[:name],
                           lineno: ctx.lineno, offset: m.offset(:name))
         elsif m[:goal_minute]
             Token.new( :GOAL_MINUTE, m[:goal_minute],
                              lineno: ctx.lineno, offset: m.offset(:goal_minute),
                              value: _build_goal_minute( m ))
         elsif m[:goal_type]
             Token.new( :GOAL_TYPE,m[:goal_type],
                              lineno: ctx.lineno, offset: m.offset(:goal_type),
                              value: _build_goal_type( m ))
         elsif m[:score]
             Token.new( :SCORE, m[:score],
                              lineno: ctx.lineno, offset: m.offset(:score),
                              value: _build_score( m ))
         elsif m[:sym]
            case m[:sym]
            when ')'  ## leave goal mode!!
                _trace( "LEAVE GOAL_ALT_RE MODE" )
                @re = RE
                ##  note - use/return GOAL_END token   - change to GOAL_END_PAREN(THESIS)
                ##                                or GOAL_PAREN_CLOSE/END ???
                ## [:GOALS_END, '<|GOALS_END|>']
                Token.virtual( :GOALS_END, lineno: ctx.lineno  )
            else
                Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
            end
         else
            ctx.warn_on_else( m, mode: 'GOAL_ALT' )
            nil
         end
end

#_on_goal_compat(m, ctx:) ⇒ Object

note - m is MatchData object



132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/sportdb/parser/lexer-on_goal.rb', line 132

def _on_goal_compat( m, ctx: )      ## note - m is MatchData object

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
             Token.new(:PLAYER, m[:name],
                           lineno: ctx.lineno, offset: m.offset(:name))
         elsif m[:minute]
             Token.new(:MINUTE, m[:minute],
                           lineno: ctx.lineno, offset: m.offset(:minute),
                           value: _build_minute( m ))
         elsif m[:goal_type]
             Token.new( :GOAL_TYPE,m[:goal_type],
                              lineno: ctx.lineno, offset: m.offset(:goal_type),
                              value: _build_goal_type( m ))

         elsif m[:score]
             Token.new( :SCORE, m[:score],
                              lineno: ctx.lineno, offset: m.offset(:score),
                              value: _build_score( m ))
         elsif m[:sym]
            case m[:sym]
            when ')'  ## leave goal mode!!
                _trace( "LEAVE GOAL_COMPAT_RE MODE" )
                @re = RE
                ##  note - use/return GOAL_END token   - change to GOAL_END_PAREN(THESIS)
                ##                                or GOAL_PAREN_CLOSE/END ???
                ## [:GOALS_END, '<|GOALS_END|>']
                Token.virtual( :GOALS_END, lineno: ctx.lineno  )
            else
                Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
            end
         else
            ctx.warn_on_else( m, mode: 'GOAL_COMPAT' )
            nil
         end
end

#_on_group_def(m, ctx:) ⇒ Object

note - m is MatchData object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/sportdb/parser/lexer-on_group_def.rb', line 14

def _on_group_def( m, ctx: )      ## note - m is MatchData object

           if m[:spaces] || m[:space]
               nil    ## skip spaces
           elsif m[:text]
               Token.new(:TEAM,  m[:text],
                 lineno: ctx.lineno, offset: m.offset(:text))
           elsif m[:sym]
                Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
           else
              ctx.warn_on_else( m, mode: 'GROUP_DEF' )
              nil
           end
end

#_on_prop_attendance(m, ctx:) ⇒ Object

note - m is MatchData object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/sportdb/parser/lexer-on_prop_misc.rb', line 45

def _on_prop_attendance( m, ctx: )      ## note - m is MatchData object

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:enclosed_name]
              ## reserverd for use for sold out or such (in the future) - why? why not?
             Token.new(:ENCLOSED_NAME, m[:name],
                             lineno: ctx.lineno, offset: m.offset(:name))
         elsif m[:num]
             Token.new(:PROP_NUM, m[:num],
                             lineno: ctx.lineno, offset: m.offset(:num),
                             value: m[:value].to_i(10))
         else
            ctx.warn_on_else( m, mode: 'PROP_ATTENDANCE' )
            nil
         end
end

#_on_prop_cards(m, ctx:) ⇒ Object

note - m is MatchData object



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/sportdb/parser/lexer-on_prop_misc.rb', line 17

def _on_prop_cards( m, ctx: )      ## note - m is MatchData object

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]
              Token.new(:PROP_NAME, m[:name],
                               lineno: ctx.lineno, offset: m.offset(:prop_name))
         elsif m[:minute]
              Token.new(:MINUTE, m[:minute],
                           lineno: ctx.lineno, offset: m.offset(:minute),
                           value: _build_minute( m ))
         elsif m[:sym]
              Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
         else
             ctx.warn_on_else( m, mode: 'PROP_CARDS' )
             nil
         end
end

#_on_prop_lineup(m, ctx:) ⇒ Object

note - m is MatchData object



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/sportdb/parser/lexer-on_prop_lineup.rb', line 22

def _on_prop_lineup( m, ctx: )      ## note - m is MatchData object

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_key]   ## check for inline prop keys
              key = m[:key]
              ##  supported for now coach/trainer (add manager?)
              if ['coach',
                  'trainer'].include?( key.downcase )
                ## use PROP_COACH or COACH_KEY or such - why? why not?
                Token.new(:COACH, m[:key],
                             lineno: ctx.lineno, offset: m.offset(:key))
              else
                ## report error - for unknown (inline) prop key in lineup
                nil
              end
         elsif m[:inline_captain]
              Token.new(:INLINE_CAPTAIN, m[:inline_captain],
                            lineno: ctx.lineno, offset: m.offset(:inline_captain))
         elsif m[:inline_yellow]
              card = {}
              card[:m]      = m[:minute].to_i(10)  if m[:minute]
              card[:offset] = m[:offset].to_i(10)  if m[:offset]
              Token.new(:INLINE_YELLOW, m[:inline_yellow],
                               lineno: ctx.lineno, offset: m.offset(:inline_yellow),
                                value: card)
         elsif m[:inline_red]
              card = {}
              card[:m]      = m[:minute].to_i(10)  if m[:minute]
              card[:offset] = m[:offset].to_i(10)  if m[:offset]
              Token.new(:INLINE_RED, m[:inline_red],
                              lineno: ctx.lineno, offset: m.offset(:inline_red),
                              value: card)
         elsif m[:inline_yellow_red]
              card = {}
              card[:m]      = m[:minute].to_i(10)  if m[:minute]
              card[:offset] = m[:offset].to_i(10)  if m[:offset]
              Token.new(:INLINE_YELLOW_RED, m[:inline_yellow_red],
                               lineno: ctx.lineno, offset: m.offset(:inline_yellow_red),
                               value: card)
         elsif m[:prop_name]
              Token.new(:PROP_NAME, m[:name],
                               lineno: ctx.lineno, offset: m.offset(:prop_name))
         elsif m[:minute]
              Token.new(:MINUTE, m[:minute],
                           lineno: ctx.lineno, offset: m.offset(:minute),
                           value: _build_minute( m ))
         elsif m[:sym]
              Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
         else
             ctx.warn_on_else( m, mode: 'PROP_LINEUP' )
             nil
         end
end

#_on_prop_penalties(m, ctx:) ⇒ Object

note - m is MatchData object



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/sportdb/parser/lexer-on_prop_penalties.rb', line 16

def _on_prop_penalties( m, ctx: )      ## note - m is MatchData object
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
              Token.new(:PROP_NAME, m[:name],
                               lineno: ctx.lineno, offset: m.offset(:prop_name))
         elsif m[:enclosed_name]
              ## use HOLD,SAVE,POST or such keys - why? why not?
             Token.new(:ENCLOSED_NAME, m[:name],
                             lineno: ctx.lineno, offset: m.offset(:name))
         elsif m[:score]
             Token.new( :SCORE, m[:score],
                              lineno: ctx.lineno, offset: m.offset(:score),
                              value: _build_score( m ))
         elsif m[:sym]
              Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
         else
            ctx.warn_on_else( m, mode: 'PROP_PENALTIES ')
            nil
         end
end

#_on_prop_referee(m, ctx:) ⇒ Object

note - m is MatchData object



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/sportdb/parser/lexer-on_prop_misc.rb', line 75

def _on_prop_referee( m, ctx: )      ## note - m is MatchData object

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_key]   ## check for inline prop keys
              key = m[:key]
              ##  supported for now coach/trainer (add manager?)
              if ['att', 'attn', 'attendance' ].include?( key.downcase )
                ## use ATTENDANCE_PROP or ATTENDANCE_KEY or such - why? why not?
                Token.new(:ATTENDANCE, m[:key],
                                 lineno: ctx.lineno, offset: m.offset(:key))
              else
                ## report error - for unknown (inline) prop key in lineup
                nil
              end
         elsif m[:prop_name]    ## note - change prop_name to player or to (plain) name?
              Token.new(:PROP_NAME, m[:name],
                               lineno: ctx.lineno, offset: m.offset(:prop_name))
         elsif m[:num]
             Token.new(:PROP_NUM, m[:num],
                             lineno: ctx.lineno, offset: m.offset(:num),
                             value: m[:value].to_i(10))
         elsif m[:enclosed_name]
              ## use HOLD,SAVE,POST or such keys - why? why not?
             Token.new(:ENCLOSED_NAME, m[:name],
                             lineno: ctx.lineno, offset: m.offset(:name))
         elsif m[:sym]
              Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
         else
            ctx.warn_on_else( m, mode: 'PROP_REFEREE' )
            nil
         end
end

#_on_round_def(m, ctx:) ⇒ Object

note - m is MatchData object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/sportdb/parser/lexer-on_round_def.rb', line 15

def _on_round_def( m, ctx: )      ## note - m is MatchData object


           if m[:spaces] || m[:space]
               nil    ## skip spaces
           elsif m[:date]
              Token.new(:DATE, m[:date],
                           lineno: ctx.lineno, offset: m.offset(:date),
                           value: _build_date(m))
           elsif m[:duration]
              Token.new(:DURATION, m[:duration],
                            lineno: ctx.lineno, offset: m.offset(:duration),
                            value: _build_duration( m ))
           elsif m[:sym]
              Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
           else
              ctx.warn_on_else( m, mode: 'ROUND_DEF' )
              nil
           end
end

#_on_top(m, ctx:) ⇒ Object

note - m is MatchData object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/sportdb/parser/lexer-on_top.rb', line 5

def _on_top( m, ctx: )      ## note - m is MatchData object

        ##  note - top-level (for now always) assumes TEAM for TEXT match!!
        ##           fix/fix/fix change TEXT_RE/:text to  TEAM_RE/:team !!!

        if m[:space] || m[:spaces]
           nil   ## skip space(s)
        elsif m[:text]         then Token.new(:TEAM,  m[:text],
                                                      lineno: ctx.lineno, offset: m.offset(:text))
        elsif m[:team_home]    then Token.new(:TEAM_HOME,  m[:team_home],
                                                      lineno: ctx.lineno, offset: m.offset(:team_home))
        elsif m[:team_away]    then Token.new(:TEAM_AWAY,  m[:team_away],
                                                      lineno: ctx.lineno, offset: m.offset(:team_away))
        elsif m[:team_neutral] then Token.new(:TEAM_NEUTRAL, m[:team_neutral],
                                                      lineno: ctx.lineno, offset: m.offset(:team_neutral))

        ## (match) status e.g. cancelled, awarded, etc.
        ##  inline:  w/o - walkover
        ##           n/p - not played
        ##           bye
        ##           abd/abd. - abandoned
        ##           void
        ##           susp/susp. - suspended
        ##           ppd/ppd. or postp/postp. - postponed
        ##           awd/awd. - awarded
        ##           canc/canc. - cancelled/canceled
        elsif m[:inline_wo]   then Token.new(:INLINE_WO, m[:inline_wo],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_wo))
        elsif m[:inline_np]   then Token.new(:INLINE_NP, m[:inline_np],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_np))
        elsif m[:inline_bye]  then Token.new(:INLINE_BYE, m[:inline_bye],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_bye))
        elsif m[:inline_abd]  then Token.new(:INLINE_ABD, m[:inline_abd],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_abd))
        elsif m[:inline_void] then Token.new(:INLINE_VOID, m[:inline_void],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_void))
        elsif m[:inline_susp] then Token.new(:INLINE_SUSP, m[:inline_susp],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_susp))
        elsif m[:inline_ppd]  then Token.new(:INLINE_PPD, m[:inline_ppd],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_ppd))
        elsif m[:inline_awd]  then Token.new(:INLINE_AWD, m[:inline_awd],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_awd))
        elsif m[:inline_canc] then Token.new(:INLINE_CANC, m[:inline_canc],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_canc))
        elsif m[:status]      then Token.new(:STATUS, m[:status],
                                                  lineno: ctx.lineno, offset: m.offset(:status),
                                                  value: _build_status( m ))
        elsif m[:note]
            ###  todo/check:
            ##      use value hash - why? why not? or simplify to:
            ## [:NOTE, [m[:note], {note: m[:note] } ]]
             Token.new(:NOTE, m[:note],
                               lineno: ctx.lineno, offset: m.offset(:note))

        elsif m[:attendance]
             att = {}
             att[:value] = m[:value].gsub( '_', '' ).to_i(10)
             ## note - for token id use INLINE_ATTENDANCE  (ATTENDANCE in use for prop!!!)
            Token.new(:INLINE_ATTENDANCE, m[:attendance],
                                   lineno: ctx.lineno, offset: m.offset(:attendance),
                                          value: att)

        elsif m[:time]         then Token.new(:TIME, m[:time],
                                                lineno: ctx.lineno, offset: m.offset(:time),
                                                value: _build_time(m))
        elsif m[:date]         then Token.new(:DATE, m[:date],
                                                lineno: ctx.lineno, offset: m.offset(:date),
                                                value: _build_date(m))
        elsif m[:date_legs]    then Token.new(:DATE_LEGS, m[:date_legs],
                                                 lineno: ctx.lineno, offset: m.offset(:date_legs),
                                                 value: _build_date_legs(m))

        elsif m[:score_legs]   then Token.new(:SCORE_LEGS, m[:score_legs],
                                                  lineno: ctx.lineno, offset: m.offset(:score_legs),
                                                  value: _build_score_legs( m ))
        elsif m[:score_full]   then Token.new(:SCORE_FULL, m[:score_full],
                                                  lineno: ctx.lineno, offset: m.offset(:score_full),
                                                  value: _build_score_full( m ))
        elsif m[:score_fuller] then Token.new(:SCORE_FULLER, m[:score_fuller],
                                                  lineno: ctx.lineno, offset: m.offset(:score_fuller),
                                                  value: _build_score_fuller( m ))
        elsif m[:score_fuller_more] then Token.new(:SCORE_FULLER_MORE, m[:score_fuller_more],
                                                      lineno: ctx.lineno, offset: m.offset(:score_fuller_more),
                                                      value: _build_score_fuller_more( m ))
        elsif m[:score]      then Token.new(:SCORE,  m[:score],
                                                lineno: ctx.lineno, offset: m.offset(:score),
                                                value: _build_score( m ))
        elsif m[:score_awd]  then Token.new(:SCORE_AWD, m[:score_awd],
                                                lineno: ctx.lineno, offset: m.offset(:score_awd),
                                                value: _build_score_awd( m ))
        elsif m[:score_abd]  then Token.new(:SCORE_ABD, m[:score_abd],
                                                lineno: ctx.lineno, offset: m.offset(:score_abd),
                                                value: _build_score_abd( m ))

        elsif m[:vs]         then Token.new(:VS, m[:vs],
                                              lineno: ctx.lineno, offset: m.offset(:vs))
        elsif m[:sym]
          case m[:sym]  ## return symbols "inline" as is - why? why not?
          when '@'    ##  enter geo mode
            _trace( 'ENTER GEO_RE MODE' )
            @re = GEO_RE
            @geo_count = 0
            Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
          when '('    ## enter goal scorer mode on "free-floating" open paranthesis!!!
             _trace( 'ENTER GOAL_RE MODE' )
             @re = GOAL_RE
              ## note - eat-up ( for now; do NOT pass along as token
              ##       pass along "virutal" INLINE GOALS - why? why not?
              Token.virtual( :INLINE_GOALS, lineno: ctx.lineno )
          else
            Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
          end
        else
           ctx.warn_on_else( m )
           nil
        end
end

#_prep_doc(txt) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/sportdb/parser/lexer-prep_doc.rb', line 45

def _prep_doc( txt )
    ##  preprocess automagically
    ##   strip html comments
    ##      keep empty lines?            - yes  (turn in BLANK tokens)
    ##      keep leading spaces (indent) - yes  (maybe used later in upstream parser!!)
    ##
    ##  note - KEEP empty lines (get turned into BLANK token!!!!)


    ### normalize unicode (decomposed chars to composed chars)
    ##
    ##  note:  é is decomposed (in two chars e.g.)
    ##   e (101)
    ##   ́  (769)
    ##   vs
    ##     é (233)
    txt = txt.unicode_normalize(:nfc)


    ##  "universal" newlines
    ##      replace all windows-style  cr+lf (\r\n) to lf (\n) only
    txt = txt.gsub( "\r\n", "\n" )



    ###
    ## quick hack for now
    ##   remove  html-style comments <!-- -->
    ##           (incl. multi-line)  with two spaces
    ##       will mess-up lineno tracking!!!
    ##    fix later to have function lineno & colno!!!
    ##
    ##  todo/fix - why? why not?
    ##   to keep lineno intact
    ##     replace with  space and newline

    ###
    ## add more "native" multi-line comment-styles
    ##  e.g.    #[[ ... ]]  or  #<<< .. >>> or #<< .. >>
    ##                 or such - why? why not?

    txt = txt.gsub( HTML_COMMENT_RE ) do |m|
                     _trace('preproc html comment:', m )
                        '  '
                   end



   txt = txt.gsub( PREPROC_NOTA_BENE_RE ) do |m|
       if m.include?( "\n" )   ## check for newlines (\n) and replace
            _trace('preproc (multi-line) note/nota bene block:', m )
           m.gsub( "\n", '' )
       else
         m
       end
    end


    #####
    ## (another) quick hack for now
    ##   turn multi-line note blocks into
    ##             single-line note blocks
    ##             by changing newline (\n) to ⏎ (unicode U+23CE)
    ##              or why not  to ___ ?
    ##
    ##  unicode options for return/arrows:
    ##   -  ↵ (U+21B5): Downwards Arrow With Corner Leftwards.
    ##                This is the most common "carriage return" symbol.
    ##   -  ⏎ (U+23CE): Return Symbol.
    ##               Specifically designated as the keyboard's "Return" key symbol,
    ##                often used in user interfaces.

    txt = txt.gsub( PREPROC_BLOCK_RE ) do |m|
       if m.include?( "\n" )   ## check for newlines (\n) and replace
         _trace( 'preproc (multi-line) block:', m )
         m.gsub( "\n", '' )
       else
         m
       end
    end


    txt
end

#_prep_line(line) ⇒ Object

auto-fix checks line-by-line



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/sportdb/parser/lexer-prep_line.rb', line 8

def _prep_line( line )

       ##
       ##  first check for tabs
       ##    add error/warn
       ##    for auto-fix - replace tabs with two spaces

        line = line.gsub( "\t" ) do |_|
                  ## report error here
                  ## todo/add error here
                  _warn( "auto-fix; replacing tab (\\t) with two spaces in line #{line.inspect}" )
                   '  '   ## replace with two spaces
                 end


        ## U+00A0 (160)  -- non-breaking space (unicode)
        line = line.gsub( "\u00A0" ) do |uni|
                  ## report error here
                  ## todo/add error here
                  _warn( "auto-fix; replacing non-breaking unicode space (#{uni}/#{uni.ord}) w/ ascii space ( /#{" ".ord}) in line #{line.inspect}" )
                   ' '   ## replace with space
                 end

        ###
        ## todo/fix - print unicode numbers for [–−]
        ##                different candidates to differentiate and document!!!
        ##   – => U+2013 (8211)     -- En Dash     (unicode)
        ##   − => U+2212 (8722)     -- Minus Sign  (unicode)
        line = line.gsub( /[–−]/ ) do |uni|
                  ## report error here
                  ## todo/add error here
                  _warn( "auto-fix; replacing unicode dash (#{uni}/#{uni.ord}) w/ ascii dash (-/#{"-".ord}) in line #{line.inspect}" )
                   '-'   ## replace with ascii dash (-)
                  end

        ####   add more unsmart quotes
        ## smart quotes
        line = line.gsub( /[‘’]/ ) do |uni|
                  ## report error here
                  ## todo/add error here
                  _warn( "auto-fix; replacing unicode (smart) quote (#{uni}/#{uni.ord}) w/ ascii quote ('/#{"'".ord}) in line #{line.inspect}" )
                   "'"
                  end

        line = line.gsub( /[“”]/ ) do |uni|
                  ## report error here
                  ## todo/add error here
                  _warn( %Q{auto-fix; replacing unicode (smart) double quote (#{uni}/#{uni.ord}) w/ ascii double quote ("/#{'"'.ord}) in line #{line.inspect}} )
                   '"'
                  end

   line
end

#_tokenize_line(line, lineno) ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
# File 'lib/sportdb/parser/lexer-tokenize.rb', line 72

def _tokenize_line( line, lineno )
  tokens = []
  errors = []   ## keep a list of errors - why? why not?


  pos = 0        ## note - usually same as offset[1] aka offset[end] after match
  ## track last offset (begin/end) - to report error on no match
  ##   or no match in end of string
  offset = [0,0]
  m = nil

  ## track number of geo text seen
  ##    (use for - do NOT break on two spaces if no geo text seen yet!!)
  @geo_count = 0

  ####
  ## quick hack - keep re state/mode between tokenize calls!!!
  @re  ||= RE     ## note - switch between RE & INSIDE_RE


  if @re == RE  ## top-level
    ### check for modes once (per line) here to speed-up parsing
    ###   for now goals only possible for start of line!!
    ###        fix - remove optional [] - why? why not?

    ####
    ## note - ord e.g. (45) for match number can only start a (match) line
    ##                "inline" use NOT possible
    ## note -  ord (for ordinal number!!!) e.g match number (1), (42), etc.
    if (m = START_WITH_ORD.match(line))
       ## note -  strip enclosing () and convert to integer
       tokens << Token.new(:ORD, m[:ord],
                                lineno: lineno, offset: m.offset(:ord),
                                value: m[:value].to_i(10)  )

       offset = m.offset(0)
       pos    = offset[1]      ## update pos
    elsif (m = START_WITH_YEAR.match(line))
       tokens << Token.new(:YEAR, m[:year],
                                 lineno: lineno, offset: m.offset(:year),
                                 value:  m[:year].to_i(10) )

       offset = m.offset(0)
       pos    = offset[1]    ## update pos

    elsif (m = START_WITH_GROUP_DEF_LINE_RE.match( line ))
      _trace( "ENTER GROUP_DEF_RE MODE" )
      @re = GROUP_DEF_RE

      tokens << Token.new( :GROUP_DEF, m[:group_def],
                               lineno: lineno, offset: m.offset(:group_def) )


      offset = m.offset(0)
      pos = offset[1]    ## update pos

    elsif (m = START_WITH_PROP_KEY_RE.match( line ))
      ##  start with prop key (match will switch into prop mode!!!)
      ##   - fix - remove leading spaces in regex (upstream) - why? why not?
      ##
      ###  switch into new mode
      ##  switch context  to PROP_RE
        _trace("ENTER PROP_RE MODE" )
        key = m[:key]


        ### todo/fix - add prop yellow/red cards too - why? why not?
        ##  todo/fix - separate sent off and red card
        ##     sent-off - incl. red card, yellow/red card and the era before red cards!!
        if ['sent off'].include?( key.downcase)
          @re = PROP_CARDS_RE    ## use CARDS_RE ???
          tokens << Token.new(:PROP_SENTOFF, m[:key],
                                   lineno: lineno, offset: m.offset(:key))
        elsif ['red cards'].include?( key.downcase )
          @re = PROP_CARDS_RE    ## use CARDS_RE ???
          tokens << Token.new(:PROP_REDCARDS, m[:key],
                                   lineno: lineno, offset: m.offset(:key))
        elsif ['yellow cards'].include?( key.downcase )
          @re = PROP_CARDS_RE
          tokens << Token.new(:PROP_YELLOWCARDS, m[:key],
                                   lineno: lineno, offset: m.offset(:key))
        elsif ['ref', 'referee',
               'refs', 'referees'   ## note - allow/support assistant refs
              ].include?( key.downcase )
          @re = PROP_REFEREE_RE
          tokens << Token.new(:PROP_REFEREE, m[:key],
                                   lineno: lineno, offset: m.offset(:key))
        elsif ['att', 'attn', 'attendance'].include?( key.downcase )
          @re = PROP_ATTENDANCE_RE
          tokens << Token.new(:PROP_ATTENDANCE, m[:key],
                                   lineno: lineno, offset: m.offset(:key))

     #   elsif ['goals'].include?( key.downcase )
     #     @re = PROP_GOAL_RE
     #     tokens << [:PROP_GOALS, m[:key]]

        elsif ['penalties',
               'penalty shootout',
               'penalty shoot-out',
               'penalty kicks'].include?( key.downcase )
          @re = PROP_PENALTIES_RE
          tokens << Token.new(:PROP_PENALTIES, m[:key],
                                  lineno: lineno, offset: m.offset(:key))
        else   ## assume (team) line-up
          @re = PROP_LINEUP_RE
          ## fix-fix-fix - rename to PROP_LINEUP !!
          tokens << Token.new(:PROP, m[:key],
                                 lineno: lineno, offset: m.offset(:key))
        end

        offset = m.offset(0)
        pos    = offset[1]     ## update pos
    ###
    ### todo/fix
    ###   rename to START_WITH_ROUND_DEF_OUTLINE_RE !!!!
    elsif (m = ROUND_DEF_OUTLINE_RE.match( line ))
      _trace( "ENTER ROUND_DEF_RE MODE" )
      @re = ROUND_DEF_RE

      ## note - return ROUND_DEF NOT  ROUND_OUTLINE token
      ##   fix - add leading ▪ too!!
      tokens << Token.new( :ROUND_DEF, m[:round_outline],
                            lineno: lineno, offset: m.offset(:round_outline))

      offset = m.offset(0)
      pos    = offset[1]    ## update pos
    elsif (m = ROUND_OUTLINE_RE.match( line ))
      _trace( "ROUND_OUTLINE" )
      ## note - derive round level from no of (leading) markers
      ##             e.g. ▪/:: is 1, ▪▪/::: is 2, ▪▪▪/:::: is 3, etc.
      ##       note  - ascii-style starts with double ::, thus, autodecrement by one!
      round_level = m[:round_marker].size
      round_level -= 1  if m[:round_marker].start_with?( '::' )

      tokens << Token.new( :ROUND_OUTLINE, m[:round_outline],
                           lineno: lineno, offset: m.offset(:round_outline),
                           value: { outline: m[:round_outline],
                                    level: round_level})

      ## note - eats-up line for now (change later to only eat-up marker e.g. »|>>)
      offset = m.offset(0)
      pos    = offset[1]       ## update pos
    elsif (m = START_GOAL_LINE_RE.match( line ))   ## line starting with ( - assume
      ##  switch context to GOAL_RE (goalline(s))
      ####
      ##  note - check for alternate goal line styles / formats
      if START_GOAL_LINE_COMPAT_RE.match(line )
        ## "legacy" style starting with minute e.g.
        ##  (6 Puskás 0-1, 9 Czibor 0-2, 11 Morlock 1-2, 18 Rahn 2-2,
        ##    84 Rahn 3-2)
        @re = GOAL_COMPAT_RE
        _trace( "ENTER GOAL_COMPAT_RE MODE" )

        tokens << Token.virtual( :GOALS_COMPAT, lineno: lineno )
      elsif START_GOAL_LINE_ALT_RE.match( line )
        ##  goals with scores e.g.
        ##    (1-0 Franck Ribéry, 2-0 Ivica Olić, 2-1 Wayne Rooney)
        ##         -or-
        ##      (Dion Beljo  1-0
        ##                   1-1  Andreas Gruber
        ##   Matthias Seidl  2-1)
        @re = GOAL_ALT_RE
        _trace( "ENTER GOAL_ALT_RE MODE" )

        tokens << Token.virtual( :GOALS_ALT, lineno: lineno )
      else
        ## "standard" / default style
        @re = GOAL_RE
        _trace( "ENTER GOAL_RE MODE" )

        tokens << Token.virtual( :GOALS, lineno: lineno )
      end

      ## note - eat-up ( for now
      ##   pass along "virtual" GOALS or GOALS_ALT token
      ##      (see INLINE_GOALS for the starting goal line inline)
      ##
      ## fix-fix-fix
      ##  keep offset at [0,0] - why? why not?
      ##    do NOT eat-up
      ##   or better
      ##    add tokens << Token.literal( '(', lineno: lineno, offset: ...) !!!
      offset = m.offset(0)
      pos    = offset[1]      ## update pos
    end
  end



  old_pos = -1   ## allows to backtrack to old pos (used in geo)




  ctx = Context.new( self,
                     line:   line,
                     lineno: lineno,
                     errors: errors )


  while m = @re.match( line, pos )
    # if debug?
    #  pp m
    #  puts "pos: #{pos}"
    # end
    offset = m.offset(0)
    ctx.offset = offset



    if offset[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!
      msg =  "parse error (tokenize) - skipping >#{line[pos..(offset[0]-1)]}< in line #{lineno}@#{offset[0]},#{offset[1]} >#{line}<"
      errors << msg

      log( msg )
      puts "!! WARN - #{msg}"
    end


    ##
    ## todo/fix - also check if possible
    ##   if no match but not yet end off string!!!!
    ##    report skipped text run too!!!

    old_pos = pos
    pos     = offset[1]

#    pp offset  if debug?

    ##
    ## note: racc requires pairs e.g. [:TOKEN, VAL]
    ##         for VAL use "text" or ["text", { opts }]  array



  t = if    @re == ROUND_DEF_RE      then   _on_round_def( m, ctx: ctx )
      elsif @re == GROUP_DEF_RE      then   _on_group_def( m, ctx: ctx )
      elsif @re == GEO_RE
           ### note - possibly end inline geo on [ (and others?? in the future
           ## note: break on double spaces e.g.
           ## e.g. Jul/16 @ Arena Auf Schalke, Gelsenkirchen  Serbia 0-1 England
           if m[:spaces]
                 ### note - do NOT break out
                 ##           if not text seen yet!!!
                 if @geo_count > 0
                    ## get out-off geo mode and backtrack (w/ next)
                    ##
                    ## todo/fix
                    ##   add virtual geo_end token!!!
                    _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
                    @re = RE
                    pos = old_pos
                    next   ## backtrack (resume new loop step)
                 else
                     nil   ## skip spaces
                 end
           elsif m[:space]
               nil    ## skip (single) space
           elsif m[:text]
               @geo_count += 1
                ## keep pos - why? why not?
               Token.new(:GEO, m[:text],
                                lineno: lineno, offset: m.offset(:text))
           elsif m[:geo_end]   ## "hacky" special comma; always ends geo mode!!!
                 ## get out-off geo mode and backtrack (w/ next)
                    ## todo/fix
                    ##   add (semi-) virtual geo_end token!!!
                 _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
                 @re = RE
                 pos = old_pos
                 next   ## backtrack (resume new loop step)
           elsif m[:sym]
              case m[:sym]
                ## note - reset geo_count to 0 (avoids break on two spaces)
                ##                     if separator seen!!
              when ',' then @geo_count = 0
                            Token.literal( m[:sym], lineno: lineno, offset: m.offset(:sym))
              when '' then @geo_count = 0;
                            Token.literal( ',', lineno: lineno, offset: m.offset(:sym))
                                ## note - treat geo sep › (unicode) like comma for now!!!
              when '>' then @geo_count = 0;
                            Token.literal( ',', lineno: lineno, offset: m.offset(:sym))
                               ## note - treat geo sep > (ascii) like comma for now!!!
              when '[' then
                    ##
                    ## todo/fix
                    ##   add virtual geo_end token!!!
                 ## get out-off geo mode and backtrack (w/ next)
                 _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
                 @re = RE
                 pos = old_pos
                 next   ## backtrack (resume new loop step)
              else
                 Token.literal( m[:sym], lineno: lineno, offset: m.offset(:sym))
              end
           else
             ctx.warn_on_else( m, mode: 'GEO' )
             nil
           end
      elsif @re == PROP_CARDS_RE       then  _on_prop_cards( m, ctx: ctx )
      elsif @re == PROP_LINEUP_RE      then  _on_prop_lineup( m, ctx: ctx )
      elsif @re == PROP_ATTENDANCE_RE  then  _on_prop_attendance( m, ctx: ctx )
      elsif @re == PROP_REFEREE_RE     then  _on_prop_referee( m, ctx: ctx )
      elsif @re == PROP_PENALTIES_RE   then  _on_prop_penalties( m, ctx: ctx )
      elsif @re == GOAL_COMPAT_RE      then  _on_goal_compat( m, ctx: ctx )
      elsif @re == GOAL_ALT_RE         then  _on_goal_alt( m, ctx: ctx )
      elsif @re == GOAL_RE             then  _on_goal( m, ctx: ctx )
      ###################################################
      ## assume TOP_LEVEL (a.k.a. RE) machinery
      else
          _on_top( m, ctx: ctx )
      end


    tokens << t    if t

#    if debug?
#      print ">"
#      print "*" * pos
#      puts "#{line[pos..-1]}<"
#    end
  end

  ## check if no match in end of string
  if offset[1] != line.size
    msg =  "parse error (tokenize) - skipping >#{line[offset[1]..-1]}< in line #{lineno}@#{offset[1]},#{line.size} >#{line}<"
    errors << msg

    log( msg )
    puts "!! WARN - #{msg}"
  end


  # if @re == GOAL_RE   ### ALWAYS switch back to top level mode
  #   puts "  LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
  #   @re = RE
  # end

   if @re == GEO_RE   ### ALWAYS switch back to top level mode
     _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
     @re = RE
   end

   ### ALWAYS switch back to top level mode
   @re = RE  if @re == GROUP_DEF_RE ||
                @re == ROUND_DEF_RE

   ##
   ## if in prop mode continue if   last token is [,-]
   ##        otherwise change back to "standard" mode
   if @re == PROP_LINEUP_RE     ||
      @re == PROP_CARDS_RE      ||
      @re == PROP_PENALTIES_RE  ||
      @re == PROP_ATTENDANCE_RE ||
      @re == PROP_REFEREE_RE
     if [',', '-', ';'].include?( tokens[-1].type)
        ## continue/stay in PROP_RE mode
        ##  todo/check - auto-add PROP_CONT token or such
        ##                to help parser with possible NEWLINE
        ##                  conflicts  - why? why not?
     else
        ## switch back to top-level mode!!
        _trace( "LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE" )
        @re = RE
        ## note - auto-add PROP_END (<PROP_END>)
        tokens << Token.virtual(:PROP_END, lineno: lineno)
     end
   end


  [tokens,errors]
end

#_trace(*args) ⇒ Object



20
21
22
23
24
25
# File 'lib/sportdb/parser/lexer.rb', line 20

def _trace( *args )
  if debug?
    print "[DEBUG] Lexer -- "
    args.each { |arg| puts args }
  end
end

#_warn(*args) ⇒ Object



27
28
29
30
# File 'lib/sportdb/parser/lexer.rb', line 27

def _warn( *args )
  print "!! [WARN] Lexer -- "
  args.each { |arg| puts args }
end

#debug?Boolean

Returns:

  • (Boolean)


38
# File 'lib/sportdb/parser/lexer.rb', line 38

def debug?()  @debug == true; end

#log(msg) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
# File 'lib/sportdb/parser/lexer.rb', line 7

def log( msg )
   ## append msg to ./logs.txt
   ##     use ./errors.txt - why? why not?
   ##
   ##  change to ./logs_lexer.txt or such - why? why not?
   ##    auto-add/prepend  [Lexer] and timestamp!!!  to msg - why? why not?
   File.open( './logs.txt', 'a:utf-8' ) do |f|
     f.write( msg )
     f.write( "\n" )
   end
end

#tokenize_with_errorsObject



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/sportdb/parser/lexer.rb', line 54

def tokenize_with_errors

     = []   ## note: add tokens line-by-line (flatten later)
    errors         = []   ## keep a list of errors - why? why not?


    txt = _prep_doc( @txt )



    ####
    ## quick hack - keep re state/mode between tokenize calls!!!
    @re  ||= RE     ## note - switch between RE & INSIDE_RE

    lineno = 0
    txt.each_line do |line|
        lineno += 1

        ## todo - "inlined virtual/collapsed/folded newlines"
        ##   check for "↵" !!!
        ##   and add to lineno


        ## note - KEEP leading spaces for indent
        ##         use rstrip (NOT left/leading & right/trainling strip) only!!
        ## note -   remove/strip trailing newline (and optional spaces)!!!
        ##          trailing whitespace may incl. \n or \r\n!!!
        line = line.rstrip


        ###  skip comments
        ##      todo/check - change to blank line
        ##                     to keep lineno (closer to orginal) - why? why not?
        next  if line.match?(/\A  [ ]* ## optional leading space(s)
                                   \#
                                    /x )

        ##  strip (inline) end-of-line comments (from line)
        ##    check/discuss: make - inline comment require trailing space
        ##                      e.g.   #1 vs # 1   - why? why not?
        line = line.sub( /   [ ]*      ## (eat-up) optional leading space(s)
                              \#{1,}.*?
                             \z
                            /x, '' )


        ####
        #  support __END__ marker to cut-off input
        break if line.match?( /\A [ ]*   ## optional leading space(s)
                                   __END__
                                 \z
                               /x )



        ## auto-fixes line-by-line (e.g. check for tabs, smart quotes, etc.)
        line = _prep_line( line )


        _trace( "line #{lineno}: >#{line}<" )


        ######
        ### special case for empty line (aka BLANK)
        if line.empty?
           ## note - blank always resets parser mode to std/top-level!!!
           @re = RE
            << [Token.virtual(:BLANK, lineno: lineno)]
        elsif (m = HEADING_RE.match(line))
           ## note - heading always resets parser mode to std/top-level!!!
           @re = RE
           _trace( 'HEADING' )
           ## note - derive heading level from no of (leading) markers
           ##             e.g. = is 1, == is 2, == is 3, etc.
           heading_level = m[:heading_marker].size
            << [Token.new(:"H#{heading_level}", m[:heading], lineno: lineno)]
        elsif (m = NOTA_BENE_RE.match(line))
           ## note - nota bene always resets parser mode to std/top-level!!!
           @re = RE
            << [Token.new(:NOTA_BENE, m[:nota_bene], lineno: lineno)]
        else

          more_tokens, more_errors = _tokenize_line( line, lineno )

            << more_tokens
          errors          += more_errors
        end
    end # each line




     = .map do |tokens|

        #################
        ##    transform tokens (using simple patterns)
        ##      to help along the (racc look ahead 1 - LA1) parser
        nodes = []

        buf = Tokens.new( tokens )
        ## pp buf


    loop do
          break if buf.eos?

          if buf.match?( :DATE, :TIME )   ## merge DATE TIME into DATETIME
               date = buf.next
               time = buf.next
               ## puts "DATETIME:"
               ## pp date, time

               ##  note:  time value is { time: {} } or
               ##                       { time: {}, time_local {} }
               text  = date.text + ' ' + time.text,  ## concat string of two tokens
               value = { date: date.value }.merge( time.value )

               nodes << Token.new(:DATETIME, text,
                                      lineno: date.lineno,
                                      offset: [date.offset[0],
                                               time.offset[1]],
                                      value: value )
          ### support  date time with comma too - why? why not?
          elsif buf.match?( :DATE, ',', :TIME )
               date = buf.next
               _    = buf.next  ## ignore comma
               time = buf.next
               ## puts "DATETIME:"
               ## pp date, time
               text  = date.text + ', ' + time.text  ## concat string of two tokens
               value =  { date: date.value }.merge( time.value )

               nodes << Token.new(:DATETIME, text,
                                      lineno: date.lineno,
                                      offset: [date.offset[0],
                                               time.offset[1]],
                                     value: value )
          elsif buf.match?( :GOAL_MINUTE, ',', :GOAL_MINUTE )
             ## note - only advance by two tokens!
             ##     allows more :GOAL_MINUTE sequences!! e.g. 12,13,14 etc!!!
             ##
             ## help parser with comma shift/reduce conflict
             ##   change ',' to GOAL_MINUTE_SEP !!!
             nodes << buf.next   ## pass through goal_minute
             comma = buf.next  ## eat-up goal_minute_sep a.k.a. comma (,)
                           ##   and replace with dedicated sep(arator)
             nodes << Token.new( :GOAL_MINUTE_SEP,
                                      comma.text,
                                      lineno: comma.lineno,
                                      offset: comma.offset,
                                      value:  comma.value)
          elsif buf.match?( ',', :INLINE_ATTENDANCE )
             ## note  - allow optional comma before inline attendance
             ## help parser with comma shift/reduce conflict
             ##   change ',' to INLINE_ATTENDANCE_SEP !!!
             comma = buf.next  ## eat-up inline_attendance_sep a.k.a. comma (,)
                           ##   and replace with dedicated sep(arator)
             nodes << Token.new(:INLINE_ATTENDANCE_SEP,
                                    comma.text,
                                    lineno: comma.lineno,
                                    offset: comma.offset,
                                    value:  comma.value)
             nodes << buf.next   ## pass through inline_attendance
          else
             ## pass through
             nodes << buf.next
          end
    end  # loop
    nodes
  end  # map tokens_by_line


    ## puts "tokens_by_line:"
    ## pp tokens_by_line


    ## flatten tokens
    tokens = []
    .each do |tok_line|

        ## if debug?
        ##   pp tok_line
        ## end

         tokens  += tok_line

         ## auto-add newlines  (unless BLANK!!)
         unless tok_line[0] && tok_line[0].type == :BLANK
            ## note - reuse lineno from first token in line
            ##                  use last - why? why not?
            tokens  << Token.newline( lineno: tok_line[0].lineno )
         end
    end

    [tokens,errors]

end