Class: SportDb::Lexer

Inherits:
Object
  • Object
show all
Defined in:
lib/sportdb/parser/lexer.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/lexer_tty.rb,
lib/sportdb/parser/token-geo.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-note.rb,
lib/sportdb/parser/token-prop.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-time.rb,
lib/sportdb/parser/token-goals.rb,
lib/sportdb/parser/token-group.rb,
lib/sportdb/parser/token-round.rb,
lib/sportdb/parser/token-score.rb,
lib/sportdb/parser/token-table.rb,
lib/sportdb/parser/token-status.rb,
lib/sportdb/parser/token_helpers.rb,
lib/sportdb/parser/token-prop_name.rb,
lib/sportdb/parser/token-score_legs.rb,
lib/sportdb/parser/token-score_fuller.rb,
lib/sportdb/parser/token-date_duration.rb

Constant Summary collapse

HTML_COMMENT_RE =
%r{  <!--
     .*?   ## note - use non-greedy/lazy *? match
  --> 
}xm
PREPROC_BLOCK_RE =

note - [] block may NOT incl. square brackets

   what about comments (e.g. #)?                       
todo/check - rename to NOTE_BLOCK or TEXT_BLOCK or ???
%r{  \[
                      [^\[\]\#]*?  ## note - use non-greedy/lazy *? match
                  \]
}xm
PREPROC_NOTA_BENE_RE =

check for “literal” (multi-line) note blocks

eg.  nb:  or note:          
space required after double colon - why? why not?
%r{
         ^  
    [ ]* (?: nb | note) [ ]* : [ ]+
       .+?  ## non-greedy 
   
    ## positive lookahead
    ##    note - must end with blank line or end-of-file/document 
    ##   note - do NOT eat-up trailing hrule (---)  
      (?=      (?: \n [ ]* -{3,} [ ]*)? 
                   \n[ ]*\n
               | \z 
        )   
}xim
LINE_CONTINUATION_RE =

replace “escaped” newline with non-newline char e.g. ‘↵’

%r{
   \\[ ]* \n
}x
MAGIC_COMMENT_RE =

check for magic comments

e.g  # teletype: true    or TELETYPE: TRUE 
        tty/teletype
%r{  \A
   [ ]*    ## optional leading spaces
  \#+      ##  note - allow ##,###, etc. too 
   [ ]*    ## optional spaces
     (?<magic_comment_key> tty | teletype )
   [ ]*    ## optional spaces
      :       
   [ ]*    ## optional spaces
      (?<magic_comment_value> true | false )
   [ ]*    ## optional trailing spaces
  \z
}ix
BASICS_RE =
%r{
    (?<vs>
       (?<=[ ])	# positive lookbehind for space
       (?-i: 
           vs\.?|v|VS 
       )        # note - only match case sensitive (downcased letters)!!!
                # note -  bigger match first e.g. vs than v etc.
       (?=[ ])   # positive lookahead for space
    )
       |
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym> [,;/@|()\[\]-] )   ### note: add parantheses too e.g () - why? why not?
}ix
ATTENDANCE_RE =

add att(endance) e.g. att: 18000

A v B 2-1  att: 18000
%r{
    (?<attendance>
     \b
        att: [ ]*
         (?<value>
              [1-9]
              (?: _? \d+ )*
         )
     \b
)}ix
INLINE_WO_RE =

add support for WO or W-0 too - why? why not?

%r{
    (?<inline_wo>
        \b (?: w/o | W/O ) \b
)}x
INLINE_BYE_RE =

note - NOT case insensitive

%r{ 
   (?<inline_bye>
       \b (?: bye | BYE ) \b
)}x
INLINE_NP_RE =

A n/p B (note - basically a inline short form of A v B [cancelled] )

N/P
%r{
    (?<inline_np>
        \b (?: n/p | N/P ) \b
)}x
INLINE_ABD_RE =

abd/abd. or aban/aban. [abandoned]

ABD/ABAN
%r{
    (?<inline_abd>
        \b (?: abd\.? |
               aban\.? |
               ABD | ABAN
           ) 
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x
INLINE_SUSP_RE =

susp/susp. [suspended]

SUSP
%r{
    (?<inline_susp>
        \b (?: susp\.? |
                SUSP ) 
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x
INLINE_PPD_RE =

ppd/ppd. or pst/pst. or pstp/pstp. or postp/postp. [postponed]

PPD/PSTP/POSTP/P-P              
 todo/check - add/allow p-p too - why? why not?
%r{
    (?<inline_ppd>
        \b (?: ppd\.? |
               pst\.? |
               po?stp\.? |
               PPD | PST | PO?STP | P-P
            ) 
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x
INLINE_VOID_RE =

void via x-x X-X

todo/check - only allow X-X - why? why not?
%r{
      (?<inline_void>
          \b (?: x-x |
                 X-X 
             )
        ## POSITIVE lookahead - requires space
           (?= [ ])
)}x
INLINE_AWD_RE =

awd/awd. [awarded]

AWD
note - recommendation is to allways include score
         thus, use/prefer SCORE_AWD e.g. 0-3 awd
%r{
    (?<inline_awd>
        \b (?: awd\.? | AWD ) 
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x
INLINE_CANC_RE =

canc/canc. [cancelled]

CANC
%r{
    (?<inline_canc>
        \b (?: canc\.?  | CANC ) 
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x
TEAM_HOME_RE =

home/away/neutral - (h), (a), (n)

add support for h/a/n  
   with (?-i \b [han] \b) lower-case and \b boundry - why? why not?
%r{  (?<team_home> \(h\) )}xi
TEAM_AWAY_RE =
%r{  (?<team_away> \(a\)  )}xi
TEAM_NEUTRAL_RE =
%r{  (?<team_neutral> \(n\) )}xi
RE =
Regexp.union(
                    STATUS_RE,   ## match status e.g. [cancelled], etc.

                    INLINE_WO_RE,    ## (inline) match status - w/o (walkout)
                    INLINE_NP_RE,    ## (inline) match status - n/p (not played)
                    INLINE_BYE_RE,   ## (inline) match status - bye (advance to next round)
                    INLINE_ABD_RE,   ## (inline) match status - abd/abd. (abandoned)
                    INLINE_SUSP_RE,  ## (inline) match status - susp/susp.  (suspended)
                    INLINE_PPD_RE,   ## (inline) match status - ppd/ppd. or pstp/pstp. or postp/postp. or p-p (postponed)            
                    INLINE_VOID_RE,  ## (inline) match status - x-x (voided) 
                    INLINE_AWD_RE,   ## (inline) match status - awd/awd. (awarded)
                    INLINE_CANC_RE,  ## (inline) match status - canc/canc. (cancelled/canceled)
                   

                    TEAM_HOME_RE,     ## (H)
                    TEAM_AWAY_RE,     ## (A)
                    TEAM_NEUTRAL_RE,  ## (N)

                    NOTE_RE,  ### fix - change to INLINE_NOTE !!!
                    DATE_LEGS_RE,  # note - must go before date!!!
                    DATE_RE,  ## note - date must go before time (e.g. 12.12. vs 12.12)
                     TIME_RE,
                    ATTENDANCE_RE,   # note - allow att: for now inline in matches too - why? why not? 
                    SCORE_LEGS_RE,
                    SCORE_FULL_RE, 
                    SCORE_FULLER_RE,
                    SCORE_FULLER_MORE_RE,
                    SCORE_AWD_RE,   #  (inline) score awarded e.g. 3-0 awd or 0-1 awd. etc.
                    SCORE_ABD_RE,   #  (inline) score abandoned e.g. 2-1 abd.
                    SCORE_RE,   ## note basic score e.g. 1-1 must go after SCORE_FULL_RE!!!
                    
                    ## note - add "experimental" "split" scores for now
                    SCORE_TEAM_RE,   ##  e.g. (2) 1  for "split" scores
                    SCORE_TEAM_PEN_RE,   ##  e.g. 1 (2)  
                    
                    BASICS_RE, 
                   TEXT_RE,
                     ## note - score_team_num (e.g. 0 or 10 etc.)
                     ##            MUST BE after TEXT 
                     ##              only match if nothing else matches (expect ANY)
                    SCORE_TEAM_NUM_RE,   ## e.g. 0 or 1 or 9 or 11 etc. (<100)
                   ANY_RE,
)
START_WITH_ORD =

ord (for ordinal number)

e.g. (51) or (1) etc.  - limit digits of number - why? why not???
%r{
   \A  
    [ ]*    ## ignore leading spaces (if any)
(?<ord>
  \(  
   (?<value>\d+) 
  \)
)}ix
START_WITH_YEAR =

e.g. 1930, 1986, 2002, 2010, 2022, 2026

     note - only YYYY
note - look out for clubs like  1860 München (de) !!!
                                1899 Hoffenheim (de)
                                1896 Löwenherz  (ch - a.k.a. FC Winterthur ??)
                any others starting with YYYY ?!
note - YEAR requires TWO (trailing) spaces !!!!! e.g. 
   1930    Uruguay             4-2 Argentina
   1934    Italy               2-1 Czechoslovakia   (AET)
   2022    Argentina           3-3 France           (AET, 4-2 pen)

 do NOT match (iso date!!) -  2020-11-12
                              2020/11/12
                              2020.11.12 etc.
%r{
   \A
       [ ]*    ## ignore leading spaces (if any)
     (?<year>
        \d{4}
     )
     ## positive lookahead 
       (?= [ ]{2} |   ## min. TWO spaces or 
           [ ]@ |   ##   space with geo marker or
           [ ]* \z  ##    year (date) header (end-of-line/string)
        )   
}x
HEADING_RE =
%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?<heading_marker> ={1,6} ) 
    [ ]*
     (?<heading>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1 
        [^=]+?   ## use non-greedy 
     )
    [ ]*  ## ignore trailing spaces (if any)
     (?: =* )  ## allow any trailing heading markers
    [ ]*  ## ignore trailing spaces (if any)
  \z
}ix
HRULER_RE =
%r{
                 \A
                           [ ]*  ## ignore leading spaces (if any)
                    -{3,}  ## must be at least three dashes!!!
                           [ ]*  ## ignore trailing spaces (if any)                   
                 \z
}ix
IS_TTY_LINE_RE =

experimental teletype mode

only space, A-Z and 0-9 allowed
%r{  \A  
     ## note - use NEGATIVE lookahead to exclude blank lines
       (?! [ ]*\z)

        [A-Z0-9 ]+
    \z
}x
TTY_SPACES_RE =
%r{ (?<spaces> [ ]{2,}) |
  (?<space>  [ ])
}x
TTY_NUM_RE =
%r{   \b  (?<num> \d+ ) \b 
}x
TTY_TEXT_RE =

note - TEXT for now allows A, 1A, A1, A1A, A1 B1 C1,

                            A1AA1 2B22 3C33
- single space only for concat
   text segments MUST NOT be all numbers e.g. 1, 11, etc.
%r{   \b (?<text>                         
         (?:
            [A-Z]  ## MUST start with letter  
              |
             [0-9]+[A-Z]   ## or numbers followed by letter 
           )
           [0-9A-Z]*
           (?:
               ### allow move segements separated
               ##     by single space
                [ ]
               (?: 
                   [A-Z]  ## MUST start with letter  
                    |
                   [0-9]+[A-Z]   ## or numbers followed by letter 
                )
               [0-9A-Z]*
           )*
        )
        \b   
}x
TTY_RE =
Regexp.union(
                TTY_SPACES_RE,
                TTY_TEXT_RE,
                TTY_NUM_RE,
                ##  fix add ANY_RE,  
)
GEO_TEXT_RE =
%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
          ## positive lookbehind -  for now space (or beginning of line - for testing) only
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<= [ ,›>\[\]]|^)
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |
                # opt 2 - start with num!! - 
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MAY be followed by (optional space) !
                      ## MUST be follow by a to z!!!!
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add another weirdo case
                ##   e.g.   's Gravenwezel-Schilde
                ##   add more letters (or sequences here - why? why not?)
                    '\p{L}+
               )

               ##
               ## todo/check - find a different "more intuitive" regex/rule if possible?
               ##    for single spaces only (and _/ MUST not be surround by spaces) 

              (?: 
                  (?:
                    [ ]?   # only single (inline) space allowed - double spaces are breaks!!!  
                    (?:
                       \p{L} | \d  | [.&'°]
                        |
                       (?: (?<! [ ])  ## no space allowed before (but possible after)
                            [-]
                       )
                         |
                       (?: (?<! [ ])  ## no spaces allowed around these characters
                           [_/]
                          (?! [ ])
                       )
                    )+
                  )
                  |
              ## for now allow auto-add optional
              ##   parenthesis enclosed closed text
              ##   e.g. Dublin (Dalymount Park)
              ##        Bucuresti (23 August)
              ##        Paris (Parc des Princes)
              ##        Ost-Berlin (Walter-Ulbricht)
              ##        Athinai (OAKA - Maroussi)
              ##
              ##   or   Valencia (Spain) or Solna   
              (?:
                    [ ]
                    \(
                        [^()\[\],;:›<>]+    ## todo - add more special chars
                                            ##   maybe list only allowed ones??
                                            ##   make pattern more strict - why? why not?
                    \)
              )
          )*


              ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)

            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## POSITIVE lookahead
            (?=[ ,›>\[\]]|$)

   )
}ix
GEO_BASICS_RE =
%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym> [,›>\[] )
}ix
GEO_END_RE =
%r{
   (?<geo_end>
        ,
    )
    ## POSITIVE lookahead for props
    (?=    
        [ ]*  ## optional spaces
         (?: att|ref)    ## todo/fix - use generic [a-z]+ - why? why not?
         :
    )
}ix
GEO_RE =
Regexp.union(
                    GEO_END_RE,
                    GEO_BASICS_RE, 
                    GEO_TEXT_RE,
                    ANY_RE,
)
MONTH_LINES =
parse_names( <<TXT )
January    Jan
February   Feb
March      Mar
April      Apr
May
June       Jun
July       Jul
August     Aug
September  Sept  Sep
October    Oct
November   Nov
December   Dec
TXT
MONTH_NAMES =
build_names( MONTH_LINES )
MONTH_MAP =

pp MONTH_NAMES

build_map( MONTH_LINES, downcase: true )
DAY_LINES =
parse_names( <<TXT )
Monday                   Mon  Mo
Tuesday            Tues  Tue  Tu
Wednesday                Wed  We
Thursday    Thurs  Thur  Thu  Th
Friday                   Fri  Fr
Saturday                 Sat  Sa
Sunday                   Sun  Su
TXT
DAY_NAMES =
build_names( DAY_LINES )
DAY_MAP =

pp DAY_NAMES

build_map( DAY_LINES, downcase: true )
DATE_I_RE =

e.g. Fri Aug 9

    Fri  Aug 9
   Fri, Aug 9
   Fri, Aug 9 2024
   Fri, Aug 9, 2024
        Aug 9, 2024
        Aug 9, 2024
note - eat-up optional comma after DAY_NAMES!!

   note - Fri Aug/9  no longer supported!!!
%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+)
     )?
     (?<month_name>#{MONTH_NAMES})
          [ ] 
     (?<day>\d{1,2})
          \b
     ## optional year
     (      ,? [ ]       ## note - comma optinal with single space required for now
            (?<year>\d{4})        ## optional year 2025 (yyyy)      
              \b
     )?
)}ix
DATE_LEGS_I_RE =

todo/fix - add (opt) day_name later

add (opt) year later

e.g. Aug 9 & Aug 10 note - allow shortcut e.g. Aug 9 & 10

%r{
(?<date_legs>
 \b
     (?<month_name1>#{MONTH_NAMES})
          [ ] 
     (?<day1>\d{1,2})
    [ ] & [ ]
     (?:
        (?<month_name2>#{MONTH_NAMES})
          [ ] 
      )?  ## note - make 2nd month_name optional 
     (?<day2>\d{1,2})
  \b
)}ix
DATE_II_RE =

e.g. 3 June or 10 June

 note - allow more spaces between  DAY_NAMES and DAY e.g.
  Sun  1 Mar        
  Wed  4 Mar        
  Sat 14 Mar   
  Sat 11 Apr 
  Sat 11 Apr 2021
  Sat 11 Apr 21

  Sat, 11 Apr
 note - eat-up optional comma after DAY_NAMES!!

note - Sat 14 Mar 17:30
        check two-digit year (with NEGATIVE lookahead for time!!!)
%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+)
     )?
     (?<day>\d{1,2})
         [ ]
     (?<month_name>#{MONTH_NAMES})
          \b
     ## optional year
     (  [ ]
        (?: 
           (?<year>\d{4})        ## optional year 2025 (yyyy)
               |
            (?:
               (?<yy>\d{2})           ## optional year 25 (yy)
                ## check NEGATIVE lookahead
               (?! :|[:h]\d{2})
            )
        )
        \b   
     )?
)}ix
DATE_III_A_RE =

e.g. iso-date - 2011-08-25

note - allow/support ("shortcuts") e.g 2011-8-25  or 2011-8-3 / 2011-08-03 etc.
%r{
(?<date>
  \b
   (?<year>\d{4})
       -
   (?<month>\d{1,2})
       -
   (?<day>\d{1,2})
  \b
)}ix
DATE_III_B_RE =

starting w/ day/month/year e.g. 25-08-2011

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          (?: ,?[ ]+)
     )?
   (?<day>\d{1,2})
       -
   (?<month>\d{1,2})
       -
   (?<year>\d{4})
  \b
)}ix
DATE_IIII_RE =

allow (short)“european” style 8.8.

note - assume day/month!!!
%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+) 
     )?
   (?<day>\d{1,2})
       \.
   (?<month>\d{1,2})
       \.
   (?: (?: 
          (?<year>\d{4})        ## optional year 2025 (yyyy)
              |
          (?<yy>\d{2})           ## optional year 25 (yy)
       )
        \b
   )?
)
}ix
DATE_IIIII_RE =

04/03/2026 or 4/3/2026

04/03/26   or 4/3/26
04/03      or 4/3
%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          (?: ,?[ ]+)
     )?
   (?<day>\d{1,2})
       /
   (?<month>\d{1,2})
    \b
   (?:  
        /
       (?: 
          (?<year>\d{4})         ## optional year 2025 (yyyy)
              |
          (?<yy>\d{2})           ## optional year 25 (yy)
       )
      \b
   )?
)
}ix
DATE_RE =

map tables

note: order matters; first come-first matched/served
Regexp.union(
   DATE_I_RE,
   DATE_II_RE,
   DATE_III_A_RE,    ## e.g. 1973-08-14
   DATE_III_B_RE,
   DATE_IIII_RE,    ## e.g. 8.8. or 8.13.79 or 08.14.1973 
   DATE_IIIII_RE,   ## e.g.  08/14/1973
)
DATE_LEGS_RE =

todo - add more format style here; change to Regexp.union later!!!

DATE_LEGS_I_RE
NOTE_RE =

fix - use (?<text>) - text capture for inner text!!

use (?<note> for complete match as a convention!! )
%r{
\[ 
  (?<note>
     [^\[\]\#]*?    ## note - non-greedy/lazy operator
                    ##    exclude comments inside note block - why? why not?
  )
\]
}xi
NOTA_BENE_RE =

check for “literal” (multi-line) note blocks

 eg.  nb:  or note:          
 space required after double colon - why? why not?              

note - use \A (instead of ^) - \A strictly matches the start of the string.
%r{   \A
    [ ]*  ## ignore leading spaces (if any)
 (?: nb | note) [ ]* : [ ]+   
  (?<nota_bene>
       .+?  ## use non-greedy 
   )
    [ ]*  ## ignore trailing spaces (if any) 
   \z
}xi
PROP_KEY_RE =

todo/fix/fix

change ^ to \A
  change name to START_WITH_PROP_KEY_RE !!!
%r{ 
   ^     # note - MUST start line; leading spaces optional (eat-up)
   [ ]*  
(?<prop_key>
  (?<key>
      (?:\p{L}+
          |
          \d+  # check for num lookahead (MUST be space or dot)
       ## MUST be followed by (optional dot) and
       ##                      required space !!!
       ## MUST be follow by a to z!!!!
        \.?     ## optional dot
        [ ]?   ## make space optional too  - why? why not?
            ##  yes - eg. 1st, 2nd, 5th etc.
        \p{L}+
       )
       [\d\p{L}'/° -]*?   ## allow almost anyting 
                         ## fix - add negative lookahead 
                         ##         no space and dash etc.
                         ##    only allowed "inline" not at the end
                         ## must end with latter or digit!
  )
   [ ]*?     # slurp trailing spaces
    :
   (?=[ ]+)  ## possitive lookahead (must be followed by space!!)
  )
}ix
INLINE_CAPTAIN =
c

or [C] for marking player as captain

support [y ] too - or require Y - why? why not?
%r{ (?<inline_captain>
    \[ [cC] \]
)}x
INLINE_YELLOW =
%r{ (?<inline_yellow>
     \[ [yY]
         ## optional minute
         (?: [ ]+
           (?<minute> \d{1,3})
              '?
           (?:
              \+
              (?<offset>\d{1,2})
               '?
           )? 
         )? 
     \]
)}x
INLINE_RED =
%r{ (?<inline_red>
     \[ [rR] 
         ## optional minute
         (?: [ ]+
           (?<minute> \d{1,3})
              '?
           (?:
              \+
              (?<offset>\d{1,2})
               '?
           )? 
         )? 
     \]
)}x
INLINE_YELLOW_RED =
%r{ (?<inline_yellow_red>
     \[ (?:y/r |
           Y/R  ) 
         ## optional minute
         (?: [ ]+
           (?<minute> \d{1,3})
              '?
           (?:
              \+
              (?<offset>\d{1,2})
               '?
           )? 
         )? 
     \]
)}x
PROP_KEY_INLINE_RE =

simple prop key for inline use e.g.

Coach:  or Trainer:  or ...  add more here later
%r{ 
   \b  
(?<prop_key>    ## note: use prop_key (NOT prop_key_inline or such)
  (?<key>
      \p{L}+
  )
   ## note - NO spaces allowed for key for now!!! 
    :
   (?=[ ]+)  ## possitive lookahead (must be followed by space!!)
  )
}ix
PROP_NUM_RE =
%r{
 \b
  (?<num>
        ## note allow underscore inline or space e.g.
        ##  5_000
        ##  allow space inline (e.g. 5 000) - why? why not?
      (?<value> [1-9]
                (?: _? 
                    [0-9]+
                 )* 
      )
  )
 \b
}ix
ENCLOSED_NAME_RE =

todo/fix - allow more chars in enclosed name - why? why not?

                   e.g.  (') - Cote D'Ivore etc.
change to PAREN_NAME or PARENTHESIS or such - why? why not?
%r{ 
        (?<enclosed_name>  
           \( 
          (?<name>   
              \p{L}+
              (?:
                 [ ] 
                   \p{L}+ 
              )*
          )
            \)
        )
}ix
PROP_BASICS_RE =
%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>  
        [;,\(\)\[\]-] 
    )   
}ix
PROP_RE =
Regexp.union(
   MINUTE_RE,   ## e.g.  44 or 44' or 45+1 or 45+1' etc.

   INLINE_CAPTAIN,  ## e.g. [c]
   INLINE_YELLOW,   ## e.g. [Y] or [Y 44] or [Y 44'] or [Y 45+1']
   INLINE_YELLOW_RED,  ## e.g. [Y/R] or [Y/R 78]
   INLINE_RED,         ## e.g. [R] or [R 42] or [R 42']

   PROP_KEY_INLINE_RE,   
   PROP_NAME_RE,
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)
PROP_CARDS_RE =

note - no inline keys possible

todo/fix - use custom (limited) prop basics too
Regexp.union(
   MINUTE_RE,
   PROP_NAME_RE,
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)
PROP_PENALTIES_RE =
Regexp.union(
   SCORE_RE,               # e.g. 1-1 etc.
   ENCLOSED_NAME_RE,       # e.g. (save), (post), etc.
   PROP_NAME_RE,
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)
PROP_REFEREE_RE =
Regexp.union(
   ENCLOSED_NAME_RE,       # e.g. (sold out) etc.  why? why not?
   PROP_NUM_RE,                 # e.g. 28 000 or 28_000  (NOT 28,000 is not valid!!!)
   PROP_KEY_INLINE_RE,
   PROP_NAME_RE,
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)
PROP_ATTENDANCE_RE =
Regexp.union(
   ENCLOSED_NAME_RE,       # e.g. (sold out) etc.  why? why not?
   PROP_NUM_RE,                 # e.g. 28 000 or 28_000  (NOT 28,000 is not valid!!!)
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)
ANY_RE =

general catch-all (RECOMMENDED (ALWAYS) use as last entry in union)

to avoid advance of pos match!!!
%r{
     (?<any> .)
}ix
TEXT_RE =
%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
           ## positive lookbehind
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<=[ ,;@|\[\]]
                 |^
            )
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |

                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MUST be followed by (optional dot) and
                      ##                      required space !!!
                      ## MUST be follow by a to z!!!!
                      [.°]?     ## optional dot (.) or degree(°) - todo - add number sign too!! 
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add another weirdo case
                ##   e.g.   's Gravenwezel-Schilde
                    '[s] [ ] \p{L}+
               )


              (?:(?:  (?:[ ]   # only single spaces allowed inline!!!
                          ## note - exclude (v[ ]/vs[ ]/vs.[ ])
                          ##    AND switch to case-sensitive (via -i!!!)
                        (?! (?-i: (?:  ## note - (big) V not matching for versus!!!
                                      vs\.?|v|VS|   
                                         
                                      n/p|N/P|  
                                      w/o|W/O| 
                                      abd\.?|ABD|
                                      aban\.?|ABAN|
                                      susp\.?|SUSP|
                                      ppd\.?|PPD|
                                      pst\.?|PST|
                                      po?stp\.?|PO?STP|P-P|
                                      x-x|X-X|
                                      awd\.?|AWD|
                                      canc\.?|CANC ) [ ] 
                                        |
                                  (?: bye|BYE ) (?:[ ]|$))
                          )    
                      )
                      |     
                     [/-]   ## must NOT be surrounded by spaces 
                  )?
                (?:
                  \p{L} 
                     |
                  (?:   ## note - restrict [.&'] to single char usage (no doubled e.g. && etc.)
                    \. (?! \.)  ## allow single points only (now two or more etc.)
                     | 
                    & (?! &)
                     |
                    ' (?! ')
                   )
                     |
                 (?:
                   \d+
                   (?!
                     [0-9h'+] |    ## protected break on 12h / 12' / 1-1
                                    ##  check usege for 3+4 - possible? where ? why?     
                     (?:[.:-]\d)     ## protected/exclude/break on 12.03 / 12:03 / 12-12
                                      ##  BUT allow Park21-Arena for example e.g. 21-A :-)
                    )
                    [°]?  ## followed by optional ord                 
                   ## negative lookahead for numbers
                   ##   note - include digits itself!!!
                   ##   note - remove / (slash) e.g. allows UDI'19/Beter Bed
                 )
               )
              )*  ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)


            ## allow optional at the end
            ##  tag or year
            ##   make it and in the future - why? why not?
            ##
            ## change - fix
            ##   do NOT use (A) for amateur
            ##   use A or A. with NO ()!!!
            ## (A) -    allow with predined  alpha only for now
            ##          e.g. (A) - amateur a team or b?
            ###  same for U21 or U9 etc
            ##        use with NO ()!!! - why? why not?
            ##      or U21 U9 etc.   - why? why not?
            ##       or etc.
            ## (1879-1893) or allow years e.g. (1879-1893)
            ###
            ##    add allow country code three to five letters for now
            ##       change to generic 1 to 5 - why? why not?
            ##     e.g. (A), (I),
            ##          (AUT)
            ##          (TRNC)   five? for UEFA code for northern cyprus
            ##     change to 1 to 4 - why? why not?
            ##   check - fix possible for upper case only here
            ##                     inline for this group only?
            (?:
               [ ]
               \(
                  \d{4}-\d{4}
               \)
            )?
             (?:
                ######
                # check for country code (cc)
                #       e.g. (AUT) or ,AUT or AUT
                (?:
               [ ]   ## note - do NOT allow more than one space!!! - why? why not?
                   \( 
                       ## note - auto-exclude reserved (aet)  from SCORE_FULLER_MORE!!!
                       ##     plus golden goal (gg)/sudden death (sd), silver goal (sg)
                       ##    (ht), (ft)  
                       (?! (?: aet | agget | asdet | asget | ht | ft )
                             \)
                       )    
                     (?:
                       [A-Z]{1,5}   
                     )
                  \)
                )
                  |
                (?:
                    [ ]*[,›>][ ]*
                        [A-Z]{1,5}
                     \b
                )
             )?
            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## positive lookahead
            (?=[ ,;@|\[\]]
                 |$
            )
   )
}ix
TIME_RE =
%r{
        \b
    (?<time>  
             (?<hour>\d{1,2})
                   [:h] 
              (?<minute>\d{2})
                 
                 #### optional (inline) timezone
                 ##    note - non-utc timezone MUST be hard-coded (added) here!!!
                 ##     avoids eating-up team names (separated by one space)
                 ##            e.g.  18:30 MEX v MEX 
                 (?:
                    [ ]  ## require space - why? why not
                     (?<timezone>
                        (?: 
                          ## GMT   - Greenwich Mean Time
                          ## BST   - British Summer Time
                          ## CES?T - Central European (Summer) Time
                          ## EES?T - Eastern European (Summer) Time
                          ##
                          (?: GMT|BST|CES?T|EES?T) 
                               (?: /
                                   UTC  (?: [+-]\d{1,4} | ±0)
                               )?
                          )
                          |
                          (?:
                             UTC  (?: [+-]\d{1,4} | ±0)
                          )
                     )
                 )?
        )          
      \b  

####
###  note - local time is now INLINE and MUST follow time
       (?:     
           [ ]+   ## todo/check - make space optional - why? why not?
           \(
        (?<time_local>   
                (?<local_hour>\d{1,2})
                   [:h]    ### todo/fix - MUST match style in time above!!!
                (?<local_minute>\d{2})
                
                ####
                ## optional "local" timezone name eg. BRT or CEST etc.
                (?:
                    [ ] ## require space - why? why not
                   (?<local_timezone>
                      (?:  [A-Z]{3,4}
                           (?: /
                                   UTC (?: [+-]\d{1,4} | ±0)
                           )? 
                      )
                      |    
                      (?:     ## e.g. 0 or 00 or 0000
                          UTC   (?: [+-]\d{1,4} | ±0)
                      )   
                  )
               )?  # note - make timezone  optional!!!
          )
      \)       
       )?
}ix
GOAL_BASICS_RE =
%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>  
        [;,)]   ##  add (-) dash too - why? why not?   
    )   
}ix
START_GOAL_LINE_RE =

note - assume lines starting with opening ( are goal lines!!!!

note - use \A (instead of ^) - \A strictly matches the start of the string.

 note -  check for negative lookahead
               to exclude ord (numbers) e.g.  (1), (42), etc.!!!

todo/fix -- exclude (a), (h), (n)  - TEAM_AWAY, TEAM_HOME, TEAM_NEUTRAL tokens!!
%r{
                    \A                        
                       [ ]*    ## ignore leading spaces (if any) 
                      \(

                      # check NEGATIVE lookahead
                      (?! 
                            ##  exclude (a), (h), (n)
                            ##    TEAM_AWAY, TEAM_HOME, TEAM_NEUTRAL
                            (?: a|h|n )  
                            \)  
                       )

}xi
START_GOAL_LINE_COMPAT_RE =
%r{
                   \A
                        [ ]*    ## ignore leading spaces (if any) 
                      \(  
                      
                      ## (i) check NEGATIVE lookahead
                      ##    exclude score e.g. 1-1 etc.        
                          (?! [ ]* \b \d-\d \b)

                      ## (ii) check POSITIVE lookahead                                    
                          (?= [ ]*
                               \d{1,3}
                                   '?    ## optional minute marker
                                  (?: \+
                                      \d{1,2}   
                                    '?    ## optional minute marker
                                  )?     
                            )    
}xi
START_GOAL_LINE_ALT_RE =

check for goal line (alternate syntax)

(1-0 Player, 1-1 Player, ...)       
must start-off OR yes, include score      

note - allow "centered" style e.g. 
       (    Player 44' (p)  1-0  
                            1-1 Player 64'   )
%r{
    \A
       [ ]*    ## ignore leading spaces (if any) 
     \(  
     
     # check POSITIVE lookahead                                    
      (?=  .*?         ## note - non-greedy 
               \b \d-\d \b    ## score e.g. 0-1 
        )	 	
}xi
GOAL_NONE_RE =

e.g. (-; Metzger)

%r{ (?<goals_none>
       -[ ]*;
   )
}x
GOAL_SEP_ALT_RE =
%r{
          (?<goal_sep_alt>
 (?<=[ ])   ## positive lookbehind - space required
 -
 (?=[ ]|\z)    ## positive lookahead - speace required
)}x
GOAL_COUNT_RE =

e.g. (2)

 (2/p), (2/pen.), (3/2p), (3/ 2 pen.) 
-or-  (2,1pen), (3, 2 pens)

 (p), (pen.) (2 pen.), (2p)               
 (og), (o.g.), 
  (2og), (2 o.g.), (2ogs)
%r{
   (?<goal_count>
      \(
        (?:
          ## opt penalties
            (?<pen>
              (?:  (?<pen_value> \d{1,2}) [ ]? )?
                 (?:pens|pen\.?|p)
           )
            |
          ## opt own goals (og)
            (?<og>
             (?: (?<og_value> \d{1,2}) [ ]? )?
                (?:ogs?|o\.g\.|o) 
            )          
            |
          ## opt fallback - classic count/number
          (?:  (?<value> [1-9])
                ## check for option penalties
                (?<pen>
                     [,/] [ ]*
                     (?: (?<pen_value> \d{1,2}) [ ]? )?
                     (?:pens|pen\.?|p)
                )?
           )
         )  
      \)
)}ix
MINUTE_NA_RE =

minute variant for N/A not/available

  todo/check - find a better syntax - why? why not?

note  "??".to_i(10) returns 0 or
      "__".to_i(10) returns 0
quick hack - assume 0 for n/a for now
%r{
   (?<minute>
      (?<=[ (])	 # positive lookbehind for space or opening 
        (?<value> \?{2} | _{2} )
        '   ## must have minute marker!!!!
    )
}ix
MINUTE_RE =

note - inline b check in MINUTE_RE excludes

    85pen  or 90+4pen or 38p  (possible and NOT excluded in GOAL_MINUTE_RE  !!!)

minute with optional stoppage
%r{
     (?<minute>
               \b
             (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
                \b
                '?    ## optional minute marker
                
                (?: \+ (?<value2>\d{1,2}) 
                       \b   
                      '?    ## optional minute marker
                 )?
                      
      )
}ix
GOAL_MINUTE_RE =

goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og)

 todo/check - keep case-insensitive 
                 or allow OG or P or PEN or
                 only lower case - why? why not?

add (gg) for golden goal - why? why not?
add (sg) for silver goal - why? why not??
%r{
     (?<goal_minute>
               \b
             (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
                '?    ## optional minute marker
                
                 (?: \+ (?<value2>\d{1,2})
                      '?    ## optional minute marker
                 )?          
                   
        ## note - add goal minute qualifiers here inline!!! 
        (?:
            (?: [ ]? (?<og>   (?: \((?:og|o\.g\.|o)\))   ## allow (og)
                                   |
                              (?: (?:og|o\.g\.|o))      ## allow plain og
                      )
            )
            |
            (?: [ ]? (?<pen>  (?: \((?:pen\.?|p)\))   ## allow ()
                                   |
                              (?: (?:pen\.?|p))
                      )    
            )
            |
            ## add experimental header qualifier
            (?: [ ]? (?<hdr> \( (?:hdr\.?|h ) \) | (?: hdr\.?|h ) ))
            |
            ## add experimental free kick qualifier
            (?: [ ]? (?<fk> \( (?:fk\.?|f ) \) | (?: fk\.?|f) ))
        )?

        ##  add experimental seconds
        ##    e.g. (95 secs) or (95sec) etc. 
        (?: [ ]*  \(
                      (?<secs>\d{1,3})
                         [ ]?secs?
                   \) 
        )?
     )

     ## note - check positive lookahead 
     (?=[ ,;)]|$)   
}ix
GOAL_RE =
Regexp.union(
    GOAL_BASICS_RE,
    GOAL_NONE_RE,
    GOAL_MINUTE_RE,
    GOAL_COUNT_RE,
   ## MINUTE_NA_RE,   ## note - add/allow not/available (n/a,na) minutes hack for now
   ## GOAL_OG_RE, GOAL_PEN_RE,
   ## SCORE_RE,  ## add back in v2 (level 3) or such!!
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
    GOAL_SEP_ALT_RE,
    ## todo/fix - add ANY_RE !!!!
)
GOAL_TYPE_RE =
%r{
     (?<goal_type>
               \(
                 (?:
                      (?<og>  og|o\.g\.|o )  
                         |
                      (?<pen> pen\.?|p )  
                         |
                     ## add experimental header qualifier
                      (?<hdr>  hdr\.?|h )
                         |
                     ## add experimental free kick qualifier
                       (?<fk>  fk\.?|f )
                  )
                \)
)}xi
GOAL_ALT_RE =
Regexp.union(
    GOAL_BASICS_RE,
    SCORE_RE,        ## e.g.  1-0, 0-1, etc.
    GOAL_MINUTE_RE,
    GOAL_TYPE_RE,
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
    ## todo/fix - add ANY_RE !!!!
)
GOAL_COMPAT_RE =
Regexp.union(
    GOAL_BASICS_RE,
    SCORE_RE,        ## e.g.  1-0, 0-1, etc.
    MINUTE_RE,          ## note - matches minute e.g.  92, 7, 7' 7+3, 46+, etc.
    GOAL_TYPE_RE,
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
    ## todo/fix - add ANY_RE !!!!
)
GROUP_DEF_LINE_RE =

check for start of group def line e.g.

     Group A  | ...
     Group 1  : ....
     Group A2 | ....
note - use \A (instead of ^) - \A strictly matches the start of the string.
%r{  \A
   [ ]*  ## ignore leading spaces (if any)
   (?<group_def>
       Group
        [ ]
        [a-z0-9]+   ## todo/check - allow dot (.) too e.g. 1.A etc.- why? why not?         
   )
   ###   possitive lookahead MUST be : OR | 
   (?= [ ]*
       [:|] 
       [ ])  ## note: requires space for now after [:|] - keep - why? why not?	
}ix
GROUP_DEF_BASICS_RE =
%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym> [:|,] )    ### note - add comma (,) as optional separator  
}ix
GROUP_DEF_RE =
Regexp.union(  GROUP_DEF_BASICS_RE, 
   TEXT_RE,
   ANY_RE,
)
ROUND_OUTLINE_I_RE =

note - use A (instead of ^) - A strictly matches the start of the string.

todo - add support for trailing markers e.g.
  ▪ Round 1 ▪▪▪▪▪▪▪▪
  :: Round 1 ::::::::::::

check - allow without space (like in heading =Heading 1=) - why? why not?
  ▪Round 1▪▪▪▪▪▪▪▪
  ::Round 1::::::::::::
%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?<round_marker>
        [▪]{1,3}     ## BLACK SMALL SQUARE e.g. ▪,▪▪,▪▪▪
   )     
    [ ]+
     (?<round_outline>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1 
        ##   
        ##  note - CANNOT incl. :| !!!
        ##   used for markers for defs/definitions
        [^:|]+?   ## use non-greedy 
     )
     (?:
        [ ]+   
        [▪]+
     )?
     [ ]*  ## ignore trailing spaces (if any) 
   \z
}xi
ROUND_OUTLINE_II_RE =
%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?<round_marker>
         ::{1,3}     ## e.g. ::,:::,:::: 
   )     
    [ ]+
     (?<round_outline>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1 
        ##   
        ##  note - CANNOT incl. :| !!!
        ##   used for markers for defs/definitions
        [^:|]+?   ## use non-greedy 
     )
     (?:
        [ ]+   
        ::+
     )?
    [ ]*  ## ignore trailing spaces (if any) 
   \z
}xi
ROUND_OUTLINE_RE =
Regexp.union(  ROUND_OUTLINE_I_RE,
   ROUND_OUTLINE_II_RE,
)
ROUND_DEF_OUTLINE_RE =

note - for def(initions) only one level support

that is, no round outline additions possible (e.g ▪▪ 1st leg etc.)
%r{   \A
     [ ]*  ## ignore leading spaces (if any)
    (?: [▪]  ## BLACK SMALL SQUARE
         |
        :: )      
     [ ]+
      (?<round_outline>
         [^:|]+?   ## use non-greedy 
      )
     [ ]*  ## ignore trailing spaces (if any) 
    ###   possitive lookahead MUST be : OR | 
     (?= [:|] 
         [ ])  ## note: requires space for now after [:|] - keep - why? why not?	
}ix
ROUND_DEF_BASICS_RE =
%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym> [:|,] )    ### note - add comma (,) as optional separator  
}ix
ROUND_DEF_RE =
Regexp.union(  ROUND_DEF_BASICS_RE, 
   DURATION_RE,  # note - duration MUST match before date
   DATE_RE,  ## note - date must go before time (e.g. 12.12. vs 12.12)
   ANY_RE,
)
P_EN =

english helpers (penalty, extra time, …)

 note - p must go last (shortest match)
   pso = penalty shootout
- note - remove PSO for now (may add later back) - why? why not? 

todo/fix/clean-up - keep it simple -  remove optional trailing dot (.)
                     from pen., p., agg. etc. - why? why not?
                      always use (simply) pen, p, agg 
                    (also) remove  a.e.t. / a.e.t option - why? why not?

UPDATE mar/2026:  addd pens too - keep - why? why not?
                   (4-3 pens)
(4-3 Pens)  -- keep mixed Pens/Pen. too - why? why not?
(4-3 Pen.)
'(?-i: PEN | P |' +
'[Pp]ens | [Pp]en\.? | p\.? )'
ET_EN =

fix - change ET_EN to AET_EN!!! - why? why not?

check - allow Aet too - why? why not?
           or A.e.t ??
'(?-i: AET | ' +
'aet | a\.e\.t\.? )'
AETGG_EN =

after (golden goal/sudden death) extra time - add more options/styles - why? why not?

'(?-i: AET/GG | AGGET | ASDET | ' +
'aet/gg | a\.e\.t\.?/g\.g\.? | agget | asdet )'
AETSG_EN =

after (silver goal) extra time

'(?-i: AET/SG | ASGET | ' +
'aet/sg | a\.e\.t\.?/s\.g\.? | asget  )'
AGG_EN =

agg/agg. or AGG

'(?-i: AGG | agg\.? )'
SCORE_P =

fix - change SCORE_P to SCORE_FULL_P

            SCORE_ET to SCORE_FULL_ET

(re)use SCORE_P, SCORE_ET for score only part!!!
%Q<  (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})
        [ ]? #{P_EN}
>
SCORE_ET =
%Q<  (?<et1>\\d{1,2}) - (?<et2>\\d{1,2})
        [ ]? #{ET_EN}
>
SCORE_LOOKAHEAD =
'(?= [ ,\]] | $)'
SCORE__ET_GG_SG__RE =

after extra-time with golden goal/sudden death & silver goal rule

      note - golden goal & silver goal EXCLUDE penalties!!!

4-3 a.e.t/g.g.
4-3 aet/gg
4-3agget   -or-   4-3 asdet
2-1 aet/sg
 -or-
 4-3 aet/gg (3-3, 2-1)
%r{
    (?<score_full>
       \b
       (?<et1>\d{1,2}) - (?<et2>\d{1,2})
                      [ ]? (?:
                               (?<aetgg> #{AETGG_EN})
                                  |
                               (?<aetsg> #{AETSG_EN})
                            )
       ### note:
       ## add optional full-time, half-time score
         (?:
             [ ]+
             \(
                [ ]*
               (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
                  [ ]*
                (?:
                   , [ ]*
                   (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
                     [ ]*
                  )?
               )? # note: make half time (HT) score optional for now
             \)
         )?                     
        #{SCORE_LOOKAHEAD}
)}ix
SCORE__P_ET__RE =

note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.)

3-4 pen. 2-2 a.e.t.
3-4 pen.   2-2 a.e.t.
         2-2 a.e.t.
%r{
(?<score_full>
   \b
    (?: #{SCORE_P} [ ]+ 
     )?             ## note: make penalty (P) score optional for now
    #{SCORE_ET}
    #{SCORE_LOOKAHEAD}
)}ix
SCORE__ET_P__RE =

note: allow SPECIAL cases WITHOUT full time scores

AND with pen in last position!
  2-2 a.e.t., 3-4 pen. 
  2-2 a.e.t.  3-4 pen.  ## or without comma separator - why? why not?
%r{
(?<score_full>
   \b
    #{SCORE_ET}  
       (?: [ ]*,[ ]* | [ ]+ )
    #{SCORE_P}  
    #{SCORE_LOOKAHEAD}
)}ix
SCORE__FT_P__RE =

special case (i) - full time with penalties

2-2, 3-4 pen.
%r{
(?<score_full>
   \b
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})  
        [ ]*,[ ]*    ## note - comma required!!! 
    #{SCORE_P}  
    #{SCORE_LOOKAHEAD}
)}ix
SCORE__FT_HT_P__RE =

special case (ii) - full time & half-time with penalties

2-2 (1-1), 3-4 pen.
%r{
(?<score_full>
   \b
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
        [ ]*
         \(
             (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         \)
        [ ]*,[ ]*    ## note - comma required!!! 
    #{SCORE_P}  
    #{SCORE_LOOKAHEAD}
)}ix
SCORE__P__RE =

note: allow SPECIAL with penalty only

3-4 pen.  or 3-4p etc.
%r{
        (?<score_full>
  \b
    #{SCORE_P}  
    #{SCORE_LOOKAHEAD}
)}ix
SCORE__P_ET_FT_HT_V2__RE =

support short all-in-one e.g.

e.g.      3-4 pen. 2-2 a.e.t. ( 1-1, 1-1 ) becomes
 3-4 pen. (2-2, 1-1, 1-1)
%r{
          (?<score_full>
   \b
    #{SCORE_P} [ ]+       
       \(
       [ ]*
   (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]*, [ ]*
   (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*, [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
       [ ]*
    \)
   #{SCORE_LOOKAHEAD}
)}ix
SCORE__ET_FT_HT_P__RE =

e.g. 2-2 a.e.t. (1-1, 1-0), 5-1 pen.

%r{
          (?<score_full>
   \b
   #{SCORE_ET} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
   (?: [ ]*,[ ]* | [ ]+)
   #{SCORE_P}
   #{SCORE_LOOKAHEAD}
)}ix
SCORE__P_ET_FT_HT__RE =

e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or

3-4p 2-2aet (1-1, )     or
3-4 pen.  2-2 a.e.t. (1-1)       or
         2-2 a.e.t. (1-1, 1-1)  or
         2-2 a.e.t. (1-1, )     or
         2-2 a.e.t. (1-1)
%r{
          (?<score_full>
   \b
   (?:
      #{SCORE_P} [ ]+
    )?            ## note - make penalty (P) score optional for now
   #{SCORE_ET} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
 #{SCORE_LOOKAHEAD}
)}ix
SCORE__P_FT_HT__RE =

special case for case WITHOUT extra time!!

same as above (but WITHOUT extra time and pen required)
%r{
         (?<score_full>
            \b
           #{SCORE_P} [ ]+
    \(
    [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]*
 (?:
      , [ ]*
     (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         [ ]*
     )?
 )?              # note: make half time (HT) score optional for now
   \)
#{SCORE_LOOKAHEAD}
)}ix
SCORE__FT_HT__RE =

e.g. 2-1 (1-1)

%r{
            (?<score_full>
 \b
 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
      [ ]+ \( [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
      [ ]* \)
#{SCORE_LOOKAHEAD}
)}ix
SCORE_FULL_RE =

map tables

note: order matters; first come-first matched/served
Regexp.union(
  SCORE__ET_GG_SG__RE,       # e.g. 3-1 aet/gg  
  SCORE__P_ET_FT_HT_V2__RE,  # e.g. 5-1 pen. (2-2, 1-1, 1-0)  
  SCORE__ET_FT_HT_P__RE,    # e.g. 2-2 a.e.t. (1-1, 1-0), 5-1 pen. 
  SCORE__P_ET_FT_HT__RE,    # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
  SCORE__P_FT_HT__RE,     # e.g. 5-1 pen. (1-1)
  SCORE__ET_P__RE,        # e.g. 2-2 a.e.t., 5-1 pen.
  SCORE__FT_P__RE,        # e.g. 2-2, 5-1 pen.
  SCORE__FT_HT_P__RE,     # e.g. 2-2 (1-1), 5-1 pen.
  SCORE__P_ET__RE,        # e.g.  5-1 pen. 2-2 a.e.t.  or  2-2 a.e.t. (w/o pen)
  SCORE__P__RE,           # e.g. 5-1 pen.
  SCORE__FT_HT__RE,        # e.g. 1-1 (1-0)
  ##  note - keep basic score as its own token!!!!
  ##   that is, SCORE & SCORE_MORE
  ### SCORE__FT__RE,           # e.g. 1-1  -- note - must go last!!!
)
SCORE_AWD_RE =

note - keep AWD w/o dot - why? why not?

%r{
            (?<score_awd>
 \b
  (?<score1>\d{1,2}) - (?<score2>\d{1,2})
    [ ]?
      (?-i: awd\.? | AWD )
  ## POSITIVE lookahead - requires space
  (?= [ ])
)}ix
SCORE_ABD_RE =

add support for score abandoned (inline style)

2-1 abd.   or 2-1 ABD
%r{
            (?<score_abd>
 \b
  (?<score1>\d{1,2}) - (?<score2>\d{1,2})
    [ ]?
     (?-i: abd\.? | ABD )
  ## POSITIVE lookahead - requires space
  (?= [ ])
)}ix
SCORE_RE =

2-1

note - was SCORE__FT__RE
         changed to "generic" SCORE_RE
              and
           (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) 
    changed
           (?<score1>\d{1,2}) - (?<score2>\d{1,2}) 
              to 
           pattern match not necessarily the full-time (ft) scoreline!!!
  - pattern also used for goal seq(uence) e.g. 1-0 Kane, 1-1 Johnson
%r{
            (?<score>
 \b
  (?<score1>\d{1,2}) - (?<score2>\d{1,2})
 \b
)}ix
SCORE_TEAM_RE =

add support for “split” score

note - for now (2) 1  is REQUIRED
%r{
    (?<score_team>
         \(
            (?<score_i> \d{1,2}) 
         \)
         [ ]*   ## note - space optional- why? why not?
            (?<score_ii> \d{1,2})
        \b 
    )
}ix
SCORE_TEAM_PEN_RE =

“penalty”-style (4) is assumed penalty score

note - for now 1 (4) is REQUIRED
%r{
    (?<score_team_pen>
         \b
            (?<score_i> \d{1,2})
         \b
           [ ]*  ## note - space optional- why? why not?  
         \(
            (?<score_pen> \d{1,2}) 
         \)
    )
}ix
SCORE_TEAM_NUM_RE =

note - score_team_num (<100) e.g. 0, 1, .., 10, 11, .. 99

   use a different name - why? why not? 
note - must be surrouned by space
%r{
    ## positive lookbehind
     (?<= [ ])

      (?<score_team_num> \d{1,2} )

     ## positive lookahead
     (?= [ ]|\z)
}x
TABLE_HEADING_I_RE =
%r{
    \A
     [ ]*  ## ignore leading spaces (if any)
    (?<table_heading>
      \b
       P(?:ld)?  [ ]+ 
        W        [ ]+
        D        [ ]+
        L        [ ]+
        Gls      [ ]+
        Pts
     \b
      )
     [ ]*  ## ignore trailing spaces (if any) 
     \z
}xi
TABLE_DIVIDER_RE =
%r{
      \A
  [ ]*  ## ignore leading spaces (if any)
      (?<table_divider>
          (?:  ---   ## note - require three dashes minimum (---)
                [-]* 
          )  
            |
          (?: - [ ]+ - [ ]+ -  ## note - require three dashes minimum (- - -)
                (?: [ ]+ -)*   ##   todo/check - restrict spaces to 2 or 3 or such - why? why not?
          )  
      )
  [ ]*  ## ignore trailing spaces (if any) 
      \z
}xi
TABLE_NOTE_RE =
%r{
       \A
        [ ]*  ## ignore leading spaces (if any)
           -\.
           [ ]*
       (?<table_note>
            .+?   ## note - use non-greedy       
         )
        [ ]*  ## ignore trailing spaces (if any) 
        \z
}xi
TABLE_I_RE =
%r{
(?<table>\b 
     \d{1,2} [ ]+                        # Pld
     \d{1,2} [ ]+                        # W
     \d{1,2} [ ]+                        # D
     \d{1,2} [ ]+                        # L
     (?: \d{1,3} - [ ]* \d{1,3} [ ]+ )   # GF-GA
     \d{1,3}                             # Pts   
      \b 
)}xi
TABLE_II_RE =

Pld Pts W D L GF-GA | d d d d d d-d

  1. ARG^ 3 6 3 0 0 10-4

  2. CHI 3 4 2 0 1 5-3

  3. FRA 3 2 1 0 2 4-3

  4. MEX 3 0 0 0 3 4-13

%r{
(?<table>\b 
     \d{1,2} [ ]+                        # Pld
     \d{1,3} [ ]+                        # Pts   
     \d{1,2} [ ]+                        # W
     \d{1,2} [ ]+                        # D
     \d{1,2} [ ]+                        # L
     (?: \d{1,3} - [ ]* \d{1,3})   # GF-GA
      \b 
)}xi
TABLE_RE =

possible start lines for a table

excludes NOTE
and RULER (e.g. --- or) or such in the future
Regexp.union(
    TABLE_HEADING_I_RE,
    TABLE_I_RE,
    TABLE_II_RE,
)
TABLE_MORE_RE =

all possible continuation for a table

excludes HEADING
Regexp.union(
    TABLE_NOTE_RE,
    TABLE_DIVIDER_RE,
    TABLE_I_RE,
    TABLE_II_RE,
)
POSTPONED =
%Q{ (?<postponed> postponed  | pst\\.? | po?stp\\.?  | ppd\\.? ) }
CANCELED =

add can/can. - why? why not?

%Q{ (?<canceled>  cancell?ed | canc\\.? ) }
WALKOVER =

add o/w too - why? why not?

%Q{ (?<walkover>  walkover   | w/o  | wo ) }
AWARDED =
%Q{ (?<awarded>   awarded    | awd\\.? ) }
SUSPENDED =
%Q{ (?<suspended> suspended  | susp\\.? ) }
ABANDONED =
%Q{ (?<abandoned> abandoned  | aban\\.?  | abd\\.? ) }
ANNULLED =
%Q{ (?<annulled>  annulled ) }
VOIDED =

note - alternative (name) to annulled

%Q{ (?<voided>    voided     | void ) }
REPLAY =
%Q{ (?<replay>    replay     | repl\\.? ) }
STATUS_RE =

note - status_note incl. complete text incl. <status> (not normalized)

<status> gets normalized e.g. ppt => postponed etc.
%r{
            \[
      (?:    
#############################################  
### opt 1 - allow long forms with note/comment for some stati
##                    e.g. [postponed due to tropical storm "Hanna"]
##                         [suspended at 84' by storm; result stood]
#########################
           (?: (?<status_note>
                  (?<status>
               ####################
               ## pre-match (not played)
                    #{POSTPONED}
                           |
                    #{CANCELED}       
                           |
                    #{WALKOVER}        
                           |
               ######################   
               ## pre/post match
                     #{AWARDED}
                            |
               ########################
               ## post match - (partially) played
                    #{SUSPENDED} 
                            |   
                    #{ABANDONED}
                            |
                    #{ANNULLED}
                            |
                    #{VOIDED} ### note - alternative to annulled
              )     ## end-of-<status>
                  [ :;,-]+     ## leading spaces (or separators) 
                  [^\]]+?      ## note - add non-greedy match 
              ) ## end-of-<status-note>   
              [ ]*  ## eat-up optional trailing spaces
            )
            |       
########################################
## opt 2 - short form only (no note/comments) e.g. [postponed], [Canceled], etc.
####################################     
            (?<status>
         ####################
         ## pre-match (not played)
               #{POSTPONED}
                 |
               #{CANCELED}
                 |
               #{WALKOVER}         
                 |
         ######################   
         ## pre/post match
               #{AWARDED}
                 |
         ########################
         ## post match - (partially) played
               #{SUSPENDED}                                        
                 |
               #{ABANDONED}
                 |
               #{ANNULLED}
                 |
               #{VOIDED}   ### note - alternative to annulled
                 |
               #{REPLAY}       ### todo/fix - keep replay - why? why not?
                                  ###   prefer replay in round e.g. 
                                  ##       ▪ Round 17, Replay
                                  ##       ▪ Semi-finals, Replays
            )
      )
    \]
}ix
PROP_NAME_RE =

name different from text (does NOT allow number in name/text)

%r{
                 (?<prop_name> 
                      \b
                   (?<name>
                      \p{L}+       
                        \.?    ## optional dot
                          (?:
                             ## rule for space; only one single space allowed inline!!!
                              (?:
                                (?<![ ])  ## use negative lookbehind                             
                                  [ ] 
                                (?=\p{L}|['"])      ## use lookahead        
                              )
                              ## support (inline) quoted name e.g. "Rodri" or such
                                  |
                                  (?:
                                     (?<=[ ])  ## use positive lookbehind                             
                                     " \p{L}+ " 
                                      ## require space here too - why? why not?
                                   )                      
                                  |   
                             (?:
                                (?<=    ## \p{L}\. | \p{L}
                                        [\p{L}.] 
                                     )  ## use  POSITIVE lookbehind
                                 [-]   ## must be surrounded by letters
                                       ## note - allow leading dot (.) e.g. K.-H.Förster 
                                       ##                short for          Karl-Heinz Förster
                                       ##
                                       ## e.g. One-Two NOT
                                       ##      One- Two or One - Two or One -Two etc.
                                (?=\p{L})      ## use lookahead        
                              )
                                 |   
                              (?:  ## flex rule for quote - allow any
                                    ##  only check for double quotes e.g. cannot follow other ' for now - why? why not?
                                    ##        allows  rodrigez 'rodri' for example
                                (?<!')  ## use negative lookbehind                             
                                   '         
                              )      
                                 |   ## standard case with letter(s) and optinal dot
                              (?: \p{L}+
                                    \.?  ## optional dot
                              )
                          )*
                    )
               ## add lookahead - must be non-alphanum 
                  (?=[ ,;\]\)]|$)
                  )
}ix
SCORE_LEGS_RE =

win on away goals

aet
%r{
(?<score_legs>
   \b   
    (?<leg1_ft1>\d{1,2}) - (?<leg1_ft2>\d{1,2})
       (?: [ ]+ |  [ ]*,[ ]*)   # separate by spaces OR comma
    (?:
        ## opt 1 - after extra-time (et) score
            (?<leg2_et1>\d{1,2}) - (?<leg2_et2>\d{1,2})
               [ ]? #{ET_EN}   ## a.e.t./aet
                ### note - might end in dot (.) not alpha
                ###  thus, wordboundary NOT working
               #{SCORE_LOOKAHEAD}   
          |
        ## opt 2 - full-time (ft)  
        (?<leg2_ft1>\d{1,2}) - (?<leg2_ft2>\d{1,2})
            \b 
    )                
    (?:   ## check optional aggregate e.g. (agg 4-4)
        [ ]+
         \(
             agg [ ]
              (?<agg1>\d{1,2}) - (?<agg2>\d{1,2}) 
              
             ### add win options 
             (?:
                 ## opt 1 - on away goals
                (?<away> [ ]*,[ ]*
                         (?:win [ ])? on [ ] away [ ] goals?
                 )
                   |
                 ## opt 2 - on penalties  
                (?:
                   [ ]*,[ ]*
                   (?:win [ ])?
                    (?<leg2_p1>\d{1,2}) - (?<leg2_p2>\d{1,2})
                    [ ] on [ ] pens
                )
             )?
         \)
    )?
)}ix
SCORE_FULLER_AGG =
_mk_score_fuller_agg( win: false )
SCORE_FULLER_AGG_WIN =
_mk_score_fuller_agg( win: true )
SCORE_FULLER_P =
_mk_score_fuller_p( win: false )
SCORE_FULLER_P_WIN =
_mk_score_fuller_p( win: true )
SCORE_FULLER_AWAY_WIN =
%Q<
     (?:
      (?<away>
        ############
        ## opt 1)  with win
        (?:
            (?: win [ ] )?
            (?: (?<away1>\\d{1,2}) - (?<away2>\\d{1,2}) [ ] )?
             on [ ] away [ ] goals?     # goal or goals
        )
        |        
        #####
        ## opt 2)  "classic" (post)
        (?:
           (?: (?<away1>\\d{1,2}) - (?<away2>\\d{1,2}) [ ] )?
              [ ]* away  
        )
        |
        #####
        ## opt 3) up-front (pre)
        (?:
              away 
           (?:  [ ]
                (?<away1>\\d{1,2}) - (?<away2>\\d{1,2})
           )?   
        )
     ))                   
>
SCORE_FULLER_HT_OPT =
%Q<
  (?:   HT [ ]
      (?: (?<ht1>\\d{1,2}) - (?<ht2>\\d{1,2})) 
      [ ]*,[ ]*
  )?  ## note - make optional
>
SCORE_FULLER_FT_OPT =
%Q<
  (?:   FT [ ]
      (?: (?<ft1>\\d{1,2}) - (?<ft2>\\d{1,2})) 
      [ ]*,[ ]*
  )?  ## note - make optional
>
SCORE_FULLER__HT =

4-4 (HT 2-1)

       or
Team A  4-1  Team B  (HT 2-1)
%Q<
             \\(  HT [ ]
                  (?<ht1>\\d{1,2}) - (?<ht2>\\d{1,2}) 
             \\)
>
SCORE_FULLER__HT_FT__RE =
%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__HT}
)}ix
SCORE_FULLER_MORE__HT_FT__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__HT}
)}ix
SCORE_FULLER__ET =
%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_FT_OPT} 
                (?:
                  (?<aetgg> #{AETGG_EN})
                   |
                  (?<aetsg> #{AETSG_EN}) 
                   |
                  (?<aet> #{ET_EN})
                 )
             \\)
>
SCORE_FULLER__ET__RE =
%r{
(?<score_fuller>
   \b   
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__ET}
)}ix
SCORE_FULLER_MORE__ET__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__ET}
)}ix
SCORE_FULLER__ET_P =

4-4 (aet, win 3-5 on pens)

4-4 (aet, 3-5 on pens)
4-4 (aet, 3-5 pen)
4-4 (a.e.t., 3-5 pen.)
   or
Team A  4-4  Team B  (aet, win 3-5 on pens) 
Team A  4-4  Team B  (aet, 3-5 on pens)
Team A  4-4  Team B  (aet, 3-5 pen)
Team A  4-4  Team B  (a.e.t., 3-5 pen.)
%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_FT_OPT} 
                (?<aet> #{ET_EN})
                 [ ]*,[ ]*
                 #{SCORE_FULLER_P_WIN}
             \\)
>
SCORE_FULLER__ET_P__RE =
%r{
(?<score_fuller>
   \b   
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__ET_P}
)}ix
SCORE_FULLER_MORE__ET_P__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__ET_P}
)}ix
SCORE_FULLER__FT_P =

4-4 (win 3-5 on pens)

  4-4 (3-5 pen)
  4-4 (3-5p)
    or
Team A  4-4  Team B (win 3-5 on pens)
Team A  4-4  Team B (3-5 pen)
Team A  4-4  Team B (3-5p)
%Q<
             \\(
                  #{SCORE_FULLER_HT_OPT} 
                  #{SCORE_FULLER_P_WIN}
             \\)
>
SCORE_FULLER__FT_P__RE =
%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     \(
         #{SCORE_FULLER_P_WIN}
     \)
)}ix
SCORE_FULLER_MORE__FT_P__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__FT_P}
)}ix
SCORE_FULLER__FT_AGG =

3-2 (win 4-5 on aggregate)

3-2 (4-5 on aggregate)
3-2 (4-5 on agg)
3-2 (4-5 agg)
3-2 (4-5 agg.)
  or  
3-2 (agg 4-5)
%Q<
             \\(
                 #{SCORE_FULLER_HT_OPT} 
                 #{SCORE_FULLER_AGG_WIN}
             \\)
>
SCORE_FULLER__FT_AGG__RE =
%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__FT_AGG}
)}ix
SCORE_FULLER_MORE__FT_AGG__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__FT_AGG}
)}ix
SCORE_FULLER__FT_AGG_AWAY =

ft + agg + away

2-1 (3-3 on aggregate, win on away goals)
2-1 (3-3 on aggregate, win 2-1 on away goals)
%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_AGG}
                   [ ]*,[ ]*
                 #{SCORE_FULLER_AWAY_WIN}
             \\)
>
SCORE_FULLER__FT_AGG_AWAY__RE =
%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__FT_AGG_AWAY}
)}ix
SCORE_FULLER_MORE__FT_AGG_AWAY__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__FT_AGG_AWAY}
)}ix
SCORE_FULLER__ET_AGG_P =

2-1 (aet, 3-3 on aggregate, win 5-2 on pens)

2-1 (aet, 3-3 agg, 5-2 pen.)
%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_FT_OPT} 
                (?<aet> #{ET_EN})
                    [ ]*,[ ]*
                    #{SCORE_FULLER_AGG}  
                    [ ]*,[ ]*
                    #{SCORE_FULLER_P_WIN}                     
             \\)
>
SCORE_FULLER__ET_AGG_P__RE =
%r{
(?<score_fuller>
   \b   
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__ET_AGG_P}
)}ix
SCORE_FULLER_MORE__ET_AGG_P__RE =
%r{
(?<score_fuller_more>
     #{SCORE_FULLER__ET_AGG_P}
)}ix
SCORE_FULLER_RE =

map tables

note: order matters - first come-first matched/served
Regexp.union(
SCORE_FULLER__HT_FT__RE,       ## e.g.  3-2 (HT 2-1)
SCORE_FULLER__ET_P__RE,        ## e.g.  2-2 (aet, win 5-3 on pens)
SCORE_FULLER__ET__RE,          ## e.g.  2-3 (aet)
SCORE_FULLER__FT_P__RE,        ## e.g.  2-2 (win 5-3 on pens)
SCORE_FULLER__FT_AGG__RE,      ## e.g.  2-3 (win 5-4 on aggregate)
SCORE_FULLER__FT_AGG_AWAY__RE, ## e.g.  2-1 (3-3 on aggreate, win 2-1 on away goals)
SCORE_FULLER__ET_AGG_P__RE,    ## e.g.  2-1 (aet, 3-3 on aggregate, win 5-2 on pens)
)
SCORE_FULLER_MORE__HT__RE =

add support for “stand-alone” (HT) and (FT) - keep why? why not?

%r{
(?<score_fuller_more>
    \( (?<ht> ht ) \)
)}ix
SCORE_FULLER_MORE__FT__RE =
%r{
(?<score_fuller_more>
     \( (?<ft> ft ) \)  
)}ix
SCORE_FULLER_MORE__FT_ET__RE =

add special for fuller_more

(aet 4-3)   -  core score is ft, and fuller more incl. et!!!
%r{
(?<score_fuller_more>
      \(#{ET_EN}
           [ ]
       (?<et1>\d{1,2}) - (?<et2>\d{1,2})
      \) 
)}ix
SCORE_FULLER_MORE__HT_FT__CLASSIC_RE =

note - simply (1-1) !!!!!

note - special attention needed for placemenent in processing error!!!
  make sure it is the last (or on of the last) match(es)
%r{
(?<score_fuller_more>
     \(  
          (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) 
     \)
)}ix
SCORE_FULLER_MORE_RE =
Regexp.union(
  SCORE_FULLER_MORE__FT__RE,          ## e.g. (ft)
  SCORE_FULLER_MORE__HT__RE,          ## e.g. (ht)
  SCORE_FULLER_MORE__HT_FT__RE,       ## e.g. (HT 2-1)
  SCORE_FULLER_MORE__ET_P__RE,        ## e.g. (aet, win 5-3 on pens)
  SCORE_FULLER_MORE__ET__RE,          ## e.g. (aet)
  SCORE_FULLER_MORE__FT_ET__RE,       ## e.g. (aet 3-2) - (SPECIAL) incl. after extra-time score!!
  SCORE_FULLER_MORE__FT_P__RE,        ## e.g. (win 5-3 on pens)
  SCORE_FULLER_MORE__FT_AGG__RE,      ## e.g. (win 5-4 on aggregate)
  SCORE_FULLER_MORE__FT_AGG_AWAY__RE, ## e.g. (3-3 on aggreate, win 2-1 on away goals)
  SCORE_FULLER_MORE__ET_AGG_P__RE,    ## e.g. (aet, 3-3 on aggregate, win 5-2 on pens)

  SCORE_FULLER_MORE__HT_FT__CLASSIC_RE,   ## e.g. (2-1)  half-time !!!!
)
DURATION_I_RE =
%r{
(?<duration>
    \b
  (?:
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name1>#{MONTH_NAMES})
      [ ] 
   (?<day1>\d{1,2})
   ## optional year
   (  ,?   # optional comma
      [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]* - [ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name2>#{MONTH_NAMES})
      [ ] 
   (?<day2>\d{1,2})
   ## optional year
   (  ,?   # optional comma
      [ ]
      (?<year2>\d{4})
   )?
  )
   \b
)}ix
DURATION_II_RE =

variant ii

add support for shorthand
   August 16-18, 2011     
   September 13-15, 2011
    October 18-20, 2011
    March 6-8 2012
    March 6-8

 - add support for August 16+17 or such (and check 16+18)
     use <op> to check if day2 is a plus or range or such - why? why not?
%r{
(?<duration>
    \b
   (?:
       (?<month_name1>#{MONTH_NAMES})
           [ ]
        (?<day1>\d{1,2})
             -
        (?<day2>\d{1,2})
          (?:
            ,?     ## optional comma
            [ ]
            (?<year1>\d{4})
          )?     ## optional year   
   )
   \b
)}ix
DURATION_RE =

map tables

note: order matters; first come-first matched/served
Regexp.union(
   DURATION_I_RE,
   DURATION_II_RE,
)

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lines, debug: false) ⇒ Lexer

Returns a new instance of Lexer.

Raises:

  • (ArgumentError)


34
35
36
37
38
39
# File 'lib/sportdb/parser/lexer.rb', line 34

def initialize( lines, debug: false )
   raise ArgumentError, "(string) text expected for lexer; got #{lines.class.name}"  unless lines.is_a?(String)
  
   @debug = debug
   @txt   = lines
end

Class Method Details

._build_date(m) ⇒ Object

“internal” date helpers



305
306
307
308
309
310
311
312
313
314
315
316
317
318
# File 'lib/sportdb/parser/token-date.rb', line 305

def self._build_date( m )
            date = {}
         ## map month names
         ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date[:y]  = m[:year].to_i(10)  if m[:year]
            ## check - use y too for two-digit year or keep separate - why? why not?
            date[:yy] = m[:yy].to_i(10)    if m[:yy]    ## two digit year (e.g. 25 or 78 etc.)
            date[:m] = m[:month].to_i(10)  if m[:month]
            date[:m] = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
            date[:d]  = m[:day].to_i(10)   if m[:day]
            date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

            date
end

._build_date_legs(m) ⇒ Object



321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# File 'lib/sportdb/parser/token-date.rb', line 321

def self._build_date_legs( m )
           legs = {}
        ## map month names
         ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date = {}
            date[:m] = MONTH_MAP[ m[:month_name1].downcase ]
            date[:d]  = m[:day1].to_i(10)   
            legs[:date1] = date
     
            date = {}
            date[:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            date[:d]  = m[:day2].to_i(10)   
            legs[:date2] = date

            legs
end

._build_duration(m) ⇒ Object



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/sportdb/parser/token-date_duration.rb', line 169

def self._build_duration( m )
            ## todo/check/fix - if end: works for kwargs!!!!!
            duration = { start: {}, end: {}}

            duration[:start][:y] = m[:year1].to_i(10)  if m[:year1]
            duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ]   if m[:month_name1]
            duration[:start][:d]  = m[:day1].to_i(10)   if m[:day1]
            duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ]   if m[:day_name1]

            duration[:end][:y] = m[:year2].to_i(10)  if m[:year2]
            duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            duration[:end][:d]  = m[:day2].to_i(10)   if m[:day2]
            duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ]   if m[:day_name2]

            duration
end

._build_goal_count(m) ⇒ Object



436
437
438
439
440
441
442
# File 'lib/sportdb/parser/token-goals.rb', line 436

def self._build_goal_count( m )
    count = {}
    count[:count] = m[:value].to_i(10)        if m[:value]
    count[:og]    = m[:og_value] ? m[:og_value].to_i(10) : 1      if m[:og]   ## check flag
    count[:pen]   = m[:pen_value] ? m[:pen_value].to_i(10) : 1    if m[:pen]  ## check flag
    count
end

._build_goal_minute(m) ⇒ Object



389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
# File 'lib/sportdb/parser/token-goals.rb', line 389

def self._build_goal_minute( m )
    minute = {}

    minute[:m]     =  m[:value].to_i(10)   ## always required

    ## stoppage/injury time (offset)
    minute[:offset] = m[:value2].to_i(10)   if m[:value2]
    
    minute[:og]  = true       if m[:og]
    minute[:pen] = true       if m[:pen]
    minute[:freekick] = true  if m[:fk]
    minute[:header] = true    if m[:hdr]
  
    minute[:secs] = m[:secs].to_i(10)   if m[:secs]
  
    minute
end

._build_goal_type(m) ⇒ Object



448
449
450
451
452
453
454
455
# File 'lib/sportdb/parser/token-goals.rb', line 448

def self._build_goal_type( m )
    goal = {}
    goal[:og]       = true  if m[:og]
    goal[:pen]      = true  if m[:pen]
    goal[:freekick] = true  if m[:fk]
    goal[:header]   = true  if m[:hdr]
    goal
end

._build_minute(m) ⇒ Object



409
410
411
412
413
414
415
416
417
# File 'lib/sportdb/parser/token-goals.rb', line 409

def self._build_minute( m )
    minute = {}
    minute[:m]      = m[:value].to_i(10)   ## always required

    ## stoppage/injury time (offset)   
    minute[:offset] = m[:value2].to_i(10)   if m[:value2]

    minute
end

._build_score_team(m) ⇒ Object



412
413
414
415
416
417
418
419
420
421
422
# File 'lib/sportdb/parser/token-score.rb', line 412

def self._build_score_team( m )
            score = {}
            ##  note - score team is "generic"
            ##      might be full-time (ft) or
            ##         after extra-time (aet) or such
            ##         or even undecided/unknown
            ##    thus, use score_i/score_ii 
            score[:score] = [m[:score_i].to_i(10),
                             m[:score_ii].to_i(10)]  
            score
end

._build_score_team_num(m) ⇒ Object



435
436
437
438
439
# File 'lib/sportdb/parser/token-score.rb', line 435

def self._build_score_team_num( m )
            score = {}
            score[:score] = m[:score_team_num].to_i(10)
            score
end

._build_score_team_pen(m) ⇒ Object



426
427
428
429
430
431
# File 'lib/sportdb/parser/token-score.rb', line 426

def self._build_score_team_pen( m )
            score = {}
            score[:score] = [m[:score_i].to_i(10),
                             m[:score_pen].to_i(10)]  
            score
end

._build_status(m) ⇒ Object



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/sportdb/parser/token-status.rb', line 100

def self._build_status( m )
        status = {}
        ## note - norm status text - why? why not?
        status[:status] = if    m[:postponed] then 'postponed'
                          elsif m[:canceled]  then 'canceled'
                          elsif m[:walkover]  then 'walkover'
                          elsif m[:awarded]   then 'awarded'
                          elsif m[:suspended] then 'suspended'
                          elsif m[:abandoned] then 'abandoned'
                          elsif m[:annulled] ||
                                m[:voided]    then 'annulled'
                          elsif m[:replay]    then 'replay'      
                          else  ## fallback on "generic" status (shouldn't happen)
                            m[:status]
                          end

        ## includes note? e.g.  awarded; originally 2-0
        status[:status_note] = m[:status_note]   if m[:status_note]   
         
        status
end

._build_time(m) ⇒ Object



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/sportdb/parser/token-time.rb', line 95

def self._build_time( m )
              ## unify to iso-format
              ###   12.40 => 12:40
              ##    12h40 => 12:40 etc.
              ##  keep string (no time-only type in ruby)
              data = { time: {} }
              
              hour     = m[:hour].to_i(10)  ## allow 08/07/etc.
              minute   = m[:minute].to_i(10)
   
              ##   check if 24:00 possible? or only 0:00 (23:59)
              unless (hour   >=0 && hour   <=23) &&
                     (minute >=0 && minute <=59)
                 raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
              end
   
              data[:time][:h] = hour
              data[:time][:m] = minute
              data[:time][:timezone] = m[:timezone]    if m[:timezone] 
      

              ## check if local time present e.g.
              ##    18:30 (19:30)
              ##    18:30 (19:30 BST)  etc.
              if m[:time_local]
                  data[:time_local] = {}

                local_hour     = m[:local_hour].to_i(10)  ## allow 08/07/etc.
                local_minute   = m[:local_minute].to_i(10)
  
                ##   check if 24:00 possible? or only 0:00 (23:59)
                unless (hour   >=0 && hour   <=23) &&
                       (minute >=0 && minute <=59)
                   raise ArgumentError, "parse error - local time >#{m[:time_local]}< out-of-range"
                end
  
                data[:time_local][:h] = local_hour
                data[:time_local][:m] = local_minute
                data[:time_local][:timezone] = m[:local_timezone]    if m[:local_timezone] 
            end

              data
end

._mk_score_fuller_agg(win:) ⇒ Object

regex score helpers

note - MUST double escape \d e.g. \\d!!!   if not "simple" string (e.g. '' but %Q<>)


24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/sportdb/parser/token-score_fuller.rb', line 24

def self._mk_score_fuller_agg( win: )    ## with optional win - true|false
   %Q<
                 (?:
                    ############
                    ## opt 1)  with win
                    (?:
                       #{ win ? '(?: win [ ] )?' : '' }   
                        (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2})
                          [ ] on [ ] agg (?: regate )?  
                    )
                    |        
                    #####
                    ## opt 2)  "classic" (post)
                    (?:
                       (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2})
                          [ ]*
                        #{AGG_EN}   
                    )
                    |
                    #####
                    ## opt 3) agg up-front (pre)
                    (?:
                         agg [ ]
                       (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2})   
                    )
                 )
    >
end

._mk_score_fuller_p(win:) ⇒ Object

with optional win - true|false



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/sportdb/parser/token-score_fuller.rb', line 53

def self._mk_score_fuller_p( win: )    ## with optional win - true|false
   %Q<
                 (?:
                    ############
                    ## opt 1)  with win
                    (?:
                        #{ win ? '(?: win [ ] )?' : '' }
                        (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})
                          [ ] on [ ] pens
                    )
                    |        
                    #####
                    ## opt 2)  "classic" (post)
                    (?:
                       (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})
                          [ ]*
                        #{P_EN}   
                    )
                    |
                    #####
                    ## opt 3) up-front (pre)
                    (?:
                         (?: pen|p) [ ]
                       (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})   
                    )
                 )                   
    >
end

._parse_date(str) ⇒ Object



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/sportdb/parser/token_helpers.rb', line 38

def self._parse_date( str )
    ## note - strip - leading/trailing spaces
    m = DATE_RE.match( str.strip )
    
    ####  todo/fix/check:
    ###   wrapped with  \A \z NOT working with union  - check later - why?
    ###   use hand-coded  with pre_match = "" and post_match = ""
    
    if m && m.pre_match == '' && m.post_match == ''
      ## return hash table with captured components
      date = {}
      ## map month names
      ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
      date[:y]  = m[:year].to_i(10)  if m[:year]
      ## check - use y too for two-digit year or keep separate - why? why not?
      date[:yy] = m[:yy].to_i(10)    if m[:yy]    ## two digit year (e.g. 25 or 78 etc.)
      date[:m]  = m[:month].to_i(10)  if m[:month]
      date[:m]  = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
      date[:d]  = m[:day].to_i(10)   if m[:day]
      date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]
      date 
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil   
    else
      nil  ## no match - return nil
    end
end

._parse_goal_count(str) ⇒ Object



422
423
424
425
426
427
428
429
430
431
432
433
434
# File 'lib/sportdb/parser/token-goals.rb', line 422

def self._parse_goal_count( str )  
    ## note - strip - leading/trailing spaces
    m = GOAL_COUNT_RE.match( str.strip )
    if m && m.pre_match == '' && m.post_match == ''
      _build_goal_count( m )
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil   
    else
      nil  ## no match - return nil
    end
end

._parse_goal_minute(str) ⇒ Object



374
375
376
377
378
379
380
381
382
383
384
385
386
# File 'lib/sportdb/parser/token-goals.rb', line 374

def self._parse_goal_minute( str )  
    ## note - strip - leading/trailing spaces
    m = GOAL_MINUTE_RE.match( str.strip )
    if m && m.pre_match == '' && m.post_match == ''
      _build_goal_minute( m )
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil   
    else
      nil  ## no match - return nil
    end
end

._parse_score_full(str) ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/sportdb/parser/token_helpers.rb', line 69

def self._parse_score_full( str )
    ## note - strip - leading/trailing spaces
    m=SCORE_FULL_RE.match( str )

    if m && m.pre_match == '' && m.post_match == ''
       score = {}
       score[:p]  = [m[:p1].to_i,m[:p2].to_i]     if m[:p1] && m[:p2]
       score[:et] = [m[:et1].to_i,m[:et2].to_i]   if m[:et1] && m[:et2]
       score[:ft] = [m[:ft1].to_i,m[:ft2].to_i]   if m[:ft1] && m[:ft2]
       score[:ht] = [m[:ht1].to_i,m[:ht2].to_i]   if m[:ht1] && m[:ht2]
       ## score[:agg] = [m[:agg1].to_i,m[:agg2].to_i]   if m[:agg1] && m[:agg2]
       score
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil   
    else
      nil  ## no match - return nil
    end
end

._parse_team(str) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/sportdb/parser/token_helpers.rb', line 23

def self._parse_team( str )  
    ## note - strip - leading/trailing spaces
    m = TEXT_RE.match( str.strip )
    if m && m.pre_match == '' && m.post_match == ''
      m
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil   
    else
      nil  ## no match - return nil
    end
end

.build_map(lines, downcase: false) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/sportdb/parser/token-date.rb', line 40

def self.build_map( lines, downcase: false )
   ## note: downcase name!!!
  ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
  ##  {"january" => 1,  "jan" => 1,
  ##   "february" => 2, "feb" => 2,
  ##   "march" => 3,    "mar" => 3,
  ##   "april" => 4,    "apr" => 4,
  ##   "may" => 5,
  ##   "june" => 6,     "jun" => 6, ...
  lines.each_with_index.reduce( {} ) do |h,(line,i)|
    line.each do |name|
       h[ downcase ? name.downcase : name ] = i+1
    end  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
    h
  end
end

.build_names(lines) ⇒ Object



33
34
35
36
37
# File 'lib/sportdb/parser/token-date.rb', line 33

def self.build_names( lines )
  ## join all words together into a single string e.g.
  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
  lines.map { |line| line.join('|') }.join('|')
end

.parse_date(str, start:) ⇒ Object

“top-level” add a date parser helper



344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
# File 'lib/sportdb/parser/token-date.rb', line 344

def self.parse_date( str, start: )
    if m=DATE_RE.match( str )

      year    = m[:year].to_i(10)  if m[:year]
      month   = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
      day     = m[:day].to_i(10)   if m[:day]
      wday    = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

      if year.nil?   ## try to calculate year
        year =  if  month > start.month ||
                   (month == start.month && day >= start.day)
                  # assume same year as start_at event (e.g. 2013 for 2013/14 season)
                  start.year
                else
                  # assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
                  start.year+1
                end
      end
      Date.new( year,month,day )
    else
      puts "!! ERROR - unexpected date format; cannot parse >#{str}<"
      exit 1
    end
end

.parse_names(txt) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/sportdb/parser/token-date.rb', line 6

def self.parse_names( txt )
  lines = [] # array of lines (with words)

  txt.each_line do |line|
    line = line.strip

    next if line.empty?
    next if line.start_with?( '#' )   ## skip comments too

    ## strip inline (until end-of-line) comments too
    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
    ##   =>   Janvier  Janv  Jan

    line = line.sub( /#.*/, '' ).strip
    ## pp line

    values = line.split( /[ \t]+/ )
    ## pp values

    ## todo/fix -- add check for duplicates
    lines << values
  end
  lines

end

Instance Method Details

#_build_date(m) ⇒ Object



319
# File 'lib/sportdb/parser/token-date.rb', line 319

def _build_date( m ) self.class._build_date( m ); end

#_build_date_legs(m) ⇒ Object



337
# File 'lib/sportdb/parser/token-date.rb', line 337

def _build_date_legs( m ) self.class._build_date_legs( m ); end

#_build_duration(m) ⇒ Object



185
# File 'lib/sportdb/parser/token-date_duration.rb', line 185

def _build_duration(m) self.class._build_duration( m ); end

#_build_goal_count(m) ⇒ Object



443
# File 'lib/sportdb/parser/token-goals.rb', line 443

def _build_goal_count( m ) self.class._build_goal_count( m ); end

#_build_goal_minute(m) ⇒ Object



406
# File 'lib/sportdb/parser/token-goals.rb', line 406

def _build_goal_minute( m ) self.class._build_goal_minute( m ); end

#_build_goal_type(m) ⇒ Object



456
# File 'lib/sportdb/parser/token-goals.rb', line 456

def _build_goal_type( m ) self.class._build_goal_type( m ); end

#_build_minute(m) ⇒ Object



418
# File 'lib/sportdb/parser/token-goals.rb', line 418

def _build_minute( m ) self.class._build_minute( m ); end

#_build_score_team(m) ⇒ Object



423
# File 'lib/sportdb/parser/token-score.rb', line 423

def _build_score_team( m ) self.class._build_score_team( m ); end

#_build_score_team_num(m) ⇒ Object



440
# File 'lib/sportdb/parser/token-score.rb', line 440

def _build_score_team_num( m ) self.class._build_score_team_num( m ); end

#_build_score_team_pen(m) ⇒ Object



432
# File 'lib/sportdb/parser/token-score.rb', line 432

def _build_score_team_pen( m ) self.class._build_score_team_pen( m ); end

#_build_status(m) ⇒ Object



121
# File 'lib/sportdb/parser/token-status.rb', line 121

def _build_status( m ) self.class._build_status( m ); end

#_build_time(m) ⇒ Object



138
# File 'lib/sportdb/parser/token-time.rb', line 138

def _build_time(m) self.class._build_time(m); end

#_tokenize_line(line) ⇒ Object



551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
# File 'lib/sportdb/parser/lexer.rb', line 551

def _tokenize_line( line )
  tokens = []
  errors = []   ## keep a list of errors - why? why not?


  pos = 0
  ## track last offsets - to report error on no match
  ##   or no match in end of string
  offsets = [0,0]
  m = nil

  ## track number of geo text seen
  ##    (use for - do NOT break on two spaces if no geo text seen yet!!)
  geo_count = 0

  ####
  ## quick hack - keep re state/mode between tokenize calls!!!
  @re  ||= RE     ## note - switch between RE & INSIDE_RE


  if @re == RE  ## top-level
    ### check for modes once (per line) here to speed-up parsing
    ###   for now goals only possible for start of line!!
    ###        fix - remove optional [] - why? why not?
    
    ####
    ## note - ord e.g. (45) for match number can only start a (match) line
    ##                "inline" use NOT possible
    ## note -  ord (for ordinal number!!!) e.g match number (1), (42), etc.
    if (m = START_WITH_ORD.match(line))
       ## note -  strip enclosing () and convert to integer
       tokens << [:ORD, [m[:ord], { value: m[:value].to_i(10) } ]]

       offsets = [m.begin(0), m.end(0)]
       pos = offsets[1]    ## update pos
    elsif (m = START_WITH_YEAR.match(line))
       ## note -  strip enclosing () and convert to integer
       tokens << [:YEAR, m[:year].to_i(10)]

       offsets = [m.begin(0), m.end(0)]
       pos = offsets[1]    ## update pos

    ###
    ##  todo/fix - rename to START_GROUP_DEF_LINE_RE !!!!   
    elsif (m = GROUP_DEF_LINE_RE.match( line ))
      puts "  ENTER GROUP_DEF_RE MODE"   if debug?
      @re = GROUP_DEF_RE   

      tokens << [:GROUP_DEF, m[:group_def]]

      offsets = [m.begin(0), m.end(0)]
      pos = offsets[1]    ## update pos

    ###  todo/fix - rename to PROP_KEY_RE to START_WITH_PROP_KEY_RE !!!  
    elsif (m = PROP_KEY_RE.match( line ))
      ##  start with prop key (match will switch into prop mode!!!)
      ##   - fix - remove leading spaces in regex (upstream) - why? why not?
      ##
      ###  switch into new mode
      ##  switch context  to PROP_RE
        puts "  ENTER PROP_RE MODE"   if debug?
        key = m[:key]


        ### todo/fix - add prop yellow/red cards too - why? why not?
        ##  todo/fix - separate sent off and red card
        ##     sent-off - incl. red card, yellow/red card and the era before red cards!!
        if ['sent off'].include?( key.downcase) 
          @re = PROP_CARDS_RE    ## use CARDS_RE ???
          tokens << [:PROP_SENTOFF, m[:key]]   
        elsif ['red cards'].include?( key.downcase ) 
          @re = PROP_CARDS_RE    ## use CARDS_RE ???
          tokens << [:PROP_REDCARDS, m[:key]]
        elsif ['yellow cards'].include?( key.downcase )
          @re = PROP_CARDS_RE  
          tokens << [:PROP_YELLOWCARDS, m[:key]]
        elsif ['ref', 'referee', 
               'refs', 'referees'   ## note - allow/support assistant refs
              ].include?( key.downcase )
          @re = PROP_REFEREE_RE     
          tokens << [:PROP_REFEREE, m[:key]]
        elsif ['att', 'attn', 'attendance'].include?( key.downcase )
          @re = PROP_ATTENDANCE_RE
          tokens << [:PROP_ATTENDANCE, m[:key]]         
  
     #   elsif ['goals'].include?( key.downcase )
     #     @re = PROP_GOAL_RE
     #     tokens << [:PROP_GOALS, m[:key]]
         
        elsif ['penalties', 
               'penalty shootout',
               'penalty shoot-out',
               'penalty kicks'].include?( key.downcase )
          @re = PROP_PENALTIES_RE
          tokens << [:PROP_PENALTIES, m[:key]]
        else   ## assume (team) line-up
          @re = PROP_RE           ## use LINEUP_RE ???
          tokens << [:PROP, m[:key]]
        end

        offsets = [m.begin(0), m.end(0)]
        pos = offsets[1]    ## update pos
    ###
    ### todo/fix
    ###   rename to START_WITH_ROUND_DEF_OUTLINE_RE !!!!    
    elsif (m = ROUND_DEF_OUTLINE_RE.match( line ))
      puts "   ENTER ROUND_DEF_RE MODE"  if debug?
      @re = ROUND_DEF_RE   

      ## note - return ROUND_DEF NOT  ROUND_OUTLINE token
      tokens << [:ROUND_DEF, m[:round_outline]]

      offsets = [m.begin(0), m.end(0)]
      pos = offsets[1]    ## update pos
    elsif (m = ROUND_OUTLINE_RE.match( line ))
      puts "   ROUND_OUTLINE"  if debug?
      ## note - derive round level from no of (leading) markers
      ##             e.g. ▪/:: is 1, ▪▪/::: is 2, ▪▪▪/:::: is 3, etc.
      ##       note  - ascii-style starts with double ::, thus, autodecrement by one!
      round_level = m[:round_marker].size
      round_level -= 1  if m[:round_marker].start_with?( '::' ) 

      tokens << [:ROUND_OUTLINE, [m[:round_outline], 
                      { outline: m[:round_outline] , 
                        level: round_level}]]

      ## note - eats-up line for now (change later to only eat-up marker e.g. »|>>)
      offsets = [m.begin(0), m.end(0)]
      pos = offsets[1]    ## update pos
    elsif (m = START_GOAL_LINE_RE.match( line ))   ## line starting with ( - assume
      ##  switch context to GOAL_RE (goalline(s))
      ####
      ##  note - check for alternate goal line styles / formats    
      if START_GOAL_LINE_COMPAT_RE.match(line ) 
        ## "legacy" style starting with minute e.g. 
        ##  (6 Puskás 0-1, 9 Czibor 0-2, 11 Morlock 1-2, 18 Rahn 2-2,
        ##    84 Rahn 3-2)
        @re = GOAL_COMPAT_RE
        puts "  ENTER GOAL_COMPAT_RE MODE"   if debug?

        tokens << [:GOALS_COMPAT, "<|GOALS_COMPAT|>"]
      elsif START_GOAL_LINE_ALT_RE.match( line )
        ##  goals with scores e.g. 
        ##    (1-0 Franck Ribéry, 2-0 Ivica Olić, 2-1 Wayne Rooney)
        ##         -or-
        ##      (Dion Beljo  1-0 
        ##                   1-1  Andreas Gruber 
        ##   Matthias Seidl  2-1)   
        @re = GOAL_ALT_RE
        puts "  ENTER GOAL_ALT_RE MODE"   if debug?

        tokens << [:GOALS_ALT, "<|GOALS_ALT|>"]
      else
        ## "standard" / default style
        @re = GOAL_RE
        puts "  ENTER GOAL_RE MODE"   if debug?

        tokens << [:GOALS, "<|GOALS|>"]
      end

      ## note - eat-up ( for now
      ##   pass along "virtual" GOALS or GOALS_ALT token 
      ##      (see INLINE_GOALS for the starting goal line inline)     
      offsets = [m.begin(0), m.end(0)]
      pos = offsets[1]    ## update pos      
    end
  end



  old_pos = -1   ## allows to backtrack to old pos (used in geo)

  while m = @re.match( line, pos )
    # if debug?
    #  pp m
    #  puts "pos: #{pos}"
    # end
    offsets = [m.begin(0), m.end(0)]

    if offsets[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!
      msg =  "!! WARN - parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      puts msg

      errors << "parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      log( msg )
    end


    ##
    ## todo/fix - also check if possible
    ##   if no match but not yet end off string!!!!
    ##    report skipped text run too!!!

    old_pos = pos
    pos     = offsets[1]

#    pp offsets   if debug?

    ##
    ## note: racc requires pairs e.g. [:TOKEN, VAL]
    ##         for VAL use "text" or ["text", { opts }]  array


  t = if @re == ROUND_DEF_RE 
           if m[:spaces] || m[:space] 
               nil    ## skip spaces
           elsif m[:date]
            [:DATE, [m[:date], _build_date( m )]]
          elsif m[:duration]
            [:DURATION, [m[:duration], _build_duration( m )]] 
          elsif m[:sym]
              sym = m[:sym]
              case sym
              when '|' then  [:'|']
              when ':' then  [:':']
              when ',' then  [:',']
              else
                puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
                nil  ## ignore others (e.g. brackets [])
              end
           elsif m[:any]
              ## todo/check log error
               msg = "parse error (tokenize round_def) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
               puts "!! WARN - #{msg}"
  
               errors << msg
               log( "!! WARN - #{msg}" )
       
               nil   
            else
              ## report error/raise expection
               puts "!!! TOKENIZE ERROR - no match found"
               nil 
            end
      elsif @re == GROUP_DEF_RE
           if m[:spaces] || m[:space] 
               nil    ## skip spaces
           elsif m[:text]
               [:TEAM, m[:text]]  
           elsif m[:sym]
              sym = m[:sym]
              case sym
              when '|' then  [:'|']
              when ':' then  [:':']
              when ',' then  [:',']
              else
                puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
                nil  ## ignore others (e.g. brackets [])
              end
           elsif m[:any]
              ## todo/check log error
               msg = "parse error (tokenize group_def) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
               puts "!! WARN - #{msg}"
  
               errors << msg
               log( "!! WARN - #{msg}" )
       
               nil   
            else
              ## report error/raise expection
               puts "!!! TOKENIZE ERROR - no match found"
               nil 
            end
       elsif @re == GEO_RE
           ### note - possibly end inline geo on [ (and others?? in the future
           ## note: break on double spaces e.g.
           ## e.g. Jul/16 @ Arena Auf Schalke, Gelsenkirchen  Serbia 0-1 England    
           if m[:spaces]
                 ### note - do NOT break out 
                 ##           if not text seen yet!!!
                 if geo_count > 0
                    ## get out-off geo mode and backtrack (w/ next)
                    puts "  LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
                    @re = RE
                    pos = old_pos
                    next   ## backtrack (resume new loop step)
                 else 
                     nil   ## skip spaces
                 end                
           elsif m[:space] 
               nil    ## skip (single) space
           elsif m[:text]
               geo_count += 1
               [:GEO, m[:text]]   ## keep pos - why? why not?
           elsif m[:geo_end]   ## "hacky" special comma; always ends geo mode!!!
                 ## get out-off geo mode and backtrack (w/ next)
                 puts "  LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
                 @re = RE
                 pos = old_pos
                 next   ## backtrack (resume new loop step)                 
           elsif m[:sym]
              sym = m[:sym]
              ## return symbols "inline" as is - why? why not?
              ## (?<sym>[;,@|\[\]-])
              case sym
                ## note - reset geo_count to 0 (avoids break on two spaces)
                ##                     if separator seen!!
              when ',' then geo_count = 0; [:',']
              when '' then geo_count = 0; [:',']  ## note - treat geo sep › (unicode) like comma for now!!!
              when '>' then geo_count = 0; [:',']  ## note - treat geo sep > (ascii) like comma for now!!!
              when '[' then
                 ## get out-off geo mode and backtrack (w/ next)
                 puts "  LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
                 @re = RE
                 pos = old_pos
                 next   ## backtrack (resume new loop step)                 
            else
              puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
              nil  ## ignore others (e.g. brackets [])
            end
          elsif m[:any]
             ## todo/check log error
             msg = "parse error (tokenize geo) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
             puts "!! WARN - #{msg}"
  
             errors << msg
             log( "!! WARN - #{msg}" )
       
             nil   
          else
            ## report error/raise expection
             puts "!!! TOKENIZE ERROR - no match found"
             nil 
          end
      elsif @re == PROP_CARDS_RE 
        if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]
              [:PROP_NAME, m[:name]]
         elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
         elsif m[:sym]
            sym = m[:sym]
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '-' then [:'-']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
             puts "!!! TOKENIZE ERROR (PROP_CARDS_RE) - no match found"
             nil 
         end    
      elsif @re == PROP_RE   ### todo/fix - change to LINEUP_RE !!!!
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_key]   ## check for inline prop keys
              key = m[:key]   
              ##  supported for now coach/trainer (add manager?)
              if ['coach', 
                  'trainer'].include?( key.downcase )
                [:COACH, m[:key]]   ## use COACH_KEY or such - why? why not?
              else
                ## report error - for unknown (inline) prop key in lineup
                nil
              end
         elsif m[:inline_captain]
              [:INLINE_CAPTAIN, m[:inline_captain]]
         elsif m[:inline_yellow]
              card = {}
              card[:m]      = m[:minute].to_i(10)  if m[:minute]
              card[:offset] = m[:offset].to_i(10)  if m[:offset]
              [:INLINE_YELLOW, [m[:inline_yellow], card]]       
         elsif m[:inline_red]
              card = {}
              card[:m]      = m[:minute].to_i(10)  if m[:minute]
              card[:offset] = m[:offset].to_i(10)  if m[:offset]
              [:INLINE_RED, [m[:inline_red], card]]       
         elsif m[:inline_yellow_red]
              card = {}
              card[:m]      = m[:minute].to_i(10)  if m[:minute]
              card[:offset] = m[:offset].to_i(10)  if m[:offset]
              [:INLINE_YELLOW_RED, [m[:inline_yellow_red], card]]       
         elsif m[:prop_name]
              [:PROP_NAME, m[:name]]
         elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             [:MINUTE, [m[:minute], minute]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '[' then [:'[']
            when ']' then [:']']
            when '(' then [:'(']
            when ')' then [:')']
            when '-' then [:'-']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
             puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
             nil 
         end
      elsif @re == PROP_ATTENDANCE_RE
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:enclosed_name]
              ## reserverd for use for sold out or such (in the future) - why? why not?
             [:ENCLOSED_NAME, m[:name]]
         elsif m[:num]
             [:PROP_NUM, [m[:num], { value: m[:value].to_i(10) } ]]
=begin             
         elsif m[:sym]
            sym = m[:sym]
            case sym
            when ',' then [:',']
            when ';' then [:';']
            # when '[' then [:'[']
            # when ']' then [:']']
            else
              nil  ## ignore others (e.g. brackets [])
            end
=end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (PROP_ATTENDANCE_RE) - no match found"
            nil 
         end
      elsif @re == PROP_REFEREE_RE
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_key]   ## check for inline prop keys
              key = m[:key]   
              ##  supported for now coach/trainer (add manager?)
              if ['att', 'attn', 'attendance' ].include?( key.downcase )
                [:ATTENDANCE, m[:key]]   ## use COACH_KEY or such - why? why not?
              else
                ## report error - for unknown (inline) prop key in lineup
                nil
              end
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PROP_NAME, m[:name]]    ### use PLAYER for token - why? why not?
         elsif m[:num]
             [:PROP_NUM, [m[:num], { value: m[:value].to_i(10) } ]]
         elsif m[:enclosed_name]
              ## use HOLD,SAVE,POST or such keys - why? why not?
             [:ENCLOSED_NAME, m[:name]]
         elsif m[:sym]
            sym = m[:sym]
            case sym
            when ',' then [:',']
            when ';' then [:';']
 #           when '[' then [:'[']
 #           when ']' then [:']']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (PROP_REFEREE_RE) - no match found"
            nil 
         end       
      elsif @re == PROP_PENALTIES_RE
        if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PROP_NAME, m[:name]]    ### use PLAYER for token - why? why not?
         elsif m[:enclosed_name]
              ## use HOLD,SAVE,POST or such keys - why? why not?
             [:ENCLOSED_NAME, m[:name]]
         elsif m[:score]
              score = {}
              ## must always have ft for now e.g. 1-1 or such
              ###  change to (generic) score from ft -
              ##     might be score a.e.t. or such - why? why not?
              score[:score] = [m[:score1].to_i(10),
                               m[:score2].to_i(10)]  
              [:SCORE, [m[:score], score]]
         elsif m[:sym]
            sym = m[:sym]
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '[' then [:'[']
            when ']' then [:']']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (PROP_PENALTIES_RE) - no match found"
            nil 
         end
      elsif @re == GOAL_COMPAT_RE 
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PLAYER, m[:name]] 
         elsif m[:minute]
              minute = _build_minute( m )
             [:MINUTE, [m[:minute], minute]]
         elsif m[:goal_type]
              goal_type = _build_goal_type( m )
             [:GOAL_TYPE, [m[:goal_type], goal_type]]
         elsif m[:score]
            score = {}
             ##  note - score is "generic"
            ##      might be full-time (ft) or
            ##         after extra-time (aet) or such
            ##         or even undecided/unknown
            ##    thus, use score1/score2 and NOT ft1/ft2
            score[:score] = [m[:score1].to_i(10),
                             m[:score2].to_i(10)]  
            ## note - for debugging keep (pass along) "literal" score
            [:SCORE, [m[:score], score]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ')'  ## leave goal mode!!
                puts "  LEAVE GOAL_COMPAT_RE MODE"   if debug?
                @re = RE
                ##  note - use/return GOAL_END token   - change to GOAL_END_PAREN(THESIS)
                ##                                or GOAL_PAREN_CLOSE/END ???
                [:GOALS_END, '<|GOALS_END|>']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (GOAL_COMPAT_RE) - no match found"
            nil 
         end
      elsif @re == GOAL_ALT_RE 
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PLAYER, m[:name]] 
         elsif m[:goal_minute]
              minute = _build_goal_minute( m )
             [:GOAL_MINUTE, [m[:goal_minute], minute]]
         elsif m[:goal_type]
              goal_type = _build_goal_type( m )
             [:GOAL_TYPE, [m[:goal_type], goal_type]]
         elsif m[:score]
            score = {}
             ##  note - score is "generic"
            ##      might be full-time (ft) or
            ##         after extra-time (aet) or such
            ##         or even undecided/unknown
            ##    thus, use score1/score2 and NOT ft1/ft2
            score[:score] = [m[:score1].to_i(10),
                             m[:score2].to_i(10)]  
            ## note - for debugging keep (pass along) "literal" score
            [:SCORE, [m[:score], score]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ')'  ## leave goal mode!!
                puts "  LEAVE GOAL_ALT_RE MODE"   if debug?
                @re = RE
                ##  note - use/return GOAL_END token   - change to GOAL_END_PAREN(THESIS)
                ##                                or GOAL_PAREN_CLOSE/END ???
                [:GOALS_END, '<|GOALS_END|>']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (GOAL_ALT_RE) - no match found"
            nil 
         end
      elsif @re == GOAL_RE 
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:goals_none]    ## note - eats-up semicolon!! e.g. -; or - ;
             [:GOALS_NONE, "<|GOALS_NONE|>"]
         elsif m[:goal_sep_alt]
             [:GOAL_SEP_ALT, "<|GOAL_SEP_ALT|>" ]   ## e.g. dash (-) WITH leading & trailing space required    
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PLAYER, m[:name]] 
         elsif m[:goal_minute]
              minute = _build_goal_minute( m )
             [:GOAL_MINUTE, [m[:goal_minute], minute]]
         elsif m[:goal_count]
              count = _build_goal_count( m ) 
              [:GOAL_COUNT, [m[:goal_count], count]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ';' then [:';']
            # when '[' then [:'[']
            # when ']' then [:']']
            when ')'  ## leave goal mode!!
                puts "  LEAVE GOAL_RE MODE"   if debug?
                @re = RE
                ##  note - use/return GOAL_END token   - change to GOAL_END_PAREN(THESIS)
                ##                                or GOAL_PAREN_CLOSE/END ???
                [:GOALS_END, '<|GOALS_END|>']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (GOAL_RE) - no match found"
            nil 
         end
      ###################################################
      ## assume TOP_LEVEL (a.k.a. RE) machinery
      else  
        if m[:space] || m[:spaces]
           nil   ## skip space(s)
        elsif m[:text]
          ##  note - top-level (for now always) assumes TEAM for TEXT match!!
          [:TEAM, m[:text]]   ## keep pos - why? why not?
        elsif m[:status]   ## (match) status e.g. cancelled, awarded, etc.
            [:STATUS, [m[:status], _build_status( m ) ]]
        elsif m[:inline_wo]   ## w/o - walkover  (match status)
            [:INLINE_WO, m[:inline_wo]]
        elsif m[:inline_np]   ## n/p - not played (match status)
            [:INLINE_NP, m[:inline_np]]         
        elsif m[:inline_bye]  ## bye  (match status)
            [:INLINE_BYE, m[:inline_bye]]
        elsif m[:inline_abd]  ## abd/abd. - abandoned (match status)
            [:INLINE_ABD, m[:inline_abd]]
        elsif m[:inline_void]  ## abd/abd. - abandoned (match status)
            [:INLINE_VOID, m[:inline_void]]
        elsif m[:inline_susp]  ## susp/susp. - suspended (match status)
            [:INLINE_SUSP, m[:inline_susp]]
        elsif m[:inline_ppd]  ## ppd/ppd. or postp/postp. - postponed (match status)
            [:INLINE_PPD, m[:inline_ppd]]
        elsif m[:inline_awd]  ## awd/awd. - awarded (match status)
            [:INLINE_AWD, m[:inline_awd]]
        elsif m[:inline_canc]  ## canc/canc. - cancelled/canceled (match status)
            [:INLINE_CANC, m[:inline_canc]]

        elsif m[:team_home]
            [:TEAM_HOME, m[:team_home]]
        elsif m[:team_away]
            [:TEAM_AWAY, m[:team_away]]
        elsif m[:team_neutral]
            [:TEAM_NEUTRAL, m[:team_neutral]]

        elsif m[:attendance]
             att = {} 
             att[:value] = m[:value].gsub( '_', '' ).to_i(10)
             ## note - for token id use INLINE_ATTENDANCE  (ATTENDANCE in use for prop!!!) 
            [:INLINE_ATTENDANCE, [m[:attendance], att ]]
        elsif m[:note]
            ###  todo/check:
            ##      use value hash - why? why not? or simplify to:
            ## [:NOTE, [m[:note], {note: m[:note] } ]]
             [:NOTE, m[:note]] 
        elsif m[:time]
            [:TIME, [m[:time], _build_time(m)]]
        elsif m[:date]
            [:DATE, [m[:date], _build_date(m)]]
        elsif m[:date_legs]
            [:DATE_LEGS, [m[:date_legs], _build_date_legs(m)]] 
        elsif m[:score_team]
            [:SCORE_TEAM, [m[:score_team], _build_score_team(m)]] 
        elsif m[:score_team_pen]
            [:SCORE_TEAM_PEN, [m[:score_team_pen], _build_score_team_pen(m)]] 
        elsif m[:score_team_num]
            [:SCORE_TEAM_NUM, [m[:score_team_num], _build_score_team_num(m)]]
          elsif m[:score_legs]
              legs = {}
              
              ### leg1
              score = {}
              score[:ft] = [m[:leg1_ft1].to_i(10),
                            m[:leg1_ft2].to_i(10)] 
              legs['leg1'] = score
              
              ### leg2
              score = {}
              score[:ft] = [m[:leg2_ft1].to_i(10),
                            m[:leg2_ft2].to_i(10)]  if m[:leg2_ft1] && m[:leg2_ft2]
              score[:et] = [m[:leg2_et1].to_i(10),
                            m[:leg2_et2].to_i(10)]  if m[:leg2_et1] && m[:leg2_et2]
              score[:p]  = [m[:leg2_p1].to_i(10),
                            m[:leg2_p2].to_i(10)]  if m[:leg2_p1] && m[:leg2_p2]
              legs['leg2'] = score
              
              ## check for (opt) aggregate - keep on "top-level"
              legs[:agg] = [m[:agg1].to_i(10),
                            m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]
              legs[:away] = true  if m[:away]  
              
              ## note - for debugging keep (pass along) "literal" score
              [:SCORE_LEGS, [m[:score_legs], legs]]
        elsif m[:score_full]
              score = {}
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]
              score[:agg] = [m[:agg1].to_i(10),
                             m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]

              if m[:away1] && m[:away2]               
                 score[:away] = [m[:away1].to_i(10),
                                 m[:away2].to_i(10)]
              elsif m[:away]    ## fallback if no away score; check away flag
                 score[:away] = true
              end  

              ## add golden/silver flags
              score[:golden] = true   if m[:aetgg]  ## golden goal (gg)/sudden death (sd)
              score[:silver] = true   if m[:aetsg]  ## silver goal (sg)

            ## note - for debugging keep (pass along) "literal" score
            [:SCORE_FULL, [m[:score_full], score]]
        elsif m[:score_fuller]
              score = {}
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]
              score[:agg] = [m[:agg1].to_i(10),
                             m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]
              if m[:away1] && m[:away2]               
                 score[:away] = [m[:away1].to_i(10),
                                 m[:away2].to_i(10)]
              elsif m[:away]    ## fallback if no away score; check away flag
                 score[:away] = true
              end  

              ## add aet flag true/false
              # score[:aet] = true   if m[:aet] || m[:aetgg] || m[:aetsg]
              
              ## add golden/silver flags
              score[:golden] = true   if m[:aetgg]  ## golden goal (gg)/sudden death (sd)
              score[:silver] = true   if m[:aetsg]  ## silver goal (sg)

            ## note - for debugging keep (pass along) "literal" score
            [:SCORE_FULLER, [m[:score_fuller], score]]
        elsif m[:score_fuller_more]
               ##    SCORE + SCORE_FULLER_MORE
               ## note -  after extra-time (aet) or full-time (ft) 
               ##           score may be present in SCORE!!! 
              score = {}
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]
              score[:agg] = [m[:agg1].to_i(10),
                             m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]
              if m[:away1] && m[:away2]               
                 score[:away] = [m[:away1].to_i(10),
                                 m[:away2].to_i(10)]
              elsif m[:away]    ## fallback if no away score; check away flag
                 score[:away] = true
              end  

              ## add flag in score for et/ft/ht
              score[:score] = 'et'   if m[:aet] || m[:aetgg] || m[:aetsg]
              score[:score] = 'ft'   if m[:ft]
              score[:score] = 'ht'   if m[:ht]

              ## add golden/silver flags
              score[:golden] = true   if m[:aetgg]  ## golden goal (gg)/sudden death (sd)
              score[:silver] = true   if m[:aetsg]  ## silver goal (sg)

            ## note - for debugging keep (pass along) "literal" score
            [:SCORE_FULLER_MORE, [m[:score_fuller_more], score]]
        elsif m[:score]
            score = {}
             ##  note - score is "generic"
            ##      might be full-time (ft) or
            ##         after extra-time (aet) or such
            ##         or even undecided/unknown
            ##    thus, use score1/score2 and NOT ft1/ft2
            score[:score] = [m[:score1].to_i(10),
                             m[:score2].to_i(10)]  
         ## note - for debugging keep (pass along) "literal" score
          [:SCORE, [m[:score], score]]
        elsif m[:score_awd]   ## score awarded (awd/awd.)
            score = {}
            ### note - use "generic" score for now
            ##         to match  A 3-0 B [awarded] etc.
            score[:score] = [m[:score1].to_i(10),
                             m[:score2].to_i(10)]  
            ## add score[:awarded] = true ???
            ##    or only use match status to avoid duplicate?
            [:SCORE_AWD, [m[:score_awd], score]]
        elsif m[:score_abd]   ## score abandonded (abd/abd.)
            score = {}
            ### note - use "generic" score for now
            score[:score] = [m[:score1].to_i(10),
                             m[:score2].to_i(10)]  
            ## add score[:awarded] = true ???
            ##    or only use match status to avoid duplicate?
            [:SCORE_ABD, [m[:score_abd], score]]
      elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
        elsif m[:vs]
           [:VS, m[:vs]]
        elsif m[:sym]
          sym = m[:sym]
          ## return symbols "inline" as is - why? why not?
          ## (?<sym>[;,@|\[\]-])
 
          case sym
          when '@'    ##  enter geo mode
            puts "  ENTER GEO_RE MODE"  if debug?
            @re = GEO_RE
            geo_count = 0
            [:'@']
          when ',' then [:',']
          when ';' then [:';']
          when '/' then [:'/']
          when '|' then [:'|']
          when '[' then [:'[']
          when ']' then [:']']
          when '-' then [:'-']    
          when '('    ## enter goal scorer mode on "free-floating" open paranthesis!!!
             puts "  ENTER GOAL_RE MODE"   if debug?
             @re = GOAL_RE
              ## note - eat-up ( for now; do NOT pass along as token
              ##       pass along "virutal" INLINE GOALS - why? why not?
              [:INLINE_GOALS, "<|INLINE_GOALS|>"]
          when ')' then [:')']
          else
            puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
            nil  ## ignore others (e.g. brackets [])
          end
        elsif m[:any]
           ## todo/check log error
           msg = "parse error (tokenize) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
           puts "!! WARN - #{msg}"

           errors << msg
           log( "!! WARN - #{msg}" )
     
           nil   
        else
          ## report error
           puts "!!! TOKENIZE ERROR - no match found"
           nil 
        end
      end


    tokens << t    if t

#    if debug?
#      print ">"
#      print "*" * pos
#      puts "#{line[pos..-1]}<"
#    end
  end

  ## check if no match in end of string
  if offsets[1] != line.size
    msg =  "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
    puts msg
    log( msg )

    errors << "parse error (tokenize) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
  end


  # if @re == GOAL_RE   ### ALWAYS switch back to top level mode
  #   puts "  LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
  #   @re = RE 
  # end
 
   if @re == GEO_RE   ### ALWAYS switch back to top level mode
     puts "  LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
     @re = RE 
   end
 
   @re = RE  if @re == GROUP_DEF_RE   ### ALWAYS switch back to top level mode
   @re = RE  if @re == ROUND_DEF_RE

   ##
   ## if in prop mode continue if   last token is [,-]
   ##        otherwise change back to "standard" mode
   if @re == PROP_RE            || @re == PROP_CARDS_RE ||
      @re == PROP_PENALTIES_RE ||
      @re == PROP_ATTENDANCE_RE || @re == PROP_REFEREE_RE
     if [:',', :'-', :';'].include?( tokens[-1][0] )
        ## continue/stay in PROP_RE mode
        ##  todo/check - auto-add PROP_CONT token or such
        ##                to help parser with possible NEWLINE
        ##                  conflicts  - why? why not?
     else
        ## switch back to top-level mode!!
        puts "  LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
        @re = RE 
        ## note - auto-add PROP_END (<PROP_END>)
        tokens << [:PROP_END, "<|PROP_END|>"]    
     end
   end

  
  [tokens,errors]
end

#_tokenize_tty_line(line) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/sportdb/parser/lexer_tty.rb', line 59

def _tokenize_tty_line( line )
   line = line.strip

   tokens = []
   
   ## track last offsets - to report error on no match
   ##   or no match in end of string
   offsets = [0,0]
   pos = 0
   m = nil   
 

  while m = TTY_RE.match( line, pos )
    offsets = [m.begin(0), m.end(0)]

    if offsets[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!
      msg =  "!! WARN - tokenize (tty) error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      puts msg
      log( msg )
    end

    pos = offsets[1]

    t =  if m[:spaces] || m[:space] 
               nil    ## skip spaces
          elsif m[:text]
            [:TTY_TEXT, m[:text]]
          elsif m[:num]
            [:TTY_NUM, m[:num].to_i(10)] 
          else
              ## report error/raise expection
              puts "!!! TTY TOKENIZE ERROR - no match found"
              nil 
          end
     
    tokens << t    if t
  end

  ## check if no match in end of string
  if offsets[1] != line.size
      msg =  "!! WARN - tokenize (tty) error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
      puts msg
      log( msg )
  end

  tokens
end

#debug?Boolean

Returns:

  • (Boolean)


32
# File 'lib/sportdb/parser/lexer.rb', line 32

def debug?()  @debug == true; end

#is_group?(text) ⇒ Boolean

todo/fix - use LangHelper or such

 e.g.     class Lexer
              include LangHelper
          end

merge back Lang into Lexer - why? why not?

keep “old” access to checking for group, round & friends

for now for compatibility

Returns:

  • (Boolean)


26
# File 'lib/sportdb/parser/lexer.rb', line 26

def is_group?( text )  Lang.is_group?( text ); end

#is_round?(text) ⇒ Boolean

Returns:

  • (Boolean)


27
# File 'lib/sportdb/parser/lexer.rb', line 27

def is_round?( text )  Lang.is_round?( text ); end

#log(msg) ⇒ Object



7
8
9
10
11
12
13
14
# File 'lib/sportdb/parser/lexer.rb', line 7

def log( msg )
   ## append msg to ./logs.txt
   ##     use ./errors.txt - why? why not?
   File.open( './logs.txt', 'a:utf-8' ) do |f|
     f.write( msg )
     f.write( "\n" )
   end
end

#tokenize_with_errorsObject



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
# File 'lib/sportdb/parser/lexer.rb', line 107

def tokenize_with_errors

####
##   flags / modes
    @teletype = false     # use magic comment - tty/teletype: true



     = []   ## note: add tokens line-by-line (flatten later)
    errors         = []   ## keep a list of errors - why? why not?
  
   ##  preprocess automagically - why? why not?
    ##   strip lines with comments and empty lines striped / removed
    ##      keep empty lines? why? why not?
    ##      keep leading spaces (indent) - why?
    ##
    ##  note - KEEP empty lines (get turned into BLANK token!!!!)


    ##  "universal" newlines
    ##    replace all windows-style  cr+lf (\r\n) to lf (\n) only
    txt = @txt.gsub( "\r\n", "\n" )



    ###
    ## quick hack for now
    ##   remove  html-style comments <!-- -->
    ##           (incl. multi-line)  with two spaces
    ##       will mess-up lineno tracking!!!
    ##    fix later to have function lineno & colno!!!
    txt = @txt.gsub( HTML_COMMENT_RE ) do |m|
                        puts " [debug] preproc html comment:"
                        puts m
                        '  ' 
                   end


=begin                 
##
##  todo/fix - add a command line switch/option for auto-format fixes !!!
   ##  quick hack - remove later
   ##    auto-convert "old" legacy round markers (») 
   txt = txt.gsub( %r{^ [ ]*
                          »
                        (?= [ ]+)  ## require one trailing space for now!!
                        }ix ) do |_|
                     puts "!! WARN - auto-fix format; replacing old (alternate/legacy) round marker (»)"
                        '▪'
                    end   


###  16.00 => 16:00
##     todo/check - use space for positive lookbehind & ahead
##                      (instead of \b) - why? why not?
##  note - check for/exclude 12.12.  date in match
##             use negative lookahead
##   check for 12.12.94
##      use   positive lookbehind   !!!
##               must be space, comma or begin-of-line [ ,]|^
##    or use negative lookbehind
##               must NOT be dot 
   txt = txt.gsub(  %r{  
                        ## check NEGATIVE lookbehind
                         (?<! [.])  ## do NOT match 12.94 in 12.12.94  
                          \b
                        (?<h>\d{1,2})
                           \.
                        (?<m>\d{2})
                          \b
                        (?! [.] )   ## do NOT match 12.12.  
                        }ix ) do |_|
                           m = $~   ## is $LAST_MATCH_DATA
                        puts "!! WARN - auto-fix format; replacing old (alternate/legacy) time format #{m[0]}"
                           "#{m[:h]}:#{m[:m]}"   ## '\1:\2'
                        end
=end




    ###
    ## add more "native" multi-line comment-styles
    ##  e.g.    #[[ ... ]]  or  #<<< .. >>> or #<< .. >>
    ##                 or such - why? why not?


   txt = txt.gsub( PREPROC_NOTA_BENE_RE ) do |m|
       if m.include?( "\n" )   ## check for newlines (\n) and replace
         puts " [debug] preproc (multi-line) note/nota bene block:"
         puts m
         ## todo/check: replace with two spaces insead of ↵ - why? why not?
         m.gsub( "\n", '' )
       else
         m 
       end 
    end


   ##
   ## e.g. used in (multi-line) TableNote  
   ##  1.SOUTH KOREA   6  5  1  0 22- 1 16  [0-0]
   ##  2.LEBANON       6  3  1  2 11- 8 10  [0-2, 0-0]
   ##  3.Turkmenistan  6  3  0  3  8-11  9  [3-1]
   ##  4.Sri Lanka     6  0  0  6  2-23  0  [0-1]
   ##  -.North Korea   [withdrew after playing 5 matches due to safety concerns in 
   ##                   connection with the Covid-19 pandemic; all results annulled]
   ##
   ##  note - no longer used for now
   ##     enclose multi-line notes in []
   ##         removes need for line continuation for now

##
##   txt = txt.gsub( LINE_CONTINUATION_RE ) do |_|
##            puts " [debug] preproc line continuation"
##              ## todo/check: replace with two spaces insead of ↵ - why? why not?
##               '↵' 
##         end 



    #####
    ## (another) quick hack for now
    ##   turn multi-line note blocks into 
    ##             single-line note blocks
    ##             by changing newline (\n) to ⏎ (unicode U+23CE)
    ##              or why not  to ___ ?
    ##
    ##  unicode options for return/arrows:
    ##   -  ↵ (U+21B5): Downwards Arrow With Corner Leftwards. 
    ##                This is the most common "carriage return" symbol.
    ##   -  ⏎ (U+23CE): Return Symbol. 
    ##               Specifically designated as the keyboard's "Return" key symbol, 
    ##                often used in user interfaces.

    txt = txt.gsub( PREPROC_BLOCK_RE ) do |m|
       if m.include?( "\n" )   ## check for newlines (\n) and replace
         puts " [debug] preproc (multi-line) block:"
         puts m
         ## todo/check: replace with two spaces insead of ↵ - why? why not?
         m.gsub( "\n", '' )
       else
         m 
       end 
    end


    ####
    ## quick hack - keep re state/mode between tokenize calls!!!
    @re  ||= RE     ## note - switch between RE & INSIDE_RE
  

    txt.each_line do |line|
        ## line = line.rstrip   ## note - MUST remove/strip trailing newline (spaces optional)!!!
        line = line.strip   ## note - strip leading AND trailing whitespaces
                            ## note - trailing whitespace may incl. \n or \r\n!!!


        ##
        ###
        ##  check for magic comments
        ##     e.g  # teletype: true    or TELETYPE: TRUE 
        ##             tty/teletype

        if line.start_with?('#')   ###  skip comments (& check magic comments!!)
           
           if (m = MAGIC_COMMENT_RE.match(line))
              magic_comment_key   = m[:magic_comment_key].downcase
              magic_comment_value = m[:magic_comment_value].downcase

              ##   turn on teletype mode
              ## e.g.  tty: true  or teletype: true
              if ['tty', 'teletype'].include?( magic_comment_key ) &&
                 ['true'].include?( magic_comment_value )
                 puts " magic comment - turn on teletype (tty) mode"
                 @teletype = true
              end
           end

           next
        end

        line = line.sub( /#.*/, '' ).strip   ###  cut-off end-of line comments too


        ####
        #  support __END__ marker to cut-off input
        break if line.strip == '__END__'



       ##
       ##  first check for tabs
       ##    add error/warn
       ##    for auto-fix - replace tabs with two spaces
 
        line = line.gsub( "\t" ) do |_|
                  ## report error here
                  ## todo/add error here
                  puts "!! WARN - auto-fix; replacing tab (\\t) with two spaces in line #{line.inspect}"
                   "  "   ## replace with two spaces
                 end

                 
        ## U+00A0 (160)  -- non-breaking space (unicode)
        line = line.gsub( "\u00A0" ) do |uni|
                  ## report error here
                  ## todo/add error here
                  puts "!! WARN - auto-fix; replacing non-breaking unicode space (#{uni}/#{uni.ord}) w/ ascii space ( /#{" ".ord}) in line #{line.inspect}"
                   " "   ## replace with space
                 end

        ###
        ## todo/fix - print unicode numbers for [–−]
        ##                different candidates to differentiate and document!!! 
        ##   – => U+2013 (8211)     -- En Dash     (unicode) 
        ##   − => U+2212 (8722)     -- Minus Sign  (unicode)
        line = line.gsub( /[–−]/ ) do |uni|
                  ## report error here
                  ## todo/add error here
                  puts "!! WARN - auto-fix; replacing unicode dash (#{uni}/#{uni.ord}) w/ ascii dash (-/#{"-".ord}) in line #{line.inspect}"
                   '-'   ## replace with ascii dash (-)
                  end



        puts "line: >#{line}<"    if debug?

        ######
        ### special case for empty line (aka BLANK)
        if line.empty?
           ## note - blank always resets parser mode to std/top-level!!!
           @re = RE
            << [[:BLANK, '<|BLANK|>']]
        elsif (m = HEADING_RE.match(line))
           ## note - heading always resets parser mode to std/top-level!!!
           @re = RE
           puts "   HEADING"  if debug?
           ## note - derive heading level from no of (leading) markers
           ##             e.g. = is 1, == is 2, == is 3, etc.
           heading_level = m[:heading_marker].size 
            << [[:"H#{heading_level}", m[:heading]]]
        elsif (m = NOTA_BENE_RE.match(line))
           ## note - nota bene always resets parser mode to std/top-level!!!
           @re = RE
            << [[:NOTA_BENE, m[:nota_bene]]]
       elsif @re == RE && (m = TABLE_RE.match(line))
            @re = TABLE_MORE_RE  ## switch into table mode
            if m[:table_heading]
               << [[:TABLE_HEADING, m[:table_heading]]]
            else  ## assume table (line) e.g. m[:table]
               << [[:TABLE_LINE, line]]
            end 
        elsif @re == TABLE_MORE_RE
            ### todo/fix - check if no match and report/add error!!
            ##        for now (ummatched) line gets auto-added as table line!!!
            ##
            ##   note - MUST be followed by blank line (or nota bene/heading)
            ##            to switch back into to top-level!!!! 
            m = TABLE_MORE_RE.match(line)
            if m[:table_note]
               << [[:TABLE_NOTE, m[:table_note]]]
            elsif m[:table_divider]
               << [[:TABLE_DIVIDER, m[:table_divider]]]
            else  ## assume table (line) e.g. m[:table]
               << [[:TABLE_LINE, line]]
            end
        elsif @re != TABLE_MORE_RE &&  (m = HRULER_RE.match(line))
           ## note - hruler (---)
           ##          will only match if NOT in table mode!!!
           ##   otherwise
           ##      hruler always resets parser mode to std/top-level!!!
           @re = RE
            << [[:HRULER, '<|HRULER|>']]
        elsif @teletype && (@re == RE && IS_TTY_LINE_RE.match(line))
            ## try experimental TELETYPE (TTY) mode!!!
            ##    note - turn on via magic comment e.g.  tty/teletype: true
            ###
            ###    move inside _tokenize_line - why? why not?
             

             << _tokenize_tty_line( line )

            ##   note - dates such as 
            ##         APR 11 or 11 APR   will trigger TELETYPE
            ###    ## check letter
        else

          more_tokens, more_errors = _tokenize_line( line )
        
            << more_tokens   
          errors          += more_errors
        end
    end # each line





     = .map do |tokens|  
        #################
        ##    transform tokens (using simple patterns) 
        ##      to help along the (racc look ahead 1 - LA1) parser       
        nodes = []

        buf = Tokens.new( tokens )
        ## pp buf

    loop do
          break if buf.eos?

          if buf.match?( :DATE, :TIME )   ## merge DATE TIME into DATETIME
               date = buf.next[1]
               time = buf.next[1]
               ## puts "DATETIME:"
               ## pp date, time
               ##  note:  time value is { time: {} } or
               ##                       { time: {}, time_local {} }
               val =  [date[0] + ' ' + time[0],  ## concat string of two tokens
                        { date: date[1] }.merge( time[1] ) 
                      ]
               nodes << [:DATETIME, val]         
         ### support  date time with comma too - why? why not?
         elsif buf.match?( :DATE, :',', :TIME )
               date  = buf.next[1]
               _    = buf.next  ## ignore comma 
               time = buf.next[1]
               ## puts "DATETIME:"
               ## pp date, time
               val =  [date[0] + ', ' + time[0],  ## concat string of two tokens
                        { date: date[1] }.merge( time[1] )
                      ]
               nodes << [:DATETIME, val]    
        elsif buf.match?( :TEAM, :SCORE_TEAM )  
            ## merge TEAM SCORE_TEAM into TEAMALT 
            ##     (use TEAMENTRY or TEAMRESULT - why? why not?)
               team       = buf.next[1]
               score_team = buf.next[1]
               val =  [team + ' ' + score_team[0],  ## concat string of two tokens
                        { team: team }.merge( score_team[1] ) 
                      ]
               nodes << [:TEAMALT, val]         
        elsif buf.match?( :TEAM, :SCORE_TEAM_PEN )  
               team           = buf.next[1]
               score_team_pen = buf.next[1]
               val =  [team + ' ' + score_team_pen[0],  ## concat string of two tokens
                        { team: team }.merge( score_team_pen[1] ) 
                      ]
               nodes << [:TEAMALT_PEN, val]         
        elsif buf.match?( :TEAM, :SCORE_TEAM_NUM )  
               team           = buf.next[1]
               score_team_num = buf.next[1]
               val =  [team + ' ' + score_team_num[0],  ## concat string of two tokens
                        { team: team }.merge( score_team_num[1] ) 
                      ]
               nodes << [:TEAMALT_NUM, val]         
         elsif buf.match?( :GOAL_MINUTE, :',', :GOAL_MINUTE )
             ## note - only advance by two tokens!
             ##     allows more :GOAL_MINUTE sequences!! e.g. 12,13,14 etc!!!
             ##  
             ## help parser with comma shift/reduce conflict
             ##   change ',' to GOAL_MINUTE_SEP !!!
             nodes << buf.next   ## pass through goal_minute 
             _ = buf.next  ## eat-up goal_minute_sep a.k.a. comma (,)
                           ##   and replace with dedicated sep(arator)
             nodes << [:GOAL_MINUTE_SEP,"<|GOAL_MINUTE_SEP|>"]
         elsif buf.match?( :',', :INLINE_ATTENDANCE )
             ## note  - allow optional comma before inline attendance  
             ## help parser with comma shift/reduce conflict
             ##   change ',' to INLINE_ATTENDANCE_SEP !!!
             nodes << [:INLINE_ATTENDANCE_SEP, "<|INLINE_ATTENDANCE_SEP|>"]
             _ = buf.next  ## eat-up inline_attendance_sep a.k.a. comma (,)
                           ##   and replace with dedicated sep(arator)
             nodes << buf.next   ## pass through inline_attendance 
          else
             ## pass through
             nodes << buf.next
          end
    end  # loop
    nodes  
  end  # map tokens_by_line


  

    ## flatten tokens
    tokens = []
    .each do |tok|

         if debug?
           pp tok
         end


     ###############
     ##   "hacky" (automagic) line merges (remove newline)
           ## if line start with @  - check if incl. teams
  
     ###
     ### quick merge lines hack
     ##    if line starts with geo-marker token @
     ##            check if line incl. TEAM
     ##           if yes, leave alone
     ##            otherwise  merge line into previous line!!
     ##       - todo/fix - handle in possibly in grammar!!!
     ##        for now match_line CAN start with @ London
     ##                 resulting in parser conflict(s)!!!
     ##    e.g. 
     ##       England v Scotland
     ##          @ London
     ##          =>
     ##        England v Scotland @ London
     ## 
  
     ##
     ##  note/todo - if INDENT / SPACES get added
     ##                adjust here 
     ##   tok[0][0] == :INDENT  (or :SPACES) && 
     ##   tok[1][0] == :'@'

           if tok[0] && tok[0][0] == :'@' 
                team =  tok.find { |t| t[0] == :TEAM }
                if team
                   ## do nothing - keep as is (assume match_line starting w/ @)
                else
                  ## no team(s) found in line
                  ##    remove last token (that is, NEWLINE)
                  ##   note - possibly is blank ?!  keep blank
                  tokens.pop  if tokens[-1][0] == :NEWLINE
                end   
           end


         tokens  += tok 
         ## auto-add newlines  (unless BLANK!!)
         tokens  << [:NEWLINE, "\n"]   unless tok[0] && tok[0][0] == :BLANK
    end

    [tokens,errors]
end