Class: SportDb::Lexer

Inherits:

Object

Object
SportDb::Lexer

show all

Defined in:: lib/sportdb/parser/lexer.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/lexer_tty.rb,
lib/sportdb/parser/token-geo.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-note.rb,
lib/sportdb/parser/token-prop.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-time.rb,
lib/sportdb/parser/token-goals.rb,
lib/sportdb/parser/token-group.rb,
lib/sportdb/parser/token-round.rb,
lib/sportdb/parser/token-score.rb,
lib/sportdb/parser/token-table.rb,
lib/sportdb/parser/token-status.rb,
lib/sportdb/parser/token_helpers.rb,
lib/sportdb/parser/token-prop_name.rb,
lib/sportdb/parser/token-score_legs.rb,
lib/sportdb/parser/token-score_fuller.rb,
lib/sportdb/parser/token-date_duration.rb

Constant Summary collapse

HTML_COMMENT_RE =

%r{  <!--
     .*?   ## note - use non-greedy/lazy *? match
  --> 
}xm

PREPROC_BLOCK_RE = note - [] block may NOT incl. square brackets what about comments (e.g. #)? todo/check - rename to NOTE_BLOCK or TEXT_BLOCK or ???

%r{  \[
                      [^\[\]\#]*?  ## note - use non-greedy/lazy *? match
                  \]
}xm

PREPROC_NOTA_BENE_RE = check for “literal” (multi-line) note blocks eg. nb: or note: space required after double colon - why? why not?

%r{
         ^  
    [ ]* (?: nb | note) [ ]* : [ ]+
       .+?  ## non-greedy 
   
    ## positive lookahead
    ##    note - must end with blank line or end-of-file/document 
    ##   note - do NOT eat-up trailing hrule (---)  
      (?=      (?: \n [ ]* -{3,} [ ]*)? 
                   \n[ ]*\n
               | \z 
        )   
}xim

LINE_CONTINUATION_RE = replace “escaped” newline with non-newline char e.g. ‘↵’

%r{
   \\[ ]* \n
}x

MAGIC_COMMENT_RE = check for magic comments e.g # teletype: true or TELETYPE: TRUE tty/teletype

%r{  \A
   [ ]*    ## optional leading spaces
  \#+      ##  note - allow ##,###, etc. too 
   [ ]*    ## optional spaces
     (?<magic_comment_key> tty | teletype )
   [ ]*    ## optional spaces
      :       
   [ ]*    ## optional spaces
      (?<magic_comment_value> true | false )
   [ ]*    ## optional trailing spaces
  \z
}ix

BASICS_RE =

%r{
    (?<vs>
       (?<=[ ])	# positive lookbehind for space
       (?-i: 
           vs\.?|v|VS 
       )        # note - only match case sensitive (downcased letters)!!!
                # note -  bigger match first e.g. vs than v etc.
       (?=[ ])   # positive lookahead for space
    )
       |
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym> [,;/@|()\[\]-] )   ### note: add parantheses too e.g () - why? why not?
}ix

ATTENDANCE_RE = add att(endance) e.g. att: 18000 A v B 2-1 att: 18000

%r{
    (?<attendance>
     \b
        att: [ ]*
         (?<value>
              [1-9]
              (?: _? \d+ )*
         )
     \b
)}ix

INLINE_WO_RE = add support for WO or W-0 too - why? why not?

%r{
    (?<inline_wo>
        \b (?: w/o | W/O ) \b
)}x

INLINE_BYE_RE = note - NOT case insensitive

%r{ 
   (?<inline_bye>
       \b (?: bye | BYE ) \b
)}x

INLINE_NP_RE = A n/p B (note - basically a inline short form of A v B [cancelled] ) N/P

%r{
    (?<inline_np>
        \b (?: n/p | N/P ) \b
)}x

INLINE_ABD_RE = abd/abd. or aban/aban. [abandoned] ABD/ABAN

%r{
    (?<inline_abd>
        \b (?: abd\.? |
               aban\.? |
               ABD | ABAN
           ) 
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x

INLINE_SUSP_RE = susp/susp. [suspended] SUSP

%r{
    (?<inline_susp>
        \b (?: susp\.? |
                SUSP ) 
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x

INLINE_PPD_RE = ppd/ppd. or pst/pst. or pstp/pstp. or postp/postp. [postponed] PPD/PSTP/POSTP/P-P todo/check - add/allow p-p too - why? why not?

%r{
    (?<inline_ppd>
        \b (?: ppd\.? |
               pst\.? |
               po?stp\.? |
               PPD | PST | PO?STP | P-P
            ) 
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x

INLINE_VOID_RE = void via x-x X-X todo/check - only allow X-X - why? why not?

%r{
      (?<inline_void>
          \b (?: x-x |
                 X-X 
             )
        ## POSITIVE lookahead - requires space
           (?= [ ])
)}x

INLINE_AWD_RE = awd/awd. [awarded] AWD note - recommendation is to allways include score thus, use/prefer SCORE_AWD e.g. 0-3 awd

%r{
    (?<inline_awd>
        \b (?: awd\.? | AWD ) 
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x

INLINE_CANC_RE = canc/canc. [cancelled] CANC

%r{
    (?<inline_canc>
        \b (?: canc\.?  | CANC ) 
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x

TEAM_HOME_RE = home/away/neutral - (h), (a), (n) add support for h/a/n with (?-i \b [han] \b) lower-case and \b boundry - why? why not?

%r{  (?<team_home> \(h\) )}xi

TEAM_AWAY_RE =

%r{  (?<team_away> \(a\)  )}xi

TEAM_NEUTRAL_RE =

%r{  (?<team_neutral> \(n\) )}xi

RE =

Regexp.union(
                    STATUS_RE,   ## match status e.g. [cancelled], etc.

                    INLINE_WO_RE,    ## (inline) match status - w/o (walkout)
                    INLINE_NP_RE,    ## (inline) match status - n/p (not played)
                    INLINE_BYE_RE,   ## (inline) match status - bye (advance to next round)
                    INLINE_ABD_RE,   ## (inline) match status - abd/abd. (abandoned)
                    INLINE_SUSP_RE,  ## (inline) match status - susp/susp.  (suspended)
                    INLINE_PPD_RE,   ## (inline) match status - ppd/ppd. or pstp/pstp. or postp/postp. or p-p (postponed)            
                    INLINE_VOID_RE,  ## (inline) match status - x-x (voided) 
                    INLINE_AWD_RE,   ## (inline) match status - awd/awd. (awarded)
                    INLINE_CANC_RE,  ## (inline) match status - canc/canc. (cancelled/canceled)
                   

                    TEAM_HOME_RE,     ## (H)
                    TEAM_AWAY_RE,     ## (A)
                    TEAM_NEUTRAL_RE,  ## (N)

                    NOTE_RE,  ### fix - change to INLINE_NOTE !!!
                    DATE_LEGS_RE,  # note - must go before date!!!
                    DATE_RE,  ## note - date must go before time (e.g. 12.12. vs 12.12)
                     TIME_RE,
                    ATTENDANCE_RE,   # note - allow att: for now inline in matches too - why? why not? 
                    SCORE_LEGS_RE,
                    SCORE_FULL_RE, 
                    SCORE_FULLER_RE,
                    SCORE_FULLER_MORE_RE,
                    SCORE_AWD_RE,   #  (inline) score awarded e.g. 3-0 awd or 0-1 awd. etc.
                    SCORE_ABD_RE,   #  (inline) score abandoned e.g. 2-1 abd.
                    SCORE_RE,   ## note basic score e.g. 1-1 must go after SCORE_FULL_RE!!!
                    
                    ## note - add "experimental" "split" scores for now
                    SCORE_TEAM_RE,   ##  e.g. (2) 1  for "split" scores
                    SCORE_TEAM_PEN_RE,   ##  e.g. 1 (2)  
                    
                    BASICS_RE, 
                   TEXT_RE,
                     ## note - score_team_num (e.g. 0 or 10 etc.)
                     ##            MUST BE after TEXT 
                     ##              only match if nothing else matches (expect ANY)
                    SCORE_TEAM_NUM_RE,   ## e.g. 0 or 1 or 9 or 11 etc. (<100)
                   ANY_RE,
)

START_WITH_ORD = ord (for ordinal number) e.g. (51) or (1) etc. - limit digits of number - why? why not???

%r{
   \A  
    [ ]*    ## ignore leading spaces (if any)
(?<ord>
  \(  
   (?<value>\d+) 
  \)
)}ix

START_WITH_YEAR = e.g. 1930, 1986, 2002, 2010, 2022, 2026 note - only YYYY note - look out for clubs like 1860 München (de) !!! 1899 Hoffenheim (de) 1896 Löwenherz (ch - a.k.a. FC Winterthur ??) any others starting with YYYY ?! note - YEAR requires TWO (trailing) spaces !!!!! e.g. 1930 Uruguay 4-2 Argentina 1934 Italy 2-1 Czechoslovakia (AET) 2022 Argentina 3-3 France (AET, 4-2 pen) do NOT match (iso date!!) - 2020-11-12 2020/11/12 2020.11.12 etc.

%r{
   \A
       [ ]*    ## ignore leading spaces (if any)
     (?<year>
        \d{4}
     )
     ## positive lookahead 
       (?= [ ]{2} |   ## min. TWO spaces or 
           [ ]@ |   ##   space with geo marker or
           [ ]* \z  ##    year (date) header (end-of-line/string)
        )   
}x

HEADING_RE =

%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?<heading_marker> ={1,6} ) 
    [ ]*
     (?<heading>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1 
        [^=]+?   ## use non-greedy 
     )
    [ ]*  ## ignore trailing spaces (if any)
     (?: =* )  ## allow any trailing heading markers
    [ ]*  ## ignore trailing spaces (if any)
  \z
}ix

HRULER_RE =

%r{
                 \A
                           [ ]*  ## ignore leading spaces (if any)
                    -{3,}  ## must be at least three dashes!!!
                           [ ]*  ## ignore trailing spaces (if any)                   
                 \z
}ix

IS_TTY_LINE_RE = experimental teletype mode only space, A-Z and 0-9 allowed

%r{  \A  
     ## note - use NEGATIVE lookahead to exclude blank lines
       (?! [ ]*\z)

        [A-Z0-9 ]+
    \z
}x

TTY_SPACES_RE =

%r{ (?<spaces> [ ]{2,}) |
  (?<space>  [ ])
}x

TTY_NUM_RE =

%r{   \b  (?<num> \d+ ) \b 
}x

TTY_TEXT_RE = note - TEXT for now allows A, 1A, A1, A1A, A1 B1 C1, A1AA1 2B22 3C33 - single space only for concat text segments MUST NOT be all numbers e.g. 1, 11, etc.

%r{   \b (?<text>                         
         (?:
            [A-Z]  ## MUST start with letter  
              |
             [0-9]+[A-Z]   ## or numbers followed by letter 
           )
           [0-9A-Z]*
           (?:
               ### allow move segements separated
               ##     by single space
                [ ]
               (?: 
                   [A-Z]  ## MUST start with letter  
                    |
                   [0-9]+[A-Z]   ## or numbers followed by letter 
                )
               [0-9A-Z]*
           )*
        )
        \b   
}x

TTY_RE =

Regexp.union(
                TTY_SPACES_RE,
                TTY_TEXT_RE,
                TTY_NUM_RE,
                ##  fix add ANY_RE,  
)

GEO_TEXT_RE =

%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
          ## positive lookbehind -  for now space (or beginning of line - for testing) only
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<= [ ,›>\[\]]|^)
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |
                # opt 2 - start with num!! - 
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MAY be followed by (optional space) !
                      ## MUST be follow by a to z!!!!
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add another weirdo case
                ##   e.g.   's Gravenwezel-Schilde
                ##   add more letters (or sequences here - why? why not?)
                    '\p{L}+
               )

               ##
               ## todo/check - find a different "more intuitive" regex/rule if possible?
               ##    for single spaces only (and _/ MUST not be surround by spaces) 

              (?: 
                  (?:
                    [ ]?   # only single (inline) space allowed - double spaces are breaks!!!  
                    (?:
                       \p{L} | \d  | [.&'°]
                        |
                       (?: (?<! [ ])  ## no space allowed before (but possible after)
                            [-]
                       )
                         |
                       (?: (?<! [ ])  ## no spaces allowed around these characters
                           [_/]
                          (?! [ ])
                       )
                    )+
                  )
                  |
              ## for now allow auto-add optional
              ##   parenthesis enclosed closed text
              ##   e.g. Dublin (Dalymount Park)
              ##        Bucuresti (23 August)
              ##        Paris (Parc des Princes)
              ##        Ost-Berlin (Walter-Ulbricht)
              ##        Athinai (OAKA - Maroussi)
              ##
              ##   or   Valencia (Spain) or Solna   
              (?:
                    [ ]
                    \(
                        [^()\[\],;:›<>]+    ## todo - add more special chars
                                            ##   maybe list only allowed ones??
                                            ##   make pattern more strict - why? why not?
                    \)
              )
          )*


              ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)

            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## POSITIVE lookahead
            (?=[ ,›>\[\]]|$)

   )
}ix

GEO_BASICS_RE =

%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym> [,›>\[] )
}ix

GEO_END_RE =

%r{
   (?<geo_end>
        ,
    )
    ## POSITIVE lookahead for props
    (?=    
        [ ]*  ## optional spaces
         (?: att|ref)    ## todo/fix - use generic [a-z]+ - why? why not?
         :
    )
}ix

GEO_RE =

Regexp.union(
                    GEO_END_RE,
                    GEO_BASICS_RE, 
                    GEO_TEXT_RE,
                    ANY_RE,
)

MONTH_LINES =

parse_names( <<TXT )
January    Jan
February   Feb
March      Mar
April      Apr
May
June       Jun
July       Jul
August     Aug
September  Sept  Sep
October    Oct
November   Nov
December   Dec
TXT

MONTH_NAMES =

build_names( MONTH_LINES )

MONTH_MAP = pp MONTH_NAMES

build_map( MONTH_LINES, downcase: true )

DAY_LINES =

parse_names( <<TXT )
Monday                   Mon  Mo
Tuesday            Tues  Tue  Tu
Wednesday                Wed  We
Thursday    Thurs  Thur  Thu  Th
Friday                   Fri  Fr
Saturday                 Sat  Sa
Sunday                   Sun  Su
TXT

DAY_NAMES =

build_names( DAY_LINES )

DAY_MAP = pp DAY_NAMES

build_map( DAY_LINES, downcase: true )

DATE_I_RE = e.g. Fri Aug 9 Fri Aug 9 Fri, Aug 9 Fri, Aug 9 2024 Fri, Aug 9, 2024 Aug 9, 2024 Aug 9, 2024 note - eat-up optional comma after DAY_NAMES!! note - Fri Aug/9 no longer supported!!!

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+)
     )?
     (?<month_name>#{MONTH_NAMES})
          [ ] 
     (?<day>\d{1,2})
          \b
     ## optional year
     (      ,? [ ]       ## note - comma optinal with single space required for now
            (?<year>\d{4})        ## optional year 2025 (yyyy)      
              \b
     )?
)}ix

DATE_LEGS_I_RE = todo/fix - add (opt) day_name later add (opt) year later e.g. Aug 9 & Aug 10 note - allow shortcut e.g. Aug 9 & 10

%r{
(?<date_legs>
 \b
     (?<month_name1>#{MONTH_NAMES})
          [ ] 
     (?<day1>\d{1,2})
    [ ] & [ ]
     (?:
        (?<month_name2>#{MONTH_NAMES})
          [ ] 
      )?  ## note - make 2nd month_name optional 
     (?<day2>\d{1,2})
  \b
)}ix

DATE_II_RE = e.g. 3 June or 10 June note - allow more spaces between DAY_NAMES and DAY e.g. Sun 1 Mar Wed 4 Mar Sat 14 Mar Sat 11 Apr Sat 11 Apr 2021 Sat 11 Apr 21 Sat, 11 Apr note - eat-up optional comma after DAY_NAMES!! note - Sat 14 Mar 17:30 check two-digit year (with NEGATIVE lookahead for time!!!)

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+)
     )?
     (?<day>\d{1,2})
         [ ]
     (?<month_name>#{MONTH_NAMES})
          \b
     ## optional year
     (  [ ]
        (?: 
           (?<year>\d{4})        ## optional year 2025 (yyyy)
               |
            (?:
               (?<yy>\d{2})           ## optional year 25 (yy)
                ## check NEGATIVE lookahead
               (?! :|[:h]\d{2})
            )
        )
        \b   
     )?
)}ix

DATE_III_A_RE = e.g. iso-date - 2011-08-25 note - allow/support ("shortcuts") e.g 2011-8-25 or 2011-8-3 / 2011-08-03 etc.

%r{
(?<date>
  \b
   (?<year>\d{4})
       -
   (?<month>\d{1,2})
       -
   (?<day>\d{1,2})
  \b
)}ix

DATE_III_B_RE = starting w/ day/month/year e.g. 25-08-2011

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          (?: ,?[ ]+)
     )?
   (?<day>\d{1,2})
       -
   (?<month>\d{1,2})
       -
   (?<year>\d{4})
  \b
)}ix

DATE_IIII_RE = allow (short)“european” style 8.8. note - assume day/month!!!

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+) 
     )?
   (?<day>\d{1,2})
       \.
   (?<month>\d{1,2})
       \.
   (?: (?: 
          (?<year>\d{4})        ## optional year 2025 (yyyy)
              |
          (?<yy>\d{2})           ## optional year 25 (yy)
       )
        \b
   )?
)
}ix

DATE_IIIII_RE = 04/03/2026 or 4/3/2026 04/03/26 or 4/3/26 04/03 or 4/3

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          (?: ,?[ ]+)
     )?
   (?<day>\d{1,2})
       /
   (?<month>\d{1,2})
    \b
   (?:  
        /
       (?: 
          (?<year>\d{4})         ## optional year 2025 (yyyy)
              |
          (?<yy>\d{2})           ## optional year 25 (yy)
       )
      \b
   )?
)
}ix

DATE_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
   DATE_I_RE,
   DATE_II_RE,
   DATE_III_A_RE,    ## e.g. 1973-08-14
   DATE_III_B_RE,
   DATE_IIII_RE,    ## e.g. 8.8. or 8.13.79 or 08.14.1973 
   DATE_IIIII_RE,   ## e.g.  08/14/1973
)

DATE_LEGS_RE = todo - add more format style here; change to Regexp.union later!!!

DATE_LEGS_I_RE

NOTE_RE = fix - use (?<text>) - text capture for inner text!! use (?<note> for complete match as a convention!! )

%r{
\[ 
  (?<note>
     [^\[\]\#]*?    ## note - non-greedy/lazy operator
                    ##    exclude comments inside note block - why? why not?
  )
\]
}xi

NOTA_BENE_RE = check for “literal” (multi-line) note blocks eg. nb: or note: space required after double colon - why? why not? note - use \A (instead of ^) - \A strictly matches the start of the string.

%r{   \A
    [ ]*  ## ignore leading spaces (if any)
 (?: nb | note) [ ]* : [ ]+   
  (?<nota_bene>
       .+?  ## use non-greedy 
   )
    [ ]*  ## ignore trailing spaces (if any) 
   \z
}xi

PROP_KEY_RE = todo/fix/fix change ^ to \A change name to START_WITH_PROP_KEY_RE !!!

%r{ 
   ^     # note - MUST start line; leading spaces optional (eat-up)
   [ ]*  
(?<prop_key>
  (?<key>
      (?:\p{L}+
          |
          \d+  # check for num lookahead (MUST be space or dot)
       ## MUST be followed by (optional dot) and
       ##                      required space !!!
       ## MUST be follow by a to z!!!!
        \.?     ## optional dot
        [ ]?   ## make space optional too  - why? why not?
            ##  yes - eg. 1st, 2nd, 5th etc.
        \p{L}+
       )
       [\d\p{L}'/° -]*?   ## allow almost anyting 
                         ## fix - add negative lookahead 
                         ##         no space and dash etc.
                         ##    only allowed "inline" not at the end
                         ## must end with latter or digit!
  )
   [ ]*?     # slurp trailing spaces
    :
   (?=[ ]+)  ## possitive lookahead (must be followed by space!!)
  )
}ix

INLINE_CAPTAIN = c or [C] for marking player as captain support [y ] too - or require Y - why? why not?

%r{ (?<inline_captain>
    \[ [cC] \]
)}x

INLINE_YELLOW =

%r{ (?<inline_yellow>
     \[ [yY]
         ## optional minute
         (?: [ ]+
           (?<minute> \d{1,3})
              '?
           (?:
              \+
              (?<offset>\d{1,2})
               '?
           )? 
         )? 
     \]
)}x

INLINE_RED =

%r{ (?<inline_red>
     \[ [rR] 
         ## optional minute
         (?: [ ]+
           (?<minute> \d{1,3})
              '?
           (?:
              \+
              (?<offset>\d{1,2})
               '?
           )? 
         )? 
     \]
)}x

INLINE_YELLOW_RED =

%r{ (?<inline_yellow_red>
     \[ (?:y/r |
           Y/R  ) 
         ## optional minute
         (?: [ ]+
           (?<minute> \d{1,3})
              '?
           (?:
              \+
              (?<offset>\d{1,2})
               '?
           )? 
         )? 
     \]
)}x

PROP_KEY_INLINE_RE = simple prop key for inline use e.g. Coach: or Trainer: or ... add more here later

%r{ 
   \b  
(?<prop_key>    ## note: use prop_key (NOT prop_key_inline or such)
  (?<key>
      \p{L}+
  )
   ## note - NO spaces allowed for key for now!!! 
    :
   (?=[ ]+)  ## possitive lookahead (must be followed by space!!)
  )
}ix

PROP_NUM_RE =

%r{
 \b
  (?<num>
        ## note allow underscore inline or space e.g.
        ##  5_000
        ##  allow space inline (e.g. 5 000) - why? why not?
      (?<value> [1-9]
                (?: _? 
                    [0-9]+
                 )* 
      )
  )
 \b
}ix

ENCLOSED_NAME_RE = todo/fix - allow more chars in enclosed name - why? why not? e.g. (') - Cote D'Ivore etc. change to PAREN_NAME or PARENTHESIS or such - why? why not?

%r{ 
        (?<enclosed_name>  
           \( 
          (?<name>   
              \p{L}+
              (?:
                 [ ] 
                   \p{L}+ 
              )*
          )
            \)
        )
}ix

PROP_BASICS_RE =

%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>  
        [;,\(\)\[\]-] 
    )   
}ix

PROP_RE =

Regexp.union(
   MINUTE_RE,   ## e.g.  44 or 44' or 45+1 or 45+1' etc.

   INLINE_CAPTAIN,  ## e.g. [c]
   INLINE_YELLOW,   ## e.g. [Y] or [Y 44] or [Y 44'] or [Y 45+1']
   INLINE_YELLOW_RED,  ## e.g. [Y/R] or [Y/R 78]
   INLINE_RED,         ## e.g. [R] or [R 42] or [R 42']

   PROP_KEY_INLINE_RE,   
   PROP_NAME_RE,
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)

PROP_CARDS_RE = note - no inline keys possible todo/fix - use custom (limited) prop basics too

Regexp.union(
   MINUTE_RE,
   PROP_NAME_RE,
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)

PROP_PENALTIES_RE =

Regexp.union(
   SCORE_RE,               # e.g. 1-1 etc.
   ENCLOSED_NAME_RE,       # e.g. (save), (post), etc.
   PROP_NAME_RE,
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)

PROP_REFEREE_RE =

Regexp.union(
   ENCLOSED_NAME_RE,       # e.g. (sold out) etc.  why? why not?
   PROP_NUM_RE,                 # e.g. 28 000 or 28_000  (NOT 28,000 is not valid!!!)
   PROP_KEY_INLINE_RE,
   PROP_NAME_RE,
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)

PROP_ATTENDANCE_RE =

Regexp.union(
   ENCLOSED_NAME_RE,       # e.g. (sold out) etc.  why? why not?
   PROP_NUM_RE,                 # e.g. 28 000 or 28_000  (NOT 28,000 is not valid!!!)
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)

ANY_RE = general catch-all (RECOMMENDED (ALWAYS) use as last entry in union) to avoid advance of pos match!!!

%r{
     (?<any> .)
}ix

TEXT_RE =

%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
           ## positive lookbehind
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<=[ ,;@|\[\]]
                 |^
            )
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |

                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MUST be followed by (optional dot) and
                      ##                      required space !!!
                      ## MUST be follow by a to z!!!!
                      [.°]?     ## optional dot (.) or degree(°) - todo - add number sign too!! 
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add another weirdo case
                ##   e.g.   's Gravenwezel-Schilde
                    '[s] [ ] \p{L}+
               )


              (?:(?:  (?:[ ]   # only single spaces allowed inline!!!
                          ## note - exclude (v[ ]/vs[ ]/vs.[ ])
                          ##    AND switch to case-sensitive (via -i!!!)
                        (?! (?-i: (?:  ## note - (big) V not matching for versus!!!
                                      vs\.?|v|VS|   
                                         
                                      n/p|N/P|  
                                      w/o|W/O| 
                                      abd\.?|ABD|
                                      aban\.?|ABAN|
                                      susp\.?|SUSP|
                                      ppd\.?|PPD|
                                      pst\.?|PST|
                                      po?stp\.?|PO?STP|P-P|
                                      x-x|X-X|
                                      awd\.?|AWD|
                                      canc\.?|CANC ) [ ] 
                                        |
                                  (?: bye|BYE ) (?:[ ]|$))
                          )    
                      )
                      |     
                     [/-]   ## must NOT be surrounded by spaces 
                  )?
                (?:
                  \p{L} 
                     |
                  (?:   ## note - restrict [.&'] to single char usage (no doubled e.g. && etc.)
                    \. (?! \.)  ## allow single points only (now two or more etc.)
                     | 
                    & (?! &)
                     |
                    ' (?! ')
                   )
                     |
                 (?:
                   \d+
                   (?!
                     [0-9h'+] |    ## protected break on 12h / 12' / 1-1
                                    ##  check usege for 3+4 - possible? where ? why?     
                     (?:[.:-]\d)     ## protected/exclude/break on 12.03 / 12:03 / 12-12
                                      ##  BUT allow Park21-Arena for example e.g. 21-A :-)
                    )
                    [°]?  ## followed by optional ord                 
                   ## negative lookahead for numbers
                   ##   note - include digits itself!!!
                   ##   note - remove / (slash) e.g. allows UDI'19/Beter Bed
                 )
               )
              )*  ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)


            ## allow optional at the end
            ##  tag or year
            ##   make it and in the future - why? why not?
            ##
            ## change - fix
            ##   do NOT use (A) for amateur
            ##   use A or A. with NO ()!!!
            ## (A) -    allow with predined  alpha only for now
            ##          e.g. (A) - amateur a team or b?
            ###  same for U21 or U9 etc
            ##        use with NO ()!!! - why? why not?
            ##      or U21 U9 etc.   - why? why not?
            ##       or etc.
            ## (1879-1893) or allow years e.g. (1879-1893)
            ###
            ##    add allow country code three to five letters for now
            ##       change to generic 1 to 5 - why? why not?
            ##     e.g. (A), (I),
            ##          (AUT)
            ##          (TRNC)   five? for UEFA code for northern cyprus
            ##     change to 1 to 4 - why? why not?
            ##   check - fix possible for upper case only here
            ##                     inline for this group only?
            (?:
               [ ]
               \(
                  \d{4}-\d{4}
               \)
            )?
             (?:
                ######
                # check for country code (cc)
                #       e.g. (AUT) or ,AUT or AUT
                (?:
               [ ]   ## note - do NOT allow more than one space!!! - why? why not?
                   \( 
                       ## note - auto-exclude reserved (aet)  from SCORE_FULLER_MORE!!!
                       ##     plus golden goal (gg)/sudden death (sd), silver goal (sg)
                       ##    (ht), (ft)  
                       (?! (?: aet | agget | asdet | asget | ht | ft )
                             \)
                       )    
                     (?:
                       [A-Z]{1,5}   
                     )
                  \)
                )
                  |
                (?:
                    [ ]*[,›>][ ]*
                        [A-Z]{1,5}
                     \b
                )
             )?
            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## positive lookahead
            (?=[ ,;@|\[\]]
                 |$
            )
   )
}ix

TIME_RE =

%r{
        \b
    (?<time>  
             (?<hour>\d{1,2})
                   [:h] 
              (?<minute>\d{2})
                 
                 #### optional (inline) timezone
                 ##    note - non-utc timezone MUST be hard-coded (added) here!!!
                 ##     avoids eating-up team names (separated by one space)
                 ##            e.g.  18:30 MEX v MEX 
                 (?:
                    [ ]  ## require space - why? why not
                     (?<timezone>
                        (?: 
                          ## GMT   - Greenwich Mean Time
                          ## BST   - British Summer Time
                          ## CES?T - Central European (Summer) Time
                          ## EES?T - Eastern European (Summer) Time
                          ##
                          (?: GMT|BST|CES?T|EES?T) 
                               (?: /
                                   UTC  (?: [+-]\d{1,4} | ±0)
                               )?
                          )
                          |
                          (?:
                             UTC  (?: [+-]\d{1,4} | ±0)
                          )
                     )
                 )?
        )          
      \b  

####
###  note - local time is now INLINE and MUST follow time
       (?:     
           [ ]+   ## todo/check - make space optional - why? why not?
           \(
        (?<time_local>   
                (?<local_hour>\d{1,2})
                   [:h]    ### todo/fix - MUST match style in time above!!!
                (?<local_minute>\d{2})
                
                ####
                ## optional "local" timezone name eg. BRT or CEST etc.
                (?:
                    [ ] ## require space - why? why not
                   (?<local_timezone>
                      (?:  [A-Z]{3,4}
                           (?: /
                                   UTC (?: [+-]\d{1,4} | ±0)
                           )? 
                      )
                      |    
                      (?:     ## e.g. 0 or 00 or 0000
                          UTC   (?: [+-]\d{1,4} | ±0)
                      )   
                  )
               )?  # note - make timezone  optional!!!
          )
      \)       
       )?
}ix

GOAL_BASICS_RE =

%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>  
        [;,)]   ##  add (-) dash too - why? why not?   
    )   
}ix

START_GOAL_LINE_RE = note - assume lines starting with opening ( are goal lines!!!! note - use \A (instead of ^) - \A strictly matches the start of the string. note - check for negative lookahead to exclude ord (numbers) e.g. (1), (42), etc.!!! todo/fix -- exclude (a), (h), (n) - TEAM_AWAY, TEAM_HOME, TEAM_NEUTRAL tokens!!

%r{
                    \A                        
                       [ ]*    ## ignore leading spaces (if any) 
                      \(

                      # check NEGATIVE lookahead
                      (?! 
                            ##  exclude (a), (h), (n)
                            ##    TEAM_AWAY, TEAM_HOME, TEAM_NEUTRAL
                            (?: a|h|n )  
                            \)  
                       )

}xi

START_GOAL_LINE_COMPAT_RE =

%r{
                   \A
                        [ ]*    ## ignore leading spaces (if any) 
                      \(  
                      
                      ## (i) check NEGATIVE lookahead
                      ##    exclude score e.g. 1-1 etc.        
                          (?! [ ]* \b \d-\d \b)

                      ## (ii) check POSITIVE lookahead                                    
                          (?= [ ]*
                               \d{1,3}
                                   '?    ## optional minute marker
                                  (?: \+
                                      \d{1,2}   
                                    '?    ## optional minute marker
                                  )?     
                            )    
}xi

START_GOAL_LINE_ALT_RE = check for goal line (alternate syntax) (1-0 Player, 1-1 Player, ...) must start-off OR yes, include score note - allow "centered" style e.g. ( Player 44' (p) 1-0 1-1 Player 64' )

%r{
    \A
       [ ]*    ## ignore leading spaces (if any) 
     \(  
     
     # check POSITIVE lookahead                                    
      (?=  .*?         ## note - non-greedy 
               \b \d-\d \b    ## score e.g. 0-1 
        )	 	
}xi

GOAL_NONE_RE = e.g. (-; Metzger)

%r{ (?<goals_none>
       -[ ]*;
   )
}x

GOAL_SEP_ALT_RE =

%r{
          (?<goal_sep_alt>
 (?<=[ ])   ## positive lookbehind - space required
 -
 (?=[ ]|\z)    ## positive lookahead - speace required
)}x

GOAL_COUNT_RE = e.g. (2) (2/p), (2/pen.), (3/2p), (3/ 2 pen.) -or- (2,1pen), (3, 2 pens) (p), (pen.) (2 pen.), (2p) (og), (o.g.), (2og), (2 o.g.), (2ogs)

%r{
   (?<goal_count>
      \(
        (?:
          ## opt penalties
            (?<pen>
              (?:  (?<pen_value> \d{1,2}) [ ]? )?
                 (?:pens|pen\.?|p)
           )
            |
          ## opt own goals (og)
            (?<og>
             (?: (?<og_value> \d{1,2}) [ ]? )?
                (?:ogs?|o\.g\.|o) 
            )          
            |
          ## opt fallback - classic count/number
          (?:  (?<value> [1-9])
                ## check for option penalties
                (?<pen>
                     [,/] [ ]*
                     (?: (?<pen_value> \d{1,2}) [ ]? )?
                     (?:pens|pen\.?|p)
                )?
           )
         )  
      \)
)}ix

MINUTE_NA_RE = minute variant for N/A not/available todo/check - find a better syntax - why? why not? note "??".to_i(10) returns 0 or "__".to_i(10) returns 0 quick hack - assume 0 for n/a for now

%r{
   (?<minute>
      (?<=[ (])	 # positive lookbehind for space or opening 
        (?<value> \?{2} | _{2} )
        '   ## must have minute marker!!!!
    )
}ix

MINUTE_RE = note - inline b check in MINUTE_RE excludes 85pen or 90+4pen or 38p (possible and NOT excluded in GOAL_MINUTE_RE !!!) minute with optional stoppage

%r{
     (?<minute>
               \b
             (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
                \b
                '?    ## optional minute marker
                
                (?: \+ (?<value2>\d{1,2}) 
                       \b   
                      '?    ## optional minute marker
                 )?
                      
      )
}ix

GOAL_MINUTE_RE = goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og) todo/check - keep case-insensitive or allow OG or P or PEN or only lower case - why? why not? add (gg) for golden goal - why? why not? add (sg) for silver goal - why? why not??

%r{
     (?<goal_minute>
               \b
             (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
                '?    ## optional minute marker
                
                 (?: \+ (?<value2>\d{1,2})
                      '?    ## optional minute marker
                 )?          
                   
        ## note - add goal minute qualifiers here inline!!! 
        (?:
            (?: [ ]? (?<og>   (?: \((?:og|o\.g\.|o)\))   ## allow (og)
                                   |
                              (?: (?:og|o\.g\.|o))      ## allow plain og
                      )
            )
            |
            (?: [ ]? (?<pen>  (?: \((?:pen\.?|p)\))   ## allow ()
                                   |
                              (?: (?:pen\.?|p))
                      )    
            )
            |
            ## add experimental header qualifier
            (?: [ ]? (?<hdr> \( (?:hdr\.?|h ) \) | (?: hdr\.?|h ) ))
            |
            ## add experimental free kick qualifier
            (?: [ ]? (?<fk> \( (?:fk\.?|f ) \) | (?: fk\.?|f) ))
        )?

        ##  add experimental seconds
        ##    e.g. (95 secs) or (95sec) etc. 
        (?: [ ]*  \(
                      (?<secs>\d{1,3})
                         [ ]?secs?
                   \) 
        )?
     )

     ## note - check positive lookahead 
     (?=[ ,;)]|$)   
}ix

GOAL_RE =

Regexp.union(
    GOAL_BASICS_RE,
    GOAL_NONE_RE,
    GOAL_MINUTE_RE,
    GOAL_COUNT_RE,
   ## MINUTE_NA_RE,   ## note - add/allow not/available (n/a,na) minutes hack for now
   ## GOAL_OG_RE, GOAL_PEN_RE,
   ## SCORE_RE,  ## add back in v2 (level 3) or such!!
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
    GOAL_SEP_ALT_RE,
    ## todo/fix - add ANY_RE !!!!
)

GOAL_TYPE_RE =

%r{
     (?<goal_type>
               \(
                 (?:
                      (?<og>  og|o\.g\.|o )  
                         |
                      (?<pen> pen\.?|p )  
                         |
                     ## add experimental header qualifier
                      (?<hdr>  hdr\.?|h )
                         |
                     ## add experimental free kick qualifier
                       (?<fk>  fk\.?|f )
                  )
                \)
)}xi

GOAL_ALT_RE =

Regexp.union(
    GOAL_BASICS_RE,
    SCORE_RE,        ## e.g.  1-0, 0-1, etc.
    GOAL_MINUTE_RE,
    GOAL_TYPE_RE,
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
    ## todo/fix - add ANY_RE !!!!
)

GOAL_COMPAT_RE =

Regexp.union(
    GOAL_BASICS_RE,
    SCORE_RE,        ## e.g.  1-0, 0-1, etc.
    MINUTE_RE,          ## note - matches minute e.g.  92, 7, 7' 7+3, 46+, etc.
    GOAL_TYPE_RE,
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
    ## todo/fix - add ANY_RE !!!!
)

GROUP_DEF_LINE_RE = check for start of group def line e.g. Group A | ... Group 1 : .... Group A2 | .... note - use \A (instead of ^) - \A strictly matches the start of the string.

%r{  \A
   [ ]*  ## ignore leading spaces (if any)
   (?<group_def>
       Group
        [ ]
        [a-z0-9]+   ## todo/check - allow dot (.) too e.g. 1.A etc.- why? why not?         
   )
   ###   possitive lookahead MUST be : OR | 
   (?= [ ]*
       [:|] 
       [ ])  ## note: requires space for now after [:|] - keep - why? why not?	
}ix

GROUP_DEF_BASICS_RE =

%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym> [:|,] )    ### note - add comma (,) as optional separator  
}ix

GROUP_DEF_RE =

Regexp.union(  GROUP_DEF_BASICS_RE, 
   TEXT_RE,
   ANY_RE,
)

ROUND_OUTLINE_I_RE = note - use A (instead of ^) - A strictly matches the start of the string. todo - add support for trailing markers e.g. ▪ Round 1 ▪▪▪▪▪▪▪▪ :: Round 1 :::::::::::: check - allow without space (like in heading =Heading 1=) - why? why not? ▪Round 1▪▪▪▪▪▪▪▪ ::Round 1::::::::::::

%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?<round_marker>
        [▪]{1,3}     ## BLACK SMALL SQUARE e.g. ▪,▪▪,▪▪▪
   )     
    [ ]+
     (?<round_outline>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1 
        ##   
        ##  note - CANNOT incl. :| !!!
        ##   used for markers for defs/definitions
        [^:|]+?   ## use non-greedy 
     )
     (?:
        [ ]+   
        [▪]+
     )?
     [ ]*  ## ignore trailing spaces (if any) 
   \z
}xi

ROUND_OUTLINE_II_RE =

%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?<round_marker>
         ::{1,3}     ## e.g. ::,:::,:::: 
   )     
    [ ]+
     (?<round_outline>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1 
        ##   
        ##  note - CANNOT incl. :| !!!
        ##   used for markers for defs/definitions
        [^:|]+?   ## use non-greedy 
     )
     (?:
        [ ]+   
        ::+
     )?
    [ ]*  ## ignore trailing spaces (if any) 
   \z
}xi

ROUND_OUTLINE_RE =

Regexp.union(  ROUND_OUTLINE_I_RE,
   ROUND_OUTLINE_II_RE,
)

ROUND_DEF_OUTLINE_RE = note - for def(initions) only one level support that is, no round outline additions possible (e.g ▪▪ 1st leg etc.)

%r{   \A
     [ ]*  ## ignore leading spaces (if any)
    (?: [▪]  ## BLACK SMALL SQUARE
         |
        :: )      
     [ ]+
      (?<round_outline>
         [^:|]+?   ## use non-greedy 
      )
     [ ]*  ## ignore trailing spaces (if any) 
    ###   possitive lookahead MUST be : OR | 
     (?= [:|] 
         [ ])  ## note: requires space for now after [:|] - keep - why? why not?	
}ix

ROUND_DEF_BASICS_RE =

%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym> [:|,] )    ### note - add comma (,) as optional separator  
}ix

ROUND_DEF_RE =

Regexp.union(  ROUND_DEF_BASICS_RE, 
   DURATION_RE,  # note - duration MUST match before date
   DATE_RE,  ## note - date must go before time (e.g. 12.12. vs 12.12)
   ANY_RE,
)

P_EN = english helpers (penalty, extra time, …) note - p must go last (shortest match) pso = penalty shootout - note - remove PSO for now (may add later back) - why? why not? todo/fix/clean-up - keep it simple - remove optional trailing dot (.) from pen., p., agg. etc. - why? why not? always use (simply) pen, p, agg (also) remove a.e.t. / a.e.t option - why? why not? UPDATE mar/2026: addd pens too - keep - why? why not? (4-3 pens) (4-3 Pens) -- keep mixed Pens/Pen. too - why? why not? (4-3 Pen.)

'(?-i: PEN | P |' +
'[Pp]ens | [Pp]en\.? | p\.? )'

ET_EN = fix - change ET_EN to AET_EN!!! - why? why not? check - allow Aet too - why? why not? or A.e.t ??

'(?-i: AET | ' +
'aet | a\.e\.t\.? )'

AETGG_EN = after (golden goal/sudden death) extra time - add more options/styles - why? why not?

'(?-i: AET/GG | AGGET | ASDET | ' +
'aet/gg | a\.e\.t\.?/g\.g\.? | agget | asdet )'

AETSG_EN = after (silver goal) extra time

'(?-i: AET/SG | ASGET | ' +
'aet/sg | a\.e\.t\.?/s\.g\.? | asget  )'

AGG_EN = agg/agg. or AGG

'(?-i: AGG | agg\.? )'

SCORE_P = fix - change SCORE_P to SCORE_FULL_P SCORE_ET to SCORE_FULL_ET (re)use SCORE_P, SCORE_ET for score only part!!!

%Q<  (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})
        [ ]? #{P_EN}
>

SCORE_ET =

%Q<  (?<et1>\\d{1,2}) - (?<et2>\\d{1,2})
        [ ]? #{ET_EN}
>

SCORE_LOOKAHEAD =

'(?= [ ,\]] | $)'

SCORE__ET_GG_SG__RE = after extra-time with golden goal/sudden death & silver goal rule note - golden goal & silver goal EXCLUDE penalties!!! 4-3 a.e.t/g.g. 4-3 aet/gg 4-3agget -or- 4-3 asdet 2-1 aet/sg -or- 4-3 aet/gg (3-3, 2-1)

%r{
    (?<score_full>
       \b
       (?<et1>\d{1,2}) - (?<et2>\d{1,2})
                      [ ]? (?:
                               (?<aetgg> #{AETGG_EN})
                                  |
                               (?<aetsg> #{AETSG_EN})
                            )
       ### note:
       ## add optional full-time, half-time score
         (?:
             [ ]+
             \(
                [ ]*
               (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
                  [ ]*
                (?:
                   , [ ]*
                   (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
                     [ ]*
                  )?
               )? # note: make half time (HT) score optional for now
             \)
         )?                     
        #{SCORE_LOOKAHEAD}
)}ix

SCORE__P_ET__RE = note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.) 3-4 pen. 2-2 a.e.t. 3-4 pen. 2-2 a.e.t. 2-2 a.e.t.

%r{
(?<score_full>
   \b
    (?: #{SCORE_P} [ ]+ 
     )?             ## note: make penalty (P) score optional for now
    #{SCORE_ET}
    #{SCORE_LOOKAHEAD}
)}ix

SCORE__ET_P__RE = note: allow SPECIAL cases WITHOUT full time scores AND with pen in last position! 2-2 a.e.t., 3-4 pen. 2-2 a.e.t. 3-4 pen. ## or without comma separator - why? why not?

%r{
(?<score_full>
   \b
    #{SCORE_ET}  
       (?: [ ]*,[ ]* | [ ]+ )
    #{SCORE_P}  
    #{SCORE_LOOKAHEAD}
)}ix

SCORE__FT_P__RE = special case (i) - full time with penalties 2-2, 3-4 pen.

%r{
(?<score_full>
   \b
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})  
        [ ]*,[ ]*    ## note - comma required!!! 
    #{SCORE_P}  
    #{SCORE_LOOKAHEAD}
)}ix

SCORE__FT_HT_P__RE = special case (ii) - full time & half-time with penalties 2-2 (1-1), 3-4 pen.

%r{
(?<score_full>
   \b
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
        [ ]*
         \(
             (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         \)
        [ ]*,[ ]*    ## note - comma required!!! 
    #{SCORE_P}  
    #{SCORE_LOOKAHEAD}
)}ix

SCORE__P__RE = note: allow SPECIAL with penalty only 3-4 pen. or 3-4p etc.

%r{
        (?<score_full>
  \b
    #{SCORE_P}  
    #{SCORE_LOOKAHEAD}
)}ix

SCORE__P_ET_FT_HT_V2__RE = support short all-in-one e.g. e.g. 3-4 pen. 2-2 a.e.t. ( 1-1, 1-1 ) becomes 3-4 pen. (2-2, 1-1, 1-1)

%r{
          (?<score_full>
   \b
    #{SCORE_P} [ ]+       
       \(
       [ ]*
   (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]*, [ ]*
   (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*, [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
       [ ]*
    \)
   #{SCORE_LOOKAHEAD}
)}ix

SCORE__ET_FT_HT_P__RE = e.g. 2-2 a.e.t. (1-1, 1-0), 5-1 pen.

%r{
          (?<score_full>
   \b
   #{SCORE_ET} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
   (?: [ ]*,[ ]* | [ ]+)
   #{SCORE_P}
   #{SCORE_LOOKAHEAD}
)}ix

SCORE__P_ET_FT_HT__RE = e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or 3-4p 2-2aet (1-1, ) or 3-4 pen. 2-2 a.e.t. (1-1) or 2-2 a.e.t. (1-1, 1-1) or 2-2 a.e.t. (1-1, ) or 2-2 a.e.t. (1-1)

%r{
          (?<score_full>
   \b
   (?:
      #{SCORE_P} [ ]+
    )?            ## note - make penalty (P) score optional for now
   #{SCORE_ET} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
 #{SCORE_LOOKAHEAD}
)}ix

SCORE__P_FT_HT__RE = special case for case WITHOUT extra time!! same as above (but WITHOUT extra time and pen required)

%r{
         (?<score_full>
            \b
           #{SCORE_P} [ ]+
    \(
    [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]*
 (?:
      , [ ]*
     (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         [ ]*
     )?
 )?              # note: make half time (HT) score optional for now
   \)
#{SCORE_LOOKAHEAD}
)}ix

SCORE__FT_HT__RE = e.g. 2-1 (1-1)

%r{
            (?<score_full>
 \b
 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
      [ ]+ \( [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
      [ ]* \)
#{SCORE_LOOKAHEAD}
)}ix

SCORE_FULL_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
  SCORE__ET_GG_SG__RE,       # e.g. 3-1 aet/gg  
  SCORE__P_ET_FT_HT_V2__RE,  # e.g. 5-1 pen. (2-2, 1-1, 1-0)  
  SCORE__ET_FT_HT_P__RE,    # e.g. 2-2 a.e.t. (1-1, 1-0), 5-1 pen. 
  SCORE__P_ET_FT_HT__RE,    # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
  SCORE__P_FT_HT__RE,     # e.g. 5-1 pen. (1-1)
  SCORE__ET_P__RE,        # e.g. 2-2 a.e.t., 5-1 pen.
  SCORE__FT_P__RE,        # e.g. 2-2, 5-1 pen.
  SCORE__FT_HT_P__RE,     # e.g. 2-2 (1-1), 5-1 pen.
  SCORE__P_ET__RE,        # e.g.  5-1 pen. 2-2 a.e.t.  or  2-2 a.e.t. (w/o pen)
  SCORE__P__RE,           # e.g. 5-1 pen.
  SCORE__FT_HT__RE,        # e.g. 1-1 (1-0)
  ##  note - keep basic score as its own token!!!!
  ##   that is, SCORE & SCORE_MORE
  ### SCORE__FT__RE,           # e.g. 1-1  -- note - must go last!!!
)

SCORE_AWD_RE = note - keep AWD w/o dot - why? why not?

%r{
            (?<score_awd>
 \b
  (?<score1>\d{1,2}) - (?<score2>\d{1,2})
    [ ]?
      (?-i: awd\.? | AWD )
  ## POSITIVE lookahead - requires space
  (?= [ ])
)}ix

SCORE_ABD_RE = add support for score abandoned (inline style) 2-1 abd. or 2-1 ABD

%r{
            (?<score_abd>
 \b
  (?<score1>\d{1,2}) - (?<score2>\d{1,2})
    [ ]?
     (?-i: abd\.? | ABD )
  ## POSITIVE lookahead - requires space
  (?= [ ])
)}ix

SCORE_RE = 2-1 note - was SCORE__FT__RE changed to "generic" SCORE_RE and (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) changed (?<score1>\d{1,2}) - (?<score2>\d{1,2}) to pattern match not necessarily the full-time (ft) scoreline!!! - pattern also used for goal seq(uence) e.g. 1-0 Kane, 1-1 Johnson

%r{
            (?<score>
 \b
  (?<score1>\d{1,2}) - (?<score2>\d{1,2})
 \b
)}ix

SCORE_TEAM_RE = add support for “split” score note - for now (2) 1 is REQUIRED

%r{
    (?<score_team>
         \(
            (?<score_i> \d{1,2}) 
         \)
         [ ]*   ## note - space optional- why? why not?
            (?<score_ii> \d{1,2})
        \b 
    )
}ix

SCORE_TEAM_PEN_RE = “penalty”-style (4) is assumed penalty score note - for now 1 (4) is REQUIRED

%r{
    (?<score_team_pen>
         \b
            (?<score_i> \d{1,2})
         \b
           [ ]*  ## note - space optional- why? why not?  
         \(
            (?<score_pen> \d{1,2}) 
         \)
    )
}ix

SCORE_TEAM_NUM_RE = note - score_team_num (<100) e.g. 0, 1, .., 10, 11, .. 99 use a different name - why? why not? note - must be surrouned by space

%r{
    ## positive lookbehind
     (?<= [ ])

      (?<score_team_num> \d{1,2} )

     ## positive lookahead
     (?= [ ]|\z)
}x

TABLE_HEADING_I_RE =

%r{
    \A
     [ ]*  ## ignore leading spaces (if any)
    (?<table_heading>
      \b
       P(?:ld)?  [ ]+ 
        W        [ ]+
        D        [ ]+
        L        [ ]+
        Gls      [ ]+
        Pts
     \b
      )
     [ ]*  ## ignore trailing spaces (if any) 
     \z
}xi

TABLE_DIVIDER_RE =

%r{
      \A
  [ ]*  ## ignore leading spaces (if any)
      (?<table_divider>
          (?:  ---   ## note - require three dashes minimum (---)
                [-]* 
          )  
            |
          (?: - [ ]+ - [ ]+ -  ## note - require three dashes minimum (- - -)
                (?: [ ]+ -)*   ##   todo/check - restrict spaces to 2 or 3 or such - why? why not?
          )  
      )
  [ ]*  ## ignore trailing spaces (if any) 
      \z
}xi

TABLE_NOTE_RE =

%r{
       \A
        [ ]*  ## ignore leading spaces (if any)
           -\.
           [ ]*
       (?<table_note>
            .+?   ## note - use non-greedy       
         )
        [ ]*  ## ignore trailing spaces (if any) 
        \z
}xi

TABLE_I_RE =

%r{
(?<table>\b 
     \d{1,2} [ ]+                        # Pld
     \d{1,2} [ ]+                        # W
     \d{1,2} [ ]+                        # D
     \d{1,2} [ ]+                        # L
     (?: \d{1,3} - [ ]* \d{1,3} [ ]+ )   # GF-GA
     \d{1,3}                             # Pts   
      \b 
)}xi

TABLE_II_RE = Pld Pts W D L GF-GA | d d d d d d-d ARG^ 3 6 3 0 0 10-4 CHI 3 4 2 0 1 5-3 FRA 3 2 1 0 2 4-3 MEX 3 0 0 0 3 4-13

%r{
(?<table>\b 
     \d{1,2} [ ]+                        # Pld
     \d{1,3} [ ]+                        # Pts   
     \d{1,2} [ ]+                        # W
     \d{1,2} [ ]+                        # D
     \d{1,2} [ ]+                        # L
     (?: \d{1,3} - [ ]* \d{1,3})   # GF-GA
      \b 
)}xi

TABLE_RE = possible start lines for a table excludes NOTE and RULER (e.g. --- or) or such in the future

Regexp.union(
    TABLE_HEADING_I_RE,
    TABLE_I_RE,
    TABLE_II_RE,
)

TABLE_MORE_RE = all possible continuation for a table excludes HEADING

Regexp.union(
    TABLE_NOTE_RE,
    TABLE_DIVIDER_RE,
    TABLE_I_RE,
    TABLE_II_RE,
)

POSTPONED =

%Q{ (?<postponed> postponed  | pst\\.? | po?stp\\.?  | ppd\\.? ) }

CANCELED = add can/can. - why? why not?

%Q{ (?<canceled>  cancell?ed | canc\\.? ) }

WALKOVER = add o/w too - why? why not?

%Q{ (?<walkover>  walkover   | w/o  | wo ) }

AWARDED =

%Q{ (?<awarded>   awarded    | awd\\.? ) }

SUSPENDED =

%Q{ (?<suspended> suspended  | susp\\.? ) }

ABANDONED =

%Q{ (?<abandoned> abandoned  | aban\\.?  | abd\\.? ) }

ANNULLED =

%Q{ (?<annulled>  annulled ) }

VOIDED = note - alternative (name) to annulled

%Q{ (?<voided>    voided     | void ) }

REPLAY =

%Q{ (?<replay>    replay     | repl\\.? ) }

STATUS_RE = note - status_note incl. complete text incl. <status> (not normalized) <status> gets normalized e.g. ppt => postponed etc.

%r{
            \[
      (?:    
#############################################  
### opt 1 - allow long forms with note/comment for some stati
##                    e.g. [postponed due to tropical storm "Hanna"]
##                         [suspended at 84' by storm; result stood]
#########################
           (?: (?<status_note>
                  (?<status>
               ####################
               ## pre-match (not played)
                    #{POSTPONED}
                           |
                    #{CANCELED}       
                           |
                    #{WALKOVER}        
                           |
               ######################   
               ## pre/post match
                     #{AWARDED}
                            |
               ########################
               ## post match - (partially) played
                    #{SUSPENDED} 
                            |   
                    #{ABANDONED}
                            |
                    #{ANNULLED}
                            |
                    #{VOIDED} ### note - alternative to annulled
              )     ## end-of-<status>
                  [ :;,-]+     ## leading spaces (or separators) 
                  [^\]]+?      ## note - add non-greedy match 
              ) ## end-of-<status-note>   
              [ ]*  ## eat-up optional trailing spaces
            )
            |       
########################################
## opt 2 - short form only (no note/comments) e.g. [postponed], [Canceled], etc.
####################################     
            (?<status>
         ####################
         ## pre-match (not played)
               #{POSTPONED}
                 |
               #{CANCELED}
                 |
               #{WALKOVER}         
                 |
         ######################   
         ## pre/post match
               #{AWARDED}
                 |
         ########################
         ## post match - (partially) played
               #{SUSPENDED}                                        
                 |
               #{ABANDONED}
                 |
               #{ANNULLED}
                 |
               #{VOIDED}   ### note - alternative to annulled
                 |
               #{REPLAY}       ### todo/fix - keep replay - why? why not?
                                  ###   prefer replay in round e.g. 
                                  ##       ▪ Round 17, Replay
                                  ##       ▪ Semi-finals, Replays
            )
      )
    \]
}ix

PROP_NAME_RE = name different from text (does NOT allow number in name/text)

%r{
                 (?<prop_name> 
                      \b
                   (?<name>
                      \p{L}+       
                        \.?    ## optional dot
                          (?:
                             ## rule for space; only one single space allowed inline!!!
                              (?:
                                (?<![ ])  ## use negative lookbehind                             
                                  [ ] 
                                (?=\p{L}|['"])      ## use lookahead        
                              )
                              ## support (inline) quoted name e.g. "Rodri" or such
                                  |
                                  (?:
                                     (?<=[ ])  ## use positive lookbehind                             
                                     " \p{L}+ " 
                                      ## require space here too - why? why not?
                                   )                      
                                  |   
                             (?:
                                (?<=    ## \p{L}\. | \p{L}
                                        [\p{L}.] 
                                     )  ## use  POSITIVE lookbehind
                                 [-]   ## must be surrounded by letters
                                       ## note - allow leading dot (.) e.g. K.-H.Förster 
                                       ##                short for          Karl-Heinz Förster
                                       ##
                                       ## e.g. One-Two NOT
                                       ##      One- Two or One - Two or One -Two etc.
                                (?=\p{L})      ## use lookahead        
                              )
                                 |   
                              (?:  ## flex rule for quote - allow any
                                    ##  only check for double quotes e.g. cannot follow other ' for now - why? why not?
                                    ##        allows  rodrigez 'rodri' for example
                                (?<!')  ## use negative lookbehind                             
                                   '         
                              )      
                                 |   ## standard case with letter(s) and optinal dot
                              (?: \p{L}+
                                    \.?  ## optional dot
                              )
                          )*
                    )
               ## add lookahead - must be non-alphanum 
                  (?=[ ,;\]\)]|$)
                  )
}ix

SCORE_LEGS_RE = win on away goals aet

%r{
(?<score_legs>
   \b   
    (?<leg1_ft1>\d{1,2}) - (?<leg1_ft2>\d{1,2})
       (?: [ ]+ |  [ ]*,[ ]*)   # separate by spaces OR comma
    (?:
        ## opt 1 - after extra-time (et) score
            (?<leg2_et1>\d{1,2}) - (?<leg2_et2>\d{1,2})
               [ ]? #{ET_EN}   ## a.e.t./aet
                ### note - might end in dot (.) not alpha
                ###  thus, wordboundary NOT working
               #{SCORE_LOOKAHEAD}   
          |
        ## opt 2 - full-time (ft)  
        (?<leg2_ft1>\d{1,2}) - (?<leg2_ft2>\d{1,2})
            \b 
    )                
    (?:   ## check optional aggregate e.g. (agg 4-4)
        [ ]+
         \(
             agg [ ]
              (?<agg1>\d{1,2}) - (?<agg2>\d{1,2}) 
              
             ### add win options 
             (?:
                 ## opt 1 - on away goals
                (?<away> [ ]*,[ ]*
                         (?:win [ ])? on [ ] away [ ] goals?
                 )
                   |
                 ## opt 2 - on penalties  
                (?:
                   [ ]*,[ ]*
                   (?:win [ ])?
                    (?<leg2_p1>\d{1,2}) - (?<leg2_p2>\d{1,2})
                    [ ] on [ ] pens
                )
             )?
         \)
    )?
)}ix

SCORE_FULLER_AGG =

_mk_score_fuller_agg( win: false )

SCORE_FULLER_AGG_WIN =

_mk_score_fuller_agg( win: true )

SCORE_FULLER_P =

_mk_score_fuller_p( win: false )

SCORE_FULLER_P_WIN =

_mk_score_fuller_p( win: true )

SCORE_FULLER_AWAY_WIN =

%Q<
     (?:
      (?<away>
        ############
        ## opt 1)  with win
        (?:
            (?: win [ ] )?
            (?: (?<away1>\\d{1,2}) - (?<away2>\\d{1,2}) [ ] )?
             on [ ] away [ ] goals?     # goal or goals
        )
        |        
        #####
        ## opt 2)  "classic" (post)
        (?:
           (?: (?<away1>\\d{1,2}) - (?<away2>\\d{1,2}) [ ] )?
              [ ]* away  
        )
        |
        #####
        ## opt 3) up-front (pre)
        (?:
              away 
           (?:  [ ]
                (?<away1>\\d{1,2}) - (?<away2>\\d{1,2})
           )?   
        )
     ))                   
>

SCORE_FULLER_HT_OPT =

%Q<
  (?:   HT [ ]
      (?: (?<ht1>\\d{1,2}) - (?<ht2>\\d{1,2})) 
      [ ]*,[ ]*
  )?  ## note - make optional
>

SCORE_FULLER_FT_OPT =

%Q<
  (?:   FT [ ]
      (?: (?<ft1>\\d{1,2}) - (?<ft2>\\d{1,2})) 
      [ ]*,[ ]*
  )?  ## note - make optional
>

SCORE_FULLER__HT = 4-4 (HT 2-1) or Team A 4-1 Team B (HT 2-1)

%Q<
             \\(  HT [ ]
                  (?<ht1>\\d{1,2}) - (?<ht2>\\d{1,2}) 
             \\)
>

SCORE_FULLER__HT_FT__RE =

%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__HT}
)}ix

SCORE_FULLER_MORE__HT_FT__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__HT}
)}ix

SCORE_FULLER__ET =

%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_FT_OPT} 
                (?:
                  (?<aetgg> #{AETGG_EN})
                   |
                  (?<aetsg> #{AETSG_EN}) 
                   |
                  (?<aet> #{ET_EN})
                 )
             \\)
>

SCORE_FULLER__ET__RE =

%r{
(?<score_fuller>
   \b   
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__ET}
)}ix

SCORE_FULLER_MORE__ET__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__ET}
)}ix

SCORE_FULLER__ET_P = 4-4 (aet, win 3-5 on pens) 4-4 (aet, 3-5 on pens) 4-4 (aet, 3-5 pen) 4-4 (a.e.t., 3-5 pen.) or Team A 4-4 Team B (aet, win 3-5 on pens) Team A 4-4 Team B (aet, 3-5 on pens) Team A 4-4 Team B (aet, 3-5 pen) Team A 4-4 Team B (a.e.t., 3-5 pen.)

%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_FT_OPT} 
                (?<aet> #{ET_EN})
                 [ ]*,[ ]*
                 #{SCORE_FULLER_P_WIN}
             \\)
>

SCORE_FULLER__ET_P__RE =

%r{
(?<score_fuller>
   \b   
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__ET_P}
)}ix

SCORE_FULLER_MORE__ET_P__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__ET_P}
)}ix

SCORE_FULLER__FT_P = 4-4 (win 3-5 on pens) 4-4 (3-5 pen) 4-4 (3-5p) or Team A 4-4 Team B (win 3-5 on pens) Team A 4-4 Team B (3-5 pen) Team A 4-4 Team B (3-5p)

%Q<
             \\(
                  #{SCORE_FULLER_HT_OPT} 
                  #{SCORE_FULLER_P_WIN}
             \\)
>

SCORE_FULLER__FT_P__RE =

%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     \(
         #{SCORE_FULLER_P_WIN}
     \)
)}ix

SCORE_FULLER_MORE__FT_P__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__FT_P}
)}ix

SCORE_FULLER__FT_AGG = 3-2 (win 4-5 on aggregate) 3-2 (4-5 on aggregate) 3-2 (4-5 on agg) 3-2 (4-5 agg) 3-2 (4-5 agg.) or 3-2 (agg 4-5)

%Q<
             \\(
                 #{SCORE_FULLER_HT_OPT} 
                 #{SCORE_FULLER_AGG_WIN}
             \\)
>

SCORE_FULLER__FT_AGG__RE =

%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__FT_AGG}
)}ix

SCORE_FULLER_MORE__FT_AGG__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__FT_AGG}
)}ix

SCORE_FULLER__FT_AGG_AWAY = ft + agg + away 2-1 (3-3 on aggregate, win on away goals) 2-1 (3-3 on aggregate, win 2-1 on away goals)

%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_AGG}
                   [ ]*,[ ]*
                 #{SCORE_FULLER_AWAY_WIN}
             \\)
>

SCORE_FULLER__FT_AGG_AWAY__RE =

%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__FT_AGG_AWAY}
)}ix

SCORE_FULLER_MORE__FT_AGG_AWAY__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__FT_AGG_AWAY}
)}ix

SCORE_FULLER__ET_AGG_P = 2-1 (aet, 3-3 on aggregate, win 5-2 on pens) 2-1 (aet, 3-3 agg, 5-2 pen.)

%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_FT_OPT} 
                (?<aet> #{ET_EN})
                    [ ]*,[ ]*
                    #{SCORE_FULLER_AGG}  
                    [ ]*,[ ]*
                    #{SCORE_FULLER_P_WIN}                     
             \\)
>

SCORE_FULLER__ET_AGG_P__RE =

%r{
(?<score_fuller>
   \b   
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__ET_AGG_P}
)}ix

SCORE_FULLER_MORE__ET_AGG_P__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__ET_AGG_P}
)}ix

SCORE_FULLER_RE = map tables note: order matters - first come-first matched/served

Regexp.union(
SCORE_FULLER__HT_FT__RE,       ## e.g.  3-2 (HT 2-1)
SCORE_FULLER__ET_P__RE,        ## e.g.  2-2 (aet, win 5-3 on pens)
SCORE_FULLER__ET__RE,          ## e.g.  2-3 (aet)
SCORE_FULLER__FT_P__RE,        ## e.g.  2-2 (win 5-3 on pens)
SCORE_FULLER__FT_AGG__RE,      ## e.g.  2-3 (win 5-4 on aggregate)
SCORE_FULLER__FT_AGG_AWAY__RE, ## e.g.  2-1 (3-3 on aggreate, win 2-1 on away goals)
SCORE_FULLER__ET_AGG_P__RE,    ## e.g.  2-1 (aet, 3-3 on aggregate, win 5-2 on pens)
)

SCORE_FULLER_MORE__HT__RE = add support for “stand-alone” (HT) and (FT) - keep why? why not?

%r{
(?<score_fuller_more>
    \( (?<ht> ht ) \)
)}ix

SCORE_FULLER_MORE__FT__RE =

%r{
(?<score_fuller_more>
     \( (?<ft> ft ) \)  
)}ix

SCORE_FULLER_MORE__FT_ET__RE = add special for fuller_more (aet 4-3) - core score is ft, and fuller more incl. et!!!

%r{
(?<score_fuller_more>
      \(#{ET_EN}
           [ ]
       (?<et1>\d{1,2}) - (?<et2>\d{1,2})
      \) 
)}ix

SCORE_FULLER_MORE__HT_FT__CLASSIC_RE = note - simply (1-1) !!!!! note - special attention needed for placemenent in processing error!!! make sure it is the last (or on of the last) match(es)

%r{
(?<score_fuller_more>
     \(  
          (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) 
     \)
)}ix

SCORE_FULLER_MORE_RE =

Regexp.union(
  SCORE_FULLER_MORE__FT__RE,          ## e.g. (ft)
  SCORE_FULLER_MORE__HT__RE,          ## e.g. (ht)
  SCORE_FULLER_MORE__HT_FT__RE,       ## e.g. (HT 2-1)
  SCORE_FULLER_MORE__ET_P__RE,        ## e.g. (aet, win 5-3 on pens)
  SCORE_FULLER_MORE__ET__RE,          ## e.g. (aet)
  SCORE_FULLER_MORE__FT_ET__RE,       ## e.g. (aet 3-2) - (SPECIAL) incl. after extra-time score!!
  SCORE_FULLER_MORE__FT_P__RE,        ## e.g. (win 5-3 on pens)
  SCORE_FULLER_MORE__FT_AGG__RE,      ## e.g. (win 5-4 on aggregate)
  SCORE_FULLER_MORE__FT_AGG_AWAY__RE, ## e.g. (3-3 on aggreate, win 2-1 on away goals)
  SCORE_FULLER_MORE__ET_AGG_P__RE,    ## e.g. (aet, 3-3 on aggregate, win 5-2 on pens)

  SCORE_FULLER_MORE__HT_FT__CLASSIC_RE,   ## e.g. (2-1)  half-time !!!!
)

DURATION_I_RE =

%r{
(?<duration>
    \b
  (?:
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name1>#{MONTH_NAMES})
      [ ] 
   (?<day1>\d{1,2})
   ## optional year
   (  ,?   # optional comma
      [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]* - [ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name2>#{MONTH_NAMES})
      [ ] 
   (?<day2>\d{1,2})
   ## optional year
   (  ,?   # optional comma
      [ ]
      (?<year2>\d{4})
   )?
  )
   \b
)}ix

DURATION_II_RE = variant ii add support for shorthand August 16-18, 2011 September 13-15, 2011 October 18-20, 2011 March 6-8 2012 March 6-8 - add support for August 16+17 or such (and check 16+18) use <op> to check if day2 is a plus or range or such - why? why not?

%r{
(?<duration>
    \b
   (?:
       (?<month_name1>#{MONTH_NAMES})
           [ ]
        (?<day1>\d{1,2})
             -
        (?<day2>\d{1,2})
          (?:
            ,?     ## optional comma
            [ ]
            (?<year1>\d{4})
          )?     ## optional year   
   )
   \b
)}ix

DURATION_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
   DURATION_I_RE,
   DURATION_II_RE,
)

Class Method Summary collapse

._build_date(m) ⇒ Object

“internal” date helpers.
._build_date_legs(m) ⇒ Object
._build_duration(m) ⇒ Object
._build_goal_count(m) ⇒ Object
._build_goal_minute(m) ⇒ Object
._build_goal_type(m) ⇒ Object
._build_minute(m) ⇒ Object
._build_score_team(m) ⇒ Object
._build_score_team_num(m) ⇒ Object
._build_score_team_pen(m) ⇒ Object
._build_status(m) ⇒ Object
._build_time(m) ⇒ Object
._mk_score_fuller_agg(win:) ⇒ Object

regex score helpers note - MUST double escape d e.g.
._mk_score_fuller_p(win:) ⇒ Object

with optional win - true|false.
._parse_date(str) ⇒ Object
._parse_goal_count(str) ⇒ Object
._parse_goal_minute(str) ⇒ Object
._parse_score_full(str) ⇒ Object
._parse_team(str) ⇒ Object
.build_map(lines, downcase: false) ⇒ Object
.build_names(lines) ⇒ Object
.parse_date(str, start:) ⇒ Object

“top-level” add a date parser helper.
.parse_names(txt) ⇒ Object

Instance Method Summary collapse

#_build_date(m) ⇒ Object
#_build_date_legs(m) ⇒ Object
#_build_duration(m) ⇒ Object
#_build_goal_count(m) ⇒ Object
#_build_goal_minute(m) ⇒ Object
#_build_goal_type(m) ⇒ Object
#_build_minute(m) ⇒ Object
#_build_score_team(m) ⇒ Object
#_build_score_team_num(m) ⇒ Object
#_build_score_team_pen(m) ⇒ Object
#_build_status(m) ⇒ Object
#_build_time(m) ⇒ Object
#_tokenize_line(line) ⇒ Object
#_tokenize_tty_line(line) ⇒ Object
#debug? ⇒ Boolean
#initialize(lines, debug: false) ⇒ Lexer constructor

A new instance of Lexer.
#is_group?(text) ⇒ Boolean

todo/fix - use LangHelper or such e.g.
#is_round?(text) ⇒ Boolean
#log(msg) ⇒ Object
#tokenize_with_errors ⇒ Object

Constructor Details

#initialize(lines, debug: false) ⇒ `Lexer`

Returns a new instance of Lexer.

Raises:

(ArgumentError)

# File 'lib/sportdb/parser/lexer.rb', line 34

def initialize( lines, debug: false )
   raise ArgumentError, "(string) text expected for lexer; got #{lines.class.name}"  unless lines.is_a?(String)
  
   @debug = debug
   @txt   = lines
end

Class Method Details

._build_date(m) ⇒ `Object`

“internal” date helpers

# File 'lib/sportdb/parser/token-date.rb', line 305

def self._build_date( m )
            date = {}
         ## map month names
         ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date[:y]  = m[:year].to_i(10)  if m[:year]
            ## check - use y too for two-digit year or keep separate - why? why not?
            date[:yy] = m[:yy].to_i(10)    if m[:yy]    ## two digit year (e.g. 25 or 78 etc.)
            date[:m] = m[:month].to_i(10)  if m[:month]
            date[:m] = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
            date[:d]  = m[:day].to_i(10)   if m[:day]
            date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

            date
end

._build_date_legs(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 321

def self._build_date_legs( m )
           legs = {}
        ## map month names
         ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date = {}
            date[:m] = MONTH_MAP[ m[:month_name1].downcase ]
            date[:d]  = m[:day1].to_i(10)   
            legs[:date1] = date
     
            date = {}
            date[:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            date[:d]  = m[:day2].to_i(10)   
            legs[:date2] = date

            legs
end

._build_duration(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-date_duration.rb', line 169

def self._build_duration( m )
            ## todo/check/fix - if end: works for kwargs!!!!!
            duration = { start: {}, end: {}}

            duration[:start][:y] = m[:year1].to_i(10)  if m[:year1]
            duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ]   if m[:month_name1]
            duration[:start][:d]  = m[:day1].to_i(10)   if m[:day1]
            duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ]   if m[:day_name1]

            duration[:end][:y] = m[:year2].to_i(10)  if m[:year2]
            duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            duration[:end][:d]  = m[:day2].to_i(10)   if m[:day2]
            duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ]   if m[:day_name2]

            duration
end

._build_goal_count(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-goals.rb', line 436

def self._build_goal_count( m )
    count = {}
    count[:count] = m[:value].to_i(10)        if m[:value]
    count[:og]    = m[:og_value] ? m[:og_value].to_i(10) : 1      if m[:og]   ## check flag
    count[:pen]   = m[:pen_value] ? m[:pen_value].to_i(10) : 1    if m[:pen]  ## check flag
    count
end

._build_goal_minute(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-goals.rb', line 389

def self._build_goal_minute( m )
    minute = {}

    minute[:m]     =  m[:value].to_i(10)   ## always required

    ## stoppage/injury time (offset)
    minute[:offset] = m[:value2].to_i(10)   if m[:value2]
    
    minute[:og]  = true       if m[:og]
    minute[:pen] = true       if m[:pen]
    minute[:freekick] = true  if m[:fk]
    minute[:header] = true    if m[:hdr]
  
    minute[:secs] = m[:secs].to_i(10)   if m[:secs]
  
    minute
end

._build_goal_type(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-goals.rb', line 448

def self._build_goal_type( m )
    goal = {}
    goal[:og]       = true  if m[:og]
    goal[:pen]      = true  if m[:pen]
    goal[:freekick] = true  if m[:fk]
    goal[:header]   = true  if m[:hdr]
    goal
end

._build_minute(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-goals.rb', line 409

def self._build_minute( m )
    minute = {}
    minute[:m]      = m[:value].to_i(10)   ## always required

    ## stoppage/injury time (offset)   
    minute[:offset] = m[:value2].to_i(10)   if m[:value2]

    minute
end

._build_score_team(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-score.rb', line 412

def self._build_score_team( m )
            score = {}
            ##  note - score team is "generic"
            ##      might be full-time (ft) or
            ##         after extra-time (aet) or such
            ##         or even undecided/unknown
            ##    thus, use score_i/score_ii 
            score[:score] = [m[:score_i].to_i(10),
                             m[:score_ii].to_i(10)]  
            score
end

._build_score_team_num(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-score.rb', line 435

def self._build_score_team_num( m )
            score = {}
            score[:score] = m[:score_team_num].to_i(10)
            score
end

._build_score_team_pen(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-score.rb', line 426

def self._build_score_team_pen( m )
            score = {}
            score[:score] = [m[:score_i].to_i(10),
                             m[:score_pen].to_i(10)]  
            score
end

._build_status(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-status.rb', line 100

def self._build_status( m )
        status = {}
        ## note - norm status text - why? why not?
        status[:status] = if    m[:postponed] then 'postponed'
                          elsif m[:canceled]  then 'canceled'
                          elsif m[:walkover]  then 'walkover'
                          elsif m[:awarded]   then 'awarded'
                          elsif m[:suspended] then 'suspended'
                          elsif m[:abandoned] then 'abandoned'
                          elsif m[:annulled] ||
                                m[:voided]    then 'annulled'
                          elsif m[:replay]    then 'replay'      
                          else  ## fallback on "generic" status (shouldn't happen)
                            m[:status]
                          end

        ## includes note? e.g.  awarded; originally 2-0
        status[:status_note] = m[:status_note]   if m[:status_note]   
         
        status
end

._build_time(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-time.rb', line 95

def self._build_time( m )
              ## unify to iso-format
              ###   12.40 => 12:40
              ##    12h40 => 12:40 etc.
              ##  keep string (no time-only type in ruby)
              data = { time: {} }
              
              hour     = m[:hour].to_i(10)  ## allow 08/07/etc.
              minute   = m[:minute].to_i(10)
   
              ##   check if 24:00 possible? or only 0:00 (23:59)
              unless (hour   >=0 && hour   <=23) &&
                     (minute >=0 && minute <=59)
                 raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
              end
   
              data[:time][:h] = hour
              data[:time][:m] = minute
              data[:time][:timezone] = m[:timezone]    if m[:timezone] 
      

              ## check if local time present e.g.
              ##    18:30 (19:30)
              ##    18:30 (19:30 BST)  etc.
              if m[:time_local]
                  data[:time_local] = {}

                local_hour     = m[:local_hour].to_i(10)  ## allow 08/07/etc.
                local_minute   = m[:local_minute].to_i(10)
  
                ##   check if 24:00 possible? or only 0:00 (23:59)
                unless (hour   >=0 && hour   <=23) &&
                       (minute >=0 && minute <=59)
                   raise ArgumentError, "parse error - local time >#{m[:time_local]}< out-of-range"
                end
  
                data[:time_local][:h] = local_hour
                data[:time_local][:m] = local_minute
                data[:time_local][:timezone] = m[:local_timezone]    if m[:local_timezone] 
            end

              data
end

._mk_score_fuller_agg(win:) ⇒ `Object`

regex score helpers

note - MUST double escape \d e.g. \\d!!!   if not "simple" string (e.g. '' but %Q<>)

# File 'lib/sportdb/parser/token-score_fuller.rb', line 24

def self._mk_score_fuller_agg( win: )    ## with optional win - true|false
   %Q<
                 (?:
                    ############
                    ## opt 1)  with win
                    (?:
                       #{ win ? '(?: win [ ] )?' : '' }   
                        (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2})
                          [ ] on [ ] agg (?: regate )?  
                    )
                    |        
                    #####
                    ## opt 2)  "classic" (post)
                    (?:
                       (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2})
                          [ ]*
                        #{AGG_EN}   
                    )
                    |
                    #####
                    ## opt 3) agg up-front (pre)
                    (?:
                         agg [ ]
                       (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2})   
                    )
                 )
    >
end

._mk_score_fuller_p(win:) ⇒ `Object`

with optional win - true|false

# File 'lib/sportdb/parser/token-score_fuller.rb', line 53

def self._mk_score_fuller_p( win: )    ## with optional win - true|false
   %Q<
                 (?:
                    ############
                    ## opt 1)  with win
                    (?:
                        #{ win ? '(?: win [ ] )?' : '' }
                        (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})
                          [ ] on [ ] pens
                    )
                    |        
                    #####
                    ## opt 2)  "classic" (post)
                    (?:
                       (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})
                          [ ]*
                        #{P_EN}   
                    )
                    |
                    #####
                    ## opt 3) up-front (pre)
                    (?:
                         (?: pen|p) [ ]
                       (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})   
                    )
                 )                   
    >
end

._parse_date(str) ⇒ `Object`

# File 'lib/sportdb/parser/token_helpers.rb', line 38

def self._parse_date( str )
    ## note - strip - leading/trailing spaces
    m = DATE_RE.match( str.strip )
    
    ####  todo/fix/check:
    ###   wrapped with  \A \z NOT working with union  - check later - why?
    ###   use hand-coded  with pre_match = "" and post_match = ""
    
    if m && m.pre_match == '' && m.post_match == ''
      ## return hash table with captured components
      date = {}
      ## map month names
      ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
      date[:y]  = m[:year].to_i(10)  if m[:year]
      ## check - use y too for two-digit year or keep separate - why? why not?
      date[:yy] = m[:yy].to_i(10)    if m[:yy]    ## two digit year (e.g. 25 or 78 etc.)
      date[:m]  = m[:month].to_i(10)  if m[:month]
      date[:m]  = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
      date[:d]  = m[:day].to_i(10)   if m[:day]
      date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]
      date 
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil   
    else
      nil  ## no match - return nil
    end
end

._parse_goal_count(str) ⇒ `Object`

# File 'lib/sportdb/parser/token-goals.rb', line 422

def self._parse_goal_count( str )  
    ## note - strip - leading/trailing spaces
    m = GOAL_COUNT_RE.match( str.strip )
    if m && m.pre_match == '' && m.post_match == ''
      _build_goal_count( m )
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil   
    else
      nil  ## no match - return nil
    end
end

._parse_goal_minute(str) ⇒ `Object`

# File 'lib/sportdb/parser/token-goals.rb', line 374

def self._parse_goal_minute( str )  
    ## note - strip - leading/trailing spaces
    m = GOAL_MINUTE_RE.match( str.strip )
    if m && m.pre_match == '' && m.post_match == ''
      _build_goal_minute( m )
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil   
    else
      nil  ## no match - return nil
    end
end

._parse_score_full(str) ⇒ `Object`

# File 'lib/sportdb/parser/token_helpers.rb', line 69

def self._parse_score_full( str )
    ## note - strip - leading/trailing spaces
    m=SCORE_FULL_RE.match( str )

    if m && m.pre_match == '' && m.post_match == ''
       score = {}
       score[:p]  = [m[:p1].to_i,m[:p2].to_i]     if m[:p1] && m[:p2]
       score[:et] = [m[:et1].to_i,m[:et2].to_i]   if m[:et1] && m[:et2]
       score[:ft] = [m[:ft1].to_i,m[:ft2].to_i]   if m[:ft1] && m[:ft2]
       score[:ht] = [m[:ht1].to_i,m[:ht2].to_i]   if m[:ht1] && m[:ht2]
       ## score[:agg] = [m[:agg1].to_i,m[:agg2].to_i]   if m[:agg1] && m[:agg2]
       score
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil   
    else
      nil  ## no match - return nil
    end
end

._parse_team(str) ⇒ `Object`

# File 'lib/sportdb/parser/token_helpers.rb', line 23

def self._parse_team( str )  
    ## note - strip - leading/trailing spaces
    m = TEXT_RE.match( str.strip )
    if m && m.pre_match == '' && m.post_match == ''
      m
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil   
    else
      nil  ## no match - return nil
    end
end

.build_map(lines, downcase: false) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 40

def self.build_map( lines, downcase: false )
   ## note: downcase name!!!
  ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
  ##  {"january" => 1,  "jan" => 1,
  ##   "february" => 2, "feb" => 2,
  ##   "march" => 3,    "mar" => 3,
  ##   "april" => 4,    "apr" => 4,
  ##   "may" => 5,
  ##   "june" => 6,     "jun" => 6, ...
  lines.each_with_index.reduce( {} ) do |h,(line,i)|
    line.each do |name|
       h[ downcase ? name.downcase : name ] = i+1
    end  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
    h
  end
end

.build_names(lines) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 33

def self.build_names( lines )
  ## join all words together into a single string e.g.
  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
  lines.map { |line| line.join('|') }.join('|')
end

.parse_date(str, start:) ⇒ `Object`

“top-level” add a date parser helper

# File 'lib/sportdb/parser/token-date.rb', line 344

def self.parse_date( str, start: )
    if m=DATE_RE.match( str )

      year    = m[:year].to_i(10)  if m[:year]
      month   = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
      day     = m[:day].to_i(10)   if m[:day]
      wday    = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

      if year.nil?   ## try to calculate year
        year =  if  month > start.month ||
                   (month == start.month && day >= start.day)
                  # assume same year as start_at event (e.g. 2013 for 2013/14 season)
                  start.year
                else
                  # assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
                  start.year+1
                end
      end
      Date.new( year,month,day )
    else
      puts "!! ERROR - unexpected date format; cannot parse >#{str}<"
      exit 1
    end
end

.parse_names(txt) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 6

def self.parse_names( txt )
  lines = [] # array of lines (with words)

  txt.each_line do |line|
    line = line.strip

    next if line.empty?
    next if line.start_with?( '#' )   ## skip comments too

    ## strip inline (until end-of-line) comments too
    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
    ##   =>   Janvier  Janv  Jan

    line = line.sub( /#.*/, '' ).strip
    ## pp line

    values = line.split( /[ \t]+/ )
    ## pp values

    ## todo/fix -- add check for duplicates
    lines << values
  end
  lines

end

Instance Method Details

#_build_date(m) ⇒ `Object`

319	# File 'lib/sportdb/parser/token-date.rb', line 319 def _build_date( m ) self.class._build_date( m ); end

#_build_date_legs(m) ⇒ `Object`

337	# File 'lib/sportdb/parser/token-date.rb', line 337 def _build_date_legs( m ) self.class._build_date_legs( m ); end

#_build_duration(m) ⇒ `Object`

185	# File 'lib/sportdb/parser/token-date_duration.rb', line 185 def _build_duration(m) self.class._build_duration( m ); end

#_build_goal_count(m) ⇒ `Object`

443	# File 'lib/sportdb/parser/token-goals.rb', line 443 def _build_goal_count( m ) self.class._build_goal_count( m ); end

#_build_goal_minute(m) ⇒ `Object`

406	# File 'lib/sportdb/parser/token-goals.rb', line 406 def _build_goal_minute( m ) self.class._build_goal_minute( m ); end

#_build_goal_type(m) ⇒ `Object`

456	# File 'lib/sportdb/parser/token-goals.rb', line 456 def _build_goal_type( m ) self.class._build_goal_type( m ); end

#_build_minute(m) ⇒ `Object`

418	# File 'lib/sportdb/parser/token-goals.rb', line 418 def _build_minute( m ) self.class._build_minute( m ); end

#_build_score_team(m) ⇒ `Object`

423	# File 'lib/sportdb/parser/token-score.rb', line 423 def _build_score_team( m ) self.class._build_score_team( m ); end

#_build_score_team_num(m) ⇒ `Object`

440	# File 'lib/sportdb/parser/token-score.rb', line 440 def _build_score_team_num( m ) self.class._build_score_team_num( m ); end

#_build_score_team_pen(m) ⇒ `Object`

432	# File 'lib/sportdb/parser/token-score.rb', line 432 def _build_score_team_pen( m ) self.class._build_score_team_pen( m ); end

#_build_status(m) ⇒ `Object`

121	# File 'lib/sportdb/parser/token-status.rb', line 121 def _build_status( m ) self.class._build_status( m ); end

#_build_time(m) ⇒ `Object`

138	# File 'lib/sportdb/parser/token-time.rb', line 138 def _build_time(m) self.class._build_time(m); end

#_tokenize_line(line) ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 551

def _tokenize_line( line )
  tokens = []
  errors = []   ## keep a list of errors - why? why not?


  pos = 0
  ## track last offsets - to report error on no match
  ##   or no match in end of string
  offsets = [0,0]
  m = nil

  ## track number of geo text seen
  ##    (use for - do NOT break on two spaces if no geo text seen yet!!)
  geo_count = 0

  ####
  ## quick hack - keep re state/mode between tokenize calls!!!
  @re  ||= RE     ## note - switch between RE & INSIDE_RE


  if @re == RE  ## top-level
    ### check for modes once (per line) here to speed-up parsing
    ###   for now goals only possible for start of line!!
    ###        fix - remove optional [] - why? why not?
    
    ####
    ## note - ord e.g. (45) for match number can only start a (match) line
    ##                "inline" use NOT possible
    ## note -  ord (for ordinal number!!!) e.g match number (1), (42), etc.
    if (m = START_WITH_ORD.match(line))
       ## note -  strip enclosing () and convert to integer
       tokens << [:ORD, [m[:ord], { value: m[:value].to_i(10) } ]]

       offsets = [m.begin(0), m.end(0)]
       pos = offsets[1]    ## update pos
    elsif (m = START_WITH_YEAR.match(line))
       ## note -  strip enclosing () and convert to integer
       tokens << [:YEAR, m[:year].to_i(10)]

       offsets = [m.begin(0), m.end(0)]
       pos = offsets[1]    ## update pos

    ###
    ##  todo/fix - rename to START_GROUP_DEF_LINE_RE !!!!   
    elsif (m = GROUP_DEF_LINE_RE.match( line ))
      puts "  ENTER GROUP_DEF_RE MODE"   if debug?
      @re = GROUP_DEF_RE   

      tokens << [:GROUP_DEF, m[:group_def]]

      offsets = [m.begin(0), m.end(0)]
      pos = offsets[1]    ## update pos

    ###  todo/fix - rename to PROP_KEY_RE to START_WITH_PROP_KEY_RE !!!  
    elsif (m = PROP_KEY_RE.match( line ))
      ##  start with prop key (match will switch into prop mode!!!)
      ##   - fix - remove leading spaces in regex (upstream) - why? why not?
      ##
      ###  switch into new mode
      ##  switch context  to PROP_RE
        puts "  ENTER PROP_RE MODE"   if debug?
        key = m[:key]


        ### todo/fix - add prop yellow/red cards too - why? why not?
        ##  todo/fix - separate sent off and red card
        ##     sent-off - incl. red card, yellow/red card and the era before red cards!!
        if ['sent off'].include?( key.downcase) 
          @re = PROP_CARDS_RE    ## use CARDS_RE ???
          tokens << [:PROP_SENTOFF, m[:key]]   
        elsif ['red cards'].include?( key.downcase ) 
          @re = PROP_CARDS_RE    ## use CARDS_RE ???
          tokens << [:PROP_REDCARDS, m[:key]]
        elsif ['yellow cards'].include?( key.downcase )
          @re = PROP_CARDS_RE  
          tokens << [:PROP_YELLOWCARDS, m[:key]]
        elsif ['ref', 'referee', 
               'refs', 'referees'   ## note - allow/support assistant refs
              ].include?( key.downcase )
          @re = PROP_REFEREE_RE     
          tokens << [:PROP_REFEREE, m[:key]]
        elsif ['att', 'attn', 'attendance'].include?( key.downcase )
          @re = PROP_ATTENDANCE_RE
          tokens << [:PROP_ATTENDANCE, m[:key]]         
  
     #   elsif ['goals'].include?( key.downcase )
     #     @re = PROP_GOAL_RE
     #     tokens << [:PROP_GOALS, m[:key]]
         
        elsif ['penalties', 
               'penalty shootout',
               'penalty shoot-out',
               'penalty kicks'].include?( key.downcase )
          @re = PROP_PENALTIES_RE
          tokens << [:PROP_PENALTIES, m[:key]]
        else   ## assume (team) line-up
          @re = PROP_RE           ## use LINEUP_RE ???
          tokens << [:PROP, m[:key]]
        end

        offsets = [m.begin(0), m.end(0)]
        pos = offsets[1]    ## update pos
    ###
    ### todo/fix
    ###   rename to START_WITH_ROUND_DEF_OUTLINE_RE !!!!    
    elsif (m = ROUND_DEF_OUTLINE_RE.match( line ))
      puts "   ENTER ROUND_DEF_RE MODE"  if debug?
      @re = ROUND_DEF_RE   

      ## note - return ROUND_DEF NOT  ROUND_OUTLINE token
      tokens << [:ROUND_DEF, m[:round_outline]]

      offsets = [m.begin(0), m.end(0)]
      pos = offsets[1]    ## update pos
    elsif (m = ROUND_OUTLINE_RE.match( line ))
      puts "   ROUND_OUTLINE"  if debug?
      ## note - derive round level from no of (leading) markers
      ##             e.g. ▪/:: is 1, ▪▪/::: is 2, ▪▪▪/:::: is 3, etc.
      ##       note  - ascii-style starts with double ::, thus, autodecrement by one!
      round_level = m[:round_marker].size
      round_level -= 1  if m[:round_marker].start_with?( '::' ) 

      tokens << [:ROUND_OUTLINE, [m[:round_outline], 
                      { outline: m[:round_outline] , 
                        level: round_level}]]

      ## note - eats-up line for now (change later to only eat-up marker e.g. »|>>)
      offsets = [m.begin(0), m.end(0)]
      pos = offsets[1]    ## update pos
    elsif (m = START_GOAL_LINE_RE.match( line ))   ## line starting with ( - assume
      ##  switch context to GOAL_RE (goalline(s))
      ####
      ##  note - check for alternate goal line styles / formats    
      if START_GOAL_LINE_COMPAT_RE.match(line ) 
        ## "legacy" style starting with minute e.g. 
        ##  (6 Puskás 0-1, 9 Czibor 0-2, 11 Morlock 1-2, 18 Rahn 2-2,
        ##    84 Rahn 3-2)
        @re = GOAL_COMPAT_RE
        puts "  ENTER GOAL_COMPAT_RE MODE"   if debug?

        tokens << [:GOALS_COMPAT, "<|GOALS_COMPAT|>"]
      elsif START_GOAL_LINE_ALT_RE.match( line )
        ##  goals with scores e.g. 
        ##    (1-0 Franck Ribéry, 2-0 Ivica Olić, 2-1 Wayne Rooney)
        ##         -or-
        ##      (Dion Beljo  1-0 
        ##                   1-1  Andreas Gruber 
        ##   Matthias Seidl  2-1)   
        @re = GOAL_ALT_RE
        puts "  ENTER GOAL_ALT_RE MODE"   if debug?

        tokens << [:GOALS_ALT, "<|GOALS_ALT|>"]
      else
        ## "standard" / default style
        @re = GOAL_RE
        puts "  ENTER GOAL_RE MODE"   if debug?

        tokens << [:GOALS, "<|GOALS|>"]
      end

      ## note - eat-up ( for now
      ##   pass along "virtual" GOALS or GOALS_ALT token 
      ##      (see INLINE_GOALS for the starting goal line inline)     
      offsets = [m.begin(0), m.end(0)]
      pos = offsets[1]    ## update pos      
    end
  end



  old_pos = -1   ## allows to backtrack to old pos (used in geo)

  while m = @re.match( line, pos )
    # if debug?
    #  pp m
    #  puts "pos: #{pos}"
    # end
    offsets = [m.begin(0), m.end(0)]

    if offsets[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!
      msg =  "!! WARN - parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      puts msg

      errors << "parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      log( msg )
    end


    ##
    ## todo/fix - also check if possible
    ##   if no match but not yet end off string!!!!
    ##    report skipped text run too!!!

    old_pos = pos
    pos     = offsets[1]

#    pp offsets   if debug?

    ##
    ## note: racc requires pairs e.g. [:TOKEN, VAL]
    ##         for VAL use "text" or ["text", { opts }]  array


  t = if @re == ROUND_DEF_RE 
           if m[:spaces] || m[:space] 
               nil    ## skip spaces
           elsif m[:date]
            [:DATE, [m[:date], _build_date( m )]]
          elsif m[:duration]
            [:DURATION, [m[:duration], _build_duration( m )]] 
          elsif m[:sym]
              sym = m[:sym]
              case sym
              when '|' then  [:'|']
              when ':' then  [:':']
              when ',' then  [:',']
              else
                puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
                nil  ## ignore others (e.g. brackets [])
              end
           elsif m[:any]
              ## todo/check log error
               msg = "parse error (tokenize round_def) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
               puts "!! WARN - #{msg}"
  
               errors << msg
               log( "!! WARN - #{msg}" )
       
               nil   
            else
              ## report error/raise expection
               puts "!!! TOKENIZE ERROR - no match found"
               nil 
            end
      elsif @re == GROUP_DEF_RE
           if m[:spaces] || m[:space] 
               nil    ## skip spaces
           elsif m[:text]
               [:TEAM, m[:text]]  
           elsif m[:sym]
              sym = m[:sym]
              case sym
              when '|' then  [:'|']
              when ':' then  [:':']
              when ',' then  [:',']
              else
                puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
                nil  ## ignore others (e.g. brackets [])
              end
           elsif m[:any]
              ## todo/check log error
               msg = "parse error (tokenize group_def) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
               puts "!! WARN - #{msg}"
  
               errors << msg
               log( "!! WARN - #{msg}" )
       
               nil   
            else
              ## report error/raise expection
               puts "!!! TOKENIZE ERROR - no match found"
               nil 
            end
       elsif @re == GEO_RE
           ### note - possibly end inline geo on [ (and others?? in the future
           ## note: break on double spaces e.g.
           ## e.g. Jul/16 @ Arena Auf Schalke, Gelsenkirchen  Serbia 0-1 England    
           if m[:spaces]
                 ### note - do NOT break out 
                 ##           if not text seen yet!!!
                 if geo_count > 0
                    ## get out-off geo mode and backtrack (w/ next)
                    puts "  LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
                    @re = RE
                    pos = old_pos
                    next   ## backtrack (resume new loop step)
                 else 
                     nil   ## skip spaces
                 end                
           elsif m[:space] 
               nil    ## skip (single) space
           elsif m[:text]
               geo_count += 1
               [:GEO, m[:text]]   ## keep pos - why? why not?
           elsif m[:geo_end]   ## "hacky" special comma; always ends geo mode!!!
                 ## get out-off geo mode and backtrack (w/ next)
                 puts "  LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
                 @re = RE
                 pos = old_pos
                 next   ## backtrack (resume new loop step)                 
           elsif m[:sym]
              sym = m[:sym]
              ## return symbols "inline" as is - why? why not?
              ## (?<sym>[;,@|\[\]-])
              case sym
                ## note - reset geo_count to 0 (avoids break on two spaces)
                ##                     if separator seen!!
              when ',' then geo_count = 0; [:',']
              when '›' then geo_count = 0; [:',']  ## note - treat geo sep › (unicode) like comma for now!!!
              when '>' then geo_count = 0; [:',']  ## note - treat geo sep > (ascii) like comma for now!!!
              when '[' then
                 ## get out-off geo mode and backtrack (w/ next)
                 puts "  LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
                 @re = RE
                 pos = old_pos
                 next   ## backtrack (resume new loop step)                 
            else
              puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
              nil  ## ignore others (e.g. brackets [])
            end
          elsif m[:any]
             ## todo/check log error
             msg = "parse error (tokenize geo) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
             puts "!! WARN - #{msg}"
  
             errors << msg
             log( "!! WARN - #{msg}" )
       
             nil   
          else
            ## report error/raise expection
             puts "!!! TOKENIZE ERROR - no match found"
             nil 
          end
      elsif @re == PROP_CARDS_RE 
        if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]
              [:PROP_NAME, m[:name]]
         elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
         elsif m[:sym]
            sym = m[:sym]
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '-' then [:'-']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
             puts "!!! TOKENIZE ERROR (PROP_CARDS_RE) - no match found"
             nil 
         end    
      elsif @re == PROP_RE   ### todo/fix - change to LINEUP_RE !!!!
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_key]   ## check for inline prop keys
              key = m[:key]   
              ##  supported for now coach/trainer (add manager?)
              if ['coach', 
                  'trainer'].include?( key.downcase )
                [:COACH, m[:key]]   ## use COACH_KEY or such - why? why not?
              else
                ## report error - for unknown (inline) prop key in lineup
                nil
              end
         elsif m[:inline_captain]
              [:INLINE_CAPTAIN, m[:inline_captain]]
         elsif m[:inline_yellow]
              card = {}
              card[:m]      = m[:minute].to_i(10)  if m[:minute]
              card[:offset] = m[:offset].to_i(10)  if m[:offset]
              [:INLINE_YELLOW, [m[:inline_yellow], card]]       
         elsif m[:inline_red]
              card = {}
              card[:m]      = m[:minute].to_i(10)  if m[:minute]
              card[:offset] = m[:offset].to_i(10)  if m[:offset]
              [:INLINE_RED, [m[:inline_red], card]]       
         elsif m[:inline_yellow_red]
              card = {}
              card[:m]      = m[:minute].to_i(10)  if m[:minute]
              card[:offset] = m[:offset].to_i(10)  if m[:offset]
              [:INLINE_YELLOW_RED, [m[:inline_yellow_red], card]]       
         elsif m[:prop_name]
              [:PROP_NAME, m[:name]]
         elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             [:MINUTE, [m[:minute], minute]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '[' then [:'[']
            when ']' then [:']']
            when '(' then [:'(']
            when ')' then [:')']
            when '-' then [:'-']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
             puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
             nil 
         end
      elsif @re == PROP_ATTENDANCE_RE
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:enclosed_name]
              ## reserverd for use for sold out or such (in the future) - why? why not?
             [:ENCLOSED_NAME, m[:name]]
         elsif m[:num]
             [:PROP_NUM, [m[:num], { value: m[:value].to_i(10) } ]]
=begin             
         elsif m[:sym]
            sym = m[:sym]
            case sym
            when ',' then [:',']
            when ';' then [:';']
            # when '[' then [:'[']
            # when ']' then [:']']
            else
              nil  ## ignore others (e.g. brackets [])
            end
=end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (PROP_ATTENDANCE_RE) - no match found"
            nil 
         end
      elsif @re == PROP_REFEREE_RE
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_key]   ## check for inline prop keys
              key = m[:key]   
              ##  supported for now coach/trainer (add manager?)
              if ['att', 'attn', 'attendance' ].include?( key.downcase )
                [:ATTENDANCE, m[:key]]   ## use COACH_KEY or such - why? why not?
              else
                ## report error - for unknown (inline) prop key in lineup
                nil
              end
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PROP_NAME, m[:name]]    ### use PLAYER for token - why? why not?
         elsif m[:num]
             [:PROP_NUM, [m[:num], { value: m[:value].to_i(10) } ]]
         elsif m[:enclosed_name]
              ## use HOLD,SAVE,POST or such keys - why? why not?
             [:ENCLOSED_NAME, m[:name]]
         elsif m[:sym]
            sym = m[:sym]
            case sym
            when ',' then [:',']
            when ';' then [:';']
 #           when '[' then [:'[']
 #           when ']' then [:']']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (PROP_REFEREE_RE) - no match found"
            nil 
         end       
      elsif @re == PROP_PENALTIES_RE
        if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PROP_NAME, m[:name]]    ### use PLAYER for token - why? why not?
         elsif m[:enclosed_name]
              ## use HOLD,SAVE,POST or such keys - why? why not?
             [:ENCLOSED_NAME, m[:name]]
         elsif m[:score]
              score = {}
              ## must always have ft for now e.g. 1-1 or such
              ###  change to (generic) score from ft -
              ##     might be score a.e.t. or such - why? why not?
              score[:score] = [m[:score1].to_i(10),
                               m[:score2].to_i(10)]  
              [:SCORE, [m[:score], score]]
         elsif m[:sym]
            sym = m[:sym]
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '[' then [:'[']
            when ']' then [:']']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (PROP_PENALTIES_RE) - no match found"
            nil 
         end
      elsif @re == GOAL_COMPAT_RE 
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PLAYER, m[:name]] 
         elsif m[:minute]
              minute = _build_minute( m )
             [:MINUTE, [m[:minute], minute]]
         elsif m[:goal_type]
              goal_type = _build_goal_type( m )
             [:GOAL_TYPE, [m[:goal_type], goal_type]]
         elsif m[:score]
            score = {}
             ##  note - score is "generic"
            ##      might be full-time (ft) or
            ##         after extra-time (aet) or such
            ##         or even undecided/unknown
            ##    thus, use score1/score2 and NOT ft1/ft2
            score[:score] = [m[:score1].to_i(10),
                             m[:score2].to_i(10)]  
            ## note - for debugging keep (pass along) "literal" score
            [:SCORE, [m[:score], score]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ')'  ## leave goal mode!!
                puts "  LEAVE GOAL_COMPAT_RE MODE"   if debug?
                @re = RE
                ##  note - use/return GOAL_END token   - change to GOAL_END_PAREN(THESIS)
                ##                                or GOAL_PAREN_CLOSE/END ???
                [:GOALS_END, '<|GOALS_END|>']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (GOAL_COMPAT_RE) - no match found"
            nil 
         end
      elsif @re == GOAL_ALT_RE 
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PLAYER, m[:name]] 
         elsif m[:goal_minute]
              minute = _build_goal_minute( m )
             [:GOAL_MINUTE, [m[:goal_minute], minute]]
         elsif m[:goal_type]
              goal_type = _build_goal_type( m )
             [:GOAL_TYPE, [m[:goal_type], goal_type]]
         elsif m[:score]
            score = {}
             ##  note - score is "generic"
            ##      might be full-time (ft) or
            ##         after extra-time (aet) or such
            ##         or even undecided/unknown
            ##    thus, use score1/score2 and NOT ft1/ft2
            score[:score] = [m[:score1].to_i(10),
                             m[:score2].to_i(10)]  
            ## note - for debugging keep (pass along) "literal" score
            [:SCORE, [m[:score], score]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ')'  ## leave goal mode!!
                puts "  LEAVE GOAL_ALT_RE MODE"   if debug?
                @re = RE
                ##  note - use/return GOAL_END token   - change to GOAL_END_PAREN(THESIS)
                ##                                or GOAL_PAREN_CLOSE/END ???
                [:GOALS_END, '<|GOALS_END|>']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (GOAL_ALT_RE) - no match found"
            nil 
         end
      elsif @re == GOAL_RE 
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:goals_none]    ## note - eats-up semicolon!! e.g. -; or - ;
             [:GOALS_NONE, "<|GOALS_NONE|>"]
         elsif m[:goal_sep_alt]
             [:GOAL_SEP_ALT, "<|GOAL_SEP_ALT|>" ]   ## e.g. dash (-) WITH leading & trailing space required    
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PLAYER, m[:name]] 
         elsif m[:goal_minute]
              minute = _build_goal_minute( m )
             [:GOAL_MINUTE, [m[:goal_minute], minute]]
         elsif m[:goal_count]
              count = _build_goal_count( m ) 
              [:GOAL_COUNT, [m[:goal_count], count]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ';' then [:';']
            # when '[' then [:'[']
            # when ']' then [:']']
            when ')'  ## leave goal mode!!
                puts "  LEAVE GOAL_RE MODE"   if debug?
                @re = RE
                ##  note - use/return GOAL_END token   - change to GOAL_END_PAREN(THESIS)
                ##                                or GOAL_PAREN_CLOSE/END ???
                [:GOALS_END, '<|GOALS_END|>']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (GOAL_RE) - no match found"
            nil 
         end
      ###################################################
      ## assume TOP_LEVEL (a.k.a. RE) machinery
      else  
        if m[:space] || m[:spaces]
           nil   ## skip space(s)
        elsif m[:text]
          ##  note - top-level (for now always) assumes TEAM for TEXT match!!
          [:TEAM, m[:text]]   ## keep pos - why? why not?
        elsif m[:status]   ## (match) status e.g. cancelled, awarded, etc.
            [:STATUS, [m[:status], _build_status( m ) ]]
        elsif m[:inline_wo]   ## w/o - walkover  (match status)
            [:INLINE_WO, m[:inline_wo]]
        elsif m[:inline_np]   ## n/p - not played (match status)
            [:INLINE_NP, m[:inline_np]]         
        elsif m[:inline_bye]  ## bye  (match status)
            [:INLINE_BYE, m[:inline_bye]]
        elsif m[:inline_abd]  ## abd/abd. - abandoned (match status)
            [:INLINE_ABD, m[:inline_abd]]
        elsif m[:inline_void]  ## abd/abd. - abandoned (match status)
            [:INLINE_VOID, m[:inline_void]]
        elsif m[:inline_susp]  ## susp/susp. - suspended (match status)
            [:INLINE_SUSP, m[:inline_susp]]
        elsif m[:inline_ppd]  ## ppd/ppd. or postp/postp. - postponed (match status)
            [:INLINE_PPD, m[:inline_ppd]]
        elsif m[:inline_awd]  ## awd/awd. - awarded (match status)
            [:INLINE_AWD, m[:inline_awd]]
        elsif m[:inline_canc]  ## canc/canc. - cancelled/canceled (match status)
            [:INLINE_CANC, m[:inline_canc]]

        elsif m[:team_home]
            [:TEAM_HOME, m[:team_home]]
        elsif m[:team_away]
            [:TEAM_AWAY, m[:team_away]]
        elsif m[:team_neutral]
            [:TEAM_NEUTRAL, m[:team_neutral]]

        elsif m[:attendance]
             att = {} 
             att[:value] = m[:value].gsub( '_', '' ).to_i(10)
             ## note - for token id use INLINE_ATTENDANCE  (ATTENDANCE in use for prop!!!) 
            [:INLINE_ATTENDANCE, [m[:attendance], att ]]
        elsif m[:note]
            ###  todo/check:
            ##      use value hash - why? why not? or simplify to:
            ## [:NOTE, [m[:note], {note: m[:note] } ]]
             [:NOTE, m[:note]] 
        elsif m[:time]
            [:TIME, [m[:time], _build_time(m)]]
        elsif m[:date]
            [:DATE, [m[:date], _build_date(m)]]
        elsif m[:date_legs]
            [:DATE_LEGS, [m[:date_legs], _build_date_legs(m)]] 
        elsif m[:score_team]
            [:SCORE_TEAM, [m[:score_team], _build_score_team(m)]] 
        elsif m[:score_team_pen]
            [:SCORE_TEAM_PEN, [m[:score_team_pen], _build_score_team_pen(m)]] 
        elsif m[:score_team_num]
            [:SCORE_TEAM_NUM, [m[:score_team_num], _build_score_team_num(m)]]
          elsif m[:score_legs]
              legs = {}
              
              ### leg1
              score = {}
              score[:ft] = [m[:leg1_ft1].to_i(10),
                            m[:leg1_ft2].to_i(10)] 
              legs['leg1'] = score
              
              ### leg2
              score = {}
              score[:ft] = [m[:leg2_ft1].to_i(10),
                            m[:leg2_ft2].to_i(10)]  if m[:leg2_ft1] && m[:leg2_ft2]
              score[:et] = [m[:leg2_et1].to_i(10),
                            m[:leg2_et2].to_i(10)]  if m[:leg2_et1] && m[:leg2_et2]
              score[:p]  = [m[:leg2_p1].to_i(10),
                            m[:leg2_p2].to_i(10)]  if m[:leg2_p1] && m[:leg2_p2]
              legs['leg2'] = score
              
              ## check for (opt) aggregate - keep on "top-level"
              legs[:agg] = [m[:agg1].to_i(10),
                            m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]
              legs[:away] = true  if m[:away]  
              
              ## note - for debugging keep (pass along) "literal" score
              [:SCORE_LEGS, [m[:score_legs], legs]]
        elsif m[:score_full]
              score = {}
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]
              score[:agg] = [m[:agg1].to_i(10),
                             m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]

              if m[:away1] && m[:away2]               
                 score[:away] = [m[:away1].to_i(10),
                                 m[:away2].to_i(10)]
              elsif m[:away]    ## fallback if no away score; check away flag
                 score[:away] = true
              end  

              ## add golden/silver flags
              score[:golden] = true   if m[:aetgg]  ## golden goal (gg)/sudden death (sd)
              score[:silver] = true   if m[:aetsg]  ## silver goal (sg)

            ## note - for debugging keep (pass along) "literal" score
            [:SCORE_FULL, [m[:score_full], score]]
        elsif m[:score_fuller]
              score = {}
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]
              score[:agg] = [m[:agg1].to_i(10),
                             m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]
              if m[:away1] && m[:away2]               
                 score[:away] = [m[:away1].to_i(10),
                                 m[:away2].to_i(10)]
              elsif m[:away]    ## fallback if no away score; check away flag
                 score[:away] = true
              end  

              ## add aet flag true/false
              # score[:aet] = true   if m[:aet] || m[:aetgg] || m[:aetsg]
              
              ## add golden/silver flags
              score[:golden] = true   if m[:aetgg]  ## golden goal (gg)/sudden death (sd)
              score[:silver] = true   if m[:aetsg]  ## silver goal (sg)

            ## note - for debugging keep (pass along) "literal" score
            [:SCORE_FULLER, [m[:score_fuller], score]]
        elsif m[:score_fuller_more]
               ##    SCORE + SCORE_FULLER_MORE
               ## note -  after extra-time (aet) or full-time (ft) 
               ##           score may be present in SCORE!!! 
              score = {}
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]
              score[:agg] = [m[:agg1].to_i(10),
                             m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]
              if m[:away1] && m[:away2]               
                 score[:away] = [m[:away1].to_i(10),
                                 m[:away2].to_i(10)]
              elsif m[:away]    ## fallback if no away score; check away flag
                 score[:away] = true
              end  

              ## add flag in score for et/ft/ht
              score[:score] = 'et'   if m[:aet] || m[:aetgg] || m[:aetsg]
              score[:score] = 'ft'   if m[:ft]
              score[:score] = 'ht'   if m[:ht]

              ## add golden/silver flags
              score[:golden] = true   if m[:aetgg]  ## golden goal (gg)/sudden death (sd)
              score[:silver] = true   if m[:aetsg]  ## silver goal (sg)

            ## note - for debugging keep (pass along) "literal" score
            [:SCORE_FULLER_MORE, [m[:score_fuller_more], score]]
        elsif m[:score]
            score = {}
             ##  note - score is "generic"
            ##      might be full-time (ft) or
            ##         after extra-time (aet) or such
            ##         or even undecided/unknown
            ##    thus, use score1/score2 and NOT ft1/ft2
            score[:score] = [m[:score1].to_i(10),
                             m[:score2].to_i(10)]  
         ## note - for debugging keep (pass along) "literal" score
          [:SCORE, [m[:score], score]]
        elsif m[:score_awd]   ## score awarded (awd/awd.)
            score = {}
            ### note - use "generic" score for now
            ##         to match  A 3-0 B [awarded] etc.
            score[:score] = [m[:score1].to_i(10),
                             m[:score2].to_i(10)]  
            ## add score[:awarded] = true ???
            ##    or only use match status to avoid duplicate?
            [:SCORE_AWD, [m[:score_awd], score]]
        elsif m[:score_abd]   ## score abandonded (abd/abd.)
            score = {}
            ### note - use "generic" score for now
            score[:score] = [m[:score1].to_i(10),
                             m[:score2].to_i(10)]  
            ## add score[:awarded] = true ???
            ##    or only use match status to avoid duplicate?
            [:SCORE_ABD, [m[:score_abd], score]]
      elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
        elsif m[:vs]
           [:VS, m[:vs]]
        elsif m[:sym]
          sym = m[:sym]
          ## return symbols "inline" as is - why? why not?
          ## (?<sym>[;,@|\[\]-])
 
          case sym
          when '@'    ##  enter geo mode
            puts "  ENTER GEO_RE MODE"  if debug?
            @re = GEO_RE
            geo_count = 0
            [:'@']
          when ',' then [:',']
          when ';' then [:';']
          when '/' then [:'/']
          when '|' then [:'|']
          when '[' then [:'[']
          when ']' then [:']']
          when '-' then [:'-']    
          when '('    ## enter goal scorer mode on "free-floating" open paranthesis!!!
             puts "  ENTER GOAL_RE MODE"   if debug?
             @re = GOAL_RE
              ## note - eat-up ( for now; do NOT pass along as token
              ##       pass along "virutal" INLINE GOALS - why? why not?
              [:INLINE_GOALS, "<|INLINE_GOALS|>"]
          when ')' then [:')']
          else
            puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
            nil  ## ignore others (e.g. brackets [])
          end
        elsif m[:any]
           ## todo/check log error
           msg = "parse error (tokenize) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
           puts "!! WARN - #{msg}"

           errors << msg
           log( "!! WARN - #{msg}" )
     
           nil   
        else
          ## report error
           puts "!!! TOKENIZE ERROR - no match found"
           nil 
        end
      end


    tokens << t    if t

#    if debug?
#      print ">"
#      print "*" * pos
#      puts "#{line[pos..-1]}<"
#    end
  end

  ## check if no match in end of string
  if offsets[1] != line.size
    msg =  "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
    puts msg
    log( msg )

    errors << "parse error (tokenize) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
  end


  # if @re == GOAL_RE   ### ALWAYS switch back to top level mode
  #   puts "  LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
  #   @re = RE 
  # end
 
   if @re == GEO_RE   ### ALWAYS switch back to top level mode
     puts "  LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
     @re = RE 
   end
 
   @re = RE  if @re == GROUP_DEF_RE   ### ALWAYS switch back to top level mode
   @re = RE  if @re == ROUND_DEF_RE

   ##
   ## if in prop mode continue if   last token is [,-]
   ##        otherwise change back to "standard" mode
   if @re == PROP_RE            || @re == PROP_CARDS_RE ||
      @re == PROP_PENALTIES_RE ||
      @re == PROP_ATTENDANCE_RE || @re == PROP_REFEREE_RE
     if [:',', :'-', :';'].include?( tokens[-1][0] )
        ## continue/stay in PROP_RE mode
        ##  todo/check - auto-add PROP_CONT token or such
        ##                to help parser with possible NEWLINE
        ##                  conflicts  - why? why not?
     else
        ## switch back to top-level mode!!
        puts "  LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
        @re = RE 
        ## note - auto-add PROP_END (<PROP_END>)
        tokens << [:PROP_END, "<|PROP_END|>"]    
     end
   end

  
  [tokens,errors]
end

#_tokenize_tty_line(line) ⇒ `Object`

# File 'lib/sportdb/parser/lexer_tty.rb', line 59

def _tokenize_tty_line( line )
   line = line.strip

   tokens = []
   
   ## track last offsets - to report error on no match
   ##   or no match in end of string
   offsets = [0,0]
   pos = 0
   m = nil   
 

  while m = TTY_RE.match( line, pos )
    offsets = [m.begin(0), m.end(0)]

    if offsets[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!
      msg =  "!! WARN - tokenize (tty) error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      puts msg
      log( msg )
    end

    pos = offsets[1]

    t =  if m[:spaces] || m[:space] 
               nil    ## skip spaces
          elsif m[:text]
            [:TTY_TEXT, m[:text]]
          elsif m[:num]
            [:TTY_NUM, m[:num].to_i(10)] 
          else
              ## report error/raise expection
              puts "!!! TTY TOKENIZE ERROR - no match found"
              nil 
          end
     
    tokens << t    if t
  end

  ## check if no match in end of string
  if offsets[1] != line.size
      msg =  "!! WARN - tokenize (tty) error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
      puts msg
      log( msg )
  end

  tokens
end

#debug? ⇒ `Boolean`

Returns:

(Boolean)

32	# File 'lib/sportdb/parser/lexer.rb', line 32 def debug?() @debug == true; end

#is_group?(text) ⇒ `Boolean`

todo/fix - use LangHelper or such

 e.g.     class Lexer
              include LangHelper
          end

merge back Lang into Lexer - why? why not?

keep “old” access to checking for group, round & friends

for now for compatibility

Returns:

(Boolean)

26	# File 'lib/sportdb/parser/lexer.rb', line 26 def is_group?( text ) Lang.is_group?( text ); end

#is_round?(text) ⇒ `Boolean`

Returns:

(Boolean)

27	# File 'lib/sportdb/parser/lexer.rb', line 27 def is_round?( text ) Lang.is_round?( text ); end

#log(msg) ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 7

def log( msg )
   ## append msg to ./logs.txt
   ##     use ./errors.txt - why? why not?
   File.open( './logs.txt', 'a:utf-8' ) do |f|
     f.write( msg )
     f.write( "\n" )
   end
end

#tokenize_with_errors ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 107

def tokenize_with_errors

####
##   flags / modes
    @teletype = false     # use magic comment - tty/teletype: true



    tokens_by_line = []   ## note: add tokens line-by-line (flatten later)
    errors         = []   ## keep a list of errors - why? why not?
  
   ##  preprocess automagically - why? why not?
    ##   strip lines with comments and empty lines striped / removed
    ##      keep empty lines? why? why not?
    ##      keep leading spaces (indent) - why?
    ##
    ##  note - KEEP empty lines (get turned into BLANK token!!!!)


    ##  "universal" newlines
    ##    replace all windows-style  cr+lf (\r\n) to lf (\n) only
    txt = @txt.gsub( "\r\n", "\n" )



    ###
    ## quick hack for now
    ##   remove  html-style comments <!-- -->
    ##           (incl. multi-line)  with two spaces
    ##       will mess-up lineno tracking!!!
    ##    fix later to have function lineno & colno!!!
    txt = @txt.gsub( HTML_COMMENT_RE ) do |m|
                        puts " [debug] preproc html comment:"
                        puts m
                        '  ' 
                   end


=begin                 
##
##  todo/fix - add a command line switch/option for auto-format fixes !!!
   ##  quick hack - remove later
   ##    auto-convert "old" legacy round markers (») 
   txt = txt.gsub( %r{^ [ ]*
                          »
                        (?= [ ]+)  ## require one trailing space for now!!
                        }ix ) do |_|
                     puts "!! WARN - auto-fix format; replacing old (alternate/legacy) round marker (»)"
                        '▪'
                    end   


###  16.00 => 16:00
##     todo/check - use space for positive lookbehind & ahead
##                      (instead of \b) - why? why not?
##  note - check for/exclude 12.12.  date in match
##             use negative lookahead
##   check for 12.12.94
##      use   positive lookbehind   !!!
##               must be space, comma or begin-of-line [ ,]|^
##    or use negative lookbehind
##               must NOT be dot 
   txt = txt.gsub(  %r{  
                        ## check NEGATIVE lookbehind
                         (?<! [.])  ## do NOT match 12.94 in 12.12.94  
                          \b
                        (?<h>\d{1,2})
                           \.
                        (?<m>\d{2})
                          \b
                        (?! [.] )   ## do NOT match 12.12.  
                        }ix ) do |_|
                           m = $~   ## is $LAST_MATCH_DATA
                        puts "!! WARN - auto-fix format; replacing old (alternate/legacy) time format #{m[0]}"
                           "#{m[:h]}:#{m[:m]}"   ## '\1:\2'
                        end
=end




    ###
    ## add more "native" multi-line comment-styles
    ##  e.g.    #[[ ... ]]  or  #<<< .. >>> or #<< .. >>
    ##                 or such - why? why not?


   txt = txt.gsub( PREPROC_NOTA_BENE_RE ) do |m|
       if m.include?( "\n" )   ## check for newlines (\n) and replace
         puts " [debug] preproc (multi-line) note/nota bene block:"
         puts m
         ## todo/check: replace with two spaces insead of ↵ - why? why not?
         m.gsub( "\n", '↵' )
       else
         m 
       end 
    end


   ##
   ## e.g. used in (multi-line) TableNote  
   ##  1.SOUTH KOREA   6  5  1  0 22- 1 16  [0-0]
   ##  2.LEBANON       6  3  1  2 11- 8 10  [0-2, 0-0]
   ##  3.Turkmenistan  6  3  0  3  8-11  9  [3-1]
   ##  4.Sri Lanka     6  0  0  6  2-23  0  [0-1]
   ##  -.North Korea   [withdrew after playing 5 matches due to safety concerns in 
   ##                   connection with the Covid-19 pandemic; all results annulled]
   ##
   ##  note - no longer used for now
   ##     enclose multi-line notes in []
   ##         removes need for line continuation for now

##
##   txt = txt.gsub( LINE_CONTINUATION_RE ) do |_|
##            puts " [debug] preproc line continuation"
##              ## todo/check: replace with two spaces insead of ↵ - why? why not?
##               '↵' 
##         end 



    #####
    ## (another) quick hack for now
    ##   turn multi-line note blocks into 
    ##             single-line note blocks
    ##             by changing newline (\n) to ⏎ (unicode U+23CE)
    ##              or why not  to ___ ?
    ##
    ##  unicode options for return/arrows:
    ##   -  ↵ (U+21B5): Downwards Arrow With Corner Leftwards. 
    ##                This is the most common "carriage return" symbol.
    ##   -  ⏎ (U+23CE): Return Symbol. 
    ##               Specifically designated as the keyboard's "Return" key symbol, 
    ##                often used in user interfaces.

    txt = txt.gsub( PREPROC_BLOCK_RE ) do |m|
       if m.include?( "\n" )   ## check for newlines (\n) and replace
         puts " [debug] preproc (multi-line) block:"
         puts m
         ## todo/check: replace with two spaces insead of ↵ - why? why not?
         m.gsub( "\n", '↵' )
       else
         m 
       end 
    end


    ####
    ## quick hack - keep re state/mode between tokenize calls!!!
    @re  ||= RE     ## note - switch between RE & INSIDE_RE
  

    txt.each_line do |line|
        ## line = line.rstrip   ## note - MUST remove/strip trailing newline (spaces optional)!!!
        line = line.strip   ## note - strip leading AND trailing whitespaces
                            ## note - trailing whitespace may incl. \n or \r\n!!!


        ##
        ###
        ##  check for magic comments
        ##     e.g  # teletype: true    or TELETYPE: TRUE 
        ##             tty/teletype

        if line.start_with?('#')   ###  skip comments (& check magic comments!!)
           
           if (m = MAGIC_COMMENT_RE.match(line))
              magic_comment_key   = m[:magic_comment_key].downcase
              magic_comment_value = m[:magic_comment_value].downcase

              ##   turn on teletype mode
              ## e.g.  tty: true  or teletype: true
              if ['tty', 'teletype'].include?( magic_comment_key ) &&
                 ['true'].include?( magic_comment_value )
                 puts " magic comment - turn on teletype (tty) mode"
                 @teletype = true
              end
           end

           next
        end

        line = line.sub( /#.*/, '' ).strip   ###  cut-off end-of line comments too


        ####
        #  support __END__ marker to cut-off input
        break if line.strip == '__END__'



       ##
       ##  first check for tabs
       ##    add error/warn
       ##    for auto-fix - replace tabs with two spaces
 
        line = line.gsub( "\t" ) do |_|
                  ## report error here
                  ## todo/add error here
                  puts "!! WARN - auto-fix; replacing tab (\\t) with two spaces in line #{line.inspect}"
                   "  "   ## replace with two spaces
                 end

                 
        ## U+00A0 (160)  -- non-breaking space (unicode)
        line = line.gsub( "\u00A0" ) do |uni|
                  ## report error here
                  ## todo/add error here
                  puts "!! WARN - auto-fix; replacing non-breaking unicode space (#{uni}/#{uni.ord}) w/ ascii space ( /#{" ".ord}) in line #{line.inspect}"
                   " "   ## replace with space
                 end

        ###
        ## todo/fix - print unicode numbers for [–−]
        ##                different candidates to differentiate and document!!! 
        ##   – => U+2013 (8211)     -- En Dash     (unicode) 
        ##   − => U+2212 (8722)     -- Minus Sign  (unicode)
        line = line.gsub( /[–−]/ ) do |uni|
                  ## report error here
                  ## todo/add error here
                  puts "!! WARN - auto-fix; replacing unicode dash (#{uni}/#{uni.ord}) w/ ascii dash (-/#{"-".ord}) in line #{line.inspect}"
                   '-'   ## replace with ascii dash (-)
                  end



        puts "line: >#{line}<"    if debug?

        ######
        ### special case for empty line (aka BLANK)
        if line.empty?
           ## note - blank always resets parser mode to std/top-level!!!
           @re = RE
           tokens_by_line << [[:BLANK, '<|BLANK|>']]
        elsif (m = HEADING_RE.match(line))
           ## note - heading always resets parser mode to std/top-level!!!
           @re = RE
           puts "   HEADING"  if debug?
           ## note - derive heading level from no of (leading) markers
           ##             e.g. = is 1, == is 2, == is 3, etc.
           heading_level = m[:heading_marker].size 
           tokens_by_line << [[:"H#{heading_level}", m[:heading]]]
        elsif (m = NOTA_BENE_RE.match(line))
           ## note - nota bene always resets parser mode to std/top-level!!!
           @re = RE
           tokens_by_line << [[:NOTA_BENE, m[:nota_bene]]]
       elsif @re == RE && (m = TABLE_RE.match(line))
            @re = TABLE_MORE_RE  ## switch into table mode
            if m[:table_heading]
              tokens_by_line << [[:TABLE_HEADING, m[:table_heading]]]
            else  ## assume table (line) e.g. m[:table]
              tokens_by_line << [[:TABLE_LINE, line]]
            end 
        elsif @re == TABLE_MORE_RE
            ### todo/fix - check if no match and report/add error!!
            ##        for now (ummatched) line gets auto-added as table line!!!
            ##
            ##   note - MUST be followed by blank line (or nota bene/heading)
            ##            to switch back into to top-level!!!! 
            m = TABLE_MORE_RE.match(line)
            if m[:table_note]
              tokens_by_line << [[:TABLE_NOTE, m[:table_note]]]
            elsif m[:table_divider]
              tokens_by_line << [[:TABLE_DIVIDER, m[:table_divider]]]
            else  ## assume table (line) e.g. m[:table]
              tokens_by_line << [[:TABLE_LINE, line]]
            end
        elsif @re != TABLE_MORE_RE &&  (m = HRULER_RE.match(line))
           ## note - hruler (---)
           ##          will only match if NOT in table mode!!!
           ##   otherwise
           ##      hruler always resets parser mode to std/top-level!!!
           @re = RE
           tokens_by_line << [[:HRULER, '<|HRULER|>']]
        elsif @teletype && (@re == RE && IS_TTY_LINE_RE.match(line))
            ## try experimental TELETYPE (TTY) mode!!!
            ##    note - turn on via magic comment e.g.  tty/teletype: true
            ###
            ###    move inside _tokenize_line - why? why not?
             

            tokens_by_line << _tokenize_tty_line( line )

            ##   note - dates such as 
            ##         APR 11 or 11 APR   will trigger TELETYPE
            ###    ## check letter
        else

          more_tokens, more_errors = _tokenize_line( line )
        
          tokens_by_line  << more_tokens   
          errors          += more_errors
        end
    end # each line





    tokens_by_line = tokens_by_line.map do |tokens|  
        #################
        ##    transform tokens (using simple patterns) 
        ##      to help along the (racc look ahead 1 - LA1) parser       
        nodes = []

        buf = Tokens.new( tokens )
        ## pp buf

    loop do
          break if buf.eos?

          if buf.match?( :DATE, :TIME )   ## merge DATE TIME into DATETIME
               date = buf.next[1]
               time = buf.next[1]
               ## puts "DATETIME:"
               ## pp date, time
               ##  note:  time value is { time: {} } or
               ##                       { time: {}, time_local {} }
               val =  [date[0] + ' ' + time[0],  ## concat string of two tokens
                        { date: date[1] }.merge( time[1] ) 
                      ]
               nodes << [:DATETIME, val]         
         ### support  date time with comma too - why? why not?
         elsif buf.match?( :DATE, :',', :TIME )
               date  = buf.next[1]
               _    = buf.next  ## ignore comma 
               time = buf.next[1]
               ## puts "DATETIME:"
               ## pp date, time
               val =  [date[0] + ', ' + time[0],  ## concat string of two tokens
                        { date: date[1] }.merge( time[1] )
                      ]
               nodes << [:DATETIME, val]    
        elsif buf.match?( :TEAM, :SCORE_TEAM )  
            ## merge TEAM SCORE_TEAM into TEAMALT 
            ##     (use TEAMENTRY or TEAMRESULT - why? why not?)
               team       = buf.next[1]
               score_team = buf.next[1]
               val =  [team + ' ' + score_team[0],  ## concat string of two tokens
                        { team: team }.merge( score_team[1] ) 
                      ]
               nodes << [:TEAMALT, val]         
        elsif buf.match?( :TEAM, :SCORE_TEAM_PEN )  
               team           = buf.next[1]
               score_team_pen = buf.next[1]
               val =  [team + ' ' + score_team_pen[0],  ## concat string of two tokens
                        { team: team }.merge( score_team_pen[1] ) 
                      ]
               nodes << [:TEAMALT_PEN, val]         
        elsif buf.match?( :TEAM, :SCORE_TEAM_NUM )  
               team           = buf.next[1]
               score_team_num = buf.next[1]
               val =  [team + ' ' + score_team_num[0],  ## concat string of two tokens
                        { team: team }.merge( score_team_num[1] ) 
                      ]
               nodes << [:TEAMALT_NUM, val]         
         elsif buf.match?( :GOAL_MINUTE, :',', :GOAL_MINUTE )
             ## note - only advance by two tokens!
             ##     allows more :GOAL_MINUTE sequences!! e.g. 12,13,14 etc!!!
             ##  
             ## help parser with comma shift/reduce conflict
             ##   change ',' to GOAL_MINUTE_SEP !!!
             nodes << buf.next   ## pass through goal_minute 
             _ = buf.next  ## eat-up goal_minute_sep a.k.a. comma (,)
                           ##   and replace with dedicated sep(arator)
             nodes << [:GOAL_MINUTE_SEP,"<|GOAL_MINUTE_SEP|>"]
         elsif buf.match?( :',', :INLINE_ATTENDANCE )
             ## note  - allow optional comma before inline attendance  
             ## help parser with comma shift/reduce conflict
             ##   change ',' to INLINE_ATTENDANCE_SEP !!!
             nodes << [:INLINE_ATTENDANCE_SEP, "<|INLINE_ATTENDANCE_SEP|>"]
             _ = buf.next  ## eat-up inline_attendance_sep a.k.a. comma (,)
                           ##   and replace with dedicated sep(arator)
             nodes << buf.next   ## pass through inline_attendance 
          else
             ## pass through
             nodes << buf.next
          end
    end  # loop
    nodes  
  end  # map tokens_by_line


  

    ## flatten tokens
    tokens = []
    tokens_by_line.each do |tok|

         if debug?
           pp tok
         end


     ###############
     ##   "hacky" (automagic) line merges (remove newline)
           ## if line start with @  - check if incl. teams
  
     ###
     ### quick merge lines hack
     ##    if line starts with geo-marker token @
     ##            check if line incl. TEAM
     ##           if yes, leave alone
     ##            otherwise  merge line into previous line!!
     ##       - todo/fix - handle in possibly in grammar!!!
     ##        for now match_line CAN start with @ London
     ##                 resulting in parser conflict(s)!!!
     ##    e.g. 
     ##       England v Scotland
     ##          @ London
     ##          =>
     ##        England v Scotland @ London
     ## 
  
     ##
     ##  note/todo - if INDENT / SPACES get added
     ##                adjust here 
     ##   tok[0][0] == :INDENT  (or :SPACES) && 
     ##   tok[1][0] == :'@'

           if tok[0] && tok[0][0] == :'@' 
                team =  tok.find { |t| t[0] == :TEAM }
                if team
                   ## do nothing - keep as is (assume match_line starting w/ @)
                else
                  ## no team(s) found in line
                  ##    remove last token (that is, NEWLINE)
                  ##   note - possibly is blank ?!  keep blank
                  tokens.pop  if tokens[-1][0] == :NEWLINE
                end   
           end


         tokens  += tok 
         ## auto-add newlines  (unless BLANK!!)
         tokens  << [:NEWLINE, "\n"]   unless tok[0] && tok[0][0] == :BLANK
    end

    [tokens,errors]
end

Class: SportDb::Lexer

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lines, debug: false) ⇒ Lexer

Class Method Details

._build_date(m) ⇒ Object

._build_date_legs(m) ⇒ Object

._build_duration(m) ⇒ Object

._build_goal_count(m) ⇒ Object

._build_goal_minute(m) ⇒ Object

._build_goal_type(m) ⇒ Object

._build_minute(m) ⇒ Object

._build_score_team(m) ⇒ Object

._build_score_team_num(m) ⇒ Object

._build_score_team_pen(m) ⇒ Object

._build_status(m) ⇒ Object

._build_time(m) ⇒ Object

._mk_score_fuller_agg(win:) ⇒ Object

._mk_score_fuller_p(win:) ⇒ Object

._parse_date(str) ⇒ Object

._parse_goal_count(str) ⇒ Object

._parse_goal_minute(str) ⇒ Object

._parse_score_full(str) ⇒ Object

._parse_team(str) ⇒ Object

.build_map(lines, downcase: false) ⇒ Object

.build_names(lines) ⇒ Object

.parse_date(str, start:) ⇒ Object

.parse_names(txt) ⇒ Object

Instance Method Details

#_build_date(m) ⇒ Object

#_build_date_legs(m) ⇒ Object

#_build_duration(m) ⇒ Object

#_build_goal_count(m) ⇒ Object

#_build_goal_minute(m) ⇒ Object

#_build_goal_type(m) ⇒ Object

#_build_minute(m) ⇒ Object

#_build_score_team(m) ⇒ Object

#_build_score_team_num(m) ⇒ Object

#_build_score_team_pen(m) ⇒ Object

#_build_status(m) ⇒ Object

#_build_time(m) ⇒ Object

#_tokenize_line(line) ⇒ Object

#_tokenize_tty_line(line) ⇒ Object

#debug? ⇒ Boolean

#is_group?(text) ⇒ Boolean

#is_round?(text) ⇒ Boolean

#log(msg) ⇒ Object

#tokenize_with_errors ⇒ Object

#initialize(lines, debug: false) ⇒ `Lexer`

._build_date(m) ⇒ `Object`

._build_date_legs(m) ⇒ `Object`

._build_duration(m) ⇒ `Object`

._build_goal_count(m) ⇒ `Object`

._build_goal_minute(m) ⇒ `Object`

._build_goal_type(m) ⇒ `Object`

._build_minute(m) ⇒ `Object`

._build_score_team(m) ⇒ `Object`

._build_score_team_num(m) ⇒ `Object`

._build_score_team_pen(m) ⇒ `Object`

._build_status(m) ⇒ `Object`

._build_time(m) ⇒ `Object`

._mk_score_fuller_agg(win:) ⇒ `Object`

._mk_score_fuller_p(win:) ⇒ `Object`

._parse_date(str) ⇒ `Object`

._parse_goal_count(str) ⇒ `Object`

._parse_goal_minute(str) ⇒ `Object`

._parse_score_full(str) ⇒ `Object`

._parse_team(str) ⇒ `Object`

.build_map(lines, downcase: false) ⇒ `Object`

.build_names(lines) ⇒ `Object`

.parse_date(str, start:) ⇒ `Object`

.parse_names(txt) ⇒ `Object`

#_build_date(m) ⇒ `Object`

#_build_date_legs(m) ⇒ `Object`

#_build_duration(m) ⇒ `Object`

#_build_goal_count(m) ⇒ `Object`

#_build_goal_minute(m) ⇒ `Object`

#_build_goal_type(m) ⇒ `Object`

#_build_minute(m) ⇒ `Object`

#_build_score_team(m) ⇒ `Object`

#_build_score_team_num(m) ⇒ `Object`

#_build_score_team_pen(m) ⇒ `Object`

#_build_status(m) ⇒ `Object`

#_build_time(m) ⇒ `Object`

#_tokenize_line(line) ⇒ `Object`

#_tokenize_tty_line(line) ⇒ `Object`

#debug? ⇒ `Boolean`

#is_group?(text) ⇒ `Boolean`

#is_round?(text) ⇒ `Boolean`

#log(msg) ⇒ `Object`

#tokenize_with_errors ⇒ `Object`