Class: SportDb::Lexer

Inherits:

Object

Object
SportDb::Lexer

show all

Defined in:: lib/sportdb/parser.rb,
lib/sportdb/parser/lexer.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/token-geo.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-note.rb,
lib/sportdb/parser/token-prop.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-time.rb,
lib/sportdb/parser/lexer_token.rb,
lib/sportdb/parser/token-goals.rb,
lib/sportdb/parser/token-group.rb,
lib/sportdb/parser/token-round.rb,
lib/sportdb/parser/token-score.rb,
lib/sportdb/parser/lexer-on_top.rb,
lib/sportdb/parser/token-status.rb,
lib/sportdb/parser/lexer-on_goal.rb,
lib/sportdb/parser/lexer-prep_doc.rb,
lib/sportdb/parser/lexer-tokenize.rb,
lib/sportdb/parser/lexer-prep_line.rb,
lib/sportdb/parser/token-prop_name.rb,
lib/sportdb/parser/token-score_full.rb,
lib/sportdb/parser/token-score_legs.rb,
lib/sportdb/parser/token-date--names.rb,
lib/sportdb/parser/lexer-on_group_def.rb,
lib/sportdb/parser/lexer-on_prop_misc.rb,
lib/sportdb/parser/lexer-on_round_def.rb,
lib/sportdb/parser/token-score_fuller.rb,
lib/sportdb/parser/token-date--helpers.rb,
lib/sportdb/parser/token-date_duration.rb,
lib/sportdb/parser/token-status_inline.rb,
lib/sportdb/parser/lexer-on_prop_lineup.rb,
lib/sportdb/parser/token-goals--helpers.rb,
lib/sportdb/parser/token-score--helpers.rb,
lib/sportdb/parser/lexer-on_prop_penalties.rb

Defined Under Namespace

Classes: Context, Token

Constant Summary collapse

ANY_RE = general catch-all (RECOMMENDED (ALWAYS) use as last entry in union) to avoid advance of pos match!!!

%r{
     (?<any> .)
}ix

SPACES_RE =

%r{
     (?<spaces> [ ]{2,})
   | (?<space>  [ ])
}ix

ATTENDANCE_RE = add att(endance) e.g. att: 18000 A v B 2-1 att: 18000

%r{
    (?<attendance>
     \b
        (?: attendance|att )
            : [ ]*
         (?<value>
              [1-9]
              (?: _? \d+ )*
         )
     \b
)}ix

TEAM_HOME_RE = home/away/neutral - (h), (a), (n) add support for h/a/n with (?-i \b [han] \b) lower-case and \b boundry - why? why not?

%r{  (?<team_home> \(h\) )}ix

TEAM_AWAY_RE =

%r{  (?<team_away> \(a\)  )}ix

TEAM_NEUTRAL_RE =

%r{  (?<team_neutral> \(n\) )}ix

VS_RE = note - only match case sensitive (downcased letters)!!! note - bigger match first e.g. vs than v etc.

%r{
    (?<vs>
       (?<=[ ])	# positive lookBEHIND for space
       (?-i:
           vs\.?|v
       )
       (?=[ ])   # positive lookAHEAD for space
    )
}ix

RE = “top-level” regex used for: - date_header - match_header & match_line_more - match_line

Regexp.union(
                    SPACES_RE,
                    STATUS_RE,   ## match status e.g. [cancelled], etc.

                    INLINE_WO_RE,    ## (inline) match status - w/o (walkout)
                    INLINE_NP_RE,    ## (inline) match status - n/p (not played)
                    INLINE_BYE_RE,   ## (inline) match status - bye (advance to next round)
                    INLINE_ABD_RE,   ## (inline) match status - abd/abd. (abandoned)
                    INLINE_SUSP_RE,  ## (inline) match status - susp/susp.  (suspended)
                    INLINE_PPD_RE,   ## (inline) match status - ppd/ppd. or pstp/pstp. or postp/postp. or p-p (postponed)
                    INLINE_VOID_RE,  ## (inline) match status - x-x (voided)
                    INLINE_AWD_RE,   ## (inline) match status - awd/awd. (awarded)
                    INLINE_CANC_RE,  ## (inline) match status - canc/canc. (cancelled/canceled)


                    TEAM_HOME_RE,     ## (H)
                    TEAM_AWAY_RE,     ## (A)
                    TEAM_NEUTRAL_RE,  ## (N)

                    NOTE_RE,  ### fix - change to INLINE_NOTE !!!
                    DATE_LEGS_RE,  # note - must go before date!!!
                    DATE_RE,  ## note - date must go before time (e.g. 12.12. vs 12.12)
                    TIME_RE,

                    ATTENDANCE_RE,   # note - allow att: for now inline in matches too - why? why not?

                    SCORE_FULL_1ST_RE, # note - MUST go before SCORE_LEGS_RE!!
                 ##   e.g. 2-2, 5-1 pen.
                    SCORE_LEGS_RE,
                    SCORE_FULL_RE,
                    SCORE_FULLER_RE,
                    SCORE_FULLER_MORE_RE,
                    SCORE_AWD_RE,   #  (inline) score awarded e.g. 3-0 awd or 0-1 awd. etc.
                    SCORE_ABD_RE,   #  (inline) score abandoned e.g. 2-1 abd.
                    SCORE_RE,   ## note basic score e.g. 1-1 must go after SCORE_FULL_RE!!!

                    VS_RE,

                   TEXT_RE,

              %r{ (?<sym> [,@()-] ) }x,   ## todo - check if "standalone" comma (,) in use?
                   ANY_RE,
)

START_WITH_ORD = ord (for ordinal number) e.g. (51) or (1) etc. - limit digits of number - why? why not???

%r{
   \A
    [ ]*    ## ignore leading spaces (if any)
(?<ord>
  \(
   (?<value>\d+)
  \)
)}ix

START_WITH_YEAR = e.g. 1930, 1986, 2002, 2010, 2022, 2026 note - only YYYY note - look out for clubs like 1860 München (de) !!! 1899 Hoffenheim (de) 1896 Löwenherz (ch - a.k.a. FC Winterthur ??) any others starting with YYYY ?! note - YEAR requires TWO (trailing) spaces !!!!! e.g. 1930 Uruguay 4-2 Argentina 1934 Italy 2-1 Czechoslovakia (AET) 2022 Argentina 3-3 France (AET, 4-2 pen) do NOT match (iso date!!) - 2020-11-12 2020/11/12 2020.11.12 etc.

%r{
   \A
       [ ]*    ## ignore leading spaces (if any)
     (?<year>
        \d{4}
     )
     ## positive lookahead
       (?= [ ]{2} |   ## min. TWO spaces!!! or
           [ ]@ |   ##   space with geo marker or
           [ ]* \z  ##    year (date) header (end-of-line/string)
        )
}x

HEADING_RE =

%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?<heading_marker> ={1,6} )
    [ ]*
     (?<heading>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1
        [^=]+?   ## use non-greedy
     )
    [ ]*  ## ignore trailing spaces (if any)
     (?: =*)  ## allow any trailing heading markers
    [ ]*  ## ignore trailing spaces (if any)
  \z
}ix

GEO_TEXT_RE =

%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
          ## positive lookbehind -  for now space (or beginning of line - for testing) only
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<= [ ,›>\[\]]|^)
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |
                # opt 2 - start with num!! -
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MAY be followed by (optional space) !
                      ## MUST be follow by a to z!!!!
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add another weirdo case
                ##   e.g.   's Gravenwezel-Schilde
                ##   add more letters (or sequences here - why? why not?)
                    '\p{L}+
               )

               ##
               ## todo/check - find a different "more intuitive" regex/rule if possible?
               ##    for single spaces only (and _/ MUST not be surround by spaces)

              (?:
                  (?:
                    [ ]?   # only single (inline) space allowed - double spaces are breaks!!!
                    (?:
                       \p{L} | \d  | [.&'°]
                        |
                       (?: (?<! [ ])  ## no space allowed before (but possible after)
                            [-]
                       )
                         |
                       (?: (?<! [ ])  ## no spaces allowed around these characters
                           [_/]
                          (?! [ ])
                       )
                    )+
                  )
                  |
              ## for now allow auto-add optional
              ##   parenthesis enclosed closed text
              ##   e.g. Dublin (Dalymount Park)
              ##        Bucuresti (23 August)
              ##        Paris (Parc des Princes)
              ##        Ost-Berlin (Walter-Ulbricht)
              ##        Athinai (OAKA - Maroussi)
              ##
              ##   or   Valencia (Spain) or Solna
              (?:
                    [ ]
                    \(
                        [^()\[\],;:›<>]+    ## todo - add more special chars
                                            ##   maybe list only allowed ones??
                                            ##   make pattern more strict - why? why not?
                    \)
              )
          )*


              ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)

            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## POSITIVE lookahead
            (?=[ ,›>\[\]]|$)

   )
}ix

GEO_END_RE =

%r{
   (?<geo_end>
        ,
    )
    ## POSITIVE lookahead for props
    ##   todo/fix - use generic [a-z]+ - why? why not?
    (?=
        [ ]*  ## optional spaces
         (?:     attendance|att
              |  referee?s|refs?
          )
         :
    )
}ix

GEO_RE =

Regexp.union(
                    SPACES_RE,
                    GEO_END_RE,
                    GEO_TEXT_RE,
                    /  (?<sym> [,›>\[] ) /x,
                    ANY_RE,
)

DATE_I_RE = e.g. Fri Aug 9 Fri Aug 9 Fri, Aug 9 Fri, Aug 9 2024 Fri, Aug 9, 2024 Aug 9, 2024 Aug 9, 2024 note - eat-up optional comma after DAY_NAMES!! note - Fri Aug/9 no longer supported!!!

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+)
     )?
     (?<month_name>#{MONTH_NAMES})
          [ ]
     (?<day>\d{1,2})
          \b
     ## optional year
     (      ,? [ ]       ## note - comma optinal with single space required for now
            (?<year>\d{4})        ## optional year 2025 (yyyy)
              \b
     )?
)}ix

DATE_LEGS_I_RE = todo/fix - add (opt) day_name later add (opt) year later e.g. Aug 9 & Aug 10 note - allow shortcut e.g. Aug 9 & 10

%r{
(?<date_legs>
 \b
     (?<month_name1>#{MONTH_NAMES})
          [ ]
     (?<day1>\d{1,2})
    [ ] & [ ]
     (?:
        (?<month_name2>#{MONTH_NAMES})
          [ ]
      )?  ## note - make 2nd month_name optional
     (?<day2>\d{1,2})
  \b
)}ix

DATE_II_RE = e.g. 3 June or 10 June note - allow more spaces between DAY_NAMES and DAY e.g. Sun 1 Mar Wed 4 Mar Sat 14 Mar Sat 11 Apr Sat 11 Apr 2021 Sat 11 Apr 21 Sat, 11 Apr note - eat-up optional comma after DAY_NAMES!! note - Sat 14 Mar 17:30 check two-digit year (with NEGATIVE lookahead for time!!!)

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+)
     )?
     (?<day>\d{1,2})
         [ ]
     (?<month_name>#{MONTH_NAMES})
          \b
     ## optional year
     (  [ ]
        (?:
           (?<year>\d{4})        ## optional year 2025 (yyyy)
               |
            (?:
               (?<yy>\d{2})           ## optional year 25 (yy)
                ## check NEGATIVE lookahead
               (?! :|[:h]\d{2})
            )
        )
        \b
     )?
)}ix

DATE_III_A_RE = e.g. iso-date - 2011-08-25 note - allow/support ("shortcuts") e.g 2011-8-25 or 2011-8-3 / 2011-08-03 etc.

%r{
(?<date>
  \b
   (?<year>\d{4})
       -
   (?<month>\d{1,2})
       -
   (?<day>\d{1,2})
  \b
)}ix

DATE_III_B_RE = starting w/ day/month/year e.g. 25-08-2011

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          (?: ,?[ ]+)
     )?
   (?<day>\d{1,2})
       -
   (?<month>\d{1,2})
       -
   (?<year>\d{4})
  \b
)}ix

DATE_IIII_RE = allow (short)“european” style 8.8. note - assume day/month!!!

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
           (?: ,?[ ]+)
     )?
   (?<day>\d{1,2})
       \.
   (?<month>\d{1,2})
       \.
   (?: (?:
          (?<year>\d{4})        ## optional year 2025 (yyyy)
              |
          (?<yy>\d{2})           ## optional year 25 (yy)
       )
        \b
   )?
)
}ix

DATE_IIIII_RE = 04/03/2026 or 4/3/2026 04/03/26 or 4/3/26 04/03 or 4/3

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          (?: ,?[ ]+)
     )?
   (?<day>\d{1,2})
       /
   (?<month>\d{1,2})
    \b
   (?:
        /
       (?:
          (?<year>\d{4})         ## optional year 2025 (yyyy)
              |
          (?<yy>\d{2})           ## optional year 25 (yy)
       )
      \b
   )?
)
}ix

DATE_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
   DATE_I_RE,
   DATE_II_RE,
   DATE_III_A_RE,    ## e.g. 1973-08-14
   DATE_III_B_RE,
   DATE_IIII_RE,    ## e.g. 8.8. or 8.13.79 or 08.14.1973
   DATE_IIIII_RE,   ## e.g.  08/14/1973
)

DATE_LEGS_RE = todo - add more format style here; change to Regexp.union later!!!

DATE_LEGS_I_RE

NOTE_RE = fix - use (?<text>) - text capture for inner text!! use (?<note> for complete match as a convention!! )

%r{
\[ 
  (?<note>
     [^\[\]\#]*?    ## note - non-greedy/lazy operator
                    ##    exclude comments inside note block - why? why not?
  )
\]
}xi

NOTA_BENE_RE = check for “literal” (multi-line) note blocks eg. nb: or note: space required after double colon - why? why not? note - use \A (instead of ^) - \A strictly matches the start of the string.

%r{   \A
    [ ]*  ## ignore leading spaces (if any)
 (?: nb | note) [ ]* : [ ]+   
  (?<nota_bene>
       .+?  ## use non-greedy 
   )
    [ ]*  ## ignore trailing spaces (if any) 
   \z
}xi

PROP_KEY_WORD_ = (i) starting w/ letters note - incl./allows digits (0-9) e.g. a1, a2000, etc. note - added back optional trailing dot (.) for abbrev. word !!!

%r{
       \p{L}
         [\p{L}\d]*
         \.?
}ix

PROP_KEY_NUM_ = note - incl. optional dot or numsign e.g. 1. or 1°

%r{
           \d+
           [.°]?
}ix

PROP_KEY_NUMALPHA_ = e.g. 1A, 1FC etc. note - no trailing dot (.) for now - check if any cases exist in real world

%r{
         \d+
         \p{L}
          [\p{L}\d]*
}ix

START_WITH_PROP_KEY_RE =

%r{
  \A         ## note - MUST start line; leading spaces optional (eat-up)
(?<prop_key>
    [ ]*     ##  optional leading spaces
  (?<key>
      (?:
          ## (i) starting w/ letters
            #{PROP_KEY_WORD_}

          ## (ii) starting w/ number
          ##  e.g. 1fc, 1a,
          | #{PROP_KEY_NUMALPHA_}
          ##      followed by optional dot) and
          ##                  optional space
          ##      MUST be follow by letter (a to z)!!!!
          ##   eg. 1[ fc], 1.[ fc], 1.[fc],  etc.
          | #{PROP_KEY_NUM_}   (?= [ ]? \p{L})
      )
      (?:
          ## connectors  - note - no dot (.), must match with abbrev word or num!!
           (?: ## (i)   single space or WITHOUT surrounding spaces!! - slash (/), dash (-)
               ##     e.g. do NOT match   one - two     or one / two
               ##                        only one-two   or  one/two

                 [ /-]

               ## (ii)     surrounded by leading or trailing optional space
               ##            c & a, etc.
               ##            d'ivoire, d' ivoire
               ##            borusia 'gladbach etc.
               ##              exclude space ' space - why? why not? (or ignore for now)
               ##
               ##    check for quotes  ('') - not realy supported here
               ##              e.g. leading or trailing ' will NOT match

                |  [ ]? & [ ]?
                |  [ ]? '
                |  ' [ ]?

               #### (iii)
               ##   note - special "hack"  to connect WITHOUT space
               ##     for   Union 1.FC  and SKN St.Pölten or St.Pölten
               ##       connects      1.FC      => NUM+WORD
               ##                     1°Mayo    => NUM+WORD
               ##                     St.Pölten => ABBREV+WORD
               ##
               ## note - match WITHOUT (space) connector
               ##                  1.FC  (Union 1.FC Stein)
               ##               [WORD: "Union"], [NUM: "1."], [WORD: "FC"]
               ##                  St.Pölten (SKN St.Pölten)
               ##                [WORD: "SKN"], [ABBREV: "St."], [WORD: "Pölten"]
               |   (?<=  [.°] )
                   (?=  \p{L})
           )
            (?:
                  #{PROP_KEY_NUMALPHA_}
               |  #{PROP_KEY_NUM_}
               |  #{PROP_KEY_WORD_}
              )
      )*
     )       ## close <key> capture
   [ ]*?     ## slurp trailing spaces
    :

                ## positive lookahead (must be followed by space!!)
                ##     or allow end-of-line too
   (?= [ ]+|$)
  )  ## close <prop_key> capture
}ix

INLINE_CAPTAIN = c or [C] for marking player as captain support [y ] too - or require Y - why? why not?

%r{ (?<inline_captain>
    \[ [cC] \]
)}x

INLINE_YELLOW =

%r{ (?<inline_yellow>
     \[ [yY]
         ## optional minute
         (?: [ ]+
           (?<minute> \d{1,3})
              '?
           (?:
              \+
              (?<offset>\d{1,2})
               '?
           )?
         )?
     \]
)}x

INLINE_RED =

%r{ (?<inline_red>
     \[ [rR]
         ## optional minute
         (?: [ ]+
           (?<minute> \d{1,3})
              '?
           (?:
              \+
              (?<offset>\d{1,2})
               '?
           )?
         )?
     \]
)}x

INLINE_YELLOW_RED =

%r{ (?<inline_yellow_red>
     \[ (?:y/r |
           Y/R  )
         ## optional minute
         (?: [ ]+
           (?<minute> \d{1,3})
              '?
           (?:
              \+
              (?<offset>\d{1,2})
               '?
           )?
         )?
     \]
)}x

PROP_KEY_INLINE_RE = simple prop key for inline use e.g. Coach: or Trainer: or ... add more here later

%r{
   \b
(?<prop_key>    ## note: use prop_key (NOT prop_key_inline or such)
  (?<key>
      \p{L}+
  )
   ## note - NO spaces allowed for key for now!!!
    :
   ## possitive lookahead (must be followed by space!!)
   (?=[ ]+)
  )
}ix

PROP_NUM_RE = note allow underscore inline e.g. 5_000 discuss/check - allow space inline (e.g. 5 000) - why? why not?

%r{
 \b
  (?<num>
      (?<value> [0-9]+
                 (?: _ [0-9]+)*
      )
  )
 \b
}x

ENCLOSED_NAME_RE = todo/fix - allow more chars in enclosed name - why? why not? e.g. (') - Cote D'Ivore etc. change to PAREN_NAME or PARENTHESIS or such - why? why not?

%r{
        (?<enclosed_name>
           \(
          (?<name>
              \p{L}+
              (?:
                 [ ]
                   \p{L}+
              )*
          )
            \)
        )
}ix

TEXT_RE =

%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
           ## positive lookbehind
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<=[ ,;@|\[\]]
                 |^
            )
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |

                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MUST be followed by (optional dot) and
                      ##                      required space !!!
                      ## MUST be follow by a to z!!!!
                      [.°]?     ## optional dot (.) or degree(°) - todo - add number sign too!!
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add another weirdo case
                ##   e.g.   's Gravenwezel-Schilde
                    '[s] [ ] \p{L}+
               )


              (?:(?:  (?:[ ]   # only single spaces allowed inline!!!
                          ## note - exclude (v[ ]/vs[ ]/vs.[ ])
                          ##    AND switch to case-sensitive (via -i!!!)
                        (?! (?-i: (?:  ## note - (big) V not matching for versus!!!
                                      vs\.?|v|

                                      n/p|N/P|
                                      w/o|W/O|
                                      abd\.?|ABD|
                                      aban\.?|ABAN|
                                      susp\.?|SUSP|
                                      ppd\.?|PPD|
                                      pst\.?|PST|
                                      po?stp\.?|PO?STP|P-P|
                                      x-x|X-X|
                                      awd\.?|AWD|
                                      canc\.?|CANC ) [ ]
                                        |
                                  (?: bye|BYE ) (?:[ ]|$))
                          )
                      )
                      |
                     [/-]   ## must NOT be surrounded by spaces
                  )?
                (?:
                  \p{L}
                     |
                  (?:   ## note - restrict [.&'] to single char usage (no doubled e.g. && etc.)
                    \. (?! \.)  ## allow single points only (now two or more etc.)
                     |
                    & (?! &)
                     |
                    ' (?! ')
                   )
                     |
                 (?:
                   \d+
                   (?!
                     [0-9h'+] |    ## protected break on 12h / 12' / 1-1
                                    ##  check usege for 3+4 - possible? where ? why?
                     (?:[.:-]\d)     ## protected/exclude/break on 12.03 / 12:03 / 12-12
                                      ##  BUT allow Park21-Arena for example e.g. 21-A :-)
                    )
                    [°]?  ## followed by optional ord
                   ## negative lookahead for numbers
                   ##   note - include digits itself!!!
                   ##   note - remove / (slash) e.g. allows UDI'19/Beter Bed
                 )
               )
              )*  ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)


            ## allow optional at the end
            ##  tag or year
            ##   make it and in the future - why? why not?
            ##
            ## change - fix
            ##   do NOT use (A) for amateur
            ##   use A or A. with NO ()!!!
            ## (A) -    allow with predined  alpha only for now
            ##          e.g. (A) - amateur a team or b?
            ###  same for U21 or U9 etc
            ##        use with NO ()!!! - why? why not?
            ##      or U21 U9 etc.   - why? why not?
            ##       or etc.
            ## (1879-1893) or allow years e.g. (1879-1893)
            ###
            ##    add allow country code three to five letters for now
            ##       change to generic 1 to 5 - why? why not?
            ##     e.g. (A), (I),
            ##          (AUT)
            ##          (TRNC)   five? for UEFA code for northern cyprus
            ##     change to 1 to 4 - why? why not?
            ##   check - fix possible for upper case only here
            ##                     inline for this group only?
            (?:
               [ ]
               \(
                  \d{4}-\d{4}
               \)
            )?
             (?:
                ######
                # check for country code (cc)
                #       e.g. (AUT) or ,AUT or AUT
                (?:
               [ ]   ## note - do NOT allow more than one space!!! - why? why not?
                   \(
                       ## note - auto-exclude reserved (aet)  from SCORE_FULLER_MORE!!!
                       ##     plus golden goal (gg)/sudden death (sd), silver goal (sg)
                       ##    (ht), (ft)
                       (?! (?: aet | agget | asdet | asget | ht | ft )
                             \)
                       )
                     (?:
                       [A-Z]{1,5}
                     )
                  \)
                )
                  |
                (?:
                    [ ]*[,›>][ ]*
                        [A-Z]{1,5}
                     \b
                )
             )?
            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## positive lookahead
            (?=[ ,;@|\[\]]
                 |$
            )
   )
}ix

TIME_RE =

%r{
        \b
    (?<time>
             (?<hour>\d{1,2})
                   [:h]
              (?<minute>\d{2})

                 #### optional (inline) timezone
                 ##    note - non-utc timezone MUST be hard-coded (added) here!!!
                 ##     avoids eating-up team names (separated by one space)
                 ##            e.g.  18:30 MEX v MEX
                 (?:
                    [ ]  ## require space - why? why not
                     (?<timezone>
                        (?:
                          ## GMT   - Greenwich Mean Time
                          ## BST   - British Summer Time
                          ## CES?T - Central European (Summer) Time
                          ## EES?T - Eastern European (Summer) Time
                          ##
                          (?: GMT|BST|CES?T|EES?T)
                               (?: /
                                   UTC  (?: [+-]\d{1,4} | ±0)
                               )?
                          )
                          |
                          (?:
                             UTC  (?: [+-]\d{1,4} | ±0)
                          )
                     )
                 )?
        )
      \b

####
###  note - local time is now INLINE and MUST follow time
       (?:
           [ ]+   ## todo/check - make space optional - why? why not?
           \(
        (?<time_local>
                (?<local_hour>\d{1,2})
                   [:h]    ### todo/fix - MUST match style in time above!!!
                           ###   use capture with backref!!!!
                (?<local_minute>\d{2})

                ####
                ## optional "local" timezone name eg. BRT or CEST etc.
                (?:
                    [ ] ## require space - why? why not
                   (?<local_timezone>
                      (?:  [A-Z]{3,4}
                           (?: /
                                   UTC (?: [+-]\d{1,4} | ±0)
                           )?
                      )
                      |
                      (?:     ## e.g. 0 or 00 or 0000
                          UTC   (?: [+-]\d{1,4} | ±0)
                      )
                  )
               )?  # note - make timezone  optional!!!
          )
      \)
       )?
}ix

START_GOAL_LINE_RE = note - assume lines starting with opening ( are goal lines!!!! note - use \A (instead of ^) - \A strictly matches the start of the string. note - check for negative lookahead to exclude ord (numbers) e.g. (1), (42), etc.!!! todo/fix -- exclude (a), (h), (n) - TEAM_AWAY, TEAM_HOME, TEAM_NEUTRAL tokens!!

%r{
                    \A
                       [ ]*    ## ignore leading spaces (if any)
                      \(

                      # check NEGATIVE lookahead
                      (?!
                            ##  exclude (a), (h), (n)
                            ##    TEAM_AWAY, TEAM_HOME, TEAM_NEUTRAL
                            (?: a|h|n )
                            \)
                       )

}xi

START_GOAL_LINE_COMPAT_RE =

%r{
                   \A
                        [ ]*    ## ignore leading spaces (if any)
                      \(

                      ## (i) check NEGATIVE lookahead
                      ##    exclude score e.g. 1-1 etc.
                          (?! [ ]* \b \d-\d \b)

                      ## (ii) check POSITIVE lookahead
                          (?= [ ]*
                               \d{1,3}
                                   '?    ## optional minute marker
                                  (?: \+
                                      \d{1,2}
                                    '?    ## optional minute marker
                                  )?
                            )
}xi

START_GOAL_LINE_ALT_RE = check for goal line (alternate syntax) (1-0 Player, 1-1 Player, ...) must start-off OR yes, include score note - allow "centered" style e.g. ( Player 44' (p) 1-0 1-1 Player 64' )

%r{
    \A
       [ ]*    ## ignore leading spaces (if any)
     \(

     # check POSITIVE lookahead
      (?=  .*?         ## note - non-greedy
               \b \d-\d \b    ## score e.g. 0-1
        )
}xi

GOAL_NONE_RE = e.g. (-; Metzger)

%r{ (?<goals_none>
       -[ ]*;
   )
}x

GOAL_SEP_ALT_RE =

%r{
          (?<goal_sep_alt>
 (?<=[ ])   ## positive lookbehind - space required
 -
 (?=[ ]|\z)    ## positive lookahead - speace required
)}x

GOAL_COUNT_RE = e.g. (2) (2/p), (2/pen.), (3/2p), (3/ 2 pen.) -or- (2,1pen), (3, 2 pens) (p), (pen.) (2 pen.), (2p) (og), (o.g.), (2og), (2 o.g.), (2ogs)

%r{
   (?<goal_count>
      \(
        (?:
          ## opt penalties
            (?<pen>
              (?:  (?<pen_value> \d{1,2}) [ ]? )?
                 (?:pens|pen\.?|p)
           )
            |
          ## opt own goals (og)
            (?<og>
             (?: (?<og_value> \d{1,2}) [ ]? )?
                (?:ogs?|o\.g\.|o)
            )
            |
          ## opt fallback - classic count/number
          (?:  (?<value> [1-9])
                ## check for option penalties
                (?<pen>
                     [,/] [ ]*
                     (?: (?<pen_value> \d{1,2}) [ ]? )?
                     (?:pens|pen\.?|p)
                )?
           )
         )
      \)
)}ix

MINUTE_RE = note - inline b check in MINUTE_RE excludes 85pen or 90+4pen or 38p (possible and NOT excluded in GOAL_MINUTE_RE !!!) minute with optional stoppage (offset)

%r{
     (?<minute>
               \b
             (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
                \b
                '?    ## optional minute marker

                (?: \+ (?<value2>\d{1,2})
                       \b
                      '?    ## optional minute marker
                 )?
      )
}ix

GOAL_MINUTE_NA_RE = keep separate? or add simply inside GOAL_MINUTE_RE - why? why not? fix-fix-fix - move into GOAL_MINUTE_RE !!!

%r{
     (?<goal_minute_na>

       # positive lookbehind
       (?<=[ ,;])

       (?<value> \?{1,2})
            '?    ## optional minute marker
     ## note - add goal minute qualifiers here inline!!!
        (?:
            (?: [ ]? (?<og>   (?: \((?:og|o\.g\.|o)\))   ## allow (og)
                                   |
                              (?: (?:og|o\.g\.|o))      ## allow plain og
                      )
            )
            |
            (?: [ ]? (?<pen>  (?: \((?:pen\.?|p)\))   ## allow ()
                                   |
                              (?: (?:pen\.?|p))
                      )
            )
            |
            ## add experimental header qualifier
            (?: [ ]? (?<hdr> \( (?:hdr\.?|h ) \) | (?: hdr\.?|h ) ))
            |
            ## add experimental free kick qualifier
            (?: [ ]? (?<fk> \( (?:fk\.?|f ) \) | (?: fk\.?|f) ))
        )?

     ## note - check positive lookahead
     (?=[ ,;)]|$)
   )
}ix

GOAL_MINUTE_RE = goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og) todo/check - keep case-insensitive or allow OG or P or PEN or only lower case - why? why not? add (gg) for golden goal - why? why not? add (sg) for silver goal - why? why not??

%r{
     (?<goal_minute>
               \b
             (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
                '?    ## optional minute marker

                 (?: \+ (?<value2>\d{1,2})
                      '?    ## optional minute marker
                 )?

        ## note - add goal minute qualifiers here inline!!!
        (?:
            (?: [ ]? (?<og>   (?: \((?:og|o\.g\.|o)\))   ## allow (og)
                                   |
                              (?: (?:og|o\.g\.|o))      ## allow plain og
                      )
            )
            |
            (?: [ ]? (?<pen>  (?: \((?:pen\.?|p)\))   ## allow ()
                                   |
                              (?: (?:pen\.?|p))
                      )
            )
            |
            ## add experimental header qualifier
            (?: [ ]? (?<hdr> \( (?:hdr\.?|h ) \) | (?: hdr\.?|h ) ))
            |
            ## add experimental free kick qualifier
            (?: [ ]? (?<fk> \( (?:fk\.?|f ) \) | (?: fk\.?|f) ))
        )?

        ##  add experimental seconds
        ##    e.g. (95 secs) or (95sec) etc.
        (?: [ ]*  \(
                      (?<secs>\d{1,3})
                         [ ]?secs?
                   \)
        )?
     )

     ## note - check positive lookahead
     (?=[ ,;)]|$)
}ix

GOAL_TYPE_RE =

%r{
     (?<goal_type>
               \(
                 (?:
                      (?<og>  og|o\.g\.|o )
                         |
                      (?<pen> pen\.?|p )
                         |
                     ## add experimental header qualifier
                      (?<hdr>  hdr\.?|h )
                         |
                     ## add experimental free kick qualifier
                       (?<fk>  fk\.?|f )
                  )
                \)
)}xi

START_WITH_GROUP_DEF_LINE_RE = check for start of group def line e.g. Group A | ... Group 1 : .... Group A2 | .... note - use \A (instead of ^) - \A strictly matches the start of the string.

%r{
   \A
   [ ]*  ## ignore leading spaces (if any)
   (?<group_def>
       Group
        [ ]
        [a-z0-9]+   ## todo/check - allow dot (.) too e.g. 1.A etc.- why? why not?
   )
   ###   positive lookahead MUST be : OR |
   (?= [ ]*
       [:|]
       [ ])  ## note: requires space for now after [:|] - keep - why? why not?
}ix

ROUND_OUTLINE_I_RE = note - use A (instead of ^) - A strictly matches the start of the string. todo - add support for trailing markers e.g. ▪ Round 1 ▪▪▪▪▪▪▪▪ :: Round 1 :::::::::::: check - allow without space (like in heading =Heading 1=) - why? why not? ▪Round 1▪▪▪▪▪▪▪▪ ::Round 1::::::::::::

%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?<round_marker>
        [▪]{1,3}     ## BLACK SMALL SQUARE e.g. ▪,▪▪,▪▪▪
   )
    [ ]+
     (?<round_outline>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1
        ##
        ##  note - CANNOT incl. :| !!!
        ##   used for markers for defs/definitions
        [^:|]+?   ## use non-greedy
     )
     (?:
        [ ]+
        [▪]+
     )?
     [ ]*  ## ignore trailing spaces (if any)
   \z
}xi

ROUND_OUTLINE_II_RE =

%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?<round_marker>
         ::{1,3}     ## e.g. ::,:::,::::
   )
    [ ]+
     (?<round_outline>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1
        ##
        ##  note - CANNOT incl. :| !!!
        ##   used for markers for defs/definitions
        [^:|]+?   ## use non-greedy
     )
     (?:
        [ ]+
        ::+
     )?
    [ ]*  ## ignore trailing spaces (if any)
   \z
}xi

ROUND_OUTLINE_RE =

Regexp.union(  ROUND_OUTLINE_I_RE,
   ROUND_OUTLINE_II_RE,
)

ROUND_DEF_OUTLINE_RE = note - for def(initions) only one level support that is, no round outline additions possible (e.g ▪▪ 1st leg etc.)

%r{   \A
     [ ]*  ## ignore leading spaces (if any)
    (?: [▪]  ## BLACK SMALL SQUARE
         |
        :: )
     [ ]+
      (?<round_outline>
         [^:|]+?   ## use non-greedy
      )
     [ ]*  ## ignore trailing spaces (if any)
    ###   possitive lookahead MUST be : OR |
     (?= [:|]
         [ ])  ## note: requires space for now after [:|] - keep - why? why not?
}ix

SCORE_AWD_RE = note - keep AWD w/o dot - why? why not?

%r{
            (?<score_awd>
 \b
  (?<score1>\d{1,2}) - (?<score2>\d{1,2})
    [ ]?
      (?-i: awd\.? | AWD )
  ## POSITIVE lookahead - requires space
  (?= [ ])
)}ix

SCORE_ABD_RE = add support for score abandoned (inline style) 2-1 abd. or 2-1 ABD

%r{
            (?<score_abd>
 \b
  (?<score1>\d{1,2}) - (?<score2>\d{1,2})
    [ ]?
     (?-i: abd\.? | ABD )
  ## POSITIVE lookahead - requires space
  (?= [ ])
)}ix

SCORE_RE = 2-1 note - was SCORE__FT__RE changed to "generic" SCORE_RE and (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) changed (?<score1>\d{1,2}) - (?<score2>\d{1,2}) to pattern match not necessarily the full-time (ft) scoreline!!! - pattern also used for goal seq(uence) e.g. 1-0 Kane, 1-1 Johnson

%r{
            (?<score>
 \b
  (?<score1>\d{1,2}) - (?<score2>\d{1,2})
 \b
)}ix

POSTPONED =

%Q{ (?<postponed> postponed  | pst\\.? | po?stp\\.?  | ppd\\.? ) }

CANCELED = add can/can. - why? why not?

%Q{ (?<canceled>  cancell?ed | canc\\.? ) }

WALKOVER = add o/w too - why? why not?

%Q{ (?<walkover>  walkover   | w/o  | wo ) }

AWARDED =

%Q{ (?<awarded>   awarded    | awd\\.? ) }

SUSPENDED =

%Q{ (?<suspended> suspended  | susp\\.? ) }

ABANDONED =

%Q{ (?<abandoned> abandoned  | aban\\.?  | abd\\.? ) }

ANNULLED =

%Q{ (?<annulled>  annulled ) }

VOIDED = note - alternative (name) to annulled

%Q{ (?<voided>    voided     | void ) }

REPLAY =

%Q{ (?<replay>    replay     | repl\\.? ) }

STATUS_RE = note - status_note incl. complete text incl. <status> (not normalized) <status> gets normalized e.g. ppt => postponed etc.

%r{
            \[
      (?:
#############################################
### opt 1 - allow long forms with note/comment for some stati
##                    e.g. [postponed due to tropical storm "Hanna"]
##                         [suspended at 84' by storm; result stood]
#########################
           (?: (?<status_note>
                  (?<status>
               ####################
               ## pre-match (not played)
                    #{POSTPONED}
                           |
                    #{CANCELED}
                           |
                    #{WALKOVER}
                           |
               ######################
               ## pre/post match
                     #{AWARDED}
                            |
               ########################
               ## post match - (partially) played
                    #{SUSPENDED}
                            |
                    #{ABANDONED}
                            |
                    #{ANNULLED}
                            |
                    #{VOIDED} ### note - alternative to annulled
              )     ## end-of-<status>
                  [ :;,-]+     ## leading spaces (or separators)
                  [^\]]+?      ## note - add non-greedy match
              ) ## end-of-<status-note>
              [ ]*  ## eat-up optional trailing spaces
            )
            |
########################################
## opt 2 - short form only (no note/comments) e.g. [postponed], [Canceled], etc.
####################################
            (?<status>
         ####################
         ## pre-match (not played)
               #{POSTPONED}
                 |
               #{CANCELED}
                 |
               #{WALKOVER}
                 |
         ######################
         ## pre/post match
               #{AWARDED}
                 |
         ########################
         ## post match - (partially) played
               #{SUSPENDED}
                 |
               #{ABANDONED}
                 |
               #{ANNULLED}
                 |
               #{VOIDED}   ### note - alternative to annulled
                 |
               #{REPLAY}       ### todo/fix - keep replay - why? why not?
                                  ###   prefer replay in round e.g.
                                  ##       ▪ Round 17, Replay
                                  ##       ▪ Semi-finals, Replays
            )
      )
    \]
}ix

GOAL_RE =

Regexp.union(
    SPACES_RE,
    GOAL_NONE_RE,
    GOAL_MINUTE_RE,
    GOAL_MINUTE_NA_RE,
    GOAL_COUNT_RE,
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
    GOAL_SEP_ALT_RE,   ##  note - add dash (-) with (required) spaces
    /  (?<sym> [;,)])  /x
    ## todo/fix - add ANY_RE !!!!
)

GOAL_ALT_RE =

Regexp.union(
    SPACES_RE,
    SCORE_RE,        ## e.g.  1-0, 0-1, etc.
    GOAL_MINUTE_RE,
    GOAL_TYPE_RE,
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
    /  (?<sym> [,)])  /x    ## note - no semicolon (;)
    ## todo/fix - add ANY_RE !!!!
)

GOAL_COMPAT_RE =

Regexp.union(
    SPACES_RE,
    SCORE_RE,        ## e.g.  1-0, 0-1, etc.
    MINUTE_RE,          ## note - matches minute e.g.  92, 7, 7' 7+3, etc.
    GOAL_TYPE_RE,
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
    /  (?<sym> [,)])  /x    ## note - no semicolon (;)
    ## todo/fix - add ANY_RE !!!!
)

HTML_COMMENT_RE =

%r{  <!--
     .*?   ## note - use non-greedy/lazy *? match
  -->
}xm

PREPROC_NOTA_BENE_RE = check for “literal” (multi-line) note blocks eg. nb: or note: space required after double colon - why? why not?

%r{
         ^
    [ ]* (?: nb | note) [ ]* : [ ]+
       .+?  ## non-greedy

    ## positive lookahead
    ##    note - must end with blank line or end-of-file/document
      (?=          \n[ ]*\n
                 | \z
        )
}xim

PREPROC_BLOCK_RE = note - [] block may NOT incl. square brackets what about comments (e.g. #)? todo/check - rename to NOTE_BLOCK or TEXT_BLOCK or ???

%r{  \[
                      [^\[\]\#]*?  ## note - use non-greedy/lazy *? match
                  \]
}xm

PROP_NAME_WORD_ =

%r{
       \p{L}+
         \.?     ## optional dot
}ix

PROP_NAME_RE =

name different from text (**does NOT allow number in name/text**)

different from PROP_KEY too

%r{
                 (?<prop_name>
                      \b
                   (?<name>
                        #{PROP_NAME_WORD_}

                          ## connectors
                          (?:
                             ## (i) space - only one single space allowed inline!!!
                              (?:
                               ### check if negative lookbehind is redudant!!
                               ##    next char is \p{L} and NOT space
                               ##    thus double space not possible!!
                                (?<! [ ])             ## use negative lookbehind
                                  [ ]
                                (?=  \p{L}|['"]\p{L})      ## use lookahead
                              )
                              ## (ii) support (inline) quoted name e.g. "Rodri" or such
                                 | (?:
                                     (?<=[ ])   ## use positive lookbehind
                                     " \p{L}+ "
                                      ## require space here too - why? why not?
                                   )
                              ## (iii) dash (-)
                              | (?:
                                ## use  POSITIVE lookBEHIND
                                ## note - allow leading dot (.) e.g. K.-H.Förster
                                ##                short for          Karl-Heinz Förster
                                ##
                                ##    change to negative lookBEHIND   [ '"-]
                                ##      \p{L}\. | \p{L} - not MUST be fixed size
                                 (?<=
                                         [\p{L}.]
                                      )
                                 [-]   ## must be surrounded by letters
                                       ## e.g. One-Two NOT
                                       ##      One- Two or One - Two or One -Two etc.
                                (?= \p{L})      ## use lookahead
                              )
                                 |
                              (?:  ## flex rule for quote - allow any
                                    ##  only check for double quotes e.g. cannot follow other ' for now - why? why not?
                                    ##        allows  rodrigez 'rodri' for example
                                (?<!')  ## use negative lookbehind
                                   '
                              )
                            |   ## standard case with letter(s) and optional dot
                              #{PROP_NAME_WORD_}
                          )*
                    )
                ## add lookahead - must be non-alphanum
                ##    add colon (:) too - why? why not?
                  (?= [ ,;\]\)]|$)
)}ix

P_EN = english helpers (penalty, extra time, …) note - p must go last (shortest match) pso = penalty shootout - note - remove PSO for now (may add later back) - why? why not? todo/fix/clean-up - keep it simple - remove optional trailing dot (.) from pen., p., agg. etc. - why? why not? always use (simply) pen, p, agg (also) remove a.e.t. / a.e.t option - why? why not? UPDATE mar/2026: addd pens too - keep - why? why not? (4-3 pens) (4-3 Pens) -- keep mixed Pens/Pen. too - why? why not? (4-3 Pen.)

'(?-i: PEN | P |' +
'[Pp]ens | [Pp]en\.? | p\.? )'

ET_EN = fix - change ET_EN to AET_EN!!! - why? why not? check - allow Aet too - why? why not? or A.e.t ??

'(?-i: AET | ' +
'aet | a\.e\.t\.? )'

AETGG_EN = after (golden goal/sudden death) extra time - add more options/styles - why? why not?

'(?-i: AET/GG | AGGET | ASDET | ' +
'aet/gg | a\.e\.t\.?/g\.g\.? | agget | asdet )'

AETSG_EN = after (silver goal) extra time

'(?-i: AET/SG | ASGET | ' +
'aet/sg | a\.e\.t\.?/s\.g\.? | asget  )'

AGG_EN = agg/agg. or AGG

'(?-i: AGG | agg\.? )'

SCORE_P = fix - change SCORE_P to SCORE_FULL_P SCORE_ET to SCORE_FULL_ET (re)use SCORE_P, SCORE_ET for score only part!!! fix/fix/fix - rename to SCORE_P_ SCORE_ET_ mark internals with TRAILING underscore (leading NOT possible!)

%Q<  (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})
        [ ]? #{P_EN}
>

SCORE_ET =

%Q<  (?<et1>\\d{1,2}) - (?<et2>\\d{1,2})
        [ ]? #{ET_EN}
>

SCORE_LOOKAHEAD =

'(?= [ ,\]] | $)'

SCORE__ET_GG_SG__RE = after extra-time with golden goal/sudden death & silver goal rule note - golden goal & silver goal EXCLUDE penalties!!! 4-3 a.e.t/g.g. 4-3 aet/gg 4-3agget -or- 4-3 asdet 2-1 aet/sg -or- 4-3 aet/gg (3-3, 2-1)

%r{
    (?<score_full>
       \b
       (?<et1>\d{1,2}) - (?<et2>\d{1,2})
                      [ ]? (?:
                               (?<aetgg> #{AETGG_EN})
                                  |
                               (?<aetsg> #{AETSG_EN})
                            )
       ### note:
       ## add optional full-time, half-time score
         (?:
             [ ]+
             \(
                [ ]*
               (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
                  [ ]*
                (?:
                   , [ ]*
                   (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
                     [ ]*
                  )?
               )? # note: make half time (HT) score optional for now
             \)
         )?
        #{SCORE_LOOKAHEAD}
)}ix

SCORE__P_ET__RE = note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.) 3-4 pen. 2-2 a.e.t. 3-4 pen. 2-2 a.e.t. 2-2 a.e.t.

%r{
(?<score_full>
   \b
    (?: #{SCORE_P} [ ]+
     )?             ## note: make penalty (P) score optional for now
    #{SCORE_ET}
    #{SCORE_LOOKAHEAD}
)}ix

SCORE__ET_P__RE = note: allow SPECIAL cases WITHOUT full time scores AND with pen in last position! 2-2 a.e.t., 3-4 pen. 2-2 a.e.t. 3-4 pen. ## or without comma separator - why? why not?

%r{
(?<score_full>
   \b
    #{SCORE_ET}
       (?: [ ]*,[ ]* | [ ]+ )
    #{SCORE_P}
    #{SCORE_LOOKAHEAD}
)}ix

SCORE__FT_P__RE = special case (i) - full time with penalties 2-2, 3-4 pen.

%r{
(?<score_full>
   \b
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
        [ ]*,[ ]*    ## note - comma required!!!
    #{SCORE_P}
    #{SCORE_LOOKAHEAD}
)}ix

SCORE__FT_HT_P__RE = special case (ii) - full time & half-time with penalties 2-2 (1-1), 3-4 pen.

%r{
(?<score_full>
   \b
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
        [ ]*
         \(
             (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         \)
        [ ]*,[ ]*    ## note - comma required!!!
    #{SCORE_P}
    #{SCORE_LOOKAHEAD}
)}ix

SCORE__P__RE = note: allow SPECIAL with penalty only 3-4 pen. or 3-4p etc.

%r{
        (?<score_full>
  \b
    #{SCORE_P}
    #{SCORE_LOOKAHEAD}
)}ix

SCORE__P_ET_FT_HT_V2__RE = support short all-in-one e.g. e.g. 3-4 pen. 2-2 a.e.t. ( 1-1, 1-1 ) becomes 3-4 pen. (2-2, 1-1, 1-1)

%r{
          (?<score_full>
   \b
    #{SCORE_P} [ ]+
       \(
       [ ]*
   (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]*, [ ]*
   (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*, [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
       [ ]*
    \)
   #{SCORE_LOOKAHEAD}
)}ix

SCORE__ET_FT_HT_P__RE = e.g. 2-2 a.e.t. (1-1, 1-0), 5-1 pen.

%r{
          (?<score_full>
   \b
   #{SCORE_ET} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
   (?: [ ]*,[ ]* | [ ]+)
   #{SCORE_P}
   #{SCORE_LOOKAHEAD}
)}ix

SCORE__P_ET_FT_HT__RE = e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or 3-4p 2-2aet (1-1, ) or 3-4 pen. 2-2 a.e.t. (1-1) or 2-2 a.e.t. (1-1, 1-1) or 2-2 a.e.t. (1-1, ) or 2-2 a.e.t. (1-1)

%r{
          (?<score_full>
   \b
   (?:
      #{SCORE_P} [ ]+
    )?            ## note - make penalty (P) score optional for now
   #{SCORE_ET} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
 #{SCORE_LOOKAHEAD}
)}ix

SCORE__P_FT_HT__RE = special case for case WITHOUT extra time!! same as above (but WITHOUT extra time and pen required)

%r{
         (?<score_full>
            \b
           #{SCORE_P} [ ]+
    \(
    [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]*
 (?:
      , [ ]*
     (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         [ ]*
     )?
 )?              # note: make half time (HT) score optional for now
   \)
#{SCORE_LOOKAHEAD}
)}ix

SCORE__FT_HT__RE = e.g. 2-1 (1-1)

%r{
            (?<score_full>
 \b
 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
      [ ]+ \( [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
      [ ]* \)
#{SCORE_LOOKAHEAD}
)}ix

SCORE_FULL_1ST_RE = note 2-2, 5-1 pen. must get priority (get before SCORE_LEGS!!!) break out note - no need for Regexp.union for now (only single regex!)

SCORE__FT_P__RE

SCORE_FULL_RE =

Regexp.union(
  SCORE__ET_GG_SG__RE,       # e.g. 3-1 aet/gg
  SCORE__P_ET_FT_HT_V2__RE,  # e.g. 5-1 pen. (2-2, 1-1, 1-0)
  SCORE__ET_FT_HT_P__RE,    # e.g. 2-2 a.e.t. (1-1, 1-0), 5-1 pen.
  SCORE__P_ET_FT_HT__RE,    # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
  SCORE__P_FT_HT__RE,     # e.g. 5-1 pen. (1-1)
  SCORE__ET_P__RE,        # e.g. 2-2 a.e.t., 5-1 pen.
  SCORE__FT_HT_P__RE,     # e.g. 2-2 (1-1), 5-1 pen.
  SCORE__P_ET__RE,        # e.g.  5-1 pen. 2-2 a.e.t.  or  2-2 a.e.t. (w/o pen)
  SCORE__P__RE,           # e.g. 5-1 pen.
  SCORE__FT_HT__RE,        # e.g. 1-1 (1-0)
  ##  note - keep basic score as its own token!!!!
  ##   that is, SCORE & SCORE_MORE
  ### SCORE__FT__RE,           # e.g. 1-1  -- note - must go last!!!
)

SCORE_LEGS_RE = win on away goals aet

%r{
(?<score_legs>
   \b   
    (?<leg1_ft1>\d{1,2}) - (?<leg1_ft2>\d{1,2})
       (?: [ ]+ |  [ ]*,[ ]*)   # separate by spaces OR comma
    (?:
        ## opt 1 - after extra-time (et) score
            (?<leg2_et1>\d{1,2}) - (?<leg2_et2>\d{1,2})
               [ ]? #{ET_EN}   ## a.e.t./aet
                ### note - might end in dot (.) not alpha
                ###  thus, wordboundary NOT working
               #{SCORE_LOOKAHEAD}   
          |
        ## opt 2 - full-time (ft)  
        (?<leg2_ft1>\d{1,2}) - (?<leg2_ft2>\d{1,2})
            \b 
    )                
    (?:   ## check optional aggregate e.g. (agg 4-4)
        [ ]+
         \(
             agg [ ]
              (?<agg1>\d{1,2}) - (?<agg2>\d{1,2}) 
              
             ### add win options 
             (?:
                 ## opt 1 - on away goals
                (?<away> [ ]*,[ ]*
                         (?:win [ ])? on [ ] away [ ] goals?
                 )
                   |
                 ## opt 2 - on penalties  
                (?:
                   [ ]*,[ ]*
                   (?:win [ ])?
                    (?<leg2_p1>\d{1,2}) - (?<leg2_p2>\d{1,2})
                    [ ] on [ ] pens
                )
             )?
         \)
    )?
)}ix

MONTH_LINES =

parse_names( <<TXT )
January    Jan
February   Feb
March      Mar
April      Apr
May
June       Jun
July       Jul
August     Aug
September  Sept  Sep
October    Oct
November   Nov
December   Dec
TXT

MONTH_NAMES =

build_names( MONTH_LINES )

MONTH_MAP = pp MONTH_NAMES

build_map( MONTH_LINES, downcase: true )

DAY_LINES =

parse_names( <<TXT )
Monday                   Mon  Mo
Tuesday            Tues  Tue  Tu
Wednesday                Wed  We
Thursday    Thurs  Thur  Thu  Th
Friday                   Fri  Fr
Saturday                 Sat  Sa
Sunday                   Sun  Su
TXT

DAY_NAMES =

build_names( DAY_LINES )

DAY_MAP = pp DAY_NAMES

build_map( DAY_LINES, downcase: true )

GROUP_DEF_RE = note - add comma (,) as optional separator

Regexp.union(  SPACES_RE,
   TEXT_RE,
   / (?<sym> [:|,] )  /x,
   ANY_RE,
)

PROP_CARDS_RE = note - no inline keys possible todo/fix - use custom (limited) prop basics too

Regexp.union(
   SPACES_RE,
   MINUTE_RE,
   PROP_NAME_RE,
   /  (?<sym>  [;,-]) /x
   ## todo/fix - add ANY_RE here too!!!
)

PROP_ATTENDANCE_RE =

Regexp.union(
   SPACES_RE,
   ENCLOSED_NAME_RE,       # e.g. (sold out) etc.  why? why not?
   PROP_NUM_RE,                 # e.g. 28 000 or 28_000  (NOT 28,000 is not valid!!!)
   ## todo/fix - add ANY_RE here too!!!
)

PROP_REFEREE_RE =

Regexp.union(
   SPACES_RE,
   ENCLOSED_NAME_RE,       # e.g. (sold out) etc.  why? why not?
   PROP_NUM_RE,                 # e.g. 28 000 or 28_000  (NOT 28,000 is not valid!!!)
   PROP_KEY_INLINE_RE,
   PROP_NAME_RE,
   /  (?<sym>  [;,]) /x
   ## todo/fix - add ANY_RE here too!!!
)

ROUND_DEF_RE = note - add comma (,) as optional separator

Regexp.union(  SPACES_RE,
   DURATION_RE,  # note - duration MUST match before date
   DATE_RE,  ## note - date must go before time (e.g. 12.12. vs 12.12)
   / (?<sym> [:|,] ) /x,
   ANY_RE
)

SCORE_FULLER_AGG =

_mk_score_fuller_agg( win: false )

SCORE_FULLER_AGG_WIN =

_mk_score_fuller_agg( win: true )

SCORE_FULLER_P =

_mk_score_fuller_p( win: false )

SCORE_FULLER_P_WIN =

_mk_score_fuller_p( win: true )

SCORE_FULLER_AWAY_WIN =

%Q<
     (?:
      (?<away>
        ############
        ## opt 1)  with win
        (?:
            (?: win [ ] )?
            (?: (?<away1>\\d{1,2}) - (?<away2>\\d{1,2}) [ ] )?
             on [ ] away [ ] goals?     # goal or goals
        )
        |        
        #####
        ## opt 2)  "classic" (post)
        (?:
           (?: (?<away1>\\d{1,2}) - (?<away2>\\d{1,2}) [ ] )?
              [ ]* away  
        )
        |
        #####
        ## opt 3) up-front (pre)
        (?:
              away 
           (?:  [ ]
                (?<away1>\\d{1,2}) - (?<away2>\\d{1,2})
           )?   
        )
     ))                   
>

SCORE_FULLER_HT_OPT =

%Q<
  (?:   HT [ ]
      (?: (?<ht1>\\d{1,2}) - (?<ht2>\\d{1,2})) 
      [ ]*,[ ]*
  )?  ## note - make optional
>

SCORE_FULLER_FT_OPT =

%Q<
  (?:   FT [ ]
      (?: (?<ft1>\\d{1,2}) - (?<ft2>\\d{1,2})) 
      [ ]*,[ ]*
  )?  ## note - make optional
>

SCORE_FULLER__HT = 4-4 (HT 2-1) or Team A 4-1 Team B (HT 2-1)

%Q<
             \\(  HT [ ]
                  (?<ht1>\\d{1,2}) - (?<ht2>\\d{1,2}) 
             \\)
>

SCORE_FULLER__HT_FT__RE =

%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__HT}
)}ix

SCORE_FULLER_MORE__HT_FT__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__HT}
)}ix

SCORE_FULLER__ET =

%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_FT_OPT} 
                (?:
                  (?<aetgg> #{AETGG_EN})
                   |
                  (?<aetsg> #{AETSG_EN}) 
                   |
                  (?<aet> #{ET_EN})
                 )
             \\)
>

SCORE_FULLER__ET__RE =

%r{
(?<score_fuller>
   \b   
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__ET}
)}ix

SCORE_FULLER_MORE__ET__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__ET}
)}ix

SCORE_FULLER__ET_P = 4-4 (aet, win 3-5 on pens) 4-4 (aet, 3-5 on pens) 4-4 (aet, 3-5 pen) 4-4 (a.e.t., 3-5 pen.) or Team A 4-4 Team B (aet, win 3-5 on pens) Team A 4-4 Team B (aet, 3-5 on pens) Team A 4-4 Team B (aet, 3-5 pen) Team A 4-4 Team B (a.e.t., 3-5 pen.)

%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_FT_OPT} 
                (?<aet> #{ET_EN})
                 [ ]*,[ ]*
                 #{SCORE_FULLER_P_WIN}
             \\)
>

SCORE_FULLER__ET_P__RE =

%r{
(?<score_fuller>
   \b   
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__ET_P}
)}ix

SCORE_FULLER_MORE__ET_P__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__ET_P}
)}ix

SCORE_FULLER__FT_P = 4-4 (win 3-5 on pens) 4-4 (3-5 pen) 4-4 (3-5p) or Team A 4-4 Team B (win 3-5 on pens) Team A 4-4 Team B (3-5 pen) Team A 4-4 Team B (3-5p)

%Q<
             \\(
                  #{SCORE_FULLER_HT_OPT} 
                  #{SCORE_FULLER_P_WIN}
             \\)
>

SCORE_FULLER__FT_P__RE =

%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     \(
         #{SCORE_FULLER_P_WIN}
     \)
)}ix

SCORE_FULLER_MORE__FT_P__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__FT_P}
)}ix

SCORE_FULLER__FT_AGG = 3-2 (win 4-5 on aggregate) 3-2 (4-5 on aggregate) 3-2 (4-5 on agg) 3-2 (4-5 agg) 3-2 (4-5 agg.) or 3-2 (agg 4-5)

%Q<
             \\(
                 #{SCORE_FULLER_HT_OPT} 
                 #{SCORE_FULLER_AGG_WIN}
             \\)
>

SCORE_FULLER__FT_AGG__RE =

%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__FT_AGG}
)}ix

SCORE_FULLER_MORE__FT_AGG__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__FT_AGG}
)}ix

SCORE_FULLER__FT_AGG_AWAY = ft + agg + away 2-1 (3-3 on aggregate, win on away goals) 2-1 (3-3 on aggregate, win 2-1 on away goals)

%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_AGG}
                   [ ]*,[ ]*
                 #{SCORE_FULLER_AWAY_WIN}
             \\)
>

SCORE_FULLER__FT_AGG_AWAY__RE =

%r{
(?<score_fuller>
   \b   
    (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__FT_AGG_AWAY}
)}ix

SCORE_FULLER_MORE__FT_AGG_AWAY__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__FT_AGG_AWAY}
)}ix

SCORE_FULLER__ET_AGG_P = 2-1 (aet, 3-3 on aggregate, win 5-2 on pens) 2-1 (aet, 3-3 agg, 5-2 pen.)

%Q<
             \\(
                #{SCORE_FULLER_HT_OPT} 
                #{SCORE_FULLER_FT_OPT} 
                (?<aet> #{ET_EN})
                    [ ]*,[ ]*
                    #{SCORE_FULLER_AGG}  
                    [ ]*,[ ]*
                    #{SCORE_FULLER_P_WIN}                     
             \\)
>

SCORE_FULLER__ET_AGG_P__RE =

%r{
(?<score_fuller>
   \b   
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
    [ ]+
     #{SCORE_FULLER__ET_AGG_P}
)}ix

SCORE_FULLER_MORE__ET_AGG_P__RE =

%r{
(?<score_fuller_more>
     #{SCORE_FULLER__ET_AGG_P}
)}ix

SCORE_FULLER_RE = map tables note: order matters - first come-first matched/served

Regexp.union(
SCORE_FULLER__HT_FT__RE,       ## e.g.  3-2 (HT 2-1)
SCORE_FULLER__ET_P__RE,        ## e.g.  2-2 (aet, win 5-3 on pens)
SCORE_FULLER__ET__RE,          ## e.g.  2-3 (aet)
SCORE_FULLER__FT_P__RE,        ## e.g.  2-2 (win 5-3 on pens)
SCORE_FULLER__FT_AGG__RE,      ## e.g.  2-3 (win 5-4 on aggregate)
SCORE_FULLER__FT_AGG_AWAY__RE, ## e.g.  2-1 (3-3 on aggreate, win 2-1 on away goals)
SCORE_FULLER__ET_AGG_P__RE,    ## e.g.  2-1 (aet, 3-3 on aggregate, win 5-2 on pens)
)

SCORE_FULLER_MORE__HT__RE = add support for “stand-alone” (HT) and (FT) - keep why? why not?

%r{
(?<score_fuller_more>
    \( (?<ht> ht ) \)
)}ix

SCORE_FULLER_MORE__FT__RE =

%r{
(?<score_fuller_more>
     \( (?<ft> ft ) \)  
)}ix

SCORE_FULLER_MORE__FT_ET__RE = add special for fuller_more (aet 4-3) - core score is ft, and fuller more incl. et!!!

%r{
(?<score_fuller_more>
      \(#{ET_EN}
           [ ]
       (?<et1>\d{1,2}) - (?<et2>\d{1,2})
      \) 
)}ix

SCORE_FULLER_MORE__HT_FT__CLASSIC_RE = note - simply (1-1) !!!!! note - special attention needed for placemenent in processing error!!! make sure it is the last (or on of the last) match(es)

%r{
(?<score_fuller_more>
     \(  
          (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) 
     \)
)}ix

SCORE_FULLER_MORE_RE =

Regexp.union(
  SCORE_FULLER_MORE__FT__RE,          ## e.g. (ft)
  SCORE_FULLER_MORE__HT__RE,          ## e.g. (ht)
  SCORE_FULLER_MORE__HT_FT__RE,       ## e.g. (HT 2-1)
  SCORE_FULLER_MORE__ET_P__RE,        ## e.g. (aet, win 5-3 on pens)
  SCORE_FULLER_MORE__ET__RE,          ## e.g. (aet)
  SCORE_FULLER_MORE__FT_ET__RE,       ## e.g. (aet 3-2) - (SPECIAL) incl. after extra-time score!!
  SCORE_FULLER_MORE__FT_P__RE,        ## e.g. (win 5-3 on pens)
  SCORE_FULLER_MORE__FT_AGG__RE,      ## e.g. (win 5-4 on aggregate)
  SCORE_FULLER_MORE__FT_AGG_AWAY__RE, ## e.g. (3-3 on aggreate, win 2-1 on away goals)
  SCORE_FULLER_MORE__ET_AGG_P__RE,    ## e.g. (aet, 3-3 on aggregate, win 5-2 on pens)

  SCORE_FULLER_MORE__HT_FT__CLASSIC_RE,   ## e.g. (2-1)  half-time !!!!
)

DURATION_I_RE =

%r{
(?<duration>
    \b
  (?:
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name1>#{MONTH_NAMES})
      [ ]
   (?<day1>\d{1,2})
   ## optional year
   (  ,?   # optional comma
      [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]* - [ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name2>#{MONTH_NAMES})
      [ ]
   (?<day2>\d{1,2})
   ## optional year
   (  ,?   # optional comma
      [ ]
      (?<year2>\d{4})
   )?
  )
   \b
)}ix

DURATION_II_RE = variant ii add support for shorthand August 16-18, 2011 September 13-15, 2011 October 18-20, 2011 March 6-8 2012 March 6-8 - add support for August 16+17 or such (and check 16+18) use <op> to check if day2 is a plus or range or such - why? why not?

%r{
(?<duration>
    \b
   (?:
       (?<month_name1>#{MONTH_NAMES})
           [ ]
        (?<day1>\d{1,2})
             -
        (?<day2>\d{1,2})
          (?:
            ,?     ## optional comma
            [ ]
            (?<year1>\d{4})
          )?     ## optional year
   )
   \b
)}ix

DURATION_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
   DURATION_I_RE,
   DURATION_II_RE,
)

INLINE_WO_RE = add support for WO or W-0 too - why? why not?

%r{
    (?<inline_wo>
        \b (?: w/o | W/O ) \b
)}x

INLINE_BYE_RE = note - NOT case insensitive

%r{
   (?<inline_bye>
       \b (?: bye | BYE ) \b
)}x

INLINE_NP_RE = A n/p B (note - basically a inline short form of A v B [cancelled] ) N/P

%r{
    (?<inline_np>
        \b (?: n/p | N/P ) \b
)}x

INLINE_ABD_RE = abd/abd. or aban/aban. [abandoned] ABD/ABAN

%r{
    (?<inline_abd>
        \b (?: abd\.? |
               aban\.? |
               ABD | ABAN
           )
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x

INLINE_SUSP_RE = susp/susp. [suspended] SUSP

%r{
    (?<inline_susp>
        \b (?: susp\.? |
                SUSP )
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x

INLINE_PPD_RE = ppd/ppd. or pst/pst. or pstp/pstp. or postp/postp. [postponed] PPD/PSTP/POSTP/P-P todo/check - add/allow p-p too - why? why not?

%r{
    (?<inline_ppd>
        \b (?: ppd\.? |
               pst\.? |
               po?stp\.? |
               PPD | PST | PO?STP | P-P
            )
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x

INLINE_VOID_RE = void via x-x X-X todo/check - only allow X-X - why? why not?

%r{
      (?<inline_void>
          \b (?: x-x |
                 X-X
             )
        ## POSITIVE lookahead - requires space
           (?= [ ])
)}x

INLINE_AWD_RE = awd/awd. [awarded] AWD note - recommendation is to allways include score thus, use/prefer SCORE_AWD e.g. 0-3 awd

%r{
    (?<inline_awd>
        \b (?: awd\.? | AWD )
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x

INLINE_CANC_RE = canc/canc. [cancelled] CANC

%r{
    (?<inline_canc>
        \b (?: canc\.?  | CANC )
   ## POSITIVE lookahead - requires space
          (?= [ ])
)}x

PROP_LINEUP_RE =

Regexp.union(
   SPACES_RE,
   MINUTE_RE,   ## e.g.  44 or 44' or 45+1 or 45+1' etc.

   INLINE_CAPTAIN,  ## e.g. [c]
   INLINE_YELLOW,   ## e.g. [Y] or [Y 44] or [Y 44'] or [Y 45+1']
   INLINE_YELLOW_RED,  ## e.g. [Y/R] or [Y/R 78]
   INLINE_RED,         ## e.g. [R] or [R 42] or [R 42']

   PROP_KEY_INLINE_RE,
   PROP_NAME_RE,
   /  (?<sym>  [;,()\[\]-]) /x
   ## todo/fix - add ANY_RE here too!!!
)

PROP_PENALTIES_RE =

Regexp.union(
   SPACES_RE,
   SCORE_RE,               # e.g. 1-1 etc.
   ENCLOSED_NAME_RE,       # e.g. (save), (post), etc.
   PROP_NAME_RE,
    /  (?<sym>  [;,]) /x    ## add [] too - why? why not?
   ## todo/fix - add ANY_RE here too!!!
)

Class Method Summary collapse

._build_date(m) ⇒ Object

“internal” date helpers.
._build_date_legs(m) ⇒ Object
._build_duration(m) ⇒ Object
._build_goal_count(m) ⇒ Object
._build_goal_minute(m) ⇒ Object
._build_goal_minute_na(m) ⇒ Object
._build_goal_type(m) ⇒ Object
._build_minute(m) ⇒ Object
._build_score(m) ⇒ Object
._build_score_abd(m) ⇒ Object

score abandonded (abd/abd.).
._build_score_awd(m) ⇒ Object

score awarded (awd/awd.).
._build_score_full(m) ⇒ Object
._build_score_fuller(m) ⇒ Object
._build_score_fuller_more(m) ⇒ Object
._build_score_legs(m) ⇒ Object
._build_status(m) ⇒ Object
._build_time(m) ⇒ Object
._mk_score_fuller_agg(win:) ⇒ Object

regex score helpers note - MUST double escape d e.g.
._mk_score_fuller_p(win:) ⇒ Object

with optional win - true|false.
._parse_date(str) ⇒ Object
._parse_goal_count(str) ⇒ Object
._parse_goal_minute(str) ⇒ Object

parse helpers.
._parse_score_full(str) ⇒ Object

add parser helpers.
._parse_team(str) ⇒ Object

helper for testing regex match for team names.
.build_map(lines, downcase: false) ⇒ Object
.build_names(lines) ⇒ Object
.parse_date(str, start: nil) ⇒ Object

note: parse_date - returns Date object _parse_date (with underscore) - return hash of “parsed” regex match data!!.
.parse_names(txt) ⇒ Object

Instance Method Summary collapse

#_build_date(m) ⇒ Object
#_build_date_legs(m) ⇒ Object
#_build_duration(m) ⇒ Object
#_build_goal_count(m) ⇒ Object
#_build_goal_minute(m) ⇒ Object
#_build_goal_minute_na(m) ⇒ Object
#_build_goal_type(m) ⇒ Object
#_build_minute(m) ⇒ Object
#_build_score(m) ⇒ Object
#_build_score_abd(m) ⇒ Object
#_build_score_awd(m) ⇒ Object
#_build_score_full(m) ⇒ Object
#_build_score_fuller(m) ⇒ Object
#_build_score_fuller_more(m) ⇒ Object
#_build_score_legs(m) ⇒ Object
#_build_status(m) ⇒ Object
#_build_time(m) ⇒ Object
#_info(*args) ⇒ Object
#_on_goal(m, ctx:) ⇒ Object
#_on_goal_alt(m, ctx:) ⇒ Object
#_on_goal_compat(m, ctx:) ⇒ Object

note - m is MatchData object.
#_on_group_def(m, ctx:) ⇒ Object

note - m is MatchData object.
#_on_prop_attendance(m, ctx:) ⇒ Object

note - m is MatchData object.
#_on_prop_cards(m, ctx:) ⇒ Object

note - m is MatchData object.
#_on_prop_lineup(m, ctx:) ⇒ Object

note - m is MatchData object.
#_on_prop_penalties(m, ctx:) ⇒ Object

note - m is MatchData object.
#_on_prop_referee(m, ctx:) ⇒ Object

note - m is MatchData object.
#_on_round_def(m, ctx:) ⇒ Object

note - m is MatchData object.
#_on_top(m, ctx:) ⇒ Object

note - m is MatchData object.
#_prep_doc(txt) ⇒ Object
#_prep_line(line) ⇒ Object

auto-fix checks line-by-line.
#_tokenize_line(line, lineno) ⇒ Object
#_trace(*args) ⇒ Object
#_warn(*args) ⇒ Object
#debug? ⇒ Boolean
#initialize(txt, debug: false) ⇒ Lexer constructor

A new instance of Lexer.
#log(msg) ⇒ Object
#tokenize_with_errors ⇒ Object

Constructor Details

#initialize(txt, debug: false) ⇒ `Lexer`

Returns a new instance of Lexer.

Raises:

(ArgumentError)

# File 'lib/sportdb/parser/lexer.rb', line 44

def initialize( txt, debug: false )
   raise ArgumentError, "text as string expected for lexer; got #{txt.class.name}"  unless txt.is_a?(String)

   @txt   = txt
   @debug = debug
end

Class Method Details

._build_date(m) ⇒ `Object`

“internal” date helpers

# File 'lib/sportdb/parser/token-date--helpers.rb', line 6

def self._build_date( m )
            date = {}
         ## map month names
         ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date[:y]  = m[:year].to_i(10)  if m[:year]
            ## check - use y too for two-digit year or keep separate - why? why not?
            date[:yy] = m[:yy].to_i(10)    if m[:yy]    ## two digit year (e.g. 25 or 78 etc.)
            date[:m] = m[:month].to_i(10)  if m[:month]
            date[:m] = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
            date[:d]  = m[:day].to_i(10)   if m[:day]
            date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

            date
end

._build_date_legs(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-date--helpers.rb', line 21

def self._build_date_legs( m )
           legs = {}
        ## map month names
         ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date = {}
            date[:m] = MONTH_MAP[ m[:month_name1].downcase ]
            date[:d]  = m[:day1].to_i(10)
            legs[:date1] = date

            date = {}
            date[:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            date[:d]  = m[:day2].to_i(10)
            legs[:date2] = date

            legs
end

._build_duration(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-date--helpers.rb', line 39

def self._build_duration( m )
            ## todo/check/fix - if end: works for kwargs!!!!!
            duration = { start: {}, end: {}}

            duration[:start][:y] = m[:year1].to_i(10)  if m[:year1]
            duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ]   if m[:month_name1]
            duration[:start][:d]  = m[:day1].to_i(10)   if m[:day1]
            duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ]   if m[:day_name1]

            duration[:end][:y] = m[:year2].to_i(10)  if m[:year2]
            duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            duration[:end][:d]  = m[:day2].to_i(10)   if m[:day2]
            duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ]   if m[:day_name2]

            duration
end

._build_goal_count(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-goals--helpers.rb', line 50

def self._build_goal_count( m )
    count = {}
    count[:count] = m[:value].to_i(10)        if m[:value]
    count[:og]    = m[:og_value] ? m[:og_value].to_i(10) : 1      if m[:og]   ## check flag
    count[:pen]   = m[:pen_value] ? m[:pen_value].to_i(10) : 1    if m[:pen]  ## check flag
    count
end

._build_goal_minute(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-goals--helpers.rb', line 6

def self._build_goal_minute( m )
    minute = {}

    minute[:m]     =  m[:value].to_i(10)   ## always required

    ## stoppage/injury time (offset)
    minute[:offset] = m[:value2].to_i(10)   if m[:value2]

    minute[:og]  = true       if m[:og]
    minute[:pen] = true       if m[:pen]
    minute[:freekick] = true  if m[:fk]
    minute[:header] = true    if m[:hdr]

    minute[:secs] = m[:secs].to_i(10)   if m[:secs]

    minute
end

._build_goal_minute_na(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-goals--helpers.rb', line 24

def self._build_goal_minute_na( m )
    minute = {}

    minute[:m]     =  '?'   ##  or use nil or 999 or -1 or ???

    minute[:og]  = true       if m[:og]
    minute[:pen] = true       if m[:pen]
    minute[:freekick] = true  if m[:fk]
    minute[:header] = true    if m[:hdr]

    minute
end

._build_goal_type(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-goals--helpers.rb', line 58

def self._build_goal_type( m )
    goal = {}
    goal[:og]       = true  if m[:og]
    goal[:pen]      = true  if m[:pen]
    goal[:freekick] = true  if m[:fk]
    goal[:header]   = true  if m[:hdr]
    goal
end

._build_minute(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-goals--helpers.rb', line 39

def self._build_minute( m )
    minute = {}
    minute[:m]      = m[:value].to_i(10)   ## always required

    ## stoppage/injury time (offset)
    minute[:offset] = m[:value2].to_i(10)   if m[:value2]

    minute
end

._build_score(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-score--helpers.rb', line 5

def self._build_score( m )
             ##  note - score is "generic"
            ##      might be full-time (ft) or
            ##         after extra-time (aet) or such
            ##         or even undecided/unknown
            ##    thus, use score1/score2 and NOT ft1/ft2
            ##      thus, use (simply an) array e.g. [1,2]
            ##           and NOT hash (table) e.g. { ft: [1,2] } !!!

            score  = [m[:score1].to_i(10),
                      m[:score2].to_i(10)]

           score
end

._build_score_abd(m) ⇒ `Object`

score abandonded (abd/abd.)

# File 'lib/sportdb/parser/token-score--helpers.rb', line 30

def self._build_score_abd( m )      # score abandonded (abd/abd.)
            ### note - use "generic" score for now
            score   = [m[:score1].to_i(10),
                       m[:score2].to_i(10)]
            ## add score[:abd] = true ???
            ##  note - for now uses its own token e.g SCORE_ABD
            score
end

._build_score_awd(m) ⇒ `Object`

score awarded (awd/awd.)

# File 'lib/sportdb/parser/token-score--helpers.rb', line 20

def self._build_score_awd( m )    # score awarded (awd/awd.)
            ### note - use "generic" score for now
            ##         to match  A 3-0 B [awarded] etc.
            score = [m[:score1].to_i(10),
                      m[:score2].to_i(10)]
            ## add score[:awarded] = true ???
            ##  note - for now uses its own token e.g SCORE_AWD
            score
end

._build_score_full(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-score--helpers.rb', line 40

def self._build_score_full( m )
              score = {}
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]

              ## add golden/silver flags
              score[:golden] = true   if m[:aetgg]  ## golden goal (gg)/sudden death (sd)
              score[:silver] = true   if m[:aetsg]  ## silver goal (sg)

              score
end

._build_score_fuller(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-score--helpers.rb', line 58

def self._build_score_fuller( m )
              score = {}
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]
              score[:agg] = [m[:agg1].to_i(10),
                             m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]

              if m[:away1] && m[:away2]
                 score[:away] = [m[:away1].to_i(10),
                                 m[:away2].to_i(10)]
              elsif m[:away]    ## fallback if no away score; check away flag
                 score[:away] = true
              end

              ## add golden/silver flags
              score[:golden] = true   if m[:aetgg]  ## golden goal (gg)/sudden death (sd)
              score[:silver] = true   if m[:aetsg]  ## silver goal (sg)

              score
end

._build_score_fuller_more(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-score--helpers.rb', line 86

def self._build_score_fuller_more( m )
               ##    SCORE + SCORE_FULLER_MORE
               ## note -  after extra-time (aet) or full-time (ft)
               ##           score may be present in SCORE!!!
              score = {}
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]
              score[:agg] = [m[:agg1].to_i(10),
                             m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]

              if m[:away1] && m[:away2]
                 score[:away] = [m[:away1].to_i(10),
                                 m[:away2].to_i(10)]
              elsif m[:away]    ## fallback if no away score; check away flag
                 score[:away] = true
              end

              ## add golden/silver flags
              score[:golden] = true   if m[:aetgg]  ## golden goal (gg)/sudden death (sd)
              score[:silver] = true   if m[:aetsg]  ## silver goal (sg)

              ## add flag in score for et/ft/ht
              ##    used for "dangling" (generic) score
              score[:score] = 'et'   if m[:aet] || m[:aetgg] || m[:aetsg]
              score[:score] = 'ft'   if m[:ft]
              score[:score] = 'ht'   if m[:ht]

              score
end

._build_score_legs(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-score--helpers.rb', line 123

def self._build_score_legs( m )
              legs = {}

              ############
              ### build leg1 (score)
              score = {}
              score[:ft] = [m[:leg1_ft1].to_i(10),
                            m[:leg1_ft2].to_i(10)]
              legs['leg1'] = score

              ##################
              ### build leg2 (score)
              score = {}
              score[:ft] = [m[:leg2_ft1].to_i(10),
                            m[:leg2_ft2].to_i(10)]  if m[:leg2_ft1] && m[:leg2_ft2]
              score[:et] = [m[:leg2_et1].to_i(10),
                            m[:leg2_et2].to_i(10)]  if m[:leg2_et1] && m[:leg2_et2]
              score[:p]  = [m[:leg2_p1].to_i(10),
                            m[:leg2_p2].to_i(10)]  if m[:leg2_p1] && m[:leg2_p2]
              legs['leg2'] = score

              ## check for (opt) aggregate - keep on "top-level"
              legs[:agg] = [m[:agg1].to_i(10),
                            m[:agg2].to_i(10)]  if m[:agg1] && m[:agg2]
              legs[:away] = true  if m[:away]

              legs
end

._build_status(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-status.rb', line 100

def self._build_status( m )
        status = {}
        ## note - norm status text - why? why not?
        status[:status] = if    m[:postponed] then 'postponed'
                          elsif m[:canceled]  then 'canceled'
                          elsif m[:walkover]  then 'walkover'
                          elsif m[:awarded]   then 'awarded'
                          elsif m[:suspended] then 'suspended'
                          elsif m[:abandoned] then 'abandoned'
                          elsif m[:annulled] ||
                                m[:voided]    then 'annulled'
                          elsif m[:replay]    then 'replay'
                          else  ## fallback on "generic" status (shouldn't happen)
                            m[:status]
                          end

        ## includes note? e.g.  awarded; originally 2-0
        status[:status_note] = m[:status_note]   if m[:status_note]

        status
end

._build_time(m) ⇒ `Object`

# File 'lib/sportdb/parser/token-time.rb', line 96

def self._build_time( m )
              ## unify to iso-format
              ###   12.40 => 12:40
              ##    12h40 => 12:40 etc.
              ##  keep string (no time-only type in ruby)
              data = { time: {} }

              hour     = m[:hour].to_i(10)  ## allow 08/07/etc.
              minute   = m[:minute].to_i(10)

              ##   check if 24:00 possible? or only 0:00 (23:59)
              unless (hour   >=0 && hour   <=23) &&
                     (minute >=0 && minute <=59)
                 raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
              end

              data[:time][:h] = hour
              data[:time][:m] = minute
              data[:time][:timezone] = m[:timezone]    if m[:timezone]


              ## check if local time present e.g.
              ##    18:30 (19:30)
              ##    18:30 (19:30 BST)  etc.
              if m[:time_local]
                  data[:time_local] = {}

                local_hour     = m[:local_hour].to_i(10)  ## allow 08/07/etc.
                local_minute   = m[:local_minute].to_i(10)

                ##   check if 24:00 possible? or only 0:00 (23:59)
                unless (hour   >=0 && hour   <=23) &&
                       (minute >=0 && minute <=59)
                   raise ArgumentError, "parse error - local time >#{m[:time_local]}< out-of-range"
                end

                data[:time_local][:h] = local_hour
                data[:time_local][:m] = local_minute
                data[:time_local][:timezone] = m[:local_timezone]    if m[:local_timezone]
              end

              data
end

._mk_score_fuller_agg(win:) ⇒ `Object`

regex score helpers

note - MUST double escape \d e.g. \\d!!!   if not "simple" string (e.g. '' but %Q<>)

# File 'lib/sportdb/parser/token-score_fuller.rb', line 24

def self._mk_score_fuller_agg( win: )    ## with optional win - true|false
   %Q<
                 (?:
                    ############
                    ## opt 1)  with win
                    (?:
                       #{ win ? '(?: win [ ] )?' : '' }   
                        (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2})
                          [ ] on [ ] agg (?: regate )?  
                    )
                    |        
                    #####
                    ## opt 2)  "classic" (post)
                    (?:
                       (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2})
                          [ ]*
                        #{AGG_EN}   
                    )
                    |
                    #####
                    ## opt 3) agg up-front (pre)
                    (?:
                         agg [ ]
                       (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2})   
                    )
                 )
    >
end

._mk_score_fuller_p(win:) ⇒ `Object`

with optional win - true|false

# File 'lib/sportdb/parser/token-score_fuller.rb', line 53

def self._mk_score_fuller_p( win: )    ## with optional win - true|false
   %Q<
                 (?:
                    ############
                    ## opt 1)  with win
                    (?:
                        #{ win ? '(?: win [ ] )?' : '' }
                        (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})
                          [ ] on [ ] pens
                    )
                    |        
                    #####
                    ## opt 2)  "classic" (post)
                    (?:
                       (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})
                          [ ]*
                        #{P_EN}   
                    )
                    |
                    #####
                    ## opt 3) up-front (pre)
                    (?:
                         (?: pen|p) [ ]
                       (?<p1>\\d{1,2}) - (?<p2>\\d{1,2})   
                    )
                 )                   
    >
end

._parse_date(str) ⇒ `Object`

# File 'lib/sportdb/parser/token-date--helpers.rb', line 111

def self._parse_date( str )
    ## note - strip - leading/trailing spaces automatic - why? why not?
    m = DATE_RE.match( str.strip )

    if m && m.pre_match == '' && m.post_match == ''
      ## return hash table with captured components
      date = _build_date( m )
      date
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil
    else
      nil  ## no match - return nil
    end
end

._parse_goal_count(str) ⇒ `Object`

# File 'lib/sportdb/parser/token-goals--helpers.rb', line 95

def self._parse_goal_count( str )
    ## note - strip - leading/trailing spaces
    m = GOAL_COUNT_RE.match( str.strip )
    if m && m.pre_match == '' && m.post_match == ''
      _build_goal_count( m )
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil
    else
      nil  ## no match - return nil
    end
end

._parse_goal_minute(str) ⇒ `Object`

parse helpers

# File 'lib/sportdb/parser/token-goals--helpers.rb', line 81

def self._parse_goal_minute( str )
    ## note - strip - leading/trailing spaces
    m = GOAL_MINUTE_RE.match( str.strip )
    if m && m.pre_match == '' && m.post_match == ''
      _build_goal_minute( m )
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil
    else
      nil  ## no match - return nil
    end
end

._parse_score_full(str) ⇒ `Object`

add parser helpers

# File 'lib/sportdb/parser/token-score--helpers.rb', line 167

def self._parse_score_full( str )
    ## note - strip - leading/trailing spaces automatic - why? why not?

    m = Regexp.union(
              SCORE_FULL_1ST_RE,
              SCORE_FULL_RE ).match( str.strip )

    if m && m.pre_match == '' && m.post_match == ''
       pp m
       _build_score_full( m )
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil
    else
      nil  ## no match - return nil
    end
end

._parse_team(str) ⇒ `Object`

helper for testing regex match for team names

# File 'lib/sportdb/parser/token-text.rb', line 205

def self._parse_team( str )
    ## note - strip - leading/trailing spaces
    m = TEXT_RE.match( str.strip )
    if m && m.pre_match == '' && m.post_match == ''
      m
    elsif  m
        ## note - match BUT not anchored to start and end-of-string!!!
        ##  report, error somehow??
      nil
    else
      nil  ## no match - return nil
    end
end

.build_map(lines, downcase: false) ⇒ `Object`

# File 'lib/sportdb/parser/token-date--names.rb', line 39

def self.build_map( lines, downcase: false )
   ## note: downcase name!!!
  ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
  ##  {"january" => 1,  "jan" => 1,
  ##   "february" => 2, "feb" => 2,
  ##   "march" => 3,    "mar" => 3,
  ##   "april" => 4,    "apr" => 4,
  ##   "may" => 5,
  ##   "june" => 6,     "jun" => 6, ...
  lines.each_with_index.reduce( {} ) do |h,(line,i)|
    line.each do |name|
       h[ downcase ? name.downcase : name ] = i+1
    end  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
    h
  end
end

.build_names(lines) ⇒ `Object`

# File 'lib/sportdb/parser/token-date--names.rb', line 32

def self.build_names( lines )
  ## join all words together into a single string e.g.
  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
  lines.map { |line| line.join('|') }.join('|')
end

.parse_date(str, start: nil) ⇒ `Object`

note: parse_date - returns Date object

_parse_date (with underscore) - return  hash of "parsed" regex match data!!

# File 'lib/sportdb/parser/token-date--helpers.rb', line 72

def self.parse_date( str, start: nil )
    if m = _parse_date( str )
       year  = m[:y]
       yy    = m[:yy]

       ####
       ## support two digit shortcut for year
       if yy && year.nil?
          ###
          ## for now assume 00,01 to 30 is 2000,2001 to 2030
          ##   and          31 to 99   is  1931 to 1999
          year =   yy <= 30 ?  2000+yy : 1900+yy
       end

       month = m[:m]
       day   = m[:d]
       wday  = m[:wday]


      if year.nil?     ## try to calculate year
        raise ArgumentError, "year required in date >#{str}< or pass along start date"   if start.nil?

        year =  if  month > start.month ||
                   (month == start.month && day >= start.day)
                  # assume same year as start_at event (e.g. 2013 for 2013/14 season)
                  start.year
                else
                  # assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
                  start.year+1
                end
      end
      Date.new( year,month,day )
    else
      raise ArgumentError, "unexpected date format; cannot parse >#{str}<"
    end
end

.parse_names(txt) ⇒ `Object`

# File 'lib/sportdb/parser/token-date--names.rb', line 5

def self.parse_names( txt )
  lines = [] # array of lines (with words)

  txt.each_line do |line|
    line = line.strip

    next if line.empty?
    next if line.start_with?( '#' )   ## skip comments too

    ## strip inline (until end-of-line) comments too
    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
    ##   =>   Janvier  Janv  Jan

    line = line.sub( /#.*/, '' ).strip
    ## pp line

    values = line.split( /[ \t]+/ )
    ## pp values

    ## todo/fix -- add check for duplicates
    lines << values
  end
  lines

end

Instance Method Details

#_build_date(m) ⇒ `Object`

59	# File 'lib/sportdb/parser/token-date--helpers.rb', line 59 def _build_date( m ) self.class._build_date( m ); end

#_build_date_legs(m) ⇒ `Object`

60	# File 'lib/sportdb/parser/token-date--helpers.rb', line 60 def _build_date_legs( m ) self.class._build_date_legs( m ); end

#_build_duration(m) ⇒ `Object`

61	# File 'lib/sportdb/parser/token-date--helpers.rb', line 61 def _build_duration( m ) self.class._build_duration( m ); end

#_build_goal_count(m) ⇒ `Object`

71	# File 'lib/sportdb/parser/token-goals--helpers.rb', line 71 def _build_goal_count( m ) self.class._build_goal_count( m ); end

#_build_goal_minute(m) ⇒ `Object`

68	# File 'lib/sportdb/parser/token-goals--helpers.rb', line 68 def _build_goal_minute( m ) self.class._build_goal_minute( m ); end

#_build_goal_minute_na(m) ⇒ `Object`

69	# File 'lib/sportdb/parser/token-goals--helpers.rb', line 69 def _build_goal_minute_na( m ) self.class._build_goal_minute_na( m ); end

#_build_goal_type(m) ⇒ `Object`

72	# File 'lib/sportdb/parser/token-goals--helpers.rb', line 72 def _build_goal_type( m ) self.class._build_goal_type( m ); end

#_build_minute(m) ⇒ `Object`

70	# File 'lib/sportdb/parser/token-goals--helpers.rb', line 70 def _build_minute( m ) self.class._build_minute( m ); end

#_build_score(m) ⇒ `Object`

153	# File 'lib/sportdb/parser/token-score--helpers.rb', line 153 def _build_score( m ) self.class._build_score( m ); end

#_build_score_abd(m) ⇒ `Object`

155	# File 'lib/sportdb/parser/token-score--helpers.rb', line 155 def _build_score_abd( m ) self.class._build_score_abd( m ); end

#_build_score_awd(m) ⇒ `Object`

154	# File 'lib/sportdb/parser/token-score--helpers.rb', line 154 def _build_score_awd( m ) self.class._build_score_awd( m ); end

#_build_score_full(m) ⇒ `Object`

156	# File 'lib/sportdb/parser/token-score--helpers.rb', line 156 def _build_score_full( m ) self.class._build_score_full( m ); end

#_build_score_fuller(m) ⇒ `Object`

157	# File 'lib/sportdb/parser/token-score--helpers.rb', line 157 def _build_score_fuller( m ) self.class._build_score_fuller( m ); end

#_build_score_fuller_more(m) ⇒ `Object`

158	# File 'lib/sportdb/parser/token-score--helpers.rb', line 158 def _build_score_fuller_more( m ) self.class._build_score_fuller_more( m ); end

#_build_score_legs(m) ⇒ `Object`

159	# File 'lib/sportdb/parser/token-score--helpers.rb', line 159 def _build_score_legs( m ) self.class._build_score_legs( m ); end

#_build_status(m) ⇒ `Object`

121	# File 'lib/sportdb/parser/token-status.rb', line 121 def _build_status( m ) self.class._build_status( m ); end

#_build_time(m) ⇒ `Object`

139	# File 'lib/sportdb/parser/token-time.rb', line 139 def _build_time(m) self.class._build_time(m); end

#_info(*args) ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 32

def _info( *args )
  print "[INFO] Lexer -- "
  args.each { |arg| puts args }
end

#_on_goal(m, ctx:) ⇒ `Object`

# File 'lib/sportdb/parser/lexer-on_goal.rb', line 19

def _on_goal( m, ctx: )

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:goals_none]    ## note - eats-up semicolon!! e.g. -; or - ;
             # was:[:GOALS_NONE,"<|GOALS_NONE|>"]
             ##   use literal text!!
             Token.new( :GOALS_NONE, m[:goals_none],
                            lineno: ctx.lineno, offset: m.offset(:goals_none))
         elsif m[:goal_sep_alt]
             # was: [:GOAL_SEP_ALT, "<|GOAL_SEP_ALT|>" ]   ## e.g. dash (-) WITH leading & trailing space required
             Token.new( :GOAL_SEP_ALT, m[:goal_sep_alt],
                              lineno: ctx.lineno, offset: m.offset(:goal_sep_alt))
         elsif m[:prop_name]    ## note - change prop_name to player
             Token.new( :PLAYER, m[:name],
                              lineno: ctx.lineno, offset: m.offset(:name))
         elsif m[:goal_minute]
             Token.new( :GOAL_MINUTE, m[:goal_minute],
                              lineno: ctx.lineno, offset: m.offset(:goal_minute),
                              value: _build_goal_minute( m ))
         elsif m[:goal_minute_na]
              ## note -  (re)use GOAL_MINUTE token; no extra GOAL_MINUTE_NA or such - why? why not?
              ##          make sure to handle 'm' => nil upstream!!!
              ##                     change to  999 or -1 or such - why? why not?
             Token.new( :GOAL_MINUTE, m[:goal_minute_na],
                               lineno: ctx.lineno, offset: m.offset(:goal_minute_na),
                                value: _build_goal_minute_na( m ))
         elsif m[:goal_count]
              Token.new( :GOAL_COUNT, m[:goal_count],
                                lineno: ctx.lineno, offset: m.offset(:goal_count),
                                value: _build_goal_count( m ))
         elsif m[:sym]
            case m[:sym]
            when ')'  ## leave goal mode!!
                _trace( "LEAVE GOAL_RE MODE" )
                @re = RE
                ##  note - use/return GOAL_END token   - change to GOAL_END_PAREN(THESIS)
                ##                                or GOAL_PAREN_CLOSE/END ???
                ##   fix - use ) too - why? why not?
                ## was: [:GOALS_END, '<|GOALS_END|>']
                Token.virtual( :GOALS_END, lineno: ctx.lineno  )
            else
                Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
            end
         else
            ctx.warn_on_else( m, mode: 'GOAL' )
            nil
         end
end

#_on_goal_alt(m, ctx:) ⇒ `Object`

# File 'lib/sportdb/parser/lexer-on_goal.rb', line 82

def _on_goal_alt( m, ctx: )

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
             Token.new(:PLAYER, m[:name],
                           lineno: ctx.lineno, offset: m.offset(:name))
         elsif m[:goal_minute]
             Token.new( :GOAL_MINUTE, m[:goal_minute],
                              lineno: ctx.lineno, offset: m.offset(:goal_minute),
                              value: _build_goal_minute( m ))
         elsif m[:goal_type]
             Token.new( :GOAL_TYPE,m[:goal_type],
                              lineno: ctx.lineno, offset: m.offset(:goal_type),
                              value: _build_goal_type( m ))
         elsif m[:score]
             Token.new( :SCORE, m[:score],
                              lineno: ctx.lineno, offset: m.offset(:score),
                              value: _build_score( m ))
         elsif m[:sym]
            case m[:sym]
            when ')'  ## leave goal mode!!
                _trace( "LEAVE GOAL_ALT_RE MODE" )
                @re = RE
                ##  note - use/return GOAL_END token   - change to GOAL_END_PAREN(THESIS)
                ##                                or GOAL_PAREN_CLOSE/END ???
                ## [:GOALS_END, '<|GOALS_END|>']
                Token.virtual( :GOALS_END, lineno: ctx.lineno  )
            else
                Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
            end
         else
            ctx.warn_on_else( m, mode: 'GOAL_ALT' )
            nil
         end
end

#_on_goal_compat(m, ctx:) ⇒ `Object`

note - m is MatchData object

# File 'lib/sportdb/parser/lexer-on_goal.rb', line 132

def _on_goal_compat( m, ctx: )      ## note - m is MatchData object

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
             Token.new(:PLAYER, m[:name],
                           lineno: ctx.lineno, offset: m.offset(:name))
         elsif m[:minute]
             Token.new(:MINUTE, m[:minute],
                           lineno: ctx.lineno, offset: m.offset(:minute),
                           value: _build_minute( m ))
         elsif m[:goal_type]
             Token.new( :GOAL_TYPE,m[:goal_type],
                              lineno: ctx.lineno, offset: m.offset(:goal_type),
                              value: _build_goal_type( m ))

         elsif m[:score]
             Token.new( :SCORE, m[:score],
                              lineno: ctx.lineno, offset: m.offset(:score),
                              value: _build_score( m ))
         elsif m[:sym]
            case m[:sym]
            when ')'  ## leave goal mode!!
                _trace( "LEAVE GOAL_COMPAT_RE MODE" )
                @re = RE
                ##  note - use/return GOAL_END token   - change to GOAL_END_PAREN(THESIS)
                ##                                or GOAL_PAREN_CLOSE/END ???
                ## [:GOALS_END, '<|GOALS_END|>']
                Token.virtual( :GOALS_END, lineno: ctx.lineno  )
            else
                Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
            end
         else
            ctx.warn_on_else( m, mode: 'GOAL_COMPAT' )
            nil
         end
end

#_on_group_def(m, ctx:) ⇒ `Object`

note - m is MatchData object

# File 'lib/sportdb/parser/lexer-on_group_def.rb', line 14

def _on_group_def( m, ctx: )      ## note - m is MatchData object

           if m[:spaces] || m[:space]
               nil    ## skip spaces
           elsif m[:text]
               Token.new(:TEAM,  m[:text],
                 lineno: ctx.lineno, offset: m.offset(:text))
           elsif m[:sym]
                Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
           else
              ctx.warn_on_else( m, mode: 'GROUP_DEF' )
              nil
           end
end

#_on_prop_attendance(m, ctx:) ⇒ `Object`

note - m is MatchData object

# File 'lib/sportdb/parser/lexer-on_prop_misc.rb', line 45

def _on_prop_attendance( m, ctx: )      ## note - m is MatchData object

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:enclosed_name]
              ## reserverd for use for sold out or such (in the future) - why? why not?
             Token.new(:ENCLOSED_NAME, m[:name],
                             lineno: ctx.lineno, offset: m.offset(:name))
         elsif m[:num]
             Token.new(:PROP_NUM, m[:num],
                             lineno: ctx.lineno, offset: m.offset(:num),
                             value: m[:value].to_i(10))
         else
            ctx.warn_on_else( m, mode: 'PROP_ATTENDANCE' )
            nil
         end
end

#_on_prop_cards(m, ctx:) ⇒ `Object`

note - m is MatchData object

# File 'lib/sportdb/parser/lexer-on_prop_misc.rb', line 17

def _on_prop_cards( m, ctx: )      ## note - m is MatchData object

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]
              Token.new(:PROP_NAME, m[:name],
                               lineno: ctx.lineno, offset: m.offset(:prop_name))
         elsif m[:minute]
              Token.new(:MINUTE, m[:minute],
                           lineno: ctx.lineno, offset: m.offset(:minute),
                           value: _build_minute( m ))
         elsif m[:sym]
              Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
         else
             ctx.warn_on_else( m, mode: 'PROP_CARDS' )
             nil
         end
end

#_on_prop_lineup(m, ctx:) ⇒ `Object`

note - m is MatchData object

# File 'lib/sportdb/parser/lexer-on_prop_lineup.rb', line 22

def _on_prop_lineup( m, ctx: )      ## note - m is MatchData object

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_key]   ## check for inline prop keys
              key = m[:key]
              ##  supported for now coach/trainer (add manager?)
              if ['coach',
                  'trainer'].include?( key.downcase )
                ## use PROP_COACH or COACH_KEY or such - why? why not?
                Token.new(:COACH, m[:key],
                             lineno: ctx.lineno, offset: m.offset(:key))
              else
                ## report error - for unknown (inline) prop key in lineup
                nil
              end
         elsif m[:inline_captain]
              Token.new(:INLINE_CAPTAIN, m[:inline_captain],
                            lineno: ctx.lineno, offset: m.offset(:inline_captain))
         elsif m[:inline_yellow]
              card = {}
              card[:m]      = m[:minute].to_i(10)  if m[:minute]
              card[:offset] = m[:offset].to_i(10)  if m[:offset]
              Token.new(:INLINE_YELLOW, m[:inline_yellow],
                               lineno: ctx.lineno, offset: m.offset(:inline_yellow),
                                value: card)
         elsif m[:inline_red]
              card = {}
              card[:m]      = m[:minute].to_i(10)  if m[:minute]
              card[:offset] = m[:offset].to_i(10)  if m[:offset]
              Token.new(:INLINE_RED, m[:inline_red],
                              lineno: ctx.lineno, offset: m.offset(:inline_red),
                              value: card)
         elsif m[:inline_yellow_red]
              card = {}
              card[:m]      = m[:minute].to_i(10)  if m[:minute]
              card[:offset] = m[:offset].to_i(10)  if m[:offset]
              Token.new(:INLINE_YELLOW_RED, m[:inline_yellow_red],
                               lineno: ctx.lineno, offset: m.offset(:inline_yellow_red),
                               value: card)
         elsif m[:prop_name]
              Token.new(:PROP_NAME, m[:name],
                               lineno: ctx.lineno, offset: m.offset(:prop_name))
         elsif m[:minute]
              Token.new(:MINUTE, m[:minute],
                           lineno: ctx.lineno, offset: m.offset(:minute),
                           value: _build_minute( m ))
         elsif m[:sym]
              Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
         else
             ctx.warn_on_else( m, mode: 'PROP_LINEUP' )
             nil
         end
end

#_on_prop_penalties(m, ctx:) ⇒ `Object`

note - m is MatchData object

# File 'lib/sportdb/parser/lexer-on_prop_penalties.rb', line 16

def _on_prop_penalties( m, ctx: )      ## note - m is MatchData object
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
              Token.new(:PROP_NAME, m[:name],
                               lineno: ctx.lineno, offset: m.offset(:prop_name))
         elsif m[:enclosed_name]
              ## use HOLD,SAVE,POST or such keys - why? why not?
             Token.new(:ENCLOSED_NAME, m[:name],
                             lineno: ctx.lineno, offset: m.offset(:name))
         elsif m[:score]
             Token.new( :SCORE, m[:score],
                              lineno: ctx.lineno, offset: m.offset(:score),
                              value: _build_score( m ))
         elsif m[:sym]
              Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
         else
            ctx.warn_on_else( m, mode: 'PROP_PENALTIES ')
            nil
         end
end

#_on_prop_referee(m, ctx:) ⇒ `Object`

note - m is MatchData object

# File 'lib/sportdb/parser/lexer-on_prop_misc.rb', line 75

def _on_prop_referee( m, ctx: )      ## note - m is MatchData object

         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_key]   ## check for inline prop keys
              key = m[:key]
              ##  supported for now coach/trainer (add manager?)
              if ['att', 'attn', 'attendance' ].include?( key.downcase )
                ## use ATTENDANCE_PROP or ATTENDANCE_KEY or such - why? why not?
                Token.new(:ATTENDANCE, m[:key],
                                 lineno: ctx.lineno, offset: m.offset(:key))
              else
                ## report error - for unknown (inline) prop key in lineup
                nil
              end
         elsif m[:prop_name]    ## note - change prop_name to player or to (plain) name?
              Token.new(:PROP_NAME, m[:name],
                               lineno: ctx.lineno, offset: m.offset(:prop_name))
         elsif m[:num]
             Token.new(:PROP_NUM, m[:num],
                             lineno: ctx.lineno, offset: m.offset(:num),
                             value: m[:value].to_i(10))
         elsif m[:enclosed_name]
              ## use HOLD,SAVE,POST or such keys - why? why not?
             Token.new(:ENCLOSED_NAME, m[:name],
                             lineno: ctx.lineno, offset: m.offset(:name))
         elsif m[:sym]
              Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
         else
            ctx.warn_on_else( m, mode: 'PROP_REFEREE' )
            nil
         end
end

#_on_round_def(m, ctx:) ⇒ `Object`

note - m is MatchData object

# File 'lib/sportdb/parser/lexer-on_round_def.rb', line 15

def _on_round_def( m, ctx: )      ## note - m is MatchData object


           if m[:spaces] || m[:space]
               nil    ## skip spaces
           elsif m[:date]
              Token.new(:DATE, m[:date],
                           lineno: ctx.lineno, offset: m.offset(:date),
                           value: _build_date(m))
           elsif m[:duration]
              Token.new(:DURATION, m[:duration],
                            lineno: ctx.lineno, offset: m.offset(:duration),
                            value: _build_duration( m ))
           elsif m[:sym]
              Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
           else
              ctx.warn_on_else( m, mode: 'ROUND_DEF' )
              nil
           end
end

#_on_top(m, ctx:) ⇒ `Object`

note - m is MatchData object

# File 'lib/sportdb/parser/lexer-on_top.rb', line 5

def _on_top( m, ctx: )      ## note - m is MatchData object

        ##  note - top-level (for now always) assumes TEAM for TEXT match!!
        ##           fix/fix/fix change TEXT_RE/:text to  TEAM_RE/:team !!!

        if m[:space] || m[:spaces]
           nil   ## skip space(s)
        elsif m[:text]         then Token.new(:TEAM,  m[:text],
                                                      lineno: ctx.lineno, offset: m.offset(:text))
        elsif m[:team_home]    then Token.new(:TEAM_HOME,  m[:team_home],
                                                      lineno: ctx.lineno, offset: m.offset(:team_home))
        elsif m[:team_away]    then Token.new(:TEAM_AWAY,  m[:team_away],
                                                      lineno: ctx.lineno, offset: m.offset(:team_away))
        elsif m[:team_neutral] then Token.new(:TEAM_NEUTRAL, m[:team_neutral],
                                                      lineno: ctx.lineno, offset: m.offset(:team_neutral))

        ## (match) status e.g. cancelled, awarded, etc.
        ##  inline:  w/o - walkover
        ##           n/p - not played
        ##           bye
        ##           abd/abd. - abandoned
        ##           void
        ##           susp/susp. - suspended
        ##           ppd/ppd. or postp/postp. - postponed
        ##           awd/awd. - awarded
        ##           canc/canc. - cancelled/canceled
        elsif m[:inline_wo]   then Token.new(:INLINE_WO, m[:inline_wo],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_wo))
        elsif m[:inline_np]   then Token.new(:INLINE_NP, m[:inline_np],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_np))
        elsif m[:inline_bye]  then Token.new(:INLINE_BYE, m[:inline_bye],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_bye))
        elsif m[:inline_abd]  then Token.new(:INLINE_ABD, m[:inline_abd],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_abd))
        elsif m[:inline_void] then Token.new(:INLINE_VOID, m[:inline_void],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_void))
        elsif m[:inline_susp] then Token.new(:INLINE_SUSP, m[:inline_susp],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_susp))
        elsif m[:inline_ppd]  then Token.new(:INLINE_PPD, m[:inline_ppd],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_ppd))
        elsif m[:inline_awd]  then Token.new(:INLINE_AWD, m[:inline_awd],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_awd))
        elsif m[:inline_canc] then Token.new(:INLINE_CANC, m[:inline_canc],
                                                  lineno: ctx.lineno, offset: m.offset(:inline_canc))
        elsif m[:status]      then Token.new(:STATUS, m[:status],
                                                  lineno: ctx.lineno, offset: m.offset(:status),
                                                  value: _build_status( m ))
        elsif m[:note]
            ###  todo/check:
            ##      use value hash - why? why not? or simplify to:
            ## [:NOTE, [m[:note], {note: m[:note] } ]]
             Token.new(:NOTE, m[:note],
                               lineno: ctx.lineno, offset: m.offset(:note))

        elsif m[:attendance]
             att = {}
             att[:value] = m[:value].gsub( '_', '' ).to_i(10)
             ## note - for token id use INLINE_ATTENDANCE  (ATTENDANCE in use for prop!!!)
            Token.new(:INLINE_ATTENDANCE, m[:attendance],
                                   lineno: ctx.lineno, offset: m.offset(:attendance),
                                          value: att)

        elsif m[:time]         then Token.new(:TIME, m[:time],
                                                lineno: ctx.lineno, offset: m.offset(:time),
                                                value: _build_time(m))
        elsif m[:date]         then Token.new(:DATE, m[:date],
                                                lineno: ctx.lineno, offset: m.offset(:date),
                                                value: _build_date(m))
        elsif m[:date_legs]    then Token.new(:DATE_LEGS, m[:date_legs],
                                                 lineno: ctx.lineno, offset: m.offset(:date_legs),
                                                 value: _build_date_legs(m))

        elsif m[:score_legs]   then Token.new(:SCORE_LEGS, m[:score_legs],
                                                  lineno: ctx.lineno, offset: m.offset(:score_legs),
                                                  value: _build_score_legs( m ))
        elsif m[:score_full]   then Token.new(:SCORE_FULL, m[:score_full],
                                                  lineno: ctx.lineno, offset: m.offset(:score_full),
                                                  value: _build_score_full( m ))
        elsif m[:score_fuller] then Token.new(:SCORE_FULLER, m[:score_fuller],
                                                  lineno: ctx.lineno, offset: m.offset(:score_fuller),
                                                  value: _build_score_fuller( m ))
        elsif m[:score_fuller_more] then Token.new(:SCORE_FULLER_MORE, m[:score_fuller_more],
                                                      lineno: ctx.lineno, offset: m.offset(:score_fuller_more),
                                                      value: _build_score_fuller_more( m ))
        elsif m[:score]      then Token.new(:SCORE,  m[:score],
                                                lineno: ctx.lineno, offset: m.offset(:score),
                                                value: _build_score( m ))
        elsif m[:score_awd]  then Token.new(:SCORE_AWD, m[:score_awd],
                                                lineno: ctx.lineno, offset: m.offset(:score_awd),
                                                value: _build_score_awd( m ))
        elsif m[:score_abd]  then Token.new(:SCORE_ABD, m[:score_abd],
                                                lineno: ctx.lineno, offset: m.offset(:score_abd),
                                                value: _build_score_abd( m ))

        elsif m[:vs]         then Token.new(:VS, m[:vs],
                                              lineno: ctx.lineno, offset: m.offset(:vs))
        elsif m[:sym]
          case m[:sym]  ## return symbols "inline" as is - why? why not?
          when '@'    ##  enter geo mode
            _trace( 'ENTER GEO_RE MODE' )
            @re = GEO_RE
            @geo_count = 0
            Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
          when '('    ## enter goal scorer mode on "free-floating" open paranthesis!!!
             _trace( 'ENTER GOAL_RE MODE' )
             @re = GOAL_RE
              ## note - eat-up ( for now; do NOT pass along as token
              ##       pass along "virutal" INLINE GOALS - why? why not?
              Token.virtual( :INLINE_GOALS, lineno: ctx.lineno )
          else
            Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym))
          end
        else
           ctx.warn_on_else( m )
           nil
        end
end

#_prep_doc(txt) ⇒ `Object`

# File 'lib/sportdb/parser/lexer-prep_doc.rb', line 45

def _prep_doc( txt )
    ##  preprocess automagically
    ##   strip html comments
    ##      keep empty lines?            - yes  (turn in BLANK tokens)
    ##      keep leading spaces (indent) - yes  (maybe used later in upstream parser!!)
    ##
    ##  note - KEEP empty lines (get turned into BLANK token!!!!)


    ### normalize unicode (decomposed chars to composed chars)
    ##
    ##  note:  é is decomposed (in two chars e.g.)
    ##   e (101)
    ##   ́  (769)
    ##   vs
    ##     é (233)
    txt = txt.unicode_normalize(:nfc)


    ##  "universal" newlines
    ##      replace all windows-style  cr+lf (\r\n) to lf (\n) only
    txt = txt.gsub( "\r\n", "\n" )



    ###
    ## quick hack for now
    ##   remove  html-style comments <!-- -->
    ##           (incl. multi-line)  with two spaces
    ##       will mess-up lineno tracking!!!
    ##    fix later to have function lineno & colno!!!
    ##
    ##  todo/fix - why? why not?
    ##   to keep lineno intact
    ##     replace with  space and newline

    ###
    ## add more "native" multi-line comment-styles
    ##  e.g.    #[[ ... ]]  or  #<<< .. >>> or #<< .. >>
    ##                 or such - why? why not?

    txt = txt.gsub( HTML_COMMENT_RE ) do |m|
                     _trace('preproc html comment:', m )
                        '  '
                   end



   txt = txt.gsub( PREPROC_NOTA_BENE_RE ) do |m|
       if m.include?( "\n" )   ## check for newlines (\n) and replace
            _trace('preproc (multi-line) note/nota bene block:', m )
           m.gsub( "\n", '↵' )
       else
         m
       end
    end


    #####
    ## (another) quick hack for now
    ##   turn multi-line note blocks into
    ##             single-line note blocks
    ##             by changing newline (\n) to ⏎ (unicode U+23CE)
    ##              or why not  to ___ ?
    ##
    ##  unicode options for return/arrows:
    ##   -  ↵ (U+21B5): Downwards Arrow With Corner Leftwards.
    ##                This is the most common "carriage return" symbol.
    ##   -  ⏎ (U+23CE): Return Symbol.
    ##               Specifically designated as the keyboard's "Return" key symbol,
    ##                often used in user interfaces.

    txt = txt.gsub( PREPROC_BLOCK_RE ) do |m|
       if m.include?( "\n" )   ## check for newlines (\n) and replace
         _trace( 'preproc (multi-line) block:', m )
         m.gsub( "\n", '↵' )
       else
         m
       end
    end


    txt
end

#_prep_line(line) ⇒ `Object`

auto-fix checks line-by-line

# File 'lib/sportdb/parser/lexer-prep_line.rb', line 8

def _prep_line( line )

       ##
       ##  first check for tabs
       ##    add error/warn
       ##    for auto-fix - replace tabs with two spaces

        line = line.gsub( "\t" ) do |_|
                  ## report error here
                  ## todo/add error here
                  _warn( "auto-fix; replacing tab (\\t) with two spaces in line #{line.inspect}" )
                   '  '   ## replace with two spaces
                 end


        ## U+00A0 (160)  -- non-breaking space (unicode)
        line = line.gsub( "\u00A0" ) do |uni|
                  ## report error here
                  ## todo/add error here
                  _warn( "auto-fix; replacing non-breaking unicode space (#{uni}/#{uni.ord}) w/ ascii space ( /#{" ".ord}) in line #{line.inspect}" )
                   ' '   ## replace with space
                 end

        ###
        ## todo/fix - print unicode numbers for [–−]
        ##                different candidates to differentiate and document!!!
        ##   – => U+2013 (8211)     -- En Dash     (unicode)
        ##   − => U+2212 (8722)     -- Minus Sign  (unicode)
        line = line.gsub( /[–−]/ ) do |uni|
                  ## report error here
                  ## todo/add error here
                  _warn( "auto-fix; replacing unicode dash (#{uni}/#{uni.ord}) w/ ascii dash (-/#{"-".ord}) in line #{line.inspect}" )
                   '-'   ## replace with ascii dash (-)
                  end

        ####   add more unsmart quotes
        ## smart quotes
        line = line.gsub( /[‘’]/ ) do |uni|
                  ## report error here
                  ## todo/add error here
                  _warn( "auto-fix; replacing unicode (smart) quote (#{uni}/#{uni.ord}) w/ ascii quote ('/#{"'".ord}) in line #{line.inspect}" )
                   "'"
                  end

        line = line.gsub( /[“”]/ ) do |uni|
                  ## report error here
                  ## todo/add error here
                  _warn( %Q{auto-fix; replacing unicode (smart) double quote (#{uni}/#{uni.ord}) w/ ascii double quote ("/#{'"'.ord}) in line #{line.inspect}} )
                   '"'
                  end

   line
end

#_tokenize_line(line, lineno) ⇒ `Object`

# File 'lib/sportdb/parser/lexer-tokenize.rb', line 72

def _tokenize_line( line, lineno )
  tokens = []
  errors = []   ## keep a list of errors - why? why not?


  pos = 0        ## note - usually same as offset[1] aka offset[end] after match
  ## track last offset (begin/end) - to report error on no match
  ##   or no match in end of string
  offset = [0,0]
  m = nil

  ## track number of geo text seen
  ##    (use for - do NOT break on two spaces if no geo text seen yet!!)
  @geo_count = 0

  ####
  ## quick hack - keep re state/mode between tokenize calls!!!
  @re  ||= RE     ## note - switch between RE & INSIDE_RE


  if @re == RE  ## top-level
    ### check for modes once (per line) here to speed-up parsing
    ###   for now goals only possible for start of line!!
    ###        fix - remove optional [] - why? why not?

    ####
    ## note - ord e.g. (45) for match number can only start a (match) line
    ##                "inline" use NOT possible
    ## note -  ord (for ordinal number!!!) e.g match number (1), (42), etc.
    if (m = START_WITH_ORD.match(line))
       ## note -  strip enclosing () and convert to integer
       tokens << Token.new(:ORD, m[:ord],
                                lineno: lineno, offset: m.offset(:ord),
                                value: m[:value].to_i(10)  )

       offset = m.offset(0)
       pos    = offset[1]      ## update pos
    elsif (m = START_WITH_YEAR.match(line))
       tokens << Token.new(:YEAR, m[:year],
                                 lineno: lineno, offset: m.offset(:year),
                                 value:  m[:year].to_i(10) )

       offset = m.offset(0)
       pos    = offset[1]    ## update pos

    elsif (m = START_WITH_GROUP_DEF_LINE_RE.match( line ))
      _trace( "ENTER GROUP_DEF_RE MODE" )
      @re = GROUP_DEF_RE

      tokens << Token.new( :GROUP_DEF, m[:group_def],
                               lineno: lineno, offset: m.offset(:group_def) )


      offset = m.offset(0)
      pos = offset[1]    ## update pos

    elsif (m = START_WITH_PROP_KEY_RE.match( line ))
      ##  start with prop key (match will switch into prop mode!!!)
      ##   - fix - remove leading spaces in regex (upstream) - why? why not?
      ##
      ###  switch into new mode
      ##  switch context  to PROP_RE
        _trace("ENTER PROP_RE MODE" )
        key = m[:key]


        ### todo/fix - add prop yellow/red cards too - why? why not?
        ##  todo/fix - separate sent off and red card
        ##     sent-off - incl. red card, yellow/red card and the era before red cards!!
        if ['sent off'].include?( key.downcase)
          @re = PROP_CARDS_RE    ## use CARDS_RE ???
          tokens << Token.new(:PROP_SENTOFF, m[:key],
                                   lineno: lineno, offset: m.offset(:key))
        elsif ['red cards'].include?( key.downcase )
          @re = PROP_CARDS_RE    ## use CARDS_RE ???
          tokens << Token.new(:PROP_REDCARDS, m[:key],
                                   lineno: lineno, offset: m.offset(:key))
        elsif ['yellow cards'].include?( key.downcase )
          @re = PROP_CARDS_RE
          tokens << Token.new(:PROP_YELLOWCARDS, m[:key],
                                   lineno: lineno, offset: m.offset(:key))
        elsif ['ref', 'referee',
               'refs', 'referees'   ## note - allow/support assistant refs
              ].include?( key.downcase )
          @re = PROP_REFEREE_RE
          tokens << Token.new(:PROP_REFEREE, m[:key],
                                   lineno: lineno, offset: m.offset(:key))
        elsif ['att', 'attn', 'attendance'].include?( key.downcase )
          @re = PROP_ATTENDANCE_RE
          tokens << Token.new(:PROP_ATTENDANCE, m[:key],
                                   lineno: lineno, offset: m.offset(:key))

     #   elsif ['goals'].include?( key.downcase )
     #     @re = PROP_GOAL_RE
     #     tokens << [:PROP_GOALS, m[:key]]

        elsif ['penalties',
               'penalty shootout',
               'penalty shoot-out',
               'penalty kicks'].include?( key.downcase )
          @re = PROP_PENALTIES_RE
          tokens << Token.new(:PROP_PENALTIES, m[:key],
                                  lineno: lineno, offset: m.offset(:key))
        else   ## assume (team) line-up
          @re = PROP_LINEUP_RE
          ## fix-fix-fix - rename to PROP_LINEUP !!
          tokens << Token.new(:PROP, m[:key],
                                 lineno: lineno, offset: m.offset(:key))
        end

        offset = m.offset(0)
        pos    = offset[1]     ## update pos
    ###
    ### todo/fix
    ###   rename to START_WITH_ROUND_DEF_OUTLINE_RE !!!!
    elsif (m = ROUND_DEF_OUTLINE_RE.match( line ))
      _trace( "ENTER ROUND_DEF_RE MODE" )
      @re = ROUND_DEF_RE

      ## note - return ROUND_DEF NOT  ROUND_OUTLINE token
      ##   fix - add leading ▪ too!!
      tokens << Token.new( :ROUND_DEF, m[:round_outline],
                            lineno: lineno, offset: m.offset(:round_outline))

      offset = m.offset(0)
      pos    = offset[1]    ## update pos
    elsif (m = ROUND_OUTLINE_RE.match( line ))
      _trace( "ROUND_OUTLINE" )
      ## note - derive round level from no of (leading) markers
      ##             e.g. ▪/:: is 1, ▪▪/::: is 2, ▪▪▪/:::: is 3, etc.
      ##       note  - ascii-style starts with double ::, thus, autodecrement by one!
      round_level = m[:round_marker].size
      round_level -= 1  if m[:round_marker].start_with?( '::' )

      tokens << Token.new( :ROUND_OUTLINE, m[:round_outline],
                           lineno: lineno, offset: m.offset(:round_outline),
                           value: { outline: m[:round_outline],
                                    level: round_level})

      ## note - eats-up line for now (change later to only eat-up marker e.g. »|>>)
      offset = m.offset(0)
      pos    = offset[1]       ## update pos
    elsif (m = START_GOAL_LINE_RE.match( line ))   ## line starting with ( - assume
      ##  switch context to GOAL_RE (goalline(s))
      ####
      ##  note - check for alternate goal line styles / formats
      if START_GOAL_LINE_COMPAT_RE.match(line )
        ## "legacy" style starting with minute e.g.
        ##  (6 Puskás 0-1, 9 Czibor 0-2, 11 Morlock 1-2, 18 Rahn 2-2,
        ##    84 Rahn 3-2)
        @re = GOAL_COMPAT_RE
        _trace( "ENTER GOAL_COMPAT_RE MODE" )

        tokens << Token.virtual( :GOALS_COMPAT, lineno: lineno )
      elsif START_GOAL_LINE_ALT_RE.match( line )
        ##  goals with scores e.g.
        ##    (1-0 Franck Ribéry, 2-0 Ivica Olić, 2-1 Wayne Rooney)
        ##         -or-
        ##      (Dion Beljo  1-0
        ##                   1-1  Andreas Gruber
        ##   Matthias Seidl  2-1)
        @re = GOAL_ALT_RE
        _trace( "ENTER GOAL_ALT_RE MODE" )

        tokens << Token.virtual( :GOALS_ALT, lineno: lineno )
      else
        ## "standard" / default style
        @re = GOAL_RE
        _trace( "ENTER GOAL_RE MODE" )

        tokens << Token.virtual( :GOALS, lineno: lineno )
      end

      ## note - eat-up ( for now
      ##   pass along "virtual" GOALS or GOALS_ALT token
      ##      (see INLINE_GOALS for the starting goal line inline)
      ##
      ## fix-fix-fix
      ##  keep offset at [0,0] - why? why not?
      ##    do NOT eat-up
      ##   or better
      ##    add tokens << Token.literal( '(', lineno: lineno, offset: ...) !!!
      offset = m.offset(0)
      pos    = offset[1]      ## update pos
    end
  end



  old_pos = -1   ## allows to backtrack to old pos (used in geo)




  ctx = Context.new( self,
                     line:   line,
                     lineno: lineno,
                     errors: errors )


  while m = @re.match( line, pos )
    # if debug?
    #  pp m
    #  puts "pos: #{pos}"
    # end
    offset = m.offset(0)
    ctx.offset = offset



    if offset[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!
      msg =  "parse error (tokenize) - skipping >#{line[pos..(offset[0]-1)]}< in line #{lineno}@#{offset[0]},#{offset[1]} >#{line}<"
      errors << msg

      log( msg )
      puts "!! WARN - #{msg}"
    end


    ##
    ## todo/fix - also check if possible
    ##   if no match but not yet end off string!!!!
    ##    report skipped text run too!!!

    old_pos = pos
    pos     = offset[1]

#    pp offset  if debug?

    ##
    ## note: racc requires pairs e.g. [:TOKEN, VAL]
    ##         for VAL use "text" or ["text", { opts }]  array



  t = if    @re == ROUND_DEF_RE      then   _on_round_def( m, ctx: ctx )
      elsif @re == GROUP_DEF_RE      then   _on_group_def( m, ctx: ctx )
      elsif @re == GEO_RE
           ### note - possibly end inline geo on [ (and others?? in the future
           ## note: break on double spaces e.g.
           ## e.g. Jul/16 @ Arena Auf Schalke, Gelsenkirchen  Serbia 0-1 England
           if m[:spaces]
                 ### note - do NOT break out
                 ##           if not text seen yet!!!
                 if @geo_count > 0
                    ## get out-off geo mode and backtrack (w/ next)
                    ##
                    ## todo/fix
                    ##   add virtual geo_end token!!!
                    _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
                    @re = RE
                    pos = old_pos
                    next   ## backtrack (resume new loop step)
                 else
                     nil   ## skip spaces
                 end
           elsif m[:space]
               nil    ## skip (single) space
           elsif m[:text]
               @geo_count += 1
                ## keep pos - why? why not?
               Token.new(:GEO, m[:text],
                                lineno: lineno, offset: m.offset(:text))
           elsif m[:geo_end]   ## "hacky" special comma; always ends geo mode!!!
                 ## get out-off geo mode and backtrack (w/ next)
                    ## todo/fix
                    ##   add (semi-) virtual geo_end token!!!
                 _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
                 @re = RE
                 pos = old_pos
                 next   ## backtrack (resume new loop step)
           elsif m[:sym]
              case m[:sym]
                ## note - reset geo_count to 0 (avoids break on two spaces)
                ##                     if separator seen!!
              when ',' then @geo_count = 0
                            Token.literal( m[:sym], lineno: lineno, offset: m.offset(:sym))
              when '›' then @geo_count = 0;
                            Token.literal( ',', lineno: lineno, offset: m.offset(:sym))
                                ## note - treat geo sep › (unicode) like comma for now!!!
              when '>' then @geo_count = 0;
                            Token.literal( ',', lineno: lineno, offset: m.offset(:sym))
                               ## note - treat geo sep > (ascii) like comma for now!!!
              when '[' then
                    ##
                    ## todo/fix
                    ##   add virtual geo_end token!!!
                 ## get out-off geo mode and backtrack (w/ next)
                 _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
                 @re = RE
                 pos = old_pos
                 next   ## backtrack (resume new loop step)
              else
                 Token.literal( m[:sym], lineno: lineno, offset: m.offset(:sym))
              end
           else
             ctx.warn_on_else( m, mode: 'GEO' )
             nil
           end
      elsif @re == PROP_CARDS_RE       then  _on_prop_cards( m, ctx: ctx )
      elsif @re == PROP_LINEUP_RE      then  _on_prop_lineup( m, ctx: ctx )
      elsif @re == PROP_ATTENDANCE_RE  then  _on_prop_attendance( m, ctx: ctx )
      elsif @re == PROP_REFEREE_RE     then  _on_prop_referee( m, ctx: ctx )
      elsif @re == PROP_PENALTIES_RE   then  _on_prop_penalties( m, ctx: ctx )
      elsif @re == GOAL_COMPAT_RE      then  _on_goal_compat( m, ctx: ctx )
      elsif @re == GOAL_ALT_RE         then  _on_goal_alt( m, ctx: ctx )
      elsif @re == GOAL_RE             then  _on_goal( m, ctx: ctx )
      ###################################################
      ## assume TOP_LEVEL (a.k.a. RE) machinery
      else
          _on_top( m, ctx: ctx )
      end


    tokens << t    if t

#    if debug?
#      print ">"
#      print "*" * pos
#      puts "#{line[pos..-1]}<"
#    end
  end

  ## check if no match in end of string
  if offset[1] != line.size
    msg =  "parse error (tokenize) - skipping >#{line[offset[1]..-1]}< in line #{lineno}@#{offset[1]},#{line.size} >#{line}<"
    errors << msg

    log( msg )
    puts "!! WARN - #{msg}"
  end


  # if @re == GOAL_RE   ### ALWAYS switch back to top level mode
  #   puts "  LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
  #   @re = RE
  # end

   if @re == GEO_RE   ### ALWAYS switch back to top level mode
     _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" )
     @re = RE
   end

   ### ALWAYS switch back to top level mode
   @re = RE  if @re == GROUP_DEF_RE ||
                @re == ROUND_DEF_RE

   ##
   ## if in prop mode continue if   last token is [,-]
   ##        otherwise change back to "standard" mode
   if @re == PROP_LINEUP_RE     ||
      @re == PROP_CARDS_RE      ||
      @re == PROP_PENALTIES_RE  ||
      @re == PROP_ATTENDANCE_RE ||
      @re == PROP_REFEREE_RE
     if [',', '-', ';'].include?( tokens[-1].type)
        ## continue/stay in PROP_RE mode
        ##  todo/check - auto-add PROP_CONT token or such
        ##                to help parser with possible NEWLINE
        ##                  conflicts  - why? why not?
     else
        ## switch back to top-level mode!!
        _trace( "LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE" )
        @re = RE
        ## note - auto-add PROP_END (<PROP_END>)
        tokens << Token.virtual(:PROP_END, lineno: lineno)
     end
   end


  [tokens,errors]
end

#_trace(*args) ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 20

def _trace( *args )
  if debug?
    print "[DEBUG] Lexer -- "
    args.each { |arg| puts args }
  end
end

#_warn(*args) ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 27

def _warn( *args )
  print "!! [WARN] Lexer -- "
  args.each { |arg| puts args }
end

#debug? ⇒ `Boolean`

Returns:

(Boolean)

38	# File 'lib/sportdb/parser/lexer.rb', line 38 def debug?() @debug == true; end

#log(msg) ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 7

def log( msg )
   ## append msg to ./logs.txt
   ##     use ./errors.txt - why? why not?
   ##
   ##  change to ./logs_lexer.txt or such - why? why not?
   ##    auto-add/prepend  [Lexer] and timestamp!!!  to msg - why? why not?
   File.open( './logs.txt', 'a:utf-8' ) do |f|
     f.write( msg )
     f.write( "\n" )
   end
end

#tokenize_with_errors ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 54

def tokenize_with_errors

    tokens_by_line = []   ## note: add tokens line-by-line (flatten later)
    errors         = []   ## keep a list of errors - why? why not?


    txt = _prep_doc( @txt )



    ####
    ## quick hack - keep re state/mode between tokenize calls!!!
    @re  ||= RE     ## note - switch between RE & INSIDE_RE

    lineno = 0
    txt.each_line do |line|
        lineno += 1

        ## todo - "inlined virtual/collapsed/folded newlines"
        ##   check for "↵" !!!
        ##   and add to lineno


        ## note - KEEP leading spaces for indent
        ##         use rstrip (NOT left/leading & right/trainling strip) only!!
        ## note -   remove/strip trailing newline (and optional spaces)!!!
        ##          trailing whitespace may incl. \n or \r\n!!!
        line = line.rstrip


        ###  skip comments
        ##      todo/check - change to blank line
        ##                     to keep lineno (closer to orginal) - why? why not?
        next  if line.match?(/\A  [ ]* ## optional leading space(s)
                                   \#
                                    /x )

        ##  strip (inline) end-of-line comments (from line)
        ##    check/discuss: make - inline comment require trailing space
        ##                      e.g.   #1 vs # 1   - why? why not?
        line = line.sub( /   [ ]*      ## (eat-up) optional leading space(s)
                              \#{1,}.*?
                             \z
                            /x, '' )


        ####
        #  support __END__ marker to cut-off input
        break if line.match?( /\A [ ]*   ## optional leading space(s)
                                   __END__
                                 \z
                               /x )



        ## auto-fixes line-by-line (e.g. check for tabs, smart quotes, etc.)
        line = _prep_line( line )


        _trace( "line #{lineno}: >#{line}<" )


        ######
        ### special case for empty line (aka BLANK)
        if line.empty?
           ## note - blank always resets parser mode to std/top-level!!!
           @re = RE
           tokens_by_line << [Token.virtual(:BLANK, lineno: lineno)]
        elsif (m = HEADING_RE.match(line))
           ## note - heading always resets parser mode to std/top-level!!!
           @re = RE
           _trace( 'HEADING' )
           ## note - derive heading level from no of (leading) markers
           ##             e.g. = is 1, == is 2, == is 3, etc.
           heading_level = m[:heading_marker].size
           tokens_by_line << [Token.new(:"H#{heading_level}", m[:heading], lineno: lineno)]
        elsif (m = NOTA_BENE_RE.match(line))
           ## note - nota bene always resets parser mode to std/top-level!!!
           @re = RE
           tokens_by_line << [Token.new(:NOTA_BENE, m[:nota_bene], lineno: lineno)]
        else

          more_tokens, more_errors = _tokenize_line( line, lineno )

          tokens_by_line  << more_tokens
          errors          += more_errors
        end
    end # each line




    tokens_by_line = tokens_by_line.map do |tokens|

        #################
        ##    transform tokens (using simple patterns)
        ##      to help along the (racc look ahead 1 - LA1) parser
        nodes = []

        buf = Tokens.new( tokens )
        ## pp buf


    loop do
          break if buf.eos?

          if buf.match?( :DATE, :TIME )   ## merge DATE TIME into DATETIME
               date = buf.next
               time = buf.next
               ## puts "DATETIME:"
               ## pp date, time

               ##  note:  time value is { time: {} } or
               ##                       { time: {}, time_local {} }
               text  = date.text + ' ' + time.text,  ## concat string of two tokens
               value = { date: date.value }.merge( time.value )

               nodes << Token.new(:DATETIME, text,
                                      lineno: date.lineno,
                                      offset: [date.offset[0],
                                               time.offset[1]],
                                      value: value )
          ### support  date time with comma too - why? why not?
          elsif buf.match?( :DATE, ',', :TIME )
               date = buf.next
               _    = buf.next  ## ignore comma
               time = buf.next
               ## puts "DATETIME:"
               ## pp date, time
               text  = date.text + ', ' + time.text  ## concat string of two tokens
               value =  { date: date.value }.merge( time.value )

               nodes << Token.new(:DATETIME, text,
                                      lineno: date.lineno,
                                      offset: [date.offset[0],
                                               time.offset[1]],
                                     value: value )
          elsif buf.match?( :GOAL_MINUTE, ',', :GOAL_MINUTE )
             ## note - only advance by two tokens!
             ##     allows more :GOAL_MINUTE sequences!! e.g. 12,13,14 etc!!!
             ##
             ## help parser with comma shift/reduce conflict
             ##   change ',' to GOAL_MINUTE_SEP !!!
             nodes << buf.next   ## pass through goal_minute
             comma = buf.next  ## eat-up goal_minute_sep a.k.a. comma (,)
                           ##   and replace with dedicated sep(arator)
             nodes << Token.new( :GOAL_MINUTE_SEP,
                                      comma.text,
                                      lineno: comma.lineno,
                                      offset: comma.offset,
                                      value:  comma.value)
          elsif buf.match?( ',', :INLINE_ATTENDANCE )
             ## note  - allow optional comma before inline attendance
             ## help parser with comma shift/reduce conflict
             ##   change ',' to INLINE_ATTENDANCE_SEP !!!
             comma = buf.next  ## eat-up inline_attendance_sep a.k.a. comma (,)
                           ##   and replace with dedicated sep(arator)
             nodes << Token.new(:INLINE_ATTENDANCE_SEP,
                                    comma.text,
                                    lineno: comma.lineno,
                                    offset: comma.offset,
                                    value:  comma.value)
             nodes << buf.next   ## pass through inline_attendance
          else
             ## pass through
             nodes << buf.next
          end
    end  # loop
    nodes
  end  # map tokens_by_line


    ## puts "tokens_by_line:"
    ## pp tokens_by_line


    ## flatten tokens
    tokens = []
    tokens_by_line.each do |tok_line|

        ## if debug?
        ##   pp tok_line
        ## end

         tokens  += tok_line

         ## auto-add newlines  (unless BLANK!!)
         unless tok_line[0] && tok_line[0].type == :BLANK
            ## note - reuse lineno from first token in line
            ##                  use last - why? why not?
            tokens  << Token.newline( lineno: tok_line[0].lineno )
         end
    end

    [tokens,errors]

end

Class: SportDb::Lexer

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(txt, debug: false) ⇒ Lexer

Class Method Details

._build_date(m) ⇒ Object

._build_date_legs(m) ⇒ Object

._build_duration(m) ⇒ Object

._build_goal_count(m) ⇒ Object

._build_goal_minute(m) ⇒ Object

._build_goal_minute_na(m) ⇒ Object

._build_goal_type(m) ⇒ Object

._build_minute(m) ⇒ Object

._build_score(m) ⇒ Object

._build_score_abd(m) ⇒ Object

._build_score_awd(m) ⇒ Object

._build_score_full(m) ⇒ Object

._build_score_fuller(m) ⇒ Object

._build_score_fuller_more(m) ⇒ Object

._build_score_legs(m) ⇒ Object

._build_status(m) ⇒ Object

._build_time(m) ⇒ Object

._mk_score_fuller_agg(win:) ⇒ Object

._mk_score_fuller_p(win:) ⇒ Object

._parse_date(str) ⇒ Object

._parse_goal_count(str) ⇒ Object

._parse_goal_minute(str) ⇒ Object

._parse_score_full(str) ⇒ Object

._parse_team(str) ⇒ Object

.build_map(lines, downcase: false) ⇒ Object

.build_names(lines) ⇒ Object

.parse_date(str, start: nil) ⇒ Object

.parse_names(txt) ⇒ Object

Instance Method Details

#_build_date(m) ⇒ Object

#_build_date_legs(m) ⇒ Object

#_build_duration(m) ⇒ Object

#_build_goal_count(m) ⇒ Object

#_build_goal_minute(m) ⇒ Object

#_build_goal_minute_na(m) ⇒ Object

#_build_goal_type(m) ⇒ Object

#_build_minute(m) ⇒ Object

#_build_score(m) ⇒ Object

#_build_score_abd(m) ⇒ Object

#_build_score_awd(m) ⇒ Object

#_build_score_full(m) ⇒ Object

#_build_score_fuller(m) ⇒ Object

#_build_score_fuller_more(m) ⇒ Object

#_build_score_legs(m) ⇒ Object

#_build_status(m) ⇒ Object

#_build_time(m) ⇒ Object

#_info(*args) ⇒ Object

#_on_goal(m, ctx:) ⇒ Object

#_on_goal_alt(m, ctx:) ⇒ Object

#_on_goal_compat(m, ctx:) ⇒ Object

#_on_group_def(m, ctx:) ⇒ Object

#_on_prop_attendance(m, ctx:) ⇒ Object

#_on_prop_cards(m, ctx:) ⇒ Object

#_on_prop_lineup(m, ctx:) ⇒ Object

#_on_prop_penalties(m, ctx:) ⇒ Object

#_on_prop_referee(m, ctx:) ⇒ Object

#_on_round_def(m, ctx:) ⇒ Object

#_on_top(m, ctx:) ⇒ Object

#_prep_doc(txt) ⇒ Object

#_prep_line(line) ⇒ Object

#_tokenize_line(line, lineno) ⇒ Object

#_trace(*args) ⇒ Object

#_warn(*args) ⇒ Object

#debug? ⇒ Boolean

#log(msg) ⇒ Object

#tokenize_with_errors ⇒ Object

#initialize(txt, debug: false) ⇒ `Lexer`

._build_date(m) ⇒ `Object`

._build_date_legs(m) ⇒ `Object`

._build_duration(m) ⇒ `Object`

._build_goal_count(m) ⇒ `Object`

._build_goal_minute(m) ⇒ `Object`

._build_goal_minute_na(m) ⇒ `Object`

._build_goal_type(m) ⇒ `Object`

._build_minute(m) ⇒ `Object`

._build_score(m) ⇒ `Object`

._build_score_abd(m) ⇒ `Object`

._build_score_awd(m) ⇒ `Object`

._build_score_full(m) ⇒ `Object`

._build_score_fuller(m) ⇒ `Object`

._build_score_fuller_more(m) ⇒ `Object`

._build_score_legs(m) ⇒ `Object`

._build_status(m) ⇒ `Object`

._build_time(m) ⇒ `Object`

._mk_score_fuller_agg(win:) ⇒ `Object`

._mk_score_fuller_p(win:) ⇒ `Object`

._parse_date(str) ⇒ `Object`

._parse_goal_count(str) ⇒ `Object`

._parse_goal_minute(str) ⇒ `Object`

._parse_score_full(str) ⇒ `Object`

._parse_team(str) ⇒ `Object`

.build_map(lines, downcase: false) ⇒ `Object`

.build_names(lines) ⇒ `Object`

.parse_date(str, start: nil) ⇒ `Object`

.parse_names(txt) ⇒ `Object`

#_build_date(m) ⇒ `Object`

#_build_date_legs(m) ⇒ `Object`

#_build_duration(m) ⇒ `Object`

#_build_goal_count(m) ⇒ `Object`

#_build_goal_minute(m) ⇒ `Object`

#_build_goal_minute_na(m) ⇒ `Object`

#_build_goal_type(m) ⇒ `Object`

#_build_minute(m) ⇒ `Object`

#_build_score(m) ⇒ `Object`

#_build_score_abd(m) ⇒ `Object`

#_build_score_awd(m) ⇒ `Object`

#_build_score_full(m) ⇒ `Object`

#_build_score_fuller(m) ⇒ `Object`

#_build_score_fuller_more(m) ⇒ `Object`

#_build_score_legs(m) ⇒ `Object`

#_build_status(m) ⇒ `Object`

#_build_time(m) ⇒ `Object`

#_info(*args) ⇒ `Object`

#_on_goal(m, ctx:) ⇒ `Object`

#_on_goal_alt(m, ctx:) ⇒ `Object`

#_on_goal_compat(m, ctx:) ⇒ `Object`

#_on_group_def(m, ctx:) ⇒ `Object`

#_on_prop_attendance(m, ctx:) ⇒ `Object`

#_on_prop_cards(m, ctx:) ⇒ `Object`

#_on_prop_lineup(m, ctx:) ⇒ `Object`

#_on_prop_penalties(m, ctx:) ⇒ `Object`

#_on_prop_referee(m, ctx:) ⇒ `Object`

#_on_round_def(m, ctx:) ⇒ `Object`

#_on_top(m, ctx:) ⇒ `Object`

#_prep_doc(txt) ⇒ `Object`

#_prep_line(line) ⇒ `Object`

#_tokenize_line(line, lineno) ⇒ `Object`

#_trace(*args) ⇒ `Object`

#_warn(*args) ⇒ `Object`

#debug? ⇒ `Boolean`

#log(msg) ⇒ `Object`

#tokenize_with_errors ⇒ `Object`