Class: SportDb::Lexer

Inherits:

Object

Object
SportDb::Lexer

show all

Defined in:: lib/sportdb/parser/lexer.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/token-geo.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-prop.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-score.rb,
lib/sportdb/parser/token-minute.rb,
lib/sportdb/parser/token-status.rb

Defined Under Namespace

Classes: Tokens

Constant Summary collapse

QUICK_PLAYER_WITH_MINUTE_RE = add a QUICK_PLAYER_WITH_MINUTE check

%r{
      \b
         \d{1,3}      ## constrain numbers to 0 to 999!!!
        (?: (?: 
                \+\d{1,3}   
            )?
            |
            (?: \?{2} | _{2} )  ## add support for n/a (not/available)
        )           
        '   ## must have minute marker!!!!
}ix

TIME_RE = keep 18h30 - why? why not? add support for 6:30pm 8:20am etc. - why? why not? check - only support h e.g. 18h30 or 18H30 too - why? why not? e.g. 18.30 (or 18:30 or 18h30)

%r{
    (?<time>  \b
        (?:   (?<hour>\d{1,2})
                 (?: :|\.|h )
              (?<minute>\d{2})) 
              \b
    )
}ix

WDAY_RE = add wday / stand-alone week day - as separate regex or use TEXT with is_wday? check or such with requirement of beginning of line (anchored to line) only?? - why? why not?

%r{
(?<wday>
  \b     # note - alternation (|) is lowest precedence (such 
         #    parathenes required around \b()\b !!!
         ## note - NOT case sensitive!!!    
       (?<day_name>
        (?-i:
          Mon|Mo|
          Tue|Tu|
          Wed|We|
          Thu|Th|
          Fri|Fr|
          Sat|Sa|
          Sun|Su
       ))
       (?=[ ]{2})   # positive lookahead for two space  
       ## todo/check - must be followed by two spaces or space + [( etc.
         ##   to allow words starting with weekday abbrevations - why? why not?
         ##     check if any names (teams, rounds, etc) come up in practice 
         ##   or maybe remove three letter abbrevations Mon/Tue
         ##    and keep only Mo/Tu/We etc. - why? why not?
)}x

BASICS_RE =

%r{
    ## e.g. (51) or (1) etc.  - limit digits of number???
    ##  todo/fix - change num  to ord (for ordinal number)!!!!!
    (?<num> \(  (?<value>\d+) \) )
       |
    (?<vs>
       (?<=[ ])	# positive lookbehind for space
       (?-i: 
         vs|v 
       )        # note - only match case sensitive (downcased letters)!!!
                # note -  bigger match first e.g. vs than v etc.
       (?=[ ])   # positive lookahead for space
    )
       |
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>  (?<=^|[ ])  ## positive lookahead 
                  (?: ----|
                      ---|
                      --
                  )
             (?=[ ])   ## positive lookahead
    )
        |
    (?<sym> [;,/@|\[\]-] )
}ix

RE =

Regexp.union(
                    STATUS_RE,
                    SCORE_NOTE_RE,
                    NOTE_RE,
                    DURATION_RE,  # note - duration MUST match before date
                    DATE_RE,  ## note - date must go before time (e.g. 12.12. vs 12.12)
                     TIME_RE,
                    SCORE_MORE_RE, 
                    SCORE_RE,   ## note basic score e.g. 1-1 must go after SCORE_MORE_RE!!!
                    BASICS_RE, 
                    WDAY_RE,  # allow standalone weekday name (e.g. Mo/Tu/etc.) - why? why not?
        #    note - wday MUST be after text e.g. Sun Ke 68' is Sun Ke (NOT Sun) etc.
                   TEXT_RE,
                   ANY_RE,
)

GOAL_BASICS_RE = goal mode (switched to by PLAYER_WITH_MINUTE_RE)

%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>  
        [;,\[\]]   ## add (-) dash too - why? why not?   
    )   
}ix

GOAL_RE =

Regexp.union(
    GOAL_BASICS_RE,
    MINUTE_RE,
    MINUTE_NA_RE,   ## note - add/allow not/available (n/a,na) minutes hack for now
    GOAL_OG_RE, GOAL_PEN_RE,
    SCORE_RE,
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
)

PROP_GOAL_RE = note - leave out n/a minute in goals - make minutes optional!!!

Regexp.union(
    GOAL_BASICS_RE,
    MINUTE_RE,
   ## MINUTE_NA_RE,   ## note - add/allow not/available (n/a,na) minutes hack for now
    GOAL_OG_RE, GOAL_PEN_RE,
    SCORE_RE,
    PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
)

ROUND_OUTLINE_RE = note - use A (instead of ^) - A strictly matches the start of the string.

%r{   \A
    [ ]*  ## ignore leading spaces (if any)
  (?: »|>> ) 
    [ ]+
     (?<round_outline>
        ## must start with letter - why? why not?
        ###   1st round
        ##  allow numbers e.g. Group A - 1 
        .+?   ## use non-greedy 
     )
    [ ]*  ## ignore trailing spaces (if any) 
  $
}ix

GEO_TEXT_RE =

%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
           ## positive lookbehind -  for now space (or beginning of line - for testing) only
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<= [ ,›>\[\]]|^)
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |
                # opt 2 - start with num!! - 
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MAY be followed by (optional space) !
                      ## MUST be follow by a to z!!!!
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add another weirdo case
                ##   e.g.   's Gravenwezel-Schilde
                ##   add more letters (or sequences here - why? why not?)
                    '\p{L}+
               )

               ##
               ## todo/check - find a different "more intuitive" regex/rule if possible?
               ##    for single spaces only (and _/ MUST not be surround by spaces) 

              (?: 
                  [ ]?   # only single spaces allowed inline!!!  
                  (?:
                     \p{L} | \d | [.&'°]
                      |
                     (?: (?<! [ ])  ## no space allowed before (but possible after)
                          [-]
                     )
                       |
                     (?: (?<! [ ])  ## no spaces allowed around these characters
                          [_/]
                         (?! [ ])
                     )
                  )+
              )*
         
              ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)

            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## positive lookahead
            (?=[ ,›>\[\]]|$)
   )
}ix

TIMEZONE_RE = for timezone format use for now: (BRT/UTC-3) (e.g. brazil time) (CET/UTC+1) - central european time (CEST/UTC+2) - central european summer time - daylight saving time (DST). (EET/UTC+1) - eastern european time (EEST/UTC+2) - eastern european summer time - daylight saving time (DST). UTC+3 UTC+4 UTC+0 UTC+00 UTC+0000 - allow +01 or +0100 - why? why not - +0130 (01:30) see https://en.wikipedia.org/wiki/Time_zone https://en.wikipedia.org/wiki/List_of_UTC_offsets https://en.wikipedia.org/wiki/UTC−04:00 etc. e.g. (UTC-2) or (CEST/UTC-2) etc. todo check - only allow upcase or (utc-2) and (cest/utc-2) too - why? why not?

%r{
   (?<timezone>
      \(
           ## optional "local" timezone name eg. BRT or CEST etc.
           (?:  [a-z]+
                 /
           )?
            [a-z]+
            [+-]
            \d{1,4}   ## e.g. 0 or 00 or 0000
      \)
   )
}ix

GEO_BASICS_RE =

%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym> [,›>\[] )
}ix

GEO_RE =

Regexp.union(
                    TIMEZONE_RE,
                    GEO_BASICS_RE, 
                    GEO_TEXT_RE,
                    ANY_RE,
)

MONTH_LINES =

parse_names( <<TXT )
January    Jan
February   Feb
March      Mar
April      Apr
May
June       Jun
July       Jul
August     Aug
September  Sept  Sep
October    Oct
November   Nov
December   Dec
TXT

MONTH_NAMES =

build_names( MONTH_LINES )

MONTH_MAP = pp MONTH_NAMES

build_map( MONTH_LINES, downcase: true )

DAY_LINES =

parse_names( <<TXT )
Monday                   Mon  Mo
Tuesday            Tues  Tue  Tu
Wednesday                Wed  We
Thursday    Thurs  Thur  Thu  Th
Friday                   Fri  Fr
Saturday                 Sat  Sa
Sunday                   Sun  Su
TXT

DAY_NAMES =

build_names( DAY_LINES )

DAY_MAP = pp DAY_NAMES

build_map( DAY_LINES, downcase: true )

DATE_I_RE = e.g. Fri Aug/9 or Fri Aug 9

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?
     (?<month_name>#{MONTH_NAMES})
         (?: \/|[ ] )
     (?<day>\d{1,2})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?
  \b
)}ix

DATE_II_RE = e.g. 3 June or 10 June

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?
     (?<day>\d{1,2})
         [ ]
     (?<month_name>#{MONTH_NAMES})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?
  \b
)}ix

DATE_III_RE = e.g. iso-date - 2011-08-25 note - allow/support ("shortcuts") e.g 2011-8-25 or 2011-8-3 / 2011-08-03 etc.

%r{
(?<date>
  \b
   (?<year>\d{4})
       -
   (?<month>\d{1,2})
       -
   (?<day>\d{1,2})
  \b
)}ix

DATE_IIII_RE = allow (short)“european” style 8.8. note - assume day/month!!!

%r{
(?<date>
  \b
   (?<day>\d{1,2})
       \.
   (?<month>\d{1,2})
       \.
   (?: (?: 
          (?<year>\d{4})        ## optional year 2025 (yyyy)
              |
          (?<yy>\d{2})           ## optional year 25 (yy)
       )
        \b
   )?
)
}ix

DATE_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
   DATE_I_RE,
   DATE_II_RE,
   DATE_III_RE,
   DATE_IIII_RE,    ## e.g. 8.8. or 8.13.79 or 08.14.1973 
)

DURATION_I_RE =

%r{
(?<duration>
    \b
  (?:
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name1>#{MONTH_NAMES})
      (?: \/|[ ] )
   (?<day1>\d{1,2})
   ## optional year
   (  ,?   # optional comma
      [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]* - [ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name2>#{MONTH_NAMES})
      (?: \/|[ ] )
   (?<day2>\d{1,2})
   ## optional year
   (  ,?   # optional comma
      [ ]
      (?<year2>\d{4})
   )?
  )
   \b
)}ix

DURATION_II_RE = variant ii add support for shorthand August 16-18, 2011 September 13-15, 2011 October 18-20, 2011 March/6-8, 2012 March 6-8 2012 March 6-8 - add support for August 16+17 or such (and check 16+18) use <op> to check if day2 is a plus or range or such - why? why not?

%r{
(?<duration>
    \b
   (?:
       (?<month_name1>#{MONTH_NAMES})
           [ /]
        (?<day1>\d{1,2})
             -
        (?<day2>\d{1,2})
          (?:
            ,?     ## optional comma
            [ ]
            (?<year1>\d{4})
          )?     ## optional year   
   )
   \b
)}ix

DURATION_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
   DURATION_I_RE,
   DURATION_II_RE,
)

PROP_NAME_RE = name different from text (does NOT allow number in name/text)

%r{
                 (?<prop_name> 
                      \b
                   (?<name>
                      \p{L}+       
                        \.?    ## optional dot
                          (?:
                             ## rule for space; only one single space allowed inline!!!
                              (?:
                                (?<![ ])  ## use negative lookbehind                             
                                  [ ] 
                                (?=\p{L}|['"])      ## use lookahead        
                              )
                              ## support (inline) quoted name e.g. "Rodri" or such
                                  |
                                  (?:
                                     (?<=[ ])  ## use positive lookbehind                             
                                     " \p{L}+ " 
                                      ## require space here too - why? why not?
                                   )                      
                                  |   
                             (?:
                                (?<=\p{L})   ## use lookbehind
                                 [-]   ## must be surrounded by letters
                                       ## e.g. One/Two NOT
                                       ##      One/ Two or One / Two or One /Two etc.
                                (?=\p{L})      ## use lookahead        
                              )
                                 |   
                              (?:  ## flex rule for quote - allow any
                                    ##  only check for double quotes e.g. cannot follow other ' for now - why? why not?
                                    ##        allows  rodrigez 'rodri' for example
                                (?<!')  ## use negative lookbehind                             
                                   '         
                              )      
                                 |   ## standard case with letter(s) and optinal dot
                              (?: \p{L}+
                                    \.?  ## optional dot
                              )
                          )*
                    )
               ## add lookahead - must be non-alphanum 
                  (?=[ ,;\]\)]|$)
                  )
}ix

PROP_KEY_RE =

%r{ 
   ^     # note - MUST start line; leading spaces optional (eat-up)
   [ ]*  
(?<prop_key>
  (?<key>
      (?:\p{L}+
          |
          \d+  # check for num lookahead (MUST be space or dot)
       ## MUST be followed by (optional dot) and
       ##                      required space !!!
       ## MUST be follow by a to z!!!!
        \.?     ## optional dot
        [ ]?   ## make space optional too  - why? why not?
            ##  yes - eg. 1st, 2nd, 5th etc.
        \p{L}+
       )
       [\d\p{L}'/° -]*?   ## allow almost anyting 
                         ## fix - add negative lookahead 
                         ##         no space and dash etc.
                         ##    only allowed "inline" not at the end
                         ## must end with latter or digit!
  )
   [ ]*?     # slurp trailing spaces
    :
   (?=[ ]+)  ## possitive lookahead (must be followed by space!!)
  )
}ix

PROP_KEY_INLINE_RE = simple prop key for inline use e.g. Coach: or Trainer: or ... add more here later

%r{ 
   \b  
(?<prop_key>    ## note: use prop_key (NOT prop_key_inline or such)
  (?<key>
      \p{L}+
  )
   ## note - NO spaces allowed for key for now!!! 
    :
   (?=[ ]+)  ## possitive lookahead (must be followed by space!!)
  )
}ix

PROP_NUM_RE =

%r{
 \b
  (?<num>
        ## note allow underscore inline or space e.g.
        ##  5_000
        ##  allow space inline (e.g. 5 000) - why? why not?
      (?<value> [1-9]
                (?: _? 
                    [0-9]+
                 )* 
      )
  )
 \b
}ix

ENCLOSED_NAME_RE = todo/fix - allow more chars in enclosed name - why? why not? e.g. (') - Cote D'Ivore etc. change to PAREN_NAME or PARENTHESIS or such - why? why not?

%r{ 
        (?<enclosed_name>  
           \( 
          (?<name>   
              \p{L}+
              (?:
                 [ ] 
                   \p{L}+ 
              )*
          )
            \)
        )
}ix

PROP_BASICS_RE =

%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>  
        [;,\(\)\[\]-] 
    )   
}ix

PROP_RE =

Regexp.union(
   MINUTE_RE,
   PROP_KEY_INLINE_RE,
   PROP_NAME_RE,
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)

PROP_CARDS_RE = note - no inline keys possible todo/fix - use custom (limited) prop basics too

Regexp.union(
   MINUTE_RE,
   PROP_NAME_RE,
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)

PROP_PENALTIES_RE =

Regexp.union(
   SCORE_RE,               # e.g. 1-1 etc.
   ENCLOSED_NAME_RE,       # e.g. (save), (post), etc.
   PROP_NAME_RE,
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)

PROP_REFEREE_RE =

Regexp.union(
   ENCLOSED_NAME_RE,       # e.g. (sold out) etc.  why? why not?
   PROP_NUM_RE,                 # e.g. 28 000 or 28_000  (NOT 28,000 is not valid!!!)
   PROP_KEY_INLINE_RE,
   PROP_NAME_RE,
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)

PROP_ATTENDANCE_RE =

Regexp.union(
   ENCLOSED_NAME_RE,       # e.g. (sold out) etc.  why? why not?
   PROP_NUM_RE,                 # e.g. 28 000 or 28_000  (NOT 28,000 is not valid!!!)
   PROP_BASICS_RE, 
   ## todo/fix - add ANY_RE here too!!!
)

ANY_RE = general catch-all (RECOMMENDED (ALWAYS) use as last entry in union) to avoid advance of pos match!!!

%r{
     (?<any> .)
}ix

TEXT_RE =

%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
           ## positive lookbehind
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<=[ ,;@|\[\]]
                 |^
            )
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |

                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MUST be followed by (optional dot) and
                      ##                      required space !!!
                      ## MUST be follow by a to z!!!!
                      \.?     ## optional dot
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add weirdo case
                ##   e.g.  1/8 Finals  1/4 1/2 ...
                    1/ \d{1,2} [ ] \p{L}+
                  |
                ## opt 4 - add another weirdo case
                ##   e.g.   's Gravenwezel-Schilde
                    '[s]
                  |
                ## opt 5 - add another weirdo case
                ##   e.g. 5.-8. Platz Playoffs  - keep - why? why not?
                    \d+\.-\d+\.  [ ]? \p{L}+                 
               )

              (?:(?:  (?:[ ]   # only single spaces allowed inline!!! 
                        (?! (?-i: vs?[ ])
                          )    ## note - exclude (v[ ]/vs[ ])
                               ##    AND switch to case-sensitive (via -i!!!)
                      )
                      |     
                     [/-]   ## must NOT be surrounded by spaces 
                  )?
                (?:
                  \p{L} 
                     |
                  [.&'°]
                     |
                 (?:
                   \d+
                   (?!
                     [0-9h'+] |    ## protected break on 12h / 12' / 1-1
                                    ##  check usege for 3+4 - possible? where ? why?     
                     (?:[.:-]\d)     ## protected/exclude/break on 12.03 / 12:03 / 12-12
                                     ##  BUT allow Park21-Arena for example e.g. 21-A :-)
                    )
                   ## negative lookahead for numbers
                   ##   note - include digits itself!!!
                   ##   note - remove / (slash) e.g. allows UDI'19/Beter Bed
                 )
               )
              )*  ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)


            ## allow optional at the end
            ##  tag or year
            ##   make it and in the future - why? why not?
            ##
            ## change - fix
            ##   do NOT use (A) for amateur
            ##   use A or A. with NO ()!!!
            ## (A) -    allow with predined  alpha only for now
            ##          e.g. (A) - amateur a team or b?
            ###  same for U21 or U9 etc
            ##        use with NO ()!!! - why? why not?
            ##      or U21 U9 etc.   - why? why not?
            ##       or etc.
            ## (1879-1893) or allow years e.g. (1879-1893)
            ###
            ##    add allow country code three to five letters for now
            ##       change to generic 1 to 5 - why? why not?
            ##     e.g. (A), (I),
            ##          (AUT)
            ##          (TRNC)   five? for UEFA code for northern cyprus
            ##     change to 1 to 4 - why? why not?
            ##   check - fix possible for upper case only here
            ##                     inline for this group only?
            (?:
               [ ]
               \(
                  \d{4}-\d{4}
               \)
            )?
             (?:
               [ ]+   ## allow more than once space - why? why not?
                  \( (?:
                       [A-Z]{1,5}
                     )
                  \)
             )?
            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## positive lookahead
            (?=[ ,;@|\[\]]
                 |$
            )
   )
}ix

P_EN = english helpers (penalty, extra time, …) note - p must go last (shortest match) pso = penalty shootout

'(?: pso | pen\.? | p\.? )'

ET_EN = e.g. p., p, pen, pen., PSO, etc.

'(?: aet | a\.e\.t\.? )'

SCORE__P_ET__RE = note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.) 3-4 pen. 2-2 a.e.t. 3-4 pen. 2-2 a.e.t. 2-2 a.e.t.

%r{
(?<score_more>
   \b
    (?:
       (?<p1>\d{1,2}) - (?<p2>\d{1,2})
         [ ]* #{P_EN} [ ]+
     )?             # note: make penalty (P) score optional for now
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]* #{ET_EN}
       (?=[ ,\]]|$)
)}ix

SCORE__P__RE = note: allow SPECIAL with penalty only 3-4 pen.

%r{
        (?<score_more>
  \b
     (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN}
       (?=[ ,\]]|$)
)}ix

SCORE__P_ET_FT_HT_V2__RE = support short all-in-one e.g. e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) becomes 3-4 pen. (2-2, 1-1, 1-1)

%r{
          (?<score_more>
   \b
    (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN} [ ]+       
       \(
   (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]*, [ ]*
   (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*, [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
       [ ]*
    \)
   (?=[ ,\]]|$)
)}ix

SCORE__P_ET_FT_HT__RE = e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or 3-4p 2-2aet (1-1, ) or 3-4 pen. 2-2 a.e.t. (1-1) or 2-2 a.e.t. (1-1, 1-1) or 2-2 a.e.t. (1-1, ) or 2-2 a.e.t. (1-1)

%r{
          (?<score_more>
   \b
   (?:
    (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN} [ ]+
    )?            # note: make penalty (P) score optional for now
   (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]* #{ET_EN} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
 (?=[ ,\]]|$)
)}ix

SCORE__P_FT_HT__RE = special case for case WITHOUT extra time!! same as above (but WITHOUT extra time and pen required)

%r{
         (?<score_more>
            \b
 (?<p1>\d{1,2}) - (?<p2>\d{1,2})
    [ ]* #{P_EN} [ ]+
    \(
    [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]*
 (?:
      , [ ]*
     (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         [ ]*
     )?
 )?              # note: make half time (HT) score optional for now
   \)
  (?=[ ,\]]|$)
)}ix

SCORE__FT_HT__RE = e.g. 2-1 (1-1)

%r{
            (?<score_more>
 \b
 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
      [ ]+ \( [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
      [ ]* \)
(?=[ ,\]]|$)
)}ix

SCORE__FT__RE = 2-1

%r{
            (?<score>
 \b
 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
 \b
)}ix

SCORE_MORE_RE = map tables note: order matters; first come-first matched/served check - find a better name for SCORE_MORE - SCORE_EX, SCORE_BIG, or _ - why? why not?

Regexp.union(
  SCORE__P_ET_FT_HT_V2__RE,  # e.g. 5-1 pen. (2-2, 1-1, 1-0)  
  SCORE__P_ET_FT_HT__RE,    # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
  SCORE__P_FT_HT__RE,     # e.g. 5-1 pen. (1-1)
  SCORE__P_ET__RE,        # e.g. 2-2 a.e.t.  or  5-1 pen. 2-2 a.e.t.
  SCORE__P__RE,           # e.g. 5-1 pen.
  SCORE__FT_HT__RE,        # e.g. 1-1 (1-0)
  ##  note - keep basic score as its own token!!!!
  ##   that is, SCORE & SCORE_MORE
  ### SCORE__FT__RE,           # e.g. 1-1  -- note - must go last!!!
)

SCORE_RE =

SCORE__FT__RE

GOAL_PEN_RE = goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og) todo/check - keep case-insensitive or allow OG or P or PEN or only lower case - why? why not?

%r{
   (?<pen> \(
           (?:pen|p)\.?
           \)
    )
}ix

GOAL_OG_RE =

%r{
   (?<og> \(
          (?:og|o\.g\.)
          \)
   )
}ix

MINUTE_NA_RE = minute variant for N/A not/available todo/check - find a better syntax - why? why not? note "??".to_i(10) returns 0 or "__".to_i(10) returns 0 quick hack - assume 0 for n/a for now

%r{
   (?<minute>
      (?<=[ (])	 # positive lookbehind for space or opening 
        (?<value> \?{2} | _{2} )
        '   ## must have minute marker!!!!
    )
}ix

MINUTE_RE =

%r{
     (?<minute>
       (?<=[ (])	 # positive lookbehind for space or opening ( e.g. (61') required
                     #    todo - add more lookbehinds e.g.  ,) etc. - why? why not?
             (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
                   (?: \+
                     (?<value2>\d{1,3})   
                   )?           
        '     ## must have minute marker!!!!
     )
}ix

PLAYER_WITH_MINUTE_RE =

%r{
           \A    ### note - MUST start line; leading spaces optional (eat-up)
           [ ]*
             (?:      # optional open bracket ([) -- remove later
                (?<open_bracket> \[ )
                [ ]*
             )?
             (?:     # optional none a.k.a. -;   - what todo here?
               (?<none>  - [ ]* ; [ ]* )
             )?
   (?<player_with_minute>
                   (?<name>
                      \p{L}+       
                        \.?    ## optional dot
       
                          (?:
                              ## rule for space; only one single space allowed inline!!!
                              (?:
                                (?<![ ])  ## use negative lookbehind                             
                                  [ ] 
                                (?=\p{L}|')      ## use lookahead        
                              )
                                  |
                              (?:
                                (?<=\p{L})   ## use lookbehind
                                 ['-]   ## must be surrounded by letters
                                       ## e.g. One/Two NOT
                                       ##      One/ Two or One / Two or One /Two etc.
                                (?=\p{L})      ## use lookahead        
                              )
                                 |   
                              (?:
                                (?<=[ ])   ## use lookbehind  -- add letter (plus dot) or such - why? why not?
                                 [']   ## must be surrounded by leading space and
                                       ## traling letters  (e.g. UDI 'Beter Bed)
                                (?=\p{L})      ## use lookahead        
                              )   
                                 |
                              (?:
                                (?<=\p{L})   ## use lookbehind
                                 [']   ## must be surrounded by leading letter and
                                       ## trailing space PLUS letter  (e.g. UDI' Beter Bed)
                                (?=[ ]\p{L})      ## use lookahead (space WITH letter         
                              )   
                                 |   ## standard case with letter(s) and optinal dot
                              (?: \p{L}+
                                    \.?  ## optional dot
                              )
                          )*
                   )
#### spaces
     (?: [ ]+)
#### minute (see above)
#####   use MINUTE_RE.source or such - for inline (reference) use? do not copy
     (?<minute>
       (?<=[ (])	 # positive lookbehind for space or opening ( e.g. (61') required
                     #    todo - add more lookbehinds e.g.  ,) etc. - why? why not?
           (?: 
              (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
                   (?: \+
                     (?<value2>\d{1,3})   
                   )?
               |
              (?<value> \?{2} | _{2} )  ## add support for n/a (not/available)
           )           
        '     ## must have minute marker!!!!
     )
 
   )   
}ix

PLAYER_WITH_SCORE_RE = note - use A (instead of ^) - A strictly matches the start of the string.

%r{
           \A    ### note - MUST start line; leading spaces optional (eat-up)
           [ ]*
   (?<player_with_score>
                   (?<score>
                     (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
                   )
                      [ ]+
                   (?<name>
                      \p{L}+       
                        \.?    ## optional dot
       
                          (?:
                              ## rule for space; only one single space allowed inline!!!
                              (?:
                                (?<![ ])  ## use negative lookbehind                             
                                  [ ] 
                                (?=\p{L}|')      ## use lookahead        
                              )
                                  |
                              (?:
                                (?<=\p{L})   ## use lookbehind
                                 ['-]   ## must be surrounded by letters
                                       ## e.g. One/Two NOT
                                       ##      One/ Two or One / Two or One /Two etc.
                                (?=\p{L})      ## use lookahead        
                              )
                                 |   
                              (?:
                                (?<=[ ])   ## use lookbehind  -- add letter (plus dot) or such - why? why not?
                                 [']   ## must be surrounded by leading space and
                                       ## traling letters  (e.g. UDI 'Beter Bed)
                                (?=\p{L})      ## use lookahead        
                              )   
                                 |
                              (?:
                                (?<=\p{L})   ## use lookbehind
                                 [']   ## must be surrounded by leading letter and
                                       ## trailing space PLUS letter  (e.g. UDI' Beter Bed)
                                (?=[ ]\p{L})      ## use lookahead (space WITH letter         
                              )   
                                 |   ## standard case with letter(s) and optinal dot
                              (?: \p{L}+
                                    \.?  ## optional dot
                              )
                          )*
                   )   ## name
            ### check/todo - add lookahead  (e.g. must be space or ,$) why? why not?               
    )  ## player_with_score 
}ix

STATUS_RE =

%r{
            \[
      (?:    
            ### opt 1 - allow long forms with note/comment for some stati
           (?: (?<status> awarded
             ## e.g. [awarded match to Leones Negros by undue alignment; original result 1-2]
             ##     [awarded 3-0 to Cafetaleros by undue alignment; originally ended 2-0]
             ##     [awarded 3-0; originally 0-2, América used ineligible player (Federico Viñas)]
                            |
                          annulled
                            |
                          abandoned
             ## e.g. [abandoned at 1-1 in 65' due to cardiac arrest Luton player Tom Lockyer]
             ##      [abandoned at 0-0 in 6' due to waterlogged pitch]
             ##     [abandoned at 5-0 in 80' due to attack on assistant referee by Cerro; result stood]
             ##    [abandoned at 1-0 in 31']
             ##    [abandoned at 0-1' in 85 due to crowd trouble]
                            |
                          postponed
             ## e.g. [postponed due to problems with the screen of the stadium]
             ##      [postponed by storm]
             ##      [postponed due to tropical storm "Hanna"]
             ##      [postponed from Sep 10-12 due to death Queen Elizabeth II]
                           |
                        suspended
             ## e.g. [suspended at 0-0 in 12' due to storm]  
             ##      [suspended at 84' by storm; result stood]
                           |
                         verified
             ## e.g.  [verified 2:0 wo.]
   

               ) [ ;,]* (?<status_note> [^\]]+ )
                 [ ]*
            )
            |
        
            ## opt 2 - short from only (no note/comments)
            (?<status>
               cancelled|canceled|can\.
                 |
               abandoned|abd\.
                 |
               postponed
                 |
               awarded|awd\.
                 |
               replay
                 |
               annulled
                 |
               suspended    ### todo/fix - add status upstream - why? why not?
                            ###  move to note(s) - do NOT interpret as status - why? why not?
                 |
               verified     ### todo/fix - add status upstream (same as ??) - why? why not? 
                            ###  move to note(s) - do NOT interpret as status - why? why not?
            )
      )
    \]
}ix

NOTE_RE = todo/fix - move to token-note.rb (standalone) file

%r{
    \[ 
   (?<note>
     (?:  ##  starting with ___   PLUS requiring more text
       (?:
          nb:
          ##  e.g. [NB: between top-8 of regular season]
          #        [NB: América, Morelia and Tigres qualified on better record regular season]
          #        [NB: Celaya qualified on away goals]
          #        [NB: Alebrijes qualified on away goal]
          #        [NB: Leones Negros qualified on away goals]
          #
          # todo/fix:
          # add "top-level" NB: version
          ##   with full (end-of) line note - why? why not?
          |
          rescheduled
          ## e.g.  [rescheduled due to earthquake occurred in Mexico on September 19]
          |
          declared
          ## e.g.  [declared void]
          |
          remaining
          ## e.g. [remaining 79']   
          ##      [remaining 84'] 
          ##      [remaining 59']   
          ##      [remaining 5']
       )
      [ ]
      [^\]]+?    ## slurp all to next ] - (use non-greedy) 
     )
   )
   \] 
}ix

SCORE_NOTE_RE =

%r{
    \[ 
    (?<score_note>
      (?:   # plain aet e.g. [aet]
             aet | a\.e\.t\. |
             after [ ] extra [ -] time
       )
      |
       (?:  # plain penalties e.g. [3-2 pen]
             \d{1,2}-\d{1,2}
                [ ]* (?: p|pen )
       )
      |
        (?:  # plain aet with penalties e.g. [aet; 4-3 pen] or [aet, 4-3p]
              aet [ ]* [,;]
                [ ]*
              \d{1,2}-\d{1,2}
                [ ]* (?: p|pen )
         )
      |
      (?:
         ## e.g. Spain wins on penalties
         ##       1860 München wins on penalties etc.
         ##   must start with digit 1-9 or letter
         ##     todo - add more special chars - why? why not?
         ##     
               (?:
                    aet [ ]*   ## allow space here - why? why not
                       [,;][ ]
                )?
           
              (?:
              (?:  # opt 1 - no team listed/named - requires score
                 (?: won|wins? ) [ ]     ## note - allow won,win or wins
                (?:   ## score
                   \d{1,2}-\d{1,2}
                   [ ]
                ) 
                on [ ]  (?: pens | penalties |
                          aggregate  )   
               )
              |
              (?:  # opt 2 - team required; score optional
                (?:  ## team required
                      [1-9\p{L}][0-9\p{L} .-]+?    
                     [ ]
                 )
                 (?: won|wins? ) [ ]     ## won/win/wins
                 (?:   ## score optional
                    \d{1,2}-\d{1,2}
                    [ ]
                  )?            
                  on [ ] (?:  pens | penalties |
                              aggregate  )
             ###  [^\]]*?   ## allow more? use non-greedy
          )
        ))
         |
         (?:  ## e.g. agg 3-2 etc.
             agg [ ] \d{1,2}-\d{1,2}
         )
         |
         (?:   ## e.g. agg 4-4, Ajax win on away goals
              (?:   ## agg 4-4, optional for now - why? why not? 
                 agg [ ] \d{1,2}-\d{1,2} 
                 [ ]*[,;][ ]
               )?
             (?:  ## team required
                      [1-9\p{L}][0-9\p{L} .-]+?    
                     [ ]
              )
              (?: won|wins? ) [ ]     # won/win/wins
              on [ ] away [ ] goals
         )
      )   # score_note ref
    \]
}ix

Class Method Summary collapse

.build_map(lines, downcase: false) ⇒ Object
.build_names(lines) ⇒ Object
.parse_date(str, start:) ⇒ Object

add a date parser helper.
.parse_names(txt) ⇒ Object

Instance Method Summary collapse

#_tokenize_line(line) ⇒ Object
#debug? ⇒ Boolean
#initialize(lines, debug: false) ⇒ Lexer constructor

A new instance of Lexer.
#is_group?(text) ⇒ Boolean

todo/fix - use LangHelper or such e.g.
#is_leg?(text) ⇒ Boolean
#is_round?(text) ⇒ Boolean
#is_zone?(text) ⇒ Boolean
#log(msg) ⇒ Object
#tokenize_with_errors ⇒ Object

Constructor Details

#initialize(lines, debug: false) ⇒ `Lexer`

Returns a new instance of Lexer.

# File 'lib/sportdb/parser/lexer.rb', line 126

def initialize( lines, debug: false )
   @debug = debug

##  note - for convenience - add support
##         comments (incl. inline end-of-line comments) and empty lines here
##             why? why not?
##         why?  keeps handling "centralized" here in one place

   ## todo/fix - rework and make simpler
    ##             no need to double join array of string to txt etc.

    txt_pre =  if lines.is_a?( Array )
               ## join together with newline
                 lines.reduce( String.new ) do |mem,line|
                                               mem << line; mem << "\n"; mem
                                            end
               else  ## assume single-all-in-one txt
                 lines
               end

    ##  preprocess automagically - why? why not?
    ##   strip lines with comments and empty lines striped / removed
    ##      keep empty lines? why? why not?
    ##      keep leading spaces (indent) - why?
    ##
    ##  note - KEEP empty lines (get turned into BLANK token!!!!)

    @txt = String.new
    txt_pre.each_line do |line|    ## preprocess
       line = line.strip
       next if line.start_with?('#')   ###  skip comments
       
       line = line.sub( /#.*/, '' ).strip   ###  cut-off end-of line comments too
       
       @txt << line
       @txt << "\n"
    end
end

Class Method Details

.build_map(lines, downcase: false) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 40

def self.build_map( lines, downcase: false )
   ## note: downcase name!!!
  ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
  ##  {"january" => 1,  "jan" => 1,
  ##   "february" => 2, "feb" => 2,
  ##   "march" => 3,    "mar" => 3,
  ##   "april" => 4,    "apr" => 4,
  ##   "may" => 5,
  ##   "june" => 6,     "jun" => 6, ...
  lines.each_with_index.reduce( {} ) do |h,(line,i)|
    line.each do |name|
       h[ downcase ? name.downcase : name ] = i+1
    end  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
    h
  end
end

.build_names(lines) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 33

def self.build_names( lines )
  ## join all words together into a single string e.g.
  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
  lines.map { |line| line.join('|') }.join('|')
end

.parse_date(str, start:) ⇒ `Object`

add a date parser helper

# File 'lib/sportdb/parser/token-date.rb', line 197

def self.parse_date( str, start: )
    if m=DATE_RE.match( str )

      year    = m[:year].to_i(10)  if m[:year]
      month   = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
      day     = m[:day].to_i(10)   if m[:day]
      wday    = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

      if year.nil?   ## try to calculate year
        year =  if  month > start.month ||
                   (month == start.month && day >= start.day)
                  # assume same year as start_at event (e.g. 2013 for 2013/14 season)
                  start.year
                else
                  # assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
                  start.year+1
                end
      end
      Date.new( year,month,day )
    else
      puts "!! ERROR - unexpected date format; cannot parse >#{str}<"
      exit 1
    end
end

.parse_names(txt) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 6

def self.parse_names( txt )
  lines = [] # array of lines (with words)

  txt.each_line do |line|
    line = line.strip

    next if line.empty?
    next if line.start_with?( '#' )   ## skip comments too

    ## strip inline (until end-of-line) comments too
    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
    ##   =>   Janvier  Janv  Jan

    line = line.sub( /#.*/, '' ).strip
    ## pp line

    values = line.split( /[ \t]+/ )
    ## pp values

    ## todo/fix -- add check for duplicates
    lines << values
  end
  lines

end

Instance Method Details

#_tokenize_line(line) ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 307

def _tokenize_line( line )
  tokens = []
  errors = []   ## keep a list of errors - why? why not?

  puts "line: >#{line}<"    if debug?


   ### special case for empty line (aka BLANK)
   if line.empty?
       ## note - blank always resets parser mode to std/top-level!!!
       @re = RE

       tokens << [:BLANK, '<|BLANK|>']
       return [tokens, errors]
   end


  pos = 0
  ## track last offsets - to report error on no match
  ##   or no match in end of string
  offsets = [0,0]
  m = nil


  ####
  ## quick hack - keep re state/mode between tokenize calls!!!
  @re  ||= RE     ## note - switch between RE & INSIDE_RE


  if @re == RE  ## top-level
    ### check for modes once (per line) here to speed-up parsing
    ###   for now goals only possible for start of line!!
    ###        fix - remove optional [] - why? why not?  
  
    ##  start with prop key (match will switch into prop mode!!!)
    ##   - fix - remove leading spaces in regex (upstream) - why? why not?
    if (m = PROP_KEY_RE.match( line ))
      ###  switch into new mode
      ##  switch context  to PROP_RE
        puts "  ENTER PROP_RE MODE"   if debug?
        key = m[:key]


        ### todo - add prop yellow/red cards too - why? why not?
        if ['sent off', 'red cards'].include?( key.downcase) 
          @re = PROP_CARDS_RE    ## use CARDS_RE ???
          tokens << [:PROP_REDCARDS, m[:key]]
        elsif ['yellow cards'].include?( key.downcase )
          @re = PROP_CARDS_RE  
          tokens << [:PROP_YELLOWCARDS, m[:key]]
        elsif ['ref', 'referee'].include?( key.downcase )
          @re = PROP_REFEREE_RE     
          tokens << [:PROP_REFEREE, m[:key]]
        elsif ['att', 'attn', 'attendance'].include?( key.downcase )
          @re = PROP_ATTENDANCE_RE
          tokens << [:PROP_ATTENDANCE, m[:key]]         
        elsif ['goals'].include?( key.downcase )
          @re = PROP_GOAL_RE
          tokens << [:PROP_GOALS, m[:key]]
        elsif ['penalties', 'penalty shootout'].include?( key.downcase )
          @re = PROP_PENALTIES_RE
          tokens << [:PROP_PENALTIES, m[:key]]
        else   ## assume (team) line-up
          @re = PROP_RE           ## use LINEUP_RE ???
          tokens << [:PROP, m[:key]]
        end

        offsets = [m.begin(0), m.end(0)]
        pos = offsets[1]    ## update pos
    elsif (m = ROUND_OUTLINE_RE.match( line ))
      puts "   ROUND_OUTLINE"  if debug?

      tokens << [:ROUND_OUTLINE, m[:round_outline]]

      ## note - eats-up line for now (change later to only eat-up marker e.g. »|>>)
      offsets = [m.begin(0), m.end(0)]
      pos = offsets[1]    ## update pos
    elsif (m = PLAYER_WITH_SCORE_RE.match( line ))
      ##  switch context to GOAL_RE (goalline(s)
      ##   split token (automagically) into two!! - player AND minute!!!
      @re = GOAL_RE
      puts "  ENTER GOAL_RE MODE"   if debug?

      score = {}
      ## must always have ft for now e.g. 1-1 or such
      ###  change to (generic) score from ft -
      ##     might be score a.e.t. or such - why? why not?
      score[:ft] = [m[:ft1].to_i(10),
                    m[:ft2].to_i(10)]  
      ## note - for debugging keep (pass along) "literal" score
      tokens << [:SCORE, [m[:score], score]]

      ## auto-add player token 
      tokens << [:PLAYER, m[:name]]
  
      offsets = [m.begin(0), m.end(0)]
      pos = offsets[1]    ## update pos

    ####  FIX/FIX/TODO
    ### looks to hang in player with minute 
    ###  FIX - improve / rework PLAYER_WITH_MINUTE_RE  regex!!!!
    elsif (_quick = QUICK_PLAYER_WITH_MINUTE_RE.match(line) &&
                m = PLAYER_WITH_MINUTE_RE.match( line ))
      ##  switch context to GOAL_RE (goalline(s)
      ##   split token (automagically) into two!! - player AND minute!!!
      @re = GOAL_RE
      puts "  ENTER GOAL_RE MODE"   if debug?

      ## check for optional open_bracket
      tokens << [:'[']     if m[:open_bracket]

      ## check for  -;  (none with separator)
      ##    todo - find a better way? how possible?
      tokens << [:NONE, "<|NONE|>"]   if m[:none]
      
      ## auto-add player token first
      tokens << [:PLAYER, m[:name]]
      ## minute props
      minute = {}
      minute[:m]      = m[:value].to_i(10)
      minute[:offset] = m[:value2].to_i(10)   if m[:value2]
      ##  t is minute only
      tokens << [:MINUTE, [m[:minute], minute]]

      offsets = [m.begin(0), m.end(0)]
      pos = offsets[1]    ## update pos
    end
  end



  old_pos = -1   ## allows to backtrack to old pos (used in geo)

  while m = @re.match( line, pos )
    # if debug?
    #  pp m
    #  puts "pos: #{pos}"
    # end
    offsets = [m.begin(0), m.end(0)]

    if offsets[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!
      msg =  "!! WARN - parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      puts msg

      errors << "parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      log( msg )
    end


    ##
    ## todo/fix - also check if possible
    ##   if no match but not yet end off string!!!!
    ##    report skipped text run too!!!

    old_pos = pos
    pos     = offsets[1]

#    pp offsets   if debug?

    ##
    ## note: racc requires pairs e.g. [:TOKEN, VAL]
    ##         for VAL use "text" or ["text", { opts }]  array

  t = if @re == GEO_RE
         ### note - possibly end inline geo on [ (and others?? in the future
         if m[:space] || m[:spaces]
            nil    ## skip space(s)
         elsif m[:text]
            [:GEO, m[:text]]   ## keep pos - why? why not?
         elsif m[:timezone]
            [:TIMEZONE, m[:timezone]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
   
            case sym
            when ',' then [:',']
            when '›' then [:',']  ## note - treat geo sep › (unicode) like comma for now!!!
            when '>' then [:',']  ## note - treat geo sep > (ascii) like comma for now!!!
            when '[' then
                 ## get out-off geo mode and backtrack (w/ next)
                 puts "  LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
                 @re = RE
                 pos = old_pos
                 next   ## backtrack (resume new loop step)                 
            else
              puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
              nil  ## ignore others (e.g. brackets [])
            end
          elsif m[:any]
             ## todo/check log error
             msg = "parse error (tokenize geo) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
             puts "!! WARN - #{msg}"
  
             errors << msg
             log( "!! WARN - #{msg}" )
       
             nil   
          else
            ## report error/raise expection
             puts "!!! TOKENIZE ERROR - no match found"
             nil 
          end
      elsif @re == PROP_CARDS_RE 
        if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]
              [:PROP_NAME, m[:name]]
         elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
         elsif m[:sym]
            sym = m[:sym]
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '-' then [:'-']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
             puts "!!! TOKENIZE ERROR (PROP_CARDS_RE) - no match found"
             nil 
         end    
      elsif @re == PROP_RE   ### todo/fix - change to LINEUP_RE !!!!
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_key]   ## check for inline prop keys
              key = m[:key]   
              ##  supported for now coach/trainer (add manager?)
              if ['coach', 
                  'trainer'].include?( key.downcase )
                [:COACH, m[:key]]   ## use COACH_KEY or such - why? why not?
              else
                ## report error - for unknown (inline) prop key in lineup
                nil
              end
         elsif m[:prop_name]
               if m[:name] == 'Y'
                 [:YELLOW_CARD, m[:name]]
               elsif m[:name] == 'R'
                 [:RED_CARD, m[:name]]
               else 
                 [:PROP_NAME, m[:name]]
               end
         elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '[' then [:'[']
            when ']' then [:']']
            when '(' then [:'(']
            when ')' then [:')']
            when '-' then [:'-']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
             puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
             nil 
         end
      elsif @re == PROP_ATTENDANCE_RE
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:enclosed_name]
              ## reserverd for use for sold out or such (in the future) - why? why not?
             [:ENCLOSED_NAME, m[:name]]
         elsif m[:num]
             [:PROP_NUM, [m[:num], { value: m[:value].to_i(10) } ]]
=begin             
         elsif m[:sym]
            sym = m[:sym]
            case sym
            when ',' then [:',']
            when ';' then [:';']
            # when '[' then [:'[']
            # when ']' then [:']']
            else
              nil  ## ignore others (e.g. brackets [])
            end
=end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (PROP_ATTENDANCE_RE) - no match found"
            nil 
         end
      elsif @re == PROP_REFEREE_RE
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_key]   ## check for inline prop keys
              key = m[:key]   
              ##  supported for now coach/trainer (add manager?)
              if ['att', 'attn', 'attendance' ].include?( key.downcase )
                [:ATTENDANCE, m[:key]]   ## use COACH_KEY or such - why? why not?
              else
                ## report error - for unknown (inline) prop key in lineup
                nil
              end
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PROP_NAME, m[:name]]    ### use PLAYER for token - why? why not?
         elsif m[:num]
             [:PROP_NUM, [m[:num], { value: m[:value].to_i(10) } ]]
         elsif m[:enclosed_name]
              ## use HOLD,SAVE,POST or such keys - why? why not?
             [:ENCLOSED_NAME, m[:name]]
         elsif m[:sym]
            sym = m[:sym]
            case sym
            when ',' then [:',']
            when ';' then [:';']
 #           when '[' then [:'[']
 #           when ']' then [:']']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (PROP_REFEREE_RE) - no match found"
            nil 
         end       
      elsif @re == PROP_PENALTIES_RE
        if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PROP_NAME, m[:name]]    ### use PLAYER for token - why? why not?
         elsif m[:enclosed_name]
              ## use HOLD,SAVE,POST or such keys - why? why not?
             [:ENCLOSED_NAME, m[:name]]
         elsif m[:score]
              score = {}
              ## must always have ft for now e.g. 1-1 or such
              ###  change to (generic) score from ft -
              ##     might be score a.e.t. or such - why? why not?
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  
              ## note - for debugging keep (pass along) "literal" score
              [:SCORE, [m[:score], score]]
         elsif m[:sym]
            sym = m[:sym]
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '[' then [:'[']
            when ']' then [:']']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (PROP_PENALTIES_RE) - no match found"
            nil 
         end
      elsif @re == GOAL_RE || @re == PROP_GOAL_RE
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]    ## note - change prop_name to player
             [:PLAYER, m[:name]] 
         elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
         elsif m[:score]
              score = {}
              ## must always have ft for now e.g. 1-1 or such
              ###  change to (generic) score from ft -
              ##     might be score a.e.t. or such - why? why not?
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  
              ## note - for debugging keep (pass along) "literal" score
              [:SCORE, [m[:score], score]]
         elsif m[:og]
             [:OG, m[:og]]    ## for typed drop - string version/variants ??  why? why not?
         elsif m[:pen]
             [:PEN, m[:pen]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '[' then [:'[']
            when ']' then [:']']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
            puts "!!! TOKENIZE ERROR (GOAL_RE) - no match found"
            nil 
         end
      ###################################################
      ## assume TOP_LEVEL (a.k.a. RE) machinery
      else  
        if m[:space] || m[:spaces]
           nil   ## skip space(s)
        elsif m[:text]
          [:TEXT, m[:text]]   ## keep pos - why? why not?
        elsif m[:status]   ## (match) status e.g. cancelled, awarded, etc.
          ## todo/check - add text (or status) 
          #     to opts hash {} by default (for value)
          if m[:status_note]   ## includes note? e.g.  awarded; originally 2-0
             [:STATUS, [m[:status], {status: m[:status], 
                                     note:   m[:status_note]} ]]
          else
             [:STATUS, [m[:status], {status: m[:status] } ]]
          end
        elsif m[:note]
            ###  todo/check:
            ##      use value hash - why? why not? or simplify to:
            ## [:NOTE, [m[:note], {note: m[:note] } ]]
             [:NOTE, m[:note]] 
        elsif m[:score_note]
             [:SCORE_NOTE, m[:score_note]]
        elsif m[:time]
              ## unify to iso-format
              ###   12.40 => 12:40
              ##    12h40 => 12:40 etc.
              ##  keep string (no time-only type in ruby)
              hour =   m[:hour].to_i(10)  ## allow 08/07/etc.
              minute = m[:minute].to_i(10)
              ## check if valid -  0:00 - 24:00
              ##   check if 24:00 possible? or only 0:00 (23:59)
              if (hour >= 0 && hour <= 24) &&
                 (minute >=0 && minute <= 59)
               ## note - for debugging keep (pass along) "literal" time
               ##   might use/add support for am/pm later
               [:TIME, [m[:time], {h:hour,m:minute}]]
              else
                 raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
              end
        elsif m[:date]
            date = {}
 ## map month names
 ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date[:y]  = m[:year].to_i(10)  if m[:year]
            ## check - use y too for two-digit year or keep separate - why? why not?
            date[:yy] = m[:yy].to_i(10)    if m[:yy]    ## two digit year (e.g. 25 or 78 etc.)
            date[:m] = m[:month].to_i(10)  if m[:month]
            date[:m] = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
            date[:d]  = m[:day].to_i(10)   if m[:day]
            date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]
            ## note - for debugging keep (pass along) "literal" date
            [:DATE, [m[:date], date]]
        elsif m[:duration]
            ## todo/check/fix - if end: works for kwargs!!!!!
            duration = { start: {}, end: {}}
            duration[:start][:y] = m[:year1].to_i(10)  if m[:year1]
            duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ]   if m[:month_name1]
            duration[:start][:d]  = m[:day1].to_i(10)   if m[:day1]
            duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ]   if m[:day_name1]
            duration[:end][:y] = m[:year2].to_i(10)  if m[:year2]
            duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            duration[:end][:d]  = m[:day2].to_i(10)   if m[:day2]
            duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ]   if m[:day_name2]
            ## note - for debugging keep (pass along) "literal" duration
            [:DURATION, [m[:duration], duration]]
        elsif m[:wday]    ## standalone weekday e.g. Mo/Tu/We/etc.
             [:WDAY, [m[:wday], { wday: DAY_MAP[ m[:day_name].downcase ] } ]]
        elsif m[:num]   ## fix - change to ord (for ordinal number!!!)
              ## note -  strip enclosing () and convert to integer
             [:ORD, [m[:num], { value: m[:value].to_i(10) } ]]
        elsif m[:score_more]
              score = {}
              ## check for pen
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]

            ## note - for debugging keep (pass along) "literal" score
            [:SCORE_MORE, [m[:score_more], score]]
        elsif m[:score]
            score = {}
            ## must always have ft for now e.g. 1-1 or such
            ###  change to (generic) score from ft -
            ##     might be score a.e.t. or such - why? why not?
            score[:ft] = [m[:ft1].to_i(10),
                          m[:ft2].to_i(10)]  
          ## note - for debugging keep (pass along) "literal" score
          [:SCORE, [m[:score], score]]
      elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
        elsif m[:vs]
           [:VS, m[:vs]]
        elsif m[:sym]
          sym = m[:sym]
          ## return symbols "inline" as is - why? why not?
          ## (?<sym>[;,@|\[\]-])
 
          case sym
          when '@'    ##  enter geo mode
            puts "  ENTER GEO_RE MODE"  if debug?
            @re = GEO_RE
            [:'@']
          when ',' then [:',']
          when ';' then [:';']
          when '/' then [:'/']
          when '|' then [:'|']
          when '[' then [:'[']
          when ']' then [:']']
          when '-' then [:'-']        # level 1 OR (classic) dash
          when '--'   then [:'--']    # level 2
          when '---'  then [:'---']   # level 3
          when '----' then [:'----']  # level 4
          else
            puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<"
            nil  ## ignore others (e.g. brackets [])
          end
        elsif m[:any]
           ## todo/check log error
           msg = "parse error (tokenize) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
           puts "!! WARN - #{msg}"

           errors << msg
           log( "!! WARN - #{msg}" )
     
           nil   
        else
          ## report error
           puts "!!! TOKENIZE ERROR - no match found"
           nil 
        end
      end


    tokens << t    if t

#    if debug?
#      print ">"
#      print "*" * pos
#      puts "#{line[pos..-1]}<"
#    end
  end

  ## check if no match in end of string
  if offsets[1] != line.size
    msg =  "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
    puts msg
    log( msg )

    errors << "parse error (tokenize) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
  end


   if @re == GOAL_RE   ### ALWAYS switch back to top level mode
     puts "  LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
     @re = RE 
   end
 
   if @re == GEO_RE   ### ALWAYS switch back to top level mode
     puts "  LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
     @re = RE 
   end

   ##
   ## if in prop mode continue if   last token is [,-]
   ##        otherwise change back to "standard" mode
   if @re == PROP_RE            || @re == PROP_CARDS_RE ||
      @re == PROP_GOAL_RE       || @re == PROP_PENALTIES_RE ||
      @re == PROP_ATTENDANCE_RE || @re == PROP_REFEREE_RE
     if [:',', :'-', :';'].include?( tokens[-1][0] )
        ## continue/stay in PROP_RE mode
        ##  todo/check - auto-add PROP_CONT token or such
        ##                to help parser with possible NEWLINE
        ##                  conflicts  - why? why not?
     else
        ## switch back to top-level mode!!
        puts "  LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
        @re = RE 
        ## note - auto-add PROP_END (<PROP_END>)
        tokens << [:PROP_END, "<|PROP_END|>"]    
     end
   end

  
  [tokens,errors]
end

#debug? ⇒ `Boolean`

Returns:

(Boolean)

124	# File 'lib/sportdb/parser/lexer.rb', line 124 def debug?() @debug == true; end

#is_group?(text) ⇒ `Boolean`

todo/fix - use LangHelper or such

 e.g.     class Lexer
              include LangHelper
          end

merge back Lang into Lexer - why? why not?

keep “old” access to checking for group, round & friends

for now for compatibility

Returns:

(Boolean)

26	# File 'lib/sportdb/parser/lexer.rb', line 26 def is_group?( text ) Lang.is_group?( text ); end

#is_leg?(text) ⇒ `Boolean`

Returns:

(Boolean)

28	# File 'lib/sportdb/parser/lexer.rb', line 28 def is_leg?( text ) Lang.is_leg?( text ); end

#is_round?(text) ⇒ `Boolean`

Returns:

(Boolean)

27	# File 'lib/sportdb/parser/lexer.rb', line 27 def is_round?( text ) Lang.is_round?( text ); end

#is_zone?(text) ⇒ `Boolean`

Returns:

(Boolean)

29	# File 'lib/sportdb/parser/lexer.rb', line 29 def is_zone?( text ) Lang.is_zone?( text ); end

#log(msg) ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 7

def log( msg )
   ## append msg to ./logs.txt
   ##     use ./errors.txt - why? why not?
   File.open( './logs.txt', 'a:utf-8' ) do |f|
     f.write( msg )
     f.write( "\n" )
   end
end

#tokenize_with_errors ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 167

def tokenize_with_errors
    tokens_by_line = []   ## note: add tokens line-by-line (flatten later)
    errors         = []   ## keep a list of errors - why? why not?
  
    @txt.each_line do |line|
        line = line.rstrip   ## note - MUST remove/strip trailing newline (spaces optional)!!!
 
        more_tokens, more_errors = _tokenize_line( line )
        
        tokens_by_line  << more_tokens   
        errors          += more_errors
    end # each line

    tokens_by_line = tokens_by_line.map do |tokens|
        #############
        ## pass 1
        ##   replace all texts with keyword matches
        ##     (e.g. group, round, leg, etc.)
        ##
        ##   note - let is_round? get first (before is_group?)
        ##            will match group stage  as round (NOT group)
        tokens = tokens.map do |t|        
                    if t[0] == :TEXT
                       text = t[1]
                       t =  if is_round?( text ) || is_leg?( text ) || is_zone?( text )
                               [:ROUND, text]   
                            elsif is_group?( text )
                               [:GROUP, text]
                             else
                               t  ## pass through as-is (1:1)
                             end
                    end
                   t
                 end

        ### check for "section" starters e.g. Teams or such
        t = tokens[0] 
        if t[0] == :TEXT
            text = t[1]
            if text =~ /^teams$/i
               t[0] = :TEAMS
            elsif text =~  /^blank$/i   ### todo/fix -- remove!!! add real blanks!!
               t[0] = :BLANK
            else
            end
        end

        #################
        ## pass 2                  
        ##    transform tokens (using simple patterns) 
        ##      to help along the (racc look ahead 1 - LA1) parser       
        nodes = []

        buf = Tokens.new( tokens )
        ## pp buf


    loop do
          break if buf.eos?

          if buf.pos == 0   ## MUST start line
            ## check for
            ##    group def or round def
            if buf.match?( :ROUND, :'|' )    ## assume round def (change round to round_def)
                      nodes << [:ROUND_DEF, buf.next[1]]
                      nodes << buf.next 
                      nodes += buf.collect
                      break
            end
            if buf.match?( :GROUP, :'|' )    ## assume group def (change group to group_def)
                      nodes << [:GROUP_DEF, buf.next[1]]
                      nodes << buf.next 
                      ## change all text to team - why? why not?
                      nodes += buf.collect { |t|
                                t[0] == :TEXT ? [:TEAM, t[1]] : t
                               }
                      break
            end
          end


          if buf.match?( :TEXT, [:SCORE, :SCORE_MORE, :VS, :'-'], :TEXT )
             nodes << [:TEAM, buf.next[1]]
             nodes << buf.next
             nodes << [:TEAM, buf.next[1]]
   #   note - now handled (upstream) with GOAL_RE mode!!!
   #       elsif buf.match?( :TEXT, :MINUTE )
   #          nodes << [:PLAYER, buf.next[1]]
   #          nodes << buf.next
          elsif buf.match?( :DATE, :TIME )   ## merge DATE TIME into DATETIME
               date = buf.next[1]
               time = buf.next[1]
               ## puts "DATETIME:"
               ## pp date, time
               val =  [date[0] + ' ' + time[0],  ## concat string of two tokens
                        { date: date[1], time: time[1] }
                      ]
               nodes << [:DATETIME, val]
          else
             ## pass through
             nodes << buf.next
          end
    end  # loop
    nodes  
  end  # map tokens_by_line



    ## flatten tokens
    tokens = []
    tokens_by_line.each do |tok|

         if debug?
           pp tok
         end

         tokens  += tok 
         ## auto-add newlines  (unless BLANK!!)
         tokens  << [:NEWLINE, "\n"]   unless tok[0][0] == :BLANK
    end

    [tokens,errors]
end

Class: SportDb::Lexer

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lines, debug: false) ⇒ Lexer

Class Method Details

.build_map(lines, downcase: false) ⇒ Object

.build_names(lines) ⇒ Object

.parse_date(str, start:) ⇒ Object

.parse_names(txt) ⇒ Object

Instance Method Details

#_tokenize_line(line) ⇒ Object

#debug? ⇒ Boolean

#is_group?(text) ⇒ Boolean

#is_leg?(text) ⇒ Boolean

#is_round?(text) ⇒ Boolean

#is_zone?(text) ⇒ Boolean

#log(msg) ⇒ Object

#tokenize_with_errors ⇒ Object

#initialize(lines, debug: false) ⇒ `Lexer`

.build_map(lines, downcase: false) ⇒ `Object`

.build_names(lines) ⇒ `Object`

.parse_date(str, start:) ⇒ `Object`

.parse_names(txt) ⇒ `Object`

#_tokenize_line(line) ⇒ `Object`

#debug? ⇒ `Boolean`

#is_group?(text) ⇒ `Boolean`

#is_leg?(text) ⇒ `Boolean`

#is_round?(text) ⇒ `Boolean`

#is_zone?(text) ⇒ `Boolean`

#log(msg) ⇒ `Object`

#tokenize_with_errors ⇒ `Object`