Class: SportDb::Lexer

Inherits:

Object

Object
SportDb::Lexer

show all

Defined in:: lib/sportdb/parser/lexer.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-prop.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-score.rb,
lib/sportdb/parser/token-minute.rb,
lib/sportdb/parser/token-status.rb

Defined Under Namespace

Classes: Tokens

Constant Summary collapse

TIME_RE = keep 18h30 - why? why not? add support for 6:30pm 8:20am etc. - why? why not? check - only support h e.g. 18h30 or 18H30 too - why? why not? e.g. 18.30 (or 18:30 or 18h30)

%r{
    (?<time>  \b
        (?:   (?<hour>\d{1,2})
                 (?: :|\.|h )
              (?<minute>\d{2})) 
              \b
    )
}ix

TIMEZONE_RE = for timezone format use for now: (BRT/UTC-3) (e.g. brazil time) (CET/UTC+1) - central european time (CEST/UTC+2) - central european summer time - daylight saving time (DST). (EET/UTC+1) - eastern european time (EEST/UTC+2) - eastern european summer time - daylight saving time (DST). UTC+3 UTC+4 UTC+0 UTC+00 UTC+0000 - allow +01 or +0100 - why? why not - +0130 (01:30) see https://en.wikipedia.org/wiki/Time_zone https://en.wikipedia.org/wiki/List_of_UTC_offsets https://en.wikipedia.org/wiki/UTC−04:00 etc. e.g. (UTC-2) or (CEST/UTC-2) etc. todo check - only allow upcase or (utc-2) and (cest/utc-2) too - why? why not?

%r{
   (?<timezone>
      \(
           ## optional "local" timezone name eg. BRT or CEST etc.
           (?:  [a-z]+
                 /
           )?
            [a-z]+
            [+-]
            \d{1,4}   ## e.g. 0 or 00 or 0000
      \)
   )
}ix

WDAY_RE = add wday / stand-alone week day - as separate regex or use TEXT with is_wday? check or such with requirement of beginning of line (anchored to line) only?? - why? why not?

%r{
(?<wday>
  \b     # note - alternation (|) is lowest precedence (such 
         #    parathenes required around \b()\b !!!
         ## note - NOT case sensitive!!!    
       (?<day_name>
        (?-i:
          Mon|Mo|
          Tue|Tu|
          Wed|We|
          Thu|Th|
          Fri|Fr|
          Sat|Sa|
          Sun|Su
       ))
  \b     ## todo/check - must be followed by two spaces or space + [( etc.
         ##   to allow words starting with weekday abbrevations - why? why not?
         ##     check if any names (teams, rounds, etc) come up in practice 
         ##   or maybe remove three letter abbrevations Mon/Tue
         ##    and keep only Mo/Tu/We etc. - why? why not?
)}x

BASICS_RE =

%r{
    ## e.g. (51) or (1) etc.  - limit digits of number???
    ##  todo/fix - change num  to ord (for ordinal number)!!!!!
    (?<num> \(  (?<value>\d+) \) )
       |
    (?<vs>
       (?<=[ ])	# positive lookbehind for space
       (?-i: 
         vs|v 
       )        # note - only match case sensitive (downcased letters)!!!
                # note -  bigger match first e.g. vs than v etc.
       (?=[ ])   # positive lookahead for space
    )
       |
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>  (?<=^|[ ])  ## positive lookahead 
                  (?: ----|
                      ---|
                      --
                  )
             (?=[ ])   ## positive lookahead
    )
        |
    (?<sym> [;,/@|\[\]-] )
}ix

RE = start with prop key (match will/should switch into prop mode!!!)

Regexp.union(  PROP_KEY_RE, ##  start with prop key (match will/should switch into prop mode!!!)
                    STATUS_RE,
                    NOTE_RE,
                    TIMEZONE_RE,
                     TIME_RE,
                     DURATION_RE,  # note - duration MUST match before date
                    DATE_RE,
                    SCORE_MORE_RE, 
                    SCORE_RE,   ## note basic score e.g. 1-1 must go after SCORE_MORE_RE!!!
                    BASICS_RE, 
                    MINUTE_RE,
                    MINUTE_NA_RE,   ## note - add/allow not/available (n/a,na) minutes hack for now
                    GOAL_OG_RE, GOAL_PEN_RE,
                     TEXT_RE,
                     WDAY_RE,  # allow standalone weekday name (e.g. Mo/Tu/etc.) - why? why not?
         #    note - wday MUST be after text e.g. Sun Ke 68' is Sun Ke (NOT Sun) etc.
)

MONTH_LINES =

parse_names( <<TXT )
January    Jan
February   Feb
March      Mar
April      Apr
May
June       Jun
July       Jul
August     Aug
September  Sept  Sep
October    Oct
November   Nov
December   Dec
TXT

MONTH_NAMES =

build_names( MONTH_LINES )

MONTH_MAP = pp MONTH_NAMES

build_map( MONTH_LINES, downcase: true )

DAY_LINES =

parse_names( <<TXT )
Monday                   Mon  Mo
Tuesday            Tues  Tue  Tu
Wednesday                Wed  We
Thursday    Thurs  Thur  Thu  Th
Friday                   Fri  Fr
Saturday                 Sat  Sa
Sunday                   Sun  Su
TXT

DAY_NAMES =

build_names( DAY_LINES )

DAY_MAP = pp DAY_NAMES

build_map( DAY_LINES, downcase: true )

DATE_I_RE = e.g. Fri Aug/9 or Fri Aug 9

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?
     (?<month_name>#{MONTH_NAMES})
         (?: \/|[ ] )
     (?<day>\d{1,2})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?
  \b
)}ix

DATE_II_RE = e.g. 3 June or 10 June

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?
     (?<day>\d{1,2})
         [ ]
     (?<month_name>#{MONTH_NAMES})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?
  \b
)}ix

DATE_III_RE = e.g. iso-date - 2011-08-25 note - allow/support ("shortcuts") e.g 2011-8-25 or 2011-8-3 / 2011-08-03 etc.

%r{
(?<date>
  \b
   (?<year>\d{4})
       -
   (?<month>\d{1,2})
       -
   (?<day>\d{1,2})
  \b
)}ix

DATE_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
   DATE_I_RE,
   DATE_II_RE,
   DATE_III_RE,
)

DURATION_I_RE =

%r{
(?<duration>
    \b
  (?:
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name1>#{MONTH_NAMES})
      (?: \/|[ ] )
   (?<day1>\d{1,2})
   ## optional year
   (  ,?   # optional comma
      [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]* - [ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name2>#{MONTH_NAMES})
      (?: \/|[ ] )
   (?<day2>\d{1,2})
   ## optional year
   (  ,?   # optional comma
      [ ]
      (?<year2>\d{4})
   )?
  )
   \b
)}ix

DURATION_II_RE = variant ii add support for shorthand August 16-18, 2011 September 13-15, 2011 October 18-20, 2011 March/6-8, 2012 March 6-8 2012 March 6-8 - add support for August 16+17 or such (and check 16+18) use <op> to check if day2 is a plus or range or such - why? why not?

%r{
(?<duration>
    \b
   (?:
       (?<month_name1>#{MONTH_NAMES})
           [ /]
        (?<day1>\d{1,2})
             -
        (?<day2>\d{1,2})
          (?:
            ,?     ## optional comma
            [ ]
            (?<year1>\d{4})
          )?     ## optional year   
   )
   \b
)}ix

DURATION_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
   DURATION_I_RE,
   DURATION_II_RE,
)

PROP_NAME_RE = name different from text (does NOT allow number in name/text)

%r{
                 (?<prop_name> \b
                   (?<name>
                      \p{L}+       
                        \.?    ## optional dot
                      (?: 
                          [ ]?    # only single spaces allowed inline!!!
                          (?:
                              (?:
                                (?<=\p{L})   ## use lookbehind
                                 [/'-]   ## must be surrounded by letters
                                       ## e.g. One/Two NOT
                                       ##      One/ Two or One / Two or One /Two etc.
                                (?=\p{L})      ## use lookahead        
                              )
                                 |   
                              (?:
                                (?<=[ ])   ## use lookbehind  -- add letter (plus dot) or such - why? why not?
                                 [']   ## must be surrounded by leading space and
                                       ## traling letters  (e.g. UDI 'Beter Bed)
                                (?=\p{L})      ## use lookahead        
                              )   
                                 |
                              (?:
                                (?<=\p{L})   ## use lookbehind
                                 [']   ## must be surrounded by leading letter and
                                       ## trailing space PLUS letter  (e.g. UDI' Beter Bed)
                                (?=[ ]\p{L})      ## use lookahead (space WITH letter         
                              )   
                                 |   ## standard case with letter(s) and optinal dot
                              (?: \p{L}+
                                    \.?  ## optional dot
                              )
                          )+
                     )*
                   )
               ## add lookahead - must be non-alphanum 
                  (?=[ ,;\]\)]|$)
                  )
}ix

PROP_KEY_RE =

%r{ 
(?<prop_key> \b
  (?<key>
      (?:\p{L}+
          |
          \d+  # check for num lookahead (MUST be space or dot)
       ## MUST be followed by (optional dot) and
       ##                      required space !!!
       ## MUST be follow by a to z!!!!
        \.?     ## optional dot
        [ ]?   ## make space optional too  - why? why not?
            ##  yes - eg. 1st, 2nd, 5th etc.
        \p{L}+
       )
       [\d\p{L}'/° -]*?   ## allow almost anyting 
                         ## fix - add negative lookahead 
                         ##         no space and dash etc.
                         ##    only allowed "inline" not at the end
                         ## must end with latter or digit!
  )
   [ ]*?     # slurp trailing spaces
    :
   (?=[ ]+)  ## possitive lookahead (must be followed by space!!)
  )
}ix

PROP_BASICS_RE =

%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>  
        [;,\(\)\[\]-] 
    )   
}ix

PROP_RE =

Regexp.union(
   PROP_BASICS_RE, 
   MINUTE_RE,
   PROP_NAME_RE,
)

TEXT_RE =

%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
           ## positive lookbehind
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<=[ ,;@|\[\]]
                 |^
            )
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |

                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MUST be followed by (optional dot) and
                      ##                      required space !!!
                      ## MUST be follow by a to z!!!!
                      \.?     ## optional dot
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add weirdo case
                ##   e.g. 5.-8. Platz Playoffs  - keep - why? why not?
                    \d+\.-\d+\.  [ ]? \p{L}+
               )

              (?:(?:  (?:[ ] 
                        (?! (?-i: vs?[ ])
                          )    ## note - exclude (v[ ]/vs[ ])
                               ##    AND switch to case-sensitive (via -i!!!)
                      )
                      |     # only single spaces allowed inline!!!
                     [-/]
                  )?
                (?:
                  \p{L} |
                  [&'°]
                    |
                 (?:
                   \d+
                   (?!
                     [0-9h'+-] |    ## protected break on 12h / 12' / 1-1
                                    ##  check usege for 3+4 - possible? where ? why?
                     (?:[.:]\d)     ## protected/exclude/break on 12.03 / 12:03
                    )
                   ## negative lookahead for numbers
                   ##   note - include digits itself!!!
                   ##   note - remove / (slash) e.g. allows UDI'19/Beter Bed
                 )|
                 \.
               )
              )*  ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)


            ## allow optional at the end
            ##  tag or year
            ##   make it and in the future - why? why not?
            ##
            ## change - fix
            ##   do NOT use (A) for amateur
            ##   use A or A. with NO ()!!!
            ## (A) -    allow with predined  alpha only for now
            ##          e.g. (A) - amateur a team or b?
            ###  same for U21 or U9 etc
            ##        use with NO ()!!! - why? why not?
            ##      or U21 U9 etc.   - why? why not?
            ##       or etc.
            ## (1879-1893) or allow years e.g. (1879-1893)
            ###
            ##    add allow country code three to five letters for now
            ##       change to generic 1 to 5 - why? why not?
            ##     e.g. (A), (I),
            ##          (AUT)
            ##          (TRNC)   five? for UEFA code for northern cyprus
            ##     change to 1 to 4 - why? why not?
            ##   check - fix possible for upper case only here
            ##                     inline for this group only?
            (?:
               [ ]
               \(
                  \d{4}-\d{4}
               \)
            )?
             (?:
               [ ]+   ## allow more than once space - why? why not?
                  \( (?:
                       [A-Z]{1,5}
                     )
                  \)
             )?
            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## positive lookahead
            (?=[ ,;@|\[\]]
                 |$
            )
   )
}ix

P_EN = english helpers (penalty, extra time, …) note - p must go last (shortest match) pso = penalty shootout

'(?: pso | pen\.? | p\.? )'

ET_EN = e.g. p., p, pen, pen., PSO, etc.

'(?: aet | a\.e\.t\.? )'

SCORE__P_ET__RE = note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.) 3-4 pen. 2-2 a.e.t. 3-4 pen. 2-2 a.e.t. 2-2 a.e.t.

%r{
(?<score_more>
   \b
    (?:
       (?<p1>\d{1,2}) - (?<p2>\d{1,2})
         [ ]* #{P_EN} [ ]+
     )?             # note: make penalty (P) score optional for now
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]* #{ET_EN}
       (?=[ ,\]]|$)
)}ix

SCORE__P__RE = note: allow SPECIAL with penalty only 3-4 pen.

%r{
        (?<score_more>
  \b
     (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN}
       (?=[ ,\]]|$)
)}ix

SCORE__P_ET_FT_HT__RE = e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or 3-4p 2-2aet (1-1, ) or 3-4 pen. 2-2 a.e.t. (1-1) or 2-2 a.e.t. (1-1, 1-1) or 2-2 a.e.t. (1-1, ) or 2-2 a.e.t. (1-1)

%r{
          (?<score_more>
   \b
   (?:
    (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN} [ ]+
    )?            # note: make penalty (P) score optional for now
   (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]* #{ET_EN} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
 (?=[ ,\]]|$)
)}ix

SCORE__P_FT_HT__RE = special case for case WITHOUT extra time!! same as above (but WITHOUT extra time and pen required)

%r{
         (?<score_more>
            \b
 (?<p1>\d{1,2}) - (?<p2>\d{1,2})
    [ ]* #{P_EN} [ ]+
    \(
    [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]*
 (?:
      , [ ]*
     (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         [ ]*
     )?
 )?              # note: make half time (HT) score optional for now
   \)
  (?=[ ,\]]|$)
)}ix

SCORE__FT_HT__RE = e.g. 2-1 (1-1)

%r{
            (?<score_more>
 \b
 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
      [ ]+ \( [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
      [ ]* \)
(?=[ ,\]]|$)
)}ix

SCORE__FT__RE = 2-1

%r{
            (?<score>
 \b
 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
 \b
)}ix

SCORE_MORE_RE = map tables note: order matters; first come-first matched/served check - find a better name for SCORE_MORE - SCORE_EX, SCORE_BIG, or _ - why? why not?

Regexp.union(
  SCORE__P_ET_FT_HT__RE,  # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
  SCORE__P_FT_HT__RE,     # e.g. 5-1 pen. (1-1)
  SCORE__P_ET__RE,        # e.g. 2-2 a.e.t.  or  5-1 pen. 2-2 a.e.t.
  SCORE__P__RE,           # e.g. 5-1 pen.
  SCORE__FT_HT__RE,        # e.g. 1-1 (1-0)
  ##  note - keep basic score as its own token!!!!
  ##   that is, SCORE & SCORE_MORE
  ### SCORE__FT__RE,           # e.g. 1-1  -- note - must go last!!!
)

SCORE_RE =

SCORE__FT__RE

GOAL_PEN_RE = goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og) todo/check - keep case-insensitive or allow OG or P or PEN or only lower case - why? why not?

%r{
   (?<pen> \(
           (?:pen|p)\.?
           \)
    )
}ix

GOAL_OG_RE =

%r{
   (?<og> \(
          (?:og|o\.g\.)
          \)
   )
}ix

MINUTE_NA_RE = minute variant for N/A not/available todo/check - find a better syntax - why? why not? note "??".to_i(10) returns 0 or "__".to_i(10) returns 0 quick hack - assume 0 for n/a for now

%r{
   (?<minute>
      (?<=[ (])	 # positive lookbehind for space or opening 
        (?<value> \?{2} | _{2} )
        '   ## must have minute marker!!!!
    )
}ix

MINUTE_RE =

%r{
     (?<minute>
       (?<=[ (])	 # positive lookbehind for space or opening ( e.g. (61') required
                     #    todo - add more lookbehinds e.g.  ,) etc. - why? why not?
             (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
                   (?: \+
                     (?<value2>\d{1,3})   
                   )?           
        '     ## must have minute marker!!!!
     )
}ix

STATUS_RE =

%r{
            \[
      (?:    
            ### opt 1 - allow long forms with note/comment for some stati
           (?: (?<status> awarded
             ## e.g. [awarded match to Leones Negros by undue alignment; original result 1-2]
             ##     [awarded 3-0 to Cafetaleros by undue alignment; originally ended 2-0]
             ##     [awarded 3-0; originally 0-2, América used ineligible player (Federico Viñas)]
                            |
                          annulled
                            |
                          abandoned
             ## e.g. [abandoned at 1-1 in 65' due to cardiac arrest Luton player Tom Lockyer]
             ##      [abandoned at 0-0 in 6' due to waterlogged pitch]
             ##     [abandoned at 5-0 in 80' due to attack on assistant referee by Cerro; result stood]
             ##    [abandoned at 1-0 in 31']
             ##    [abandoned at 0-1' in 85 due to crowd trouble]
                            |
                          postponed
             ## e.g. [postponed due to problems with the screen of the stadium]
             ##      [postponed by storm]
             ##      [postponed due to tropical storm "Hanna"]
             ##      [postponed from Sep 10-12 due to death Queen Elizabeth II]
                           |
                        suspended
             ## e.g. [suspended at 0-0 in 12' due to storm]  
             ##      [suspended at 84' by storm; result stood]
                           |
                         verified
             ## e.g.  [verified 2:0 wo.]
   

               ) [ ;,]* (?<status_note> [^\]]+ )
                 [ ]*
            )
            |
        
            ## opt 2 - short from only (no note/comments)
            (?<status>
               cancelled|canceled|can\.
                 |
               abandoned|abd\.
                 |
               postponed
                 |
               awarded|awd\.
                 |
               replay
                 |
               annulled
                 |
               suspended    ### todo/fix - add status upstream - why? why not?
                            ###  move to note(s) - do NOT interpret as status - why? why not?
                 |
               verified     ### todo/fix - add status upstream (same as ??) - why? why not? 
                            ###  move to note(s) - do NOT interpret as status - why? why not?
            )
      )
    \]
}ix

NOTE_RE = todo/fix - move to token-note.rb (standalone) file

%r{
    \[ 
   (?<note>
     (?:  ##  starting with ___   PLUS requiring more text
       (?:
          nb:
          ##  e.g. [NB: between top-8 of regular season]
          #        [NB: América, Morelia and Tigres qualified on better record regular season]
          #        [NB: Celaya qualified on away goals]
          #        [NB: Alebrijes qualified on away goal]
          #        [NB: Leones Negros qualified on away goals]
          #
          # todo/fix:
          # add "top-level" NB: version
          ##   with full (end-of) line note - why? why not?
          |
          (?: originally[ ])? scheduled
          ## e.g. [originally scheduled to play in Mexico City] 
          |
          rescheduled
          ## e.g.  [Rescheduled due to earthquake occurred in Mexico on September 19]
          |
          remaining
          ## e.g. [remaining 79']   
          ##      [remaining 84'] 
          ##      [remaining 59']   
          ##      [remaining 5']
          |
          played  
          ## e.g. [played in Macaé-RJ]
          ##      [played in Caxias do Sul-RS]
          ##      [played in Sete Lagoas-MG]
          ##      [played in Uberlândia-MG]
          ##      [played in Brasília-DF]
          ##      [played in Vöcklabruck]
          ##      [played in Pasching]
          |
          declared
          ## e.g.  [declared void]
          |
          inter-group
          ## e.g. [inter-group A-B]
          ##      [inter-group C-D]
       )
      [ ]
      [^\]]+?    ## slurp all to next ] - (use non-greedy) 
     )
      |
     (?:
       ## starting with in  - do NOT allow digits
       ##   name starting with in possible - why? why not?
           in[ ]
            [^0-9\]]+?
       ## e.g. [In Estadio La Corregidora] 
       ##      [in Unidad Deportiva Centenario]
       ##      [in Estadio Olímpico Universitario]
       ##      [in Estadio Victoria]
       ##      [in UD José Brindis]
       ##      [in Colomos Alfredo "Pistache" Torres stadium]
     )
       |
      (?:
         ## e.g. Spain wins on penalties
         ##       1860 München wins on penalties etc.
         ##   must start with digit 1-9 or letter
         ##     todo - add more special chars - why? why not?
            [1-9\p{L}][0-9\p{L} .-]+?    
            [ ]wins[ ]on[ ]penalties
             [^\]]*?   ## use non-greedy
      )
   )
   \] 
}ix

Class Method Summary collapse

.build_map(lines, downcase: false) ⇒ Object
.build_names(lines) ⇒ Object
.parse_date(str, start:) ⇒ Object

add a date parser helper.
.parse_names(txt) ⇒ Object

Instance Method Summary collapse

#_tokenize_line(line) ⇒ Object
#debug? ⇒ Boolean
#initialize(lines, debug: false) ⇒ Lexer constructor

A new instance of Lexer.
#is_group?(text) ⇒ Boolean

todo/fix - use LangHelper or such e.g.
#is_leg?(text) ⇒ Boolean
#is_round?(text) ⇒ Boolean
#log(msg) ⇒ Object
#tokenize_with_errors ⇒ Object

Constructor Details

#initialize(lines, debug: false) ⇒ `Lexer`

Returns a new instance of Lexer.

# File 'lib/sportdb/parser/lexer.rb', line 126

def initialize( lines, debug: false )
   @debug = debug

##  note - for convenience - add support
##         comments (incl. inline end-of-line comments) and empty lines here
##             why? why not?
##         why?  keeps handling "centralized" here in one place

   ## todo/fix - rework and make simpler
    ##             no need to double join array of string to txt etc.

    txt_pre =  if lines.is_a?( Array )
               ## join together with newline
                 lines.reduce( String.new ) do |mem,line|
                                               mem << line; mem << "\n"; mem
                                            end
               else  ## assume single-all-in-one txt
                 lines
               end

    ##  preprocess automagically - why? why not?
    ##   strip lines with comments and empty lines striped / removed
    ##      keep empty lines? why? why not?
    ##      keep leading spaces (indent) - why?
    ##
    ##  note - KEEP empty lines (get turned into BLANK token!!!!)

    @txt = String.new
    txt_pre.each_line do |line|    ## preprocess
       line = line.strip
       next if line.start_with?('#')   ###  skip comments
       
       line = line.sub( /#.*/, '' ).strip   ###  cut-off end-of line comments too
       
       @txt << line
       @txt << "\n"
    end
end

Class Method Details

.build_map(lines, downcase: false) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 40

def self.build_map( lines, downcase: false )
   ## note: downcase name!!!
  ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
  ##  {"january" => 1,  "jan" => 1,
  ##   "february" => 2, "feb" => 2,
  ##   "march" => 3,    "mar" => 3,
  ##   "april" => 4,    "apr" => 4,
  ##   "may" => 5,
  ##   "june" => 6,     "jun" => 6, ...
  lines.each_with_index.reduce( {} ) do |h,(line,i)|
    line.each do |name|
       h[ downcase ? name.downcase : name ] = i+1
    end  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
    h
  end
end

.build_names(lines) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 33

def self.build_names( lines )
  ## join all words together into a single string e.g.
  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
  lines.map { |line| line.join('|') }.join('|')
end

.parse_date(str, start:) ⇒ `Object`

add a date parser helper

# File 'lib/sportdb/parser/token-date.rb', line 177

def self.parse_date( str, start: )
    if m=DATE_RE.match( str )

      year    = m[:year].to_i(10)  if m[:year]
      month   = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
      day     = m[:day].to_i(10)   if m[:day]
      wday    = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

      if year.nil?   ## try to calculate year
        year =  if  month > start.month ||
                   (month == start.month && day >= start.day)
                  # assume same year as start_at event (e.g. 2013 for 2013/14 season)
                  start.year
                else
                  # assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
                  start.year+1
                end
      end
      Date.new( year,month,day )
    else
      puts "!! ERROR - unexpected date format; cannot parse >#{str}<"
      exit 1
    end
end

.parse_names(txt) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 6

def self.parse_names( txt )
  lines = [] # array of lines (with words)

  txt.each_line do |line|
    line = line.strip

    next if line.empty?
    next if line.start_with?( '#' )   ## skip comments too

    ## strip inline (until end-of-line) comments too
    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
    ##   =>   Janvier  Janv  Jan

    line = line.sub( /#.*/, '' ).strip
    ## pp line

    values = line.split( /[ \t]+/ )
    ## pp values

    ## todo/fix -- add check for duplicates
    lines << values
  end
  lines

end

Instance Method Details

#_tokenize_line(line) ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 289

def _tokenize_line( line )
  tokens = []
  errors = []   ## keep a list of errors - why? why not?

  puts "line: >#{line}<"    if debug?


   ### special case for empty line (aka BLANK)
   if line.empty?
       ## note - blank always resets parser mode to std/top-level!!!
       @re = RE

       tokens << [:BLANK, '<|BLANK|>']
       return [tokens, errors]
   end


  pos = 0
  ## track last offsets - to report error on no match
  ##   or no match in end of string
  offsets = [0,0]
  m = nil


  ####
  ## quick hack - keep re state/mode between tokenize calls!!!
  @re  ||= RE     ## note - switch between RE & INSIDE_RE


  while m = @re.match( line, pos )
#    if debug?
#      pp m
#      puts "pos: #{pos}"
#    end
    offsets = [m.begin(0), m.end(0)]

    if offsets[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!
      msg =  "!! WARN - parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      puts msg

      errors << "parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      log( msg )
    end

    ##
    ## todo/fix - also check if possible
    ##   if no match but not yet end off string!!!!
    ##    report skipped text run too!!!

    pos = offsets[1]

#    pp offsets   if debug?

    ##
    ## note: racc requires pairs e.g. [:TOKEN, VAL]
    ##         for VAL use "text" or ["text", { opts }]  array


  t = if @re == PROP_RE
         if m[:space] || m[:spaces]
              nil    ## skip space(s)
         elsif m[:prop_name]
               if m[:name] == 'Y'
                 [:YELLOW_CARD, m[:name]]
               elsif m[:name] == 'R'
                 [:RED_CARD, m[:name]]
               else 
                 [:PROP_NAME, m[:name]]
               end
         elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '[' then [:'[']
            when ']' then [:']']
            when '(' then [:'(']
            when ')' then [:')']
            when '-' then [:'-']
           # when '.' then 
           #     ## switch back to top-level mode!!
           #     puts "  LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
           #     @re = RE 
           #     [:'.']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
             puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
             nil 
         end
      ###################################################
      ## assume TOP_LEVEL (a.k.a. RE) machinery
      else  
        if m[:space] || m[:spaces]
           nil   ## skip space(s)
        elsif m[:prop_key]
           ##  switch context  to PROP_RE
           @re = PROP_RE
           puts "  ENTER PROP_RE MODE"  if debug?
           [:PROP, m[:key]]
        elsif m[:text]
          [:TEXT, m[:text]]   ## keep pos - why? why not?
        elsif m[:status]   ## (match) status e.g. cancelled, awarded, etc.
          ## todo/check - add text (or status) 
          #     to opts hash {} by default (for value)
          if m[:status_note]   ## includes note? e.g.  awarded; originally 2-0
             [:STATUS, [m[:status], {status: m[:status], 
                                     note:   m[:status_note]} ]]
          else
             [:STATUS, [m[:status], {status: m[:status] } ]]
          end
        elsif m[:note]
            ###  todo/check:
            ##      use value hash - why? why not? or simplify to:
            ##  [:NOTE, m[:note]] 
             [:NOTE, [m[:note], {note: m[:note] } ]]
        elsif m[:time]
              ## unify to iso-format
              ###   12.40 => 12:40
              ##    12h40 => 12:40 etc.
              ##  keep string (no time-only type in ruby)
              hour =   m[:hour].to_i(10)  ## allow 08/07/etc.
              minute = m[:minute].to_i(10)
              ## check if valid -  0:00 - 24:00
              ##   check if 24:00 possible? or only 0:00 (23:59)
              if (hour >= 0 && hour <= 24) &&
                 (minute >=0 && minute <= 59)
               ## note - for debugging keep (pass along) "literal" time
               ##   might use/add support for am/pm later
               [:TIME, [m[:time], {h:hour,m:minute}]]
              else
                 raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
              end
        elsif m[:date]
            date = {}
 ## map month names
 ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date[:y] = m[:year].to_i(10)  if m[:year]
            date[:m] = m[:month].to_i(10)  if m[:month]
            date[:m] = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
            date[:d]  = m[:day].to_i(10)   if m[:day]
            date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]
            ## note - for debugging keep (pass along) "literal" date
            [:DATE, [m[:date], date]]
        elsif m[:timezone]
          [:TIMEZONE, m[:timezone]]
        elsif m[:duration]
            ## todo/check/fix - if end: works for kwargs!!!!!
            duration = { start: {}, end: {}}
            duration[:start][:y] = m[:year1].to_i(10)  if m[:year1]
            duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ]   if m[:month_name1]
            duration[:start][:d]  = m[:day1].to_i(10)   if m[:day1]
            duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ]   if m[:day_name1]
            duration[:end][:y] = m[:year2].to_i(10)  if m[:year2]
            duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            duration[:end][:d]  = m[:day2].to_i(10)   if m[:day2]
            duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ]   if m[:day_name2]
            ## note - for debugging keep (pass along) "literal" duration
            [:DURATION, [m[:duration], duration]]
        elsif m[:wday]    ## standalone weekday e.g. Mo/Tu/We/etc.
             [:WDAY, [m[:wday], { wday: DAY_MAP[ m[:day_name].downcase ] } ]]
        elsif m[:num]   ## fix - change to ord (for ordinal number!!!)
              ## note -  strip enclosing () and convert to integer
             [:ORD, [m[:num], { value: m[:value].to_i(10) } ]]
        elsif m[:score_more]
              score = {}
              ## check for pen
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]

            ## note - for debugging keep (pass along) "literal" score
            [:SCORE_MORE, [m[:score_more], score]]
        elsif m[:score]
            score = {}
            ## must always have ft for now e.g. 1-1 or such
            score[:ft] = [m[:ft1].to_i(10),
                          m[:ft2].to_i(10)]  
          ## note - for debugging keep (pass along) "literal" score
          [:SCORE, [m[:score], score]]
      elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
        elsif m[:og]
           [:OG, m[:og]]    ## for typed drop - string version/variants ??  why? why not?
        elsif m[:pen]
           [:PEN, m[:pen]]
        elsif m[:vs]
           [:VS, m[:vs]]
        elsif m[:sym]
          sym = m[:sym]
          ## return symbols "inline" as is - why? why not?
          ## (?<sym>[;,@|\[\]-])
 
          case sym
          when ',' then [:',']
          when ';' then [:';']
          when '/' then [:'/']
          when '@' then [:'@']
          when '|' then [:'|']
          when '[' then [:'[']
          when ']' then [:']']
          when '-' then [:'-']        # level 1 OR (classic) dash
          when '--'   then [:'--']    # level 2
          when '---'  then [:'---']   # level 3
          when '----' then [:'----']  # level 4
          else
            nil  ## ignore others (e.g. brackets [])
          end
        else
          ## report error
           puts "!!! TOKENIZE ERROR - no match found"
           nil 
        end
      end


    tokens << t    if t

#    if debug?
#      print ">"
#      print "*" * pos
#      puts "#{line[pos..-1]}<"
#    end
  end

  ## check if no match in end of string
  if offsets[1] != line.size
    msg =  "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
    puts msg
    log( msg )

    errors << "parse error (tokenize) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
  end


   ##
   ## if in prop mode continue if   last token is [,-]
   ##        otherwise change back to "standard" mode
   if @re == PROP_RE
     if [:',', :'-'].include?( tokens[-1][0] )
        ## continue/stay in PROP_RE mode
        ##  todo/check - auto-add PROP_CONT token or such
        ##                to help parser with possible NEWLINE
        ##                  conflicts  - why? why not?
     else
        ## switch back to top-level mode!!
        puts "  LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE"  if debug?
        @re = RE 
        ## note - auto-add PROP_END (<PROP_END>)
        tokens << [:PROP_END, "<|PROP_END|>"]    
     end
   end
  
  [tokens,errors]
end

#debug? ⇒ `Boolean`

Returns:

(Boolean)

124	# File 'lib/sportdb/parser/lexer.rb', line 124 def debug?() @debug == true; end

#is_group?(text) ⇒ `Boolean`

todo/fix - use LangHelper or such

 e.g.     class Lexer
              include LangHelper
          end

merge back Lang into Lexer - why? why not?

keep “old” access to checking for group, round & friends

for now for compatibility

Returns:

(Boolean)

26	# File 'lib/sportdb/parser/lexer.rb', line 26 def is_group?( text ) Lang.is_group?( text ); end

#is_leg?(text) ⇒ `Boolean`

Returns:

(Boolean)

28	# File 'lib/sportdb/parser/lexer.rb', line 28 def is_leg?( text ) Lang.is_leg?( text ); end

#is_round?(text) ⇒ `Boolean`

Returns:

(Boolean)

27	# File 'lib/sportdb/parser/lexer.rb', line 27 def is_round?( text ) Lang.is_round?( text ); end

#log(msg) ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 7

def log( msg )
   ## append msg to ./logs.txt
   ##     use ./errors.txt - why? why not?
   File.open( './logs.txt', 'a:utf-8' ) do |f|
     f.write( msg )
     f.write( "\n" )
   end
end

#tokenize_with_errors ⇒ `Object`

# File 'lib/sportdb/parser/lexer.rb', line 167

def tokenize_with_errors
    tokens_by_line = []   ## note: add tokens line-by-line (flatten later)
    errors         = []   ## keep a list of errors - why? why not?
  
    @txt.each_line do |line|
        line = line.rstrip   ## note - MUST remove/strip trailing newline (spaces optional)!!!
 
        more_tokens, more_errors = _tokenize_line( line )
        
        tokens_by_line  << more_tokens   
        errors          += more_errors
    end # each line

    tokens_by_line = tokens_by_line.map do |tokens|
        #############
        ## pass 1
        ##   replace all texts with keyword matches
        ##     (e.g. group, round, leg, etc.)
        tokens = tokens.map do |t|        
                    if t[0] == :TEXT
                       text = t[1]
                       t = if is_group?( text )
                               [:GROUP, text]
                             elsif is_round?( text ) || is_leg?( text )
                               [:ROUND, text]
                             else
                               t  ## pass through as-is (1:1)
                             end
                    end
                   t
                 end

        ### check for "section" starters e.g. Teams or such
        t = tokens[0] 
        if t[0] == :TEXT
            text = t[1]
            if text =~ /^teams$/i
               t[0] = :TEAMS
            elsif text =~  /^blank$/i   ### todo/fix -- remove!!! add real blanks!!
               t[0] = :BLANK
            else
            end
        end

        #################
        ## pass 2                  
        ##    transform tokens (using simple patterns) 
        ##      to help along the (racc look ahead 1 - LA1) parser       
        nodes = []

        buf = Tokens.new( tokens )
        ## pp buf


    loop do
          break if buf.eos?

          if buf.pos == 0   ## MUST start line
            ## check for
            ##    group def or round def
            if buf.match?( :ROUND, :'|' )    ## assume round def (change round to round_def)
                      nodes << [:ROUND_DEF, buf.next[1]]
                      nodes << buf.next 
                      nodes += buf.collect
                      break
            end
            if buf.match?( :GROUP, :'|' )    ## assume group def (change group to group_def)
                      nodes << [:GROUP_DEF, buf.next[1]]
                      nodes << buf.next 
                      ## change all text to team - why? why not?
                      nodes += buf.collect { |t|
                                t[0] == :TEXT ? [:TEAM, t[1]] : t
                               }
                      break
            end
          end


          if buf.match?( :TEXT, [:SCORE, :SCORE_MORE, :VS, :'-'], :TEXT )
             nodes << [:TEAM, buf.next[1]]
             nodes << buf.next
             nodes << [:TEAM, buf.next[1]]
          elsif buf.match?( :TEXT, :MINUTE )
             nodes << [:PLAYER, buf.next[1]]
             nodes << buf.next
          elsif buf.match?( :DATE, :TIME )   ## merge DATE TIME into DATETIME
               date = buf.next[1]
               time = buf.next[1]
               ## puts "DATETIME:"
               ## pp date, time
               val =  [date[0] + ' ' + time[0],  ## concat string of two tokens
                        { date: date[1], time: time[1] }
                      ]
               nodes << [:DATETIME, val]
          else
             ## pass through
             nodes << buf.next
          end
    end  # loop
    nodes  
  end  # map tokens_by_line



    ## flatten tokens
    tokens = []
    tokens_by_line.each do |tok|

         if debug?
           pp tok
         end

         tokens  += tok 
         ## auto-add newlines  (unless BLANK!!)
         tokens  << [:NEWLINE, "\n"]   unless tok[0][0] == :BLANK
    end

    [tokens,errors]
end

Class: SportDb::Lexer

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lines, debug: false) ⇒ Lexer

Class Method Details

.build_map(lines, downcase: false) ⇒ Object

.build_names(lines) ⇒ Object

.parse_date(str, start:) ⇒ Object

.parse_names(txt) ⇒ Object

Instance Method Details

#_tokenize_line(line) ⇒ Object

#debug? ⇒ Boolean

#is_group?(text) ⇒ Boolean

#is_leg?(text) ⇒ Boolean

#is_round?(text) ⇒ Boolean

#log(msg) ⇒ Object

#tokenize_with_errors ⇒ Object

#initialize(lines, debug: false) ⇒ `Lexer`

.build_map(lines, downcase: false) ⇒ `Object`

.build_names(lines) ⇒ `Object`

.parse_date(str, start:) ⇒ `Object`

.parse_names(txt) ⇒ `Object`

#_tokenize_line(line) ⇒ `Object`

#debug? ⇒ `Boolean`

#is_group?(text) ⇒ `Boolean`

#is_leg?(text) ⇒ `Boolean`

#is_round?(text) ⇒ `Boolean`

#log(msg) ⇒ `Object`

#tokenize_with_errors ⇒ `Object`