Class: SportDb::Lexer
- Inherits:
-
Object
- Object
- SportDb::Lexer
- Defined in:
- lib/sportdb/parser/lexer.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/token-geo.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-prop.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-score.rb,
lib/sportdb/parser/token-minute.rb,
lib/sportdb/parser/token-status.rb
Defined Under Namespace
Classes: Tokens
Constant Summary collapse
- QUICK_PLAYER_WITH_MINUTE_RE =
add a QUICK_PLAYER_WITH_MINUTE check
%r{ \b \d{1,3} ## constrain numbers to 0 to 999!!! (?: (?: \+\d{1,3} )? | (?: \?{2} | _{2} ) ## add support for n/a (not/available) ) ' ## must have minute marker!!!! }ix- TIME_RE =
keep 18h30 - why? why not?
add support for 6:30pm 8:20am etc. - why? why not? check - only support h e.g. 18h30 or 18H30 too - why? why not?e.g. 18.30 (or 18:30 or 18h30)
%r{ (?<time> \b (?: (?<hour>\d{1,2}) (?: :|\.|h ) (?<minute>\d{2})) \b ) }ix- WDAY_RE =
add wday / stand-alone week day - as separate regex or
use TEXT with is_wday? check or such with requirement of beginning of line (anchored to line) only?? - why? why not? %r{ (?<wday> \b # note - alternation (|) is lowest precedence (such # parathenes required around \b()\b !!! ## note - NOT case sensitive!!! (?<day_name> (?-i: Mon|Mo| Tue|Tu| Wed|We| Thu|Th| Fri|Fr| Sat|Sa| Sun|Su )) (?=[ ]{2}) # positive lookahead for two space ## todo/check - must be followed by two spaces or space + [( etc. ## to allow words starting with weekday abbrevations - why? why not? ## check if any names (teams, rounds, etc) come up in practice ## or maybe remove three letter abbrevations Mon/Tue ## and keep only Mo/Tu/We etc. - why? why not? )}x- BASICS_RE =
%r{ ## e.g. (51) or (1) etc. - limit digits of number??? ## todo/fix - change num to ord (for ordinal number)!!!!! (?<num> \( (?<value>\d+) \) ) | (?<vs> (?<=[ ]) # positive lookbehind for space (?-i: vs|v ) # note - only match case sensitive (downcased letters)!!! # note - bigger match first e.g. vs than v etc. (?=[ ]) # positive lookahead for space ) | (?<spaces> [ ]{2,}) | (?<space> [ ]) | (?<sym> (?<=^|[ ]) ## positive lookahead (?: ----| ---| -- ) (?=[ ]) ## positive lookahead ) | (?<sym> [;,/@|\[\]-] ) }ix- RE =
Regexp.union( STATUS_RE, SCORE_NOTE_RE, NOTE_RE, DURATION_RE, # note - duration MUST match before date DATE_RE, ## note - date must go before time (e.g. 12.12. vs 12.12) TIME_RE, SCORE_MORE_RE, SCORE_RE, ## note basic score e.g. 1-1 must go after SCORE_MORE_RE!!! BASICS_RE, WDAY_RE, # allow standalone weekday name (e.g. Mo/Tu/etc.) - why? why not? # note - wday MUST be after text e.g. Sun Ke 68' is Sun Ke (NOT Sun) etc. TEXT_RE, ANY_RE, )
- GOAL_BASICS_RE =
goal mode (switched to by PLAYER_WITH_MINUTE_RE)
%r{ (?<spaces> [ ]{2,}) | (?<space> [ ]) | (?<sym> [;,\[\]] ## add (-) dash too - why? why not? ) }ix- GOAL_RE =
Regexp.union( GOAL_BASICS_RE, MINUTE_RE, MINUTE_NA_RE, ## note - add/allow not/available (n/a,na) minutes hack for now GOAL_OG_RE, GOAL_PEN_RE, SCORE_RE, PROP_NAME_RE, ## note - (re)use prop name for now for (player) name )
- PROP_GOAL_RE =
note - leave out n/a minute in goals - make minutes optional!!!
Regexp.union( GOAL_BASICS_RE, MINUTE_RE, ## MINUTE_NA_RE, ## note - add/allow not/available (n/a,na) minutes hack for now GOAL_OG_RE, GOAL_PEN_RE, SCORE_RE, PROP_NAME_RE, ## note - (re)use prop name for now for (player) name )
- ROUND_OUTLINE_RE =
note - use A (instead of ^) - A strictly matches the start of the string.
%r{ \A [ ]* ## ignore leading spaces (if any) (?: »|>> ) [ ]+ (?<round_outline> ## must start with letter - why? why not? ### 1st round ## allow numbers e.g. Group A - 1 .+? ## use non-greedy ) [ ]* ## ignore trailing spaces (if any) $ }ix- GEO_TEXT_RE =
%r{ ## must start with alpha (allow unicode letters!!) (?<text> ## positive lookbehind - for now space (or beginning of line - for testing) only ## (MUST be fixed number of chars - no quantifier e.g. +? etc.) (?<= [ ,›>\[\]]|^) (?: # opt 1 - start with alpha \p{L}+ ## all unicode letters (e.g. [a-z]) | # opt 2 - start with num!! - \d+ # check for num lookahead (MUST be space or dot) ## MAY be followed by (optional space) ! ## MUST be follow by a to z!!!! [ ]? ## make space optional too - why? why not? ## yes - eg. 1st, 2nd, 5th etc. \p{L}+ | ## opt 3 - add another weirdo case ## e.g. 's Gravenwezel-Schilde ## add more letters (or sequences here - why? why not?) '\p{L}+ ) ## ## todo/check - find a different "more intuitive" regex/rule if possible? ## for single spaces only (and _/ MUST not be surround by spaces) (?: [ ]? # only single spaces allowed inline!!! (?: \p{L} | \d | [.&'°] | (?: (?<! [ ]) ## no space allowed before (but possible after) [-] ) | (?: (?<! [ ]) ## no spaces allowed around these characters [_/] (?! [ ]) ) )+ )* ## must NOT end with space or dash(-) ## todo/fix - possible in regex here ## only end in alphanum a-z0-9 (not dot or & ???) ## add lookahead/lookbehind ## must be space!!! ## (or comma or start/end of string) ## kind of \b !!! ## positive lookahead (?=[ ,›>\[\]]|$) ) }ix- TIMEZONE_RE =
for timezone format use for now: (BRT/UTC-3) (e.g. brazil time)
(CET/UTC+1) - central european time (CEST/UTC+2) - central european summer time - daylight saving time (DST). (EET/UTC+1) - eastern european time (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
UTC+3 UTC+4 UTC+0 UTC+00 UTC+0000
- allow +01 or +0100 - why? why not - +0130 (01:30)see
https://en.wikipedia.org/wiki/Time_zone https://en.wikipedia.org/wiki/List_of_UTC_offsets https://en.wikipedia.org/wiki/UTC−04:00 etc. e.g. (UTC-2) or (CEST/UTC-2) etc. todo check - only allow upcase or (utc-2) and (cest/utc-2) too - why? why not? %r{ (?<timezone> \( ## optional "local" timezone name eg. BRT or CEST etc. (?: [a-z]+ / )? [a-z]+ [+-] \d{1,4} ## e.g. 0 or 00 or 0000 \) ) }ix- GEO_BASICS_RE =
%r{ (?<spaces> [ ]{2,}) | (?<space> [ ]) | (?<sym> [,›>\[] ) }ix- GEO_RE =
Regexp.union( TIMEZONE_RE, GEO_BASICS_RE, GEO_TEXT_RE, ANY_RE, )
- MONTH_LINES =
parse_names( <<TXT ) January Jan February Feb March Mar April Apr May June Jun July Jul August Aug September Sept Sep October Oct November Nov December Dec TXT
- MONTH_NAMES =
build_names( MONTH_LINES )
- MONTH_MAP =
pp MONTH_NAMES
build_map( MONTH_LINES, downcase: true )
- DAY_LINES =
parse_names( <<TXT ) Monday Mon Mo Tuesday Tues Tue Tu Wednesday Wed We Thursday Thurs Thur Thu Th Friday Fri Fr Saturday Sat Sa Sunday Sun Su TXT
- DAY_NAMES =
build_names( DAY_LINES )
- DAY_MAP =
pp DAY_NAMES
build_map( DAY_LINES, downcase: true )
- DATE_I_RE =
e.g. Fri Aug/9 or Fri Aug 9
%r{ (?<date> \b ## optional day name ((?<day_name>#{DAY_NAMES}) [ ] )? (?<month_name>#{MONTH_NAMES}) (?: \/|[ ] ) (?<day>\d{1,2}) ## optional year ( [ ] (?<year>\d{4}) )? \b )}ix- DATE_II_RE =
e.g. 3 June or 10 June
%r{ (?<date> \b ## optional day name ((?<day_name>#{DAY_NAMES}) [ ] )? (?<day>\d{1,2}) [ ] (?<month_name>#{MONTH_NAMES}) ## optional year ( [ ] (?<year>\d{4}) )? \b )}ix- DATE_III_RE =
e.g. iso-date - 2011-08-25
note - allow/support ("shortcuts") e.g 2011-8-25 or 2011-8-3 / 2011-08-03 etc. %r{ (?<date> \b (?<year>\d{4}) - (?<month>\d{1,2}) - (?<day>\d{1,2}) \b )}ix- DATE_IIII_RE =
allow (short)“european” style 8.8.
note - assume day/month!!! %r{ (?<date> \b (?<day>\d{1,2}) \. (?<month>\d{1,2}) \. (?: (?: (?<year>\d{4}) ## optional year 2025 (yyyy) | (?<yy>\d{2}) ## optional year 25 (yy) ) \b )? ) }ix- DATE_RE =
map tables
note: order matters; first come-first matched/served Regexp.union( DATE_I_RE, DATE_II_RE, DATE_III_RE, DATE_IIII_RE, ## e.g. 8.8. or 8.13.79 or 08.14.1973 )
- DURATION_I_RE =
%r{ (?<duration> \b (?: ## optional day name ((?<day_name1>#{DAY_NAMES}) [ ] )? (?<month_name1>#{MONTH_NAMES}) (?: \/|[ ] ) (?<day1>\d{1,2}) ## optional year ( ,? # optional comma [ ] (?<year1>\d{4}) )? ## support + and - (add .. or such - why??) [ ]* - [ ]* ## optional day name ((?<day_name2>#{DAY_NAMES}) [ ] )? (?<month_name2>#{MONTH_NAMES}) (?: \/|[ ] ) (?<day2>\d{1,2}) ## optional year ( ,? # optional comma [ ] (?<year2>\d{4}) )? ) \b )}ix- DURATION_II_RE =
variant ii
add support for shorthand August 16-18, 2011 September 13-15, 2011 October 18-20, 2011 March/6-8, 2012 March 6-8 2012 March 6-8 - add support for August 16+17 or such (and check 16+18) use <op> to check if day2 is a plus or range or such - why? why not? %r{ (?<duration> \b (?: (?<month_name1>#{MONTH_NAMES}) [ /] (?<day1>\d{1,2}) - (?<day2>\d{1,2}) (?: ,? ## optional comma [ ] (?<year1>\d{4}) )? ## optional year ) \b )}ix- DURATION_RE =
map tables
note: order matters; first come-first matched/served Regexp.union( DURATION_I_RE, DURATION_II_RE, )
- PROP_NAME_RE =
name different from text (does NOT allow number in name/text)
%r{ (?<prop_name> \b (?<name> \p{L}+ \.? ## optional dot (?: ## rule for space; only one single space allowed inline!!! (?: (?<![ ]) ## use negative lookbehind [ ] (?=\p{L}|['"]) ## use lookahead ) ## support (inline) quoted name e.g. "Rodri" or such | (?: (?<=[ ]) ## use positive lookbehind " \p{L}+ " ## require space here too - why? why not? ) | (?: (?<=\p{L}) ## use lookbehind [-] ## must be surrounded by letters ## e.g. One/Two NOT ## One/ Two or One / Two or One /Two etc. (?=\p{L}) ## use lookahead ) | (?: ## flex rule for quote - allow any ## only check for double quotes e.g. cannot follow other ' for now - why? why not? ## allows rodrigez 'rodri' for example (?<!') ## use negative lookbehind ' ) | ## standard case with letter(s) and optinal dot (?: \p{L}+ \.? ## optional dot ) )* ) ## add lookahead - must be non-alphanum (?=[ ,;\]\)]|$) ) }ix- PROP_KEY_RE =
%r{ ^ # note - MUST start line; leading spaces optional (eat-up) [ ]* (?<prop_key> (?<key> (?:\p{L}+ | \d+ # check for num lookahead (MUST be space or dot) ## MUST be followed by (optional dot) and ## required space !!! ## MUST be follow by a to z!!!! \.? ## optional dot [ ]? ## make space optional too - why? why not? ## yes - eg. 1st, 2nd, 5th etc. \p{L}+ ) [\d\p{L}'/° -]*? ## allow almost anyting ## fix - add negative lookahead ## no space and dash etc. ## only allowed "inline" not at the end ## must end with latter or digit! ) [ ]*? # slurp trailing spaces : (?=[ ]+) ## possitive lookahead (must be followed by space!!) ) }ix- PROP_KEY_INLINE_RE =
simple prop key for inline use e.g.
Coach: or Trainer: or ... add more here later %r{ \b (?<prop_key> ## note: use prop_key (NOT prop_key_inline or such) (?<key> \p{L}+ ) ## note - NO spaces allowed for key for now!!! : (?=[ ]+) ## possitive lookahead (must be followed by space!!) ) }ix- PROP_NUM_RE =
%r{ \b (?<num> ## note allow underscore inline or space e.g. ## 5_000 ## allow space inline (e.g. 5 000) - why? why not? (?<value> [1-9] (?: _? [0-9]+ )* ) ) \b }ix- ENCLOSED_NAME_RE =
todo/fix - allow more chars in enclosed name - why? why not?
e.g. (') - Cote D'Ivore etc. change to PAREN_NAME or PARENTHESIS or such - why? why not? %r{ (?<enclosed_name> \( (?<name> \p{L}+ (?: [ ] \p{L}+ )* ) \) ) }ix- PROP_BASICS_RE =
%r{ (?<spaces> [ ]{2,}) | (?<space> [ ]) | (?<sym> [;,\(\)\[\]-] ) }ix- PROP_RE =
Regexp.union( MINUTE_RE, PROP_KEY_INLINE_RE, PROP_NAME_RE, PROP_BASICS_RE, ## todo/fix - add ANY_RE here too!!! )
- PROP_CARDS_RE =
note - no inline keys possible
todo/fix - use custom (limited) prop basics too Regexp.union( MINUTE_RE, PROP_NAME_RE, PROP_BASICS_RE, ## todo/fix - add ANY_RE here too!!! )
- PROP_PENALTIES_RE =
Regexp.union( SCORE_RE, # e.g. 1-1 etc. ENCLOSED_NAME_RE, # e.g. (save), (post), etc. PROP_NAME_RE, PROP_BASICS_RE, ## todo/fix - add ANY_RE here too!!! )
- PROP_REFEREE_RE =
Regexp.union( ENCLOSED_NAME_RE, # e.g. (sold out) etc. why? why not? PROP_NUM_RE, # e.g. 28 000 or 28_000 (NOT 28,000 is not valid!!!) PROP_KEY_INLINE_RE, PROP_NAME_RE, PROP_BASICS_RE, ## todo/fix - add ANY_RE here too!!! )
- PROP_ATTENDANCE_RE =
Regexp.union( ENCLOSED_NAME_RE, # e.g. (sold out) etc. why? why not? PROP_NUM_RE, # e.g. 28 000 or 28_000 (NOT 28,000 is not valid!!!) PROP_BASICS_RE, ## todo/fix - add ANY_RE here too!!! )
- ANY_RE =
general catch-all (RECOMMENDED (ALWAYS) use as last entry in union)
to avoid advance of pos match!!! %r{ (?<any> .) }ix- TEXT_RE =
%r{ ## must start with alpha (allow unicode letters!!) (?<text> ## positive lookbehind ## (MUST be fixed number of chars - no quantifier e.g. +? etc.) (?<=[ ,;@|\[\]] |^ ) (?: # opt 1 - start with alpha \p{L}+ ## all unicode letters (e.g. [a-z]) | # opt 2 - start with num!! - allow special case (e.g. 1. FC) \d+ # check for num lookahead (MUST be space or dot) ## MUST be followed by (optional dot) and ## required space !!! ## MUST be follow by a to z!!!! \.? ## optional dot [ ]? ## make space optional too - why? why not? ## yes - eg. 1st, 2nd, 5th etc. \p{L}+ | ## opt 3 - add weirdo case ## e.g. 1/8 Finals 1/4 1/2 ... 1/ \d{1,2} [ ] \p{L}+ | ## opt 4 - add another weirdo case ## e.g. 's Gravenwezel-Schilde '[s] | ## opt 5 - add another weirdo case ## e.g. 5.-8. Platz Playoffs - keep - why? why not? \d+\.-\d+\. [ ]? \p{L}+ ) (?:(?: (?:[ ] # only single spaces allowed inline!!! (?! (?-i: vs?[ ]) ) ## note - exclude (v[ ]/vs[ ]) ## AND switch to case-sensitive (via -i!!!) ) | [/-] ## must NOT be surrounded by spaces )? (?: \p{L} | [.&'°] | (?: \d+ (?! [0-9h'+] | ## protected break on 12h / 12' / 1-1 ## check usege for 3+4 - possible? where ? why? (?:[.:-]\d) ## protected/exclude/break on 12.03 / 12:03 / 12-12 ## BUT allow Park21-Arena for example e.g. 21-A :-) ) ## negative lookahead for numbers ## note - include digits itself!!! ## note - remove / (slash) e.g. allows UDI'19/Beter Bed ) ) )* ## must NOT end with space or dash(-) ## todo/fix - possible in regex here ## only end in alphanum a-z0-9 (not dot or & ???) ## allow optional at the end ## tag or year ## make it and in the future - why? why not? ## ## change - fix ## do NOT use (A) for amateur ## use A or A. with NO ()!!! ## (A) - allow with predined alpha only for now ## e.g. (A) - amateur a team or b? ### same for U21 or U9 etc ## use with NO ()!!! - why? why not? ## or U21 U9 etc. - why? why not? ## or etc. ## (1879-1893) or allow years e.g. (1879-1893) ### ## add allow country code three to five letters for now ## change to generic 1 to 5 - why? why not? ## e.g. (A), (I), ## (AUT) ## (TRNC) five? for UEFA code for northern cyprus ## change to 1 to 4 - why? why not? ## check - fix possible for upper case only here ## inline for this group only? (?: [ ] \( \d{4}-\d{4} \) )? (?: [ ]+ ## allow more than once space - why? why not? \( (?: [A-Z]{1,5} ) \) )? ## add lookahead/lookbehind ## must be space!!! ## (or comma or start/end of string) ## kind of \b !!! ## positive lookahead (?=[ ,;@|\[\]] |$ ) ) }ix- P_EN =
english helpers (penalty, extra time, …)
note - p must go last (shortest match) pso = penalty shootout '(?: pso | pen\.? | p\.? )'- ET_EN =
e.g. p., p, pen, pen., PSO, etc.
'(?: aet | a\.e\.t\.? )'- SCORE__P_ET__RE =
note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.)
3-4 pen. 2-2 a.e.t. 3-4 pen. 2-2 a.e.t. 2-2 a.e.t. %r{ (?<score_more> \b (?: (?<p1>\d{1,2}) - (?<p2>\d{1,2}) [ ]* #{P_EN} [ ]+ )? # note: make penalty (P) score optional for now (?<et1>\d{1,2}) - (?<et2>\d{1,2}) [ ]* #{ET_EN} (?=[ ,\]]|$) )}ix- SCORE__P__RE =
note: allow SPECIAL with penalty only
3-4 pen. %r{ (?<score_more> \b (?<p1>\d{1,2}) - (?<p2>\d{1,2}) [ ]* #{P_EN} (?=[ ,\]]|$) )}ix- SCORE__P_ET_FT_HT_V2__RE =
support short all-in-one e.g.
e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) becomes 3-4 pen. (2-2, 1-1, 1-1) %r{ (?<score_more> \b (?<p1>\d{1,2}) - (?<p2>\d{1,2}) [ ]* #{P_EN} [ ]+ \( (?<et1>\d{1,2}) - (?<et2>\d{1,2}) [ ]*, [ ]* (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]*, [ ]* (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* \) (?=[ ,\]]|$) )}ix- SCORE__P_ET_FT_HT__RE =
e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or
3-4p 2-2aet (1-1, ) or 3-4 pen. 2-2 a.e.t. (1-1) or 2-2 a.e.t. (1-1, 1-1) or 2-2 a.e.t. (1-1, ) or 2-2 a.e.t. (1-1) %r{ (?<score_more> \b (?: (?<p1>\d{1,2}) - (?<p2>\d{1,2}) [ ]* #{P_EN} [ ]+ )? # note: make penalty (P) score optional for now (?<et1>\d{1,2}) - (?<et2>\d{1,2}) [ ]* #{ET_EN} [ ]+ \( [ ]* (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]* (?: , [ ]* (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* )? )? # note: make half time (HT) score optional for now \) (?=[ ,\]]|$) )}ix- SCORE__P_FT_HT__RE =
special case for case WITHOUT extra time!!
same as above (but WITHOUT extra time and pen required) %r{ (?<score_more> \b (?<p1>\d{1,2}) - (?<p2>\d{1,2}) [ ]* #{P_EN} [ ]+ \( [ ]* (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]* (?: , [ ]* (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* )? )? # note: make half time (HT) score optional for now \) (?=[ ,\]]|$) )}ix- SCORE__FT_HT__RE =
e.g. 2-1 (1-1)
%r{ (?<score_more> \b (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]+ \( [ ]* (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* \) (?=[ ,\]]|$) )}ix- SCORE__FT__RE =
2-1
%r{ (?<score> \b (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) \b )}ix- SCORE_MORE_RE =
map tables
note: order matters; first come-first matched/servedcheck - find a better name for SCORE_MORE - SCORE_EX, SCORE_BIG, or _ - why? why not?
Regexp.union( SCORE__P_ET_FT_HT_V2__RE, # e.g. 5-1 pen. (2-2, 1-1, 1-0) SCORE__P_ET_FT_HT__RE, # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0) SCORE__P_FT_HT__RE, # e.g. 5-1 pen. (1-1) SCORE__P_ET__RE, # e.g. 2-2 a.e.t. or 5-1 pen. 2-2 a.e.t. SCORE__P__RE, # e.g. 5-1 pen. SCORE__FT_HT__RE, # e.g. 1-1 (1-0) ## note - keep basic score as its own token!!!! ## that is, SCORE & SCORE_MORE ### SCORE__FT__RE, # e.g. 1-1 -- note - must go last!!! )
- SCORE_RE =
SCORE__FT__RE- GOAL_PEN_RE =
goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og)
todo/check - keep case-insensitive or allow OG or P or PEN or only lower case - why? why not? %r{ (?<pen> \( (?:pen|p)\.? \) ) }ix- GOAL_OG_RE =
%r{ (?<og> \( (?:og|o\.g\.) \) ) }ix- MINUTE_NA_RE =
minute variant for N/A not/available
todo/check - find a better syntax - why? why not? note "??".to_i(10) returns 0 or "__".to_i(10) returns 0 quick hack - assume 0 for n/a for now %r{ (?<minute> (?<=[ (]) # positive lookbehind for space or opening (?<value> \?{2} | _{2} ) ' ## must have minute marker!!!! ) }ix- MINUTE_RE =
%r{ (?<minute> (?<=[ (]) # positive lookbehind for space or opening ( e.g. (61') required # todo - add more lookbehinds e.g. ,) etc. - why? why not? (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!! (?: \+ (?<value2>\d{1,3}) )? ' ## must have minute marker!!!! ) }ix- PLAYER_WITH_MINUTE_RE =
%r{ \A ### note - MUST start line; leading spaces optional (eat-up) [ ]* (?: # optional open bracket ([) -- remove later (?<open_bracket> \[ ) [ ]* )? (?: # optional none a.k.a. -; - what todo here? (?<none> - [ ]* ; [ ]* ) )? (?<player_with_minute> (?<name> \p{L}+ \.? ## optional dot (?: ## rule for space; only one single space allowed inline!!! (?: (?<![ ]) ## use negative lookbehind [ ] (?=\p{L}|') ## use lookahead ) | (?: (?<=\p{L}) ## use lookbehind ['-] ## must be surrounded by letters ## e.g. One/Two NOT ## One/ Two or One / Two or One /Two etc. (?=\p{L}) ## use lookahead ) | (?: (?<=[ ]) ## use lookbehind -- add letter (plus dot) or such - why? why not? ['] ## must be surrounded by leading space and ## traling letters (e.g. UDI 'Beter Bed) (?=\p{L}) ## use lookahead ) | (?: (?<=\p{L}) ## use lookbehind ['] ## must be surrounded by leading letter and ## trailing space PLUS letter (e.g. UDI' Beter Bed) (?=[ ]\p{L}) ## use lookahead (space WITH letter ) | ## standard case with letter(s) and optinal dot (?: \p{L}+ \.? ## optional dot ) )* ) #### spaces (?: [ ]+) #### minute (see above) ##### use MINUTE_RE.source or such - for inline (reference) use? do not copy (?<minute> (?<=[ (]) # positive lookbehind for space or opening ( e.g. (61') required # todo - add more lookbehinds e.g. ,) etc. - why? why not? (?: (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!! (?: \+ (?<value2>\d{1,3}) )? | (?<value> \?{2} | _{2} ) ## add support for n/a (not/available) ) ' ## must have minute marker!!!! ) ) }ix- PLAYER_WITH_SCORE_RE =
note - use A (instead of ^) - A strictly matches the start of the string.
%r{ \A ### note - MUST start line; leading spaces optional (eat-up) [ ]* (?<player_with_score> (?<score> (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) ) [ ]+ (?<name> \p{L}+ \.? ## optional dot (?: ## rule for space; only one single space allowed inline!!! (?: (?<![ ]) ## use negative lookbehind [ ] (?=\p{L}|') ## use lookahead ) | (?: (?<=\p{L}) ## use lookbehind ['-] ## must be surrounded by letters ## e.g. One/Two NOT ## One/ Two or One / Two or One /Two etc. (?=\p{L}) ## use lookahead ) | (?: (?<=[ ]) ## use lookbehind -- add letter (plus dot) or such - why? why not? ['] ## must be surrounded by leading space and ## traling letters (e.g. UDI 'Beter Bed) (?=\p{L}) ## use lookahead ) | (?: (?<=\p{L}) ## use lookbehind ['] ## must be surrounded by leading letter and ## trailing space PLUS letter (e.g. UDI' Beter Bed) (?=[ ]\p{L}) ## use lookahead (space WITH letter ) | ## standard case with letter(s) and optinal dot (?: \p{L}+ \.? ## optional dot ) )* ) ## name ### check/todo - add lookahead (e.g. must be space or ,$) why? why not? ) ## player_with_score }ix- STATUS_RE =
%r{ \[ (?: ### opt 1 - allow long forms with note/comment for some stati (?: (?<status> awarded ## e.g. [awarded match to Leones Negros by undue alignment; original result 1-2] ## [awarded 3-0 to Cafetaleros by undue alignment; originally ended 2-0] ## [awarded 3-0; originally 0-2, América used ineligible player (Federico Viñas)] | annulled | abandoned ## e.g. [abandoned at 1-1 in 65' due to cardiac arrest Luton player Tom Lockyer] ## [abandoned at 0-0 in 6' due to waterlogged pitch] ## [abandoned at 5-0 in 80' due to attack on assistant referee by Cerro; result stood] ## [abandoned at 1-0 in 31'] ## [abandoned at 0-1' in 85 due to crowd trouble] | postponed ## e.g. [postponed due to problems with the screen of the stadium] ## [postponed by storm] ## [postponed due to tropical storm "Hanna"] ## [postponed from Sep 10-12 due to death Queen Elizabeth II] | suspended ## e.g. [suspended at 0-0 in 12' due to storm] ## [suspended at 84' by storm; result stood] | verified ## e.g. [verified 2:0 wo.] ) [ ;,]* (?<status_note> [^\]]+ ) [ ]* ) | ## opt 2 - short from only (no note/comments) (?<status> cancelled|canceled|can\. | abandoned|abd\. | postponed | awarded|awd\. | replay | annulled | suspended ### todo/fix - add status upstream - why? why not? ### move to note(s) - do NOT interpret as status - why? why not? | verified ### todo/fix - add status upstream (same as ??) - why? why not? ### move to note(s) - do NOT interpret as status - why? why not? ) ) \] }ix- NOTE_RE =
todo/fix - move to token-note.rb (standalone) file
%r{ \[ (?<note> (?: ## starting with ___ PLUS requiring more text (?: nb: ## e.g. [NB: between top-8 of regular season] # [NB: América, Morelia and Tigres qualified on better record regular season] # [NB: Celaya qualified on away goals] # [NB: Alebrijes qualified on away goal] # [NB: Leones Negros qualified on away goals] # # todo/fix: # add "top-level" NB: version ## with full (end-of) line note - why? why not? | rescheduled ## e.g. [rescheduled due to earthquake occurred in Mexico on September 19] | declared ## e.g. [declared void] | remaining ## e.g. [remaining 79'] ## [remaining 84'] ## [remaining 59'] ## [remaining 5'] ) [ ] [^\]]+? ## slurp all to next ] - (use non-greedy) ) ) \] }ix- SCORE_NOTE_RE =
%r{ \[ (?<score_note> (?: # plain aet e.g. [aet] aet | a\.e\.t\. | after [ ] extra [ -] time ) | (?: # plain penalties e.g. [3-2 pen] \d{1,2}-\d{1,2} [ ]* (?: p|pen ) ) | (?: # plain aet with penalties e.g. [aet; 4-3 pen] or [aet, 4-3p] aet [ ]* [,;] [ ]* \d{1,2}-\d{1,2} [ ]* (?: p|pen ) ) | (?: ## e.g. Spain wins on penalties ## 1860 München wins on penalties etc. ## must start with digit 1-9 or letter ## todo - add more special chars - why? why not? ## (?: aet [ ]* ## allow space here - why? why not [,;][ ] )? (?: (?: # opt 1 - no team listed/named - requires score (?: won|wins? ) [ ] ## note - allow won,win or wins (?: ## score \d{1,2}-\d{1,2} [ ] ) on [ ] (?: pens | penalties | aggregate ) ) | (?: # opt 2 - team required; score optional (?: ## team required [1-9\p{L}][0-9\p{L} .-]+? [ ] ) (?: won|wins? ) [ ] ## won/win/wins (?: ## score optional \d{1,2}-\d{1,2} [ ] )? on [ ] (?: pens | penalties | aggregate ) ### [^\]]*? ## allow more? use non-greedy ) )) | (?: ## e.g. agg 3-2 etc. agg [ ] \d{1,2}-\d{1,2} ) | (?: ## e.g. agg 4-4, Ajax win on away goals (?: ## agg 4-4, optional for now - why? why not? agg [ ] \d{1,2}-\d{1,2} [ ]*[,;][ ] )? (?: ## team required [1-9\p{L}][0-9\p{L} .-]+? [ ] ) (?: won|wins? ) [ ] # won/win/wins on [ ] away [ ] goals ) ) # score_note ref \] }ix
Class Method Summary collapse
- .build_map(lines, downcase: false) ⇒ Object
- .build_names(lines) ⇒ Object
-
.parse_date(str, start:) ⇒ Object
add a date parser helper.
- .parse_names(txt) ⇒ Object
Instance Method Summary collapse
- #_tokenize_line(line) ⇒ Object
- #debug? ⇒ Boolean
-
#initialize(lines, debug: false) ⇒ Lexer
constructor
A new instance of Lexer.
-
#is_group?(text) ⇒ Boolean
todo/fix - use LangHelper or such e.g.
- #is_leg?(text) ⇒ Boolean
- #is_round?(text) ⇒ Boolean
- #is_zone?(text) ⇒ Boolean
- #log(msg) ⇒ Object
- #tokenize_with_errors ⇒ Object
Constructor Details
#initialize(lines, debug: false) ⇒ Lexer
Returns a new instance of Lexer.
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
# File 'lib/sportdb/parser/lexer.rb', line 126 def initialize( lines, debug: false ) @debug = debug ## note - for convenience - add support ## comments (incl. inline end-of-line comments) and empty lines here ## why? why not? ## why? keeps handling "centralized" here in one place ## todo/fix - rework and make simpler ## no need to double join array of string to txt etc. txt_pre = if lines.is_a?( Array ) ## join together with newline lines.reduce( String.new ) do |mem,line| mem << line; mem << "\n"; mem end else ## assume single-all-in-one txt lines end ## preprocess automagically - why? why not? ## strip lines with comments and empty lines striped / removed ## keep empty lines? why? why not? ## keep leading spaces (indent) - why? ## ## note - KEEP empty lines (get turned into BLANK token!!!!) @txt = String.new txt_pre.each_line do |line| ## preprocess line = line.strip next if line.start_with?('#') ### skip comments line = line.sub( /#.*/, '' ).strip ### cut-off end-of line comments too @txt << line @txt << "\n" end end |
Class Method Details
.build_map(lines, downcase: false) ⇒ Object
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/sportdb/parser/token-date.rb', line 40 def self.build_map( lines, downcase: false ) ## note: downcase name!!! ## build a lookup map that maps the word to the index (line no) plus 1 e.g. ## {"january" => 1, "jan" => 1, ## "february" => 2, "feb" => 2, ## "march" => 3, "mar" => 3, ## "april" => 4, "apr" => 4, ## "may" => 5, ## "june" => 6, "jun" => 6, ... lines.each_with_index.reduce( {} ) do |h,(line,i)| line.each do |name| h[ downcase ? name.downcase : name ] = i+1 end ## note: start mapping with 1 (and NOT zero-based, that is, 0) h end end |
.build_names(lines) ⇒ Object
33 34 35 36 37 |
# File 'lib/sportdb/parser/token-date.rb', line 33 def self.build_names( lines ) ## join all words together into a single string e.g. ## January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|... lines.map { |line| line.join('|') }.join('|') end |
.parse_date(str, start:) ⇒ Object
add a date parser helper
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
# File 'lib/sportdb/parser/token-date.rb', line 197 def self.parse_date( str, start: ) if m=DATE_RE.match( str ) year = m[:year].to_i(10) if m[:year] month = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name] day = m[:day].to_i(10) if m[:day] wday = DAY_MAP[ m[:day_name].downcase ] if m[:day_name] if year.nil? ## try to calculate year year = if month > start.month || (month == start.month && day >= start.day) # assume same year as start_at event (e.g. 2013 for 2013/14 season) start.year else # assume year+1 as start_at event (e.g. 2014 for 2013/14 season) start.year+1 end end Date.new( year,month,day ) else puts "!! ERROR - unexpected date format; cannot parse >#{str}<" exit 1 end end |
.parse_names(txt) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/sportdb/parser/token-date.rb', line 6 def self.parse_names( txt ) lines = [] # array of lines (with words) txt.each_line do |line| line = line.strip next if line.empty? next if line.start_with?( '#' ) ## skip comments too ## strip inline (until end-of-line) comments too ## e.g. Janvier Janv Jan ## check janv in use?? ## => Janvier Janv Jan line = line.sub( /#.*/, '' ).strip ## pp line values = line.split( /[ \t]+/ ) ## pp values ## todo/fix -- add check for duplicates lines << values end lines end |
Instance Method Details
#_tokenize_line(line) ⇒ Object
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 |
# File 'lib/sportdb/parser/lexer.rb', line 307 def _tokenize_line( line ) tokens = [] errors = [] ## keep a list of errors - why? why not? puts "line: >#{line}<" if debug? ### special case for empty line (aka BLANK) if line.empty? ## note - blank always resets parser mode to std/top-level!!! @re = RE tokens << [:BLANK, '<|BLANK|>'] return [tokens, errors] end pos = 0 ## track last offsets - to report error on no match ## or no match in end of string offsets = [0,0] m = nil #### ## quick hack - keep re state/mode between tokenize calls!!! @re ||= RE ## note - switch between RE & INSIDE_RE if @re == RE ## top-level ### check for modes once (per line) here to speed-up parsing ### for now goals only possible for start of line!! ### fix - remove optional [] - why? why not? ## start with prop key (match will switch into prop mode!!!) ## - fix - remove leading spaces in regex (upstream) - why? why not? if (m = PROP_KEY_RE.match( line )) ### switch into new mode ## switch context to PROP_RE puts " ENTER PROP_RE MODE" if debug? key = m[:key] ### todo - add prop yellow/red cards too - why? why not? if ['sent off', 'red cards'].include?( key.downcase) @re = PROP_CARDS_RE ## use CARDS_RE ??? tokens << [:PROP_REDCARDS, m[:key]] elsif ['yellow cards'].include?( key.downcase ) @re = PROP_CARDS_RE tokens << [:PROP_YELLOWCARDS, m[:key]] elsif ['ref', 'referee'].include?( key.downcase ) @re = PROP_REFEREE_RE tokens << [:PROP_REFEREE, m[:key]] elsif ['att', 'attn', 'attendance'].include?( key.downcase ) @re = PROP_ATTENDANCE_RE tokens << [:PROP_ATTENDANCE, m[:key]] elsif ['goals'].include?( key.downcase ) @re = PROP_GOAL_RE tokens << [:PROP_GOALS, m[:key]] elsif ['penalties', 'penalty shootout'].include?( key.downcase ) @re = PROP_PENALTIES_RE tokens << [:PROP_PENALTIES, m[:key]] else ## assume (team) line-up @re = PROP_RE ## use LINEUP_RE ??? tokens << [:PROP, m[:key]] end offsets = [m.begin(0), m.end(0)] pos = offsets[1] ## update pos elsif (m = ROUND_OUTLINE_RE.match( line )) puts " ROUND_OUTLINE" if debug? tokens << [:ROUND_OUTLINE, m[:round_outline]] ## note - eats-up line for now (change later to only eat-up marker e.g. »|>>) offsets = [m.begin(0), m.end(0)] pos = offsets[1] ## update pos elsif (m = PLAYER_WITH_SCORE_RE.match( line )) ## switch context to GOAL_RE (goalline(s) ## split token (automagically) into two!! - player AND minute!!! @re = GOAL_RE puts " ENTER GOAL_RE MODE" if debug? score = {} ## must always have ft for now e.g. 1-1 or such ### change to (generic) score from ft - ## might be score a.e.t. or such - why? why not? score[:ft] = [m[:ft1].to_i(10), m[:ft2].to_i(10)] ## note - for debugging keep (pass along) "literal" score tokens << [:SCORE, [m[:score], score]] ## auto-add player token tokens << [:PLAYER, m[:name]] offsets = [m.begin(0), m.end(0)] pos = offsets[1] ## update pos #### FIX/FIX/TODO ### looks to hang in player with minute ### FIX - improve / rework PLAYER_WITH_MINUTE_RE regex!!!! elsif (_quick = QUICK_PLAYER_WITH_MINUTE_RE.match(line) && m = PLAYER_WITH_MINUTE_RE.match( line )) ## switch context to GOAL_RE (goalline(s) ## split token (automagically) into two!! - player AND minute!!! @re = GOAL_RE puts " ENTER GOAL_RE MODE" if debug? ## check for optional open_bracket tokens << [:'['] if m[:open_bracket] ## check for -; (none with separator) ## todo - find a better way? how possible? tokens << [:NONE, "<|NONE|>"] if m[:none] ## auto-add player token first tokens << [:PLAYER, m[:name]] ## minute props minute = {} minute[:m] = m[:value].to_i(10) minute[:offset] = m[:value2].to_i(10) if m[:value2] ## t is minute only tokens << [:MINUTE, [m[:minute], minute]] offsets = [m.begin(0), m.end(0)] pos = offsets[1] ## update pos end end old_pos = -1 ## allows to backtrack to old pos (used in geo) while m = @re.match( line, pos ) # if debug? # pp m # puts "pos: #{pos}" # end offsets = [m.begin(0), m.end(0)] if offsets[0] != pos ## match NOT starting at start/begin position!!! ## report parse error!!! msg = "!! WARN - parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<" puts msg errors << "parse error (tokenize) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<" log( msg ) end ## ## todo/fix - also check if possible ## if no match but not yet end off string!!!! ## report skipped text run too!!! old_pos = pos pos = offsets[1] # pp offsets if debug? ## ## note: racc requires pairs e.g. [:TOKEN, VAL] ## for VAL use "text" or ["text", { opts }] array t = if @re == GEO_RE ### note - possibly end inline geo on [ (and others?? in the future if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:text] [:GEO, m[:text]] ## keep pos - why? why not? elsif m[:timezone] [:TIMEZONE, m[:timezone]] elsif m[:sym] sym = m[:sym] ## return symbols "inline" as is - why? why not? ## (?<sym>[;,@|\[\]-]) case sym when ',' then [:','] when '›' then [:','] ## note - treat geo sep › (unicode) like comma for now!!! when '>' then [:','] ## note - treat geo sep > (ascii) like comma for now!!! when '[' then ## get out-off geo mode and backtrack (w/ next) puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug? @re = RE pos = old_pos next ## backtrack (resume new loop step) else puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<" nil ## ignore others (e.g. brackets []) end elsif m[:any] ## todo/check log error msg = "parse error (tokenize geo) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<" puts "!! WARN - #{msg}" errors << msg log( "!! WARN - #{msg}" ) nil else ## report error/raise expection puts "!!! TOKENIZE ERROR - no match found" nil end elsif @re == PROP_CARDS_RE if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:prop_name] [:PROP_NAME, m[:name]] elsif m[:minute] minute = {} minute[:m] = m[:value].to_i(10) minute[:offset] = m[:value2].to_i(10) if m[:value2] ## note - for debugging keep (pass along) "literal" minute [:MINUTE, [m[:minute], minute]] elsif m[:sym] sym = m[:sym] case sym when ',' then [:','] when ';' then [:';'] when '-' then [:'-'] else nil ## ignore others (e.g. brackets []) end else ## report error puts "!!! TOKENIZE ERROR (PROP_CARDS_RE) - no match found" nil end elsif @re == PROP_RE ### todo/fix - change to LINEUP_RE !!!! if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:prop_key] ## check for inline prop keys key = m[:key] ## supported for now coach/trainer (add manager?) if ['coach', 'trainer'].include?( key.downcase ) [:COACH, m[:key]] ## use COACH_KEY or such - why? why not? else ## report error - for unknown (inline) prop key in lineup nil end elsif m[:prop_name] if m[:name] == 'Y' [:YELLOW_CARD, m[:name]] elsif m[:name] == 'R' [:RED_CARD, m[:name]] else [:PROP_NAME, m[:name]] end elsif m[:minute] minute = {} minute[:m] = m[:value].to_i(10) minute[:offset] = m[:value2].to_i(10) if m[:value2] ## note - for debugging keep (pass along) "literal" minute [:MINUTE, [m[:minute], minute]] elsif m[:sym] sym = m[:sym] ## return symbols "inline" as is - why? why not? ## (?<sym>[;,@|\[\]-]) case sym when ',' then [:','] when ';' then [:';'] when '[' then [:'['] when ']' then [:']'] when '(' then [:'('] when ')' then [:')'] when '-' then [:'-'] else nil ## ignore others (e.g. brackets []) end else ## report error puts "!!! TOKENIZE ERROR (PROP_RE) - no match found" nil end elsif @re == PROP_ATTENDANCE_RE if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:enclosed_name] ## reserverd for use for sold out or such (in the future) - why? why not? [:ENCLOSED_NAME, m[:name]] elsif m[:num] [:PROP_NUM, [m[:num], { value: m[:value].to_i(10) } ]] =begin elsif m[:sym] sym = m[:sym] case sym when ',' then [:','] when ';' then [:';'] # when '[' then [:'['] # when ']' then [:']'] else nil ## ignore others (e.g. brackets []) end =end else ## report error puts "!!! TOKENIZE ERROR (PROP_ATTENDANCE_RE) - no match found" nil end elsif @re == PROP_REFEREE_RE if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:prop_key] ## check for inline prop keys key = m[:key] ## supported for now coach/trainer (add manager?) if ['att', 'attn', 'attendance' ].include?( key.downcase ) [:ATTENDANCE, m[:key]] ## use COACH_KEY or such - why? why not? else ## report error - for unknown (inline) prop key in lineup nil end elsif m[:prop_name] ## note - change prop_name to player [:PROP_NAME, m[:name]] ### use PLAYER for token - why? why not? elsif m[:num] [:PROP_NUM, [m[:num], { value: m[:value].to_i(10) } ]] elsif m[:enclosed_name] ## use HOLD,SAVE,POST or such keys - why? why not? [:ENCLOSED_NAME, m[:name]] elsif m[:sym] sym = m[:sym] case sym when ',' then [:','] when ';' then [:';'] # when '[' then [:'['] # when ']' then [:']'] else nil ## ignore others (e.g. brackets []) end else ## report error puts "!!! TOKENIZE ERROR (PROP_REFEREE_RE) - no match found" nil end elsif @re == PROP_PENALTIES_RE if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:prop_name] ## note - change prop_name to player [:PROP_NAME, m[:name]] ### use PLAYER for token - why? why not? elsif m[:enclosed_name] ## use HOLD,SAVE,POST or such keys - why? why not? [:ENCLOSED_NAME, m[:name]] elsif m[:score] score = {} ## must always have ft for now e.g. 1-1 or such ### change to (generic) score from ft - ## might be score a.e.t. or such - why? why not? score[:ft] = [m[:ft1].to_i(10), m[:ft2].to_i(10)] ## note - for debugging keep (pass along) "literal" score [:SCORE, [m[:score], score]] elsif m[:sym] sym = m[:sym] case sym when ',' then [:','] when ';' then [:';'] when '[' then [:'['] when ']' then [:']'] else nil ## ignore others (e.g. brackets []) end else ## report error puts "!!! TOKENIZE ERROR (PROP_PENALTIES_RE) - no match found" nil end elsif @re == GOAL_RE || @re == PROP_GOAL_RE if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:prop_name] ## note - change prop_name to player [:PLAYER, m[:name]] elsif m[:minute] minute = {} minute[:m] = m[:value].to_i(10) minute[:offset] = m[:value2].to_i(10) if m[:value2] ## note - for debugging keep (pass along) "literal" minute [:MINUTE, [m[:minute], minute]] elsif m[:score] score = {} ## must always have ft for now e.g. 1-1 or such ### change to (generic) score from ft - ## might be score a.e.t. or such - why? why not? score[:ft] = [m[:ft1].to_i(10), m[:ft2].to_i(10)] ## note - for debugging keep (pass along) "literal" score [:SCORE, [m[:score], score]] elsif m[:og] [:OG, m[:og]] ## for typed drop - string version/variants ?? why? why not? elsif m[:pen] [:PEN, m[:pen]] elsif m[:sym] sym = m[:sym] ## return symbols "inline" as is - why? why not? ## (?<sym>[;,@|\[\]-]) case sym when ',' then [:','] when ';' then [:';'] when '[' then [:'['] when ']' then [:']'] else nil ## ignore others (e.g. brackets []) end else ## report error puts "!!! TOKENIZE ERROR (GOAL_RE) - no match found" nil end ################################################### ## assume TOP_LEVEL (a.k.a. RE) machinery else if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:text] [:TEXT, m[:text]] ## keep pos - why? why not? elsif m[:status] ## (match) status e.g. cancelled, awarded, etc. ## todo/check - add text (or status) # to opts hash {} by default (for value) if m[:status_note] ## includes note? e.g. awarded; originally 2-0 [:STATUS, [m[:status], {status: m[:status], note: m[:status_note]} ]] else [:STATUS, [m[:status], {status: m[:status] } ]] end elsif m[:note] ### todo/check: ## use value hash - why? why not? or simplify to: ## [:NOTE, [m[:note], {note: m[:note] } ]] [:NOTE, m[:note]] elsif m[:score_note] [:SCORE_NOTE, m[:score_note]] elsif m[:time] ## unify to iso-format ### 12.40 => 12:40 ## 12h40 => 12:40 etc. ## keep string (no time-only type in ruby) hour = m[:hour].to_i(10) ## allow 08/07/etc. minute = m[:minute].to_i(10) ## check if valid - 0:00 - 24:00 ## check if 24:00 possible? or only 0:00 (23:59) if (hour >= 0 && hour <= 24) && (minute >=0 && minute <= 59) ## note - for debugging keep (pass along) "literal" time ## might use/add support for am/pm later [:TIME, [m[:time], {h:hour,m:minute}]] else raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range" end elsif m[:date] date = {} ## map month names ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup date[:y] = m[:year].to_i(10) if m[:year] ## check - use y too for two-digit year or keep separate - why? why not? date[:yy] = m[:yy].to_i(10) if m[:yy] ## two digit year (e.g. 25 or 78 etc.) date[:m] = m[:month].to_i(10) if m[:month] date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name] date[:d] = m[:day].to_i(10) if m[:day] date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name] ## note - for debugging keep (pass along) "literal" date [:DATE, [m[:date], date]] elsif m[:duration] ## todo/check/fix - if end: works for kwargs!!!!! duration = { start: {}, end: {}} duration[:start][:y] = m[:year1].to_i(10) if m[:year1] duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ] if m[:month_name1] duration[:start][:d] = m[:day1].to_i(10) if m[:day1] duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ] if m[:day_name1] duration[:end][:y] = m[:year2].to_i(10) if m[:year2] duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ] if m[:month_name2] duration[:end][:d] = m[:day2].to_i(10) if m[:day2] duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ] if m[:day_name2] ## note - for debugging keep (pass along) "literal" duration [:DURATION, [m[:duration], duration]] elsif m[:wday] ## standalone weekday e.g. Mo/Tu/We/etc. [:WDAY, [m[:wday], { wday: DAY_MAP[ m[:day_name].downcase ] } ]] elsif m[:num] ## fix - change to ord (for ordinal number!!!) ## note - strip enclosing () and convert to integer [:ORD, [m[:num], { value: m[:value].to_i(10) } ]] elsif m[:score_more] score = {} ## check for pen score[:p] = [m[:p1].to_i(10), m[:p2].to_i(10)] if m[:p1] && m[:p2] score[:et] = [m[:et1].to_i(10), m[:et2].to_i(10)] if m[:et1] && m[:et2] score[:ft] = [m[:ft1].to_i(10), m[:ft2].to_i(10)] if m[:ft1] && m[:ft2] score[:ht] = [m[:ht1].to_i(10), m[:ht2].to_i(10)] if m[:ht1] && m[:ht2] ## note - for debugging keep (pass along) "literal" score [:SCORE_MORE, [m[:score_more], score]] elsif m[:score] score = {} ## must always have ft for now e.g. 1-1 or such ### change to (generic) score from ft - ## might be score a.e.t. or such - why? why not? score[:ft] = [m[:ft1].to_i(10), m[:ft2].to_i(10)] ## note - for debugging keep (pass along) "literal" score [:SCORE, [m[:score], score]] elsif m[:minute] minute = {} minute[:m] = m[:value].to_i(10) minute[:offset] = m[:value2].to_i(10) if m[:value2] ## note - for debugging keep (pass along) "literal" minute [:MINUTE, [m[:minute], minute]] elsif m[:vs] [:VS, m[:vs]] elsif m[:sym] sym = m[:sym] ## return symbols "inline" as is - why? why not? ## (?<sym>[;,@|\[\]-]) case sym when '@' ## enter geo mode puts " ENTER GEO_RE MODE" if debug? @re = GEO_RE [:'@'] when ',' then [:','] when ';' then [:';'] when '/' then [:'/'] when '|' then [:'|'] when '[' then [:'['] when ']' then [:']'] when '-' then [:'-'] # level 1 OR (classic) dash when '--' then [:'--'] # level 2 when '---' then [:'---'] # level 3 when '----' then [:'----'] # level 4 else puts "!!! TOKENIZE ERROR (sym) - ignore sym >#{sym}<" nil ## ignore others (e.g. brackets []) end elsif m[:any] ## todo/check log error msg = "parse error (tokenize) - skipping any match>#{m[:any]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<" puts "!! WARN - #{msg}" errors << msg log( "!! WARN - #{msg}" ) nil else ## report error puts "!!! TOKENIZE ERROR - no match found" nil end end tokens << t if t # if debug? # print ">" # print "*" * pos # puts "#{line[pos..-1]}<" # end end ## check if no match in end of string if offsets[1] != line.size msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<" puts msg log( msg ) errors << "parse error (tokenize) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<" end if @re == GOAL_RE ### ALWAYS switch back to top level mode puts " LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE" if debug? @re = RE end if @re == GEO_RE ### ALWAYS switch back to top level mode puts " LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" if debug? @re = RE end ## ## if in prop mode continue if last token is [,-] ## otherwise change back to "standard" mode if @re == PROP_RE || @re == PROP_CARDS_RE || @re == PROP_GOAL_RE || @re == PROP_PENALTIES_RE || @re == PROP_ATTENDANCE_RE || @re == PROP_REFEREE_RE if [:',', :'-', :';'].include?( tokens[-1][0] ) ## continue/stay in PROP_RE mode ## todo/check - auto-add PROP_CONT token or such ## to help parser with possible NEWLINE ## conflicts - why? why not? else ## switch back to top-level mode!! puts " LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE" if debug? @re = RE ## note - auto-add PROP_END (<PROP_END>) tokens << [:PROP_END, "<|PROP_END|>"] end end [tokens,errors] end |
#debug? ⇒ Boolean
124 |
# File 'lib/sportdb/parser/lexer.rb', line 124 def debug?() @debug == true; end |
#is_group?(text) ⇒ Boolean
todo/fix - use LangHelper or such
e.g. class Lexer
include LangHelper
end
merge back Lang into Lexer - why? why not?
keep “old” access to checking for group, round & friends
for now for compatibility
26 |
# File 'lib/sportdb/parser/lexer.rb', line 26 def is_group?( text ) Lang.is_group?( text ); end |
#is_leg?(text) ⇒ Boolean
28 |
# File 'lib/sportdb/parser/lexer.rb', line 28 def is_leg?( text ) Lang.is_leg?( text ); end |
#is_round?(text) ⇒ Boolean
27 |
# File 'lib/sportdb/parser/lexer.rb', line 27 def is_round?( text ) Lang.is_round?( text ); end |
#is_zone?(text) ⇒ Boolean
29 |
# File 'lib/sportdb/parser/lexer.rb', line 29 def is_zone?( text ) Lang.is_zone?( text ); end |
#log(msg) ⇒ Object
7 8 9 10 11 12 13 14 |
# File 'lib/sportdb/parser/lexer.rb', line 7 def log( msg ) ## append msg to ./logs.txt ## use ./errors.txt - why? why not? File.open( './logs.txt', 'a:utf-8' ) do |f| f.write( msg ) f.write( "\n" ) end end |
#tokenize_with_errors ⇒ Object
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 |
# File 'lib/sportdb/parser/lexer.rb', line 167 def tokenize_with_errors tokens_by_line = [] ## note: add tokens line-by-line (flatten later) errors = [] ## keep a list of errors - why? why not? @txt.each_line do |line| line = line.rstrip ## note - MUST remove/strip trailing newline (spaces optional)!!! more_tokens, more_errors = _tokenize_line( line ) tokens_by_line << more_tokens errors += more_errors end # each line tokens_by_line = tokens_by_line.map do |tokens| ############# ## pass 1 ## replace all texts with keyword matches ## (e.g. group, round, leg, etc.) ## ## note - let is_round? get first (before is_group?) ## will match group stage as round (NOT group) tokens = tokens.map do |t| if t[0] == :TEXT text = t[1] t = if is_round?( text ) || is_leg?( text ) || is_zone?( text ) [:ROUND, text] elsif is_group?( text ) [:GROUP, text] else t ## pass through as-is (1:1) end end t end ### check for "section" starters e.g. Teams or such t = tokens[0] if t[0] == :TEXT text = t[1] if text =~ /^teams$/i t[0] = :TEAMS elsif text =~ /^blank$/i ### todo/fix -- remove!!! add real blanks!! t[0] = :BLANK else end end ################# ## pass 2 ## transform tokens (using simple patterns) ## to help along the (racc look ahead 1 - LA1) parser nodes = [] buf = Tokens.new( tokens ) ## pp buf loop do break if buf.eos? if buf.pos == 0 ## MUST start line ## check for ## group def or round def if buf.match?( :ROUND, :'|' ) ## assume round def (change round to round_def) nodes << [:ROUND_DEF, buf.next[1]] nodes << buf.next nodes += buf.collect break end if buf.match?( :GROUP, :'|' ) ## assume group def (change group to group_def) nodes << [:GROUP_DEF, buf.next[1]] nodes << buf.next ## change all text to team - why? why not? nodes += buf.collect { |t| t[0] == :TEXT ? [:TEAM, t[1]] : t } break end end if buf.match?( :TEXT, [:SCORE, :SCORE_MORE, :VS, :'-'], :TEXT ) nodes << [:TEAM, buf.next[1]] nodes << buf.next nodes << [:TEAM, buf.next[1]] # note - now handled (upstream) with GOAL_RE mode!!! # elsif buf.match?( :TEXT, :MINUTE ) # nodes << [:PLAYER, buf.next[1]] # nodes << buf.next elsif buf.match?( :DATE, :TIME ) ## merge DATE TIME into DATETIME date = buf.next[1] time = buf.next[1] ## puts "DATETIME:" ## pp date, time val = [date[0] + ' ' + time[0], ## concat string of two tokens { date: date[1], time: time[1] } ] nodes << [:DATETIME, val] else ## pass through nodes << buf.next end end # loop nodes end # map tokens_by_line ## flatten tokens tokens = [] tokens_by_line.each do |tok| if debug? pp tok end tokens += tok ## auto-add newlines (unless BLANK!!) tokens << [:NEWLINE, "\n"] unless tok[0][0] == :BLANK end [tokens,errors] end |