Class: SportDb::Lexer
- Inherits:
-
Object
- Object
- SportDb::Lexer
- Defined in:
- lib/sportdb/parser.rb,
lib/sportdb/parser/lexer.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/token-geo.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-note.rb,
lib/sportdb/parser/token-prop.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-time.rb,
lib/sportdb/parser/lexer_token.rb,
lib/sportdb/parser/token-goals.rb,
lib/sportdb/parser/token-group.rb,
lib/sportdb/parser/token-round.rb,
lib/sportdb/parser/token-score.rb,
lib/sportdb/parser/lexer-on_top.rb,
lib/sportdb/parser/token-status.rb,
lib/sportdb/parser/lexer-on_goal.rb,
lib/sportdb/parser/lexer-prep_doc.rb,
lib/sportdb/parser/lexer-tokenize.rb,
lib/sportdb/parser/lexer-prep_line.rb,
lib/sportdb/parser/token-prop_name.rb,
lib/sportdb/parser/token-score_full.rb,
lib/sportdb/parser/token-score_legs.rb,
lib/sportdb/parser/token-date--names.rb,
lib/sportdb/parser/lexer-on_group_def.rb,
lib/sportdb/parser/lexer-on_prop_misc.rb,
lib/sportdb/parser/lexer-on_round_def.rb,
lib/sportdb/parser/token-score_fuller.rb,
lib/sportdb/parser/token-date--helpers.rb,
lib/sportdb/parser/token-date_duration.rb,
lib/sportdb/parser/token-status_inline.rb,
lib/sportdb/parser/lexer-on_prop_lineup.rb,
lib/sportdb/parser/token-goals--helpers.rb,
lib/sportdb/parser/token-score--helpers.rb,
lib/sportdb/parser/lexer-on_prop_penalties.rb
Defined Under Namespace
Constant Summary collapse
- ANY_RE =
general catch-all (RECOMMENDED (ALWAYS) use as last entry in union)
to avoid advance of pos match!!! %r{ (?<any> .) }ix- SPACES_RE =
%r{ (?<spaces> [ ]{2,}) | (?<space> [ ]) }ix- ATTENDANCE_RE =
add att(endance) e.g. att: 18000
A v B 2-1 att: 18000 %r{ (?<attendance> \b (?: attendance|att ) : [ ]* (?<value> [1-9] (?: _? \d+ )* ) \b )}ix- TEAM_HOME_RE =
home/away/neutral - (h), (a), (n)
add support for h/a/n with (?-i \b [han] \b) lower-case and \b boundry - why? why not? %r{ (?<team_home> \(h\) )}ix- TEAM_AWAY_RE =
%r{ (?<team_away> \(a\) )}ix- TEAM_NEUTRAL_RE =
%r{ (?<team_neutral> \(n\) )}ix- VS_RE =
note - only match case sensitive (downcased letters)!!! note - bigger match first e.g. vs than v etc.
%r{ (?<vs> (?<=[ ]) # positive lookBEHIND for space (?-i: vs\.?|v ) (?=[ ]) # positive lookAHEAD for space ) }ix- RE =
“top-level” regex used for:
- date_header - match_header & match_line_more - match_line Regexp.union( SPACES_RE, STATUS_RE, ## match status e.g. [cancelled], etc. INLINE_WO_RE, ## (inline) match status - w/o (walkout) INLINE_NP_RE, ## (inline) match status - n/p (not played) INLINE_BYE_RE, ## (inline) match status - bye (advance to next round) INLINE_ABD_RE, ## (inline) match status - abd/abd. (abandoned) INLINE_SUSP_RE, ## (inline) match status - susp/susp. (suspended) INLINE_PPD_RE, ## (inline) match status - ppd/ppd. or pstp/pstp. or postp/postp. or p-p (postponed) INLINE_VOID_RE, ## (inline) match status - x-x (voided) INLINE_AWD_RE, ## (inline) match status - awd/awd. (awarded) INLINE_CANC_RE, ## (inline) match status - canc/canc. (cancelled/canceled) TEAM_HOME_RE, ## (H) TEAM_AWAY_RE, ## (A) TEAM_NEUTRAL_RE, ## (N) NOTE_RE, ### fix - change to INLINE_NOTE !!! DATE_LEGS_RE, # note - must go before date!!! DATE_RE, ## note - date must go before time (e.g. 12.12. vs 12.12) TIME_RE, ATTENDANCE_RE, # note - allow att: for now inline in matches too - why? why not? SCORE_FULL_1ST_RE, # note - MUST go before SCORE_LEGS_RE!! ## e.g. 2-2, 5-1 pen. SCORE_LEGS_RE, SCORE_FULL_RE, SCORE_FULLER_RE, SCORE_FULLER_MORE_RE, SCORE_AWD_RE, # (inline) score awarded e.g. 3-0 awd or 0-1 awd. etc. SCORE_ABD_RE, # (inline) score abandoned e.g. 2-1 abd. SCORE_RE, ## note basic score e.g. 1-1 must go after SCORE_FULL_RE!!! VS_RE, TEXT_RE, %r{ (?<sym> [,@()-] ) }x, ## todo - check if "standalone" comma (,) in use? ANY_RE, )
- START_WITH_ORD =
ord (for ordinal number)
e.g. (51) or (1) etc. - limit digits of number - why? why not??? %r{ \A [ ]* ## ignore leading spaces (if any) (?<ord> \( (?<value>\d+) \) )}ix- START_WITH_YEAR =
e.g. 1930, 1986, 2002, 2010, 2022, 2026
note - only YYYY note - look out for clubs like 1860 München (de) !!! 1899 Hoffenheim (de) 1896 Löwenherz (ch - a.k.a. FC Winterthur ??) any others starting with YYYY ?! note - YEAR requires TWO (trailing) spaces !!!!! e.g. 1930 Uruguay 4-2 Argentina 1934 Italy 2-1 Czechoslovakia (AET) 2022 Argentina 3-3 France (AET, 4-2 pen) do NOT match (iso date!!) - 2020-11-12 2020/11/12 2020.11.12 etc. %r{ \A [ ]* ## ignore leading spaces (if any) (?<year> \d{4} ) ## positive lookahead (?= [ ]{2} | ## min. TWO spaces!!! or [ ]@ | ## space with geo marker or [ ]* \z ## year (date) header (end-of-line/string) ) }x- HEADING_RE =
%r{ \A [ ]* ## ignore leading spaces (if any) (?<heading_marker> ={1,6} ) [ ]* (?<heading> ## must start with letter - why? why not? ### 1st round ## allow numbers e.g. Group A - 1 [^=]+? ## use non-greedy ) [ ]* ## ignore trailing spaces (if any) (?: =*) ## allow any trailing heading markers [ ]* ## ignore trailing spaces (if any) \z }ix- GEO_TEXT_RE =
%r{ ## must start with alpha (allow unicode letters!!) (?<text> ## positive lookbehind - for now space (or beginning of line - for testing) only ## (MUST be fixed number of chars - no quantifier e.g. +? etc.) (?<= [ ,›>\[\]]|^) (?: # opt 1 - start with alpha \p{L}+ ## all unicode letters (e.g. [a-z]) | # opt 2 - start with num!! - \d+ # check for num lookahead (MUST be space or dot) ## MAY be followed by (optional space) ! ## MUST be follow by a to z!!!! [ ]? ## make space optional too - why? why not? ## yes - eg. 1st, 2nd, 5th etc. \p{L}+ | ## opt 3 - add another weirdo case ## e.g. 's Gravenwezel-Schilde ## add more letters (or sequences here - why? why not?) '\p{L}+ ) ## ## todo/check - find a different "more intuitive" regex/rule if possible? ## for single spaces only (and _/ MUST not be surround by spaces) (?: (?: [ ]? # only single (inline) space allowed - double spaces are breaks!!! (?: \p{L} | \d | [.&'°] | (?: (?<! [ ]) ## no space allowed before (but possible after) [-] ) | (?: (?<! [ ]) ## no spaces allowed around these characters [_/] (?! [ ]) ) )+ ) | ## for now allow auto-add optional ## parenthesis enclosed closed text ## e.g. Dublin (Dalymount Park) ## Bucuresti (23 August) ## Paris (Parc des Princes) ## Ost-Berlin (Walter-Ulbricht) ## Athinai (OAKA - Maroussi) ## ## or Valencia (Spain) or Solna (?: [ ] \( [^()\[\],;:›<>]+ ## todo - add more special chars ## maybe list only allowed ones?? ## make pattern more strict - why? why not? \) ) )* ## must NOT end with space or dash(-) ## todo/fix - possible in regex here ## only end in alphanum a-z0-9 (not dot or & ???) ## add lookahead/lookbehind ## must be space!!! ## (or comma or start/end of string) ## kind of \b !!! ## POSITIVE lookahead (?=[ ,›>\[\]]|$) ) }ix- GEO_END_RE =
%r{ (?<geo_end> , ) ## POSITIVE lookahead for props ## todo/fix - use generic [a-z]+ - why? why not? (?= [ ]* ## optional spaces (?: attendance|att | referee?s|refs? ) : ) }ix- GEO_RE =
Regexp.union( SPACES_RE, GEO_END_RE, GEO_TEXT_RE, / (?<sym> [,›>\[] ) /x, ANY_RE, )
- DATE_I_RE =
e.g. Fri Aug 9
Fri Aug 9 Fri, Aug 9 Fri, Aug 9 2024 Fri, Aug 9, 2024 Aug 9, 2024 Aug 9, 2024 note - eat-up optional comma after DAY_NAMES!! note - Fri Aug/9 no longer supported!!! %r{ (?<date> \b ## optional day name ((?<day_name>#{DAY_NAMES}) (?: ,?[ ]+) )? (?<month_name>#{MONTH_NAMES}) [ ] (?<day>\d{1,2}) \b ## optional year ( ,? [ ] ## note - comma optinal with single space required for now (?<year>\d{4}) ## optional year 2025 (yyyy) \b )? )}ix- DATE_LEGS_I_RE =
todo/fix - add (opt) day_name later
add (opt) year latere.g. Aug 9 & Aug 10 note - allow shortcut e.g. Aug 9 & 10
%r{ (?<date_legs> \b (?<month_name1>#{MONTH_NAMES}) [ ] (?<day1>\d{1,2}) [ ] & [ ] (?: (?<month_name2>#{MONTH_NAMES}) [ ] )? ## note - make 2nd month_name optional (?<day2>\d{1,2}) \b )}ix- DATE_II_RE =
e.g. 3 June or 10 June
note - allow more spaces between DAY_NAMES and DAY e.g. Sun 1 Mar Wed 4 Mar Sat 14 Mar Sat 11 Apr Sat 11 Apr 2021 Sat 11 Apr 21 Sat, 11 Apr note - eat-up optional comma after DAY_NAMES!! note - Sat 14 Mar 17:30 check two-digit year (with NEGATIVE lookahead for time!!!) %r{ (?<date> \b ## optional day name ((?<day_name>#{DAY_NAMES}) (?: ,?[ ]+) )? (?<day>\d{1,2}) [ ] (?<month_name>#{MONTH_NAMES}) \b ## optional year ( [ ] (?: (?<year>\d{4}) ## optional year 2025 (yyyy) | (?: (?<yy>\d{2}) ## optional year 25 (yy) ## check NEGATIVE lookahead (?! :|[:h]\d{2}) ) ) \b )? )}ix- DATE_III_A_RE =
e.g. iso-date - 2011-08-25
note - allow/support ("shortcuts") e.g 2011-8-25 or 2011-8-3 / 2011-08-03 etc. %r{ (?<date> \b (?<year>\d{4}) - (?<month>\d{1,2}) - (?<day>\d{1,2}) \b )}ix- DATE_III_B_RE =
starting w/ day/month/year e.g. 25-08-2011
%r{ (?<date> \b ## optional day name ((?<day_name>#{DAY_NAMES}) (?: ,?[ ]+) )? (?<day>\d{1,2}) - (?<month>\d{1,2}) - (?<year>\d{4}) \b )}ix- DATE_IIII_RE =
allow (short)“european” style 8.8.
note - assume day/month!!! %r{ (?<date> \b ## optional day name ((?<day_name>#{DAY_NAMES}) (?: ,?[ ]+) )? (?<day>\d{1,2}) \. (?<month>\d{1,2}) \. (?: (?: (?<year>\d{4}) ## optional year 2025 (yyyy) | (?<yy>\d{2}) ## optional year 25 (yy) ) \b )? ) }ix- DATE_IIIII_RE =
04/03/2026 or 4/3/2026
04/03/26 or 4/3/26 04/03 or 4/3 %r{ (?<date> \b ## optional day name ((?<day_name>#{DAY_NAMES}) (?: ,?[ ]+) )? (?<day>\d{1,2}) / (?<month>\d{1,2}) \b (?: / (?: (?<year>\d{4}) ## optional year 2025 (yyyy) | (?<yy>\d{2}) ## optional year 25 (yy) ) \b )? ) }ix- DATE_RE =
map tables
note: order matters; first come-first matched/served Regexp.union( DATE_I_RE, DATE_II_RE, DATE_III_A_RE, ## e.g. 1973-08-14 DATE_III_B_RE, DATE_IIII_RE, ## e.g. 8.8. or 8.13.79 or 08.14.1973 DATE_IIIII_RE, ## e.g. 08/14/1973 )
- DATE_LEGS_RE =
todo - add more format style here; change to Regexp.union later!!!
DATE_LEGS_I_RE- NOTE_RE =
fix - use (?<text>) - text capture for inner text!!
use (?<note> for complete match as a convention!! ) %r{ \[ (?<note> [^\[\]\#]*? ## note - non-greedy/lazy operator ## exclude comments inside note block - why? why not? ) \] }xi- NOTA_BENE_RE =
check for “literal” (multi-line) note blocks
eg. nb: or note: space required after double colon - why? why not? note - use \A (instead of ^) - \A strictly matches the start of the string. %r{ \A [ ]* ## ignore leading spaces (if any) (?: nb | note) [ ]* : [ ]+ (?<nota_bene> .+? ## use non-greedy ) [ ]* ## ignore trailing spaces (if any) \z }xi- PROP_KEY_WORD_ =
(i) starting w/ letters
note - incl./allows digits (0-9) e.g. a1, a2000, etc.note - added back optional trailing dot (.) for abbrev. word !!!
%r{ \p{L} [\p{L}\d]* \.? }ix- PROP_KEY_NUM_ =
note - incl. optional dot or numsign e.g. 1. or 1°
%r{ \d+ [.°]? }ix- PROP_KEY_NUMALPHA_ =
e.g. 1A, 1FC etc.
note - no trailing dot (.) for now - check if any cases exist in real world %r{ \d+ \p{L} [\p{L}\d]* }ix- START_WITH_PROP_KEY_RE =
%r{ \A ## note - MUST start line; leading spaces optional (eat-up) (?<prop_key> [ ]* ## optional leading spaces (?<key> (?: ## (i) starting w/ letters #{PROP_KEY_WORD_} ## (ii) starting w/ number ## e.g. 1fc, 1a, | #{PROP_KEY_NUMALPHA_} ## followed by optional dot) and ## optional space ## MUST be follow by letter (a to z)!!!! ## eg. 1[ fc], 1.[ fc], 1.[fc], etc. | #{PROP_KEY_NUM_} (?= [ ]? \p{L}) ) (?: ## connectors - note - no dot (.), must match with abbrev word or num!! (?: ## (i) single space or WITHOUT surrounding spaces!! - slash (/), dash (-) ## e.g. do NOT match one - two or one / two ## only one-two or one/two [ /-] ## (ii) surrounded by leading or trailing optional space ## c & a, etc. ## d'ivoire, d' ivoire ## borusia 'gladbach etc. ## exclude space ' space - why? why not? (or ignore for now) ## ## check for quotes ('') - not realy supported here ## e.g. leading or trailing ' will NOT match | [ ]? & [ ]? | [ ]? ' | ' [ ]? #### (iii) ## note - special "hack" to connect WITHOUT space ## for Union 1.FC and SKN St.Pölten or St.Pölten ## connects 1.FC => NUM+WORD ## 1°Mayo => NUM+WORD ## St.Pölten => ABBREV+WORD ## ## note - match WITHOUT (space) connector ## 1.FC (Union 1.FC Stein) ## [WORD: "Union"], [NUM: "1."], [WORD: "FC"] ## St.Pölten (SKN St.Pölten) ## [WORD: "SKN"], [ABBREV: "St."], [WORD: "Pölten"] | (?<= [.°] ) (?= \p{L}) ) (?: #{PROP_KEY_NUMALPHA_} | #{PROP_KEY_NUM_} | #{PROP_KEY_WORD_} ) )* ) ## close <key> capture [ ]*? ## slurp trailing spaces : ## positive lookahead (must be followed by space!!) ## or allow end-of-line too (?= [ ]+|$) ) ## close <prop_key> capture }ix- INLINE_CAPTAIN =
- c
-
or [C] for marking player as captain
support [y ] too - or require Y - why? why not? %r{ (?<inline_captain> \[ [cC] \] )}x- INLINE_YELLOW =
%r{ (?<inline_yellow> \[ [yY] ## optional minute (?: [ ]+ (?<minute> \d{1,3}) '? (?: \+ (?<offset>\d{1,2}) '? )? )? \] )}x- INLINE_RED =
%r{ (?<inline_red> \[ [rR] ## optional minute (?: [ ]+ (?<minute> \d{1,3}) '? (?: \+ (?<offset>\d{1,2}) '? )? )? \] )}x- INLINE_YELLOW_RED =
%r{ (?<inline_yellow_red> \[ (?:y/r | Y/R ) ## optional minute (?: [ ]+ (?<minute> \d{1,3}) '? (?: \+ (?<offset>\d{1,2}) '? )? )? \] )}x- PROP_KEY_INLINE_RE =
simple prop key for inline use e.g.
Coach: or Trainer: or ... add more here later %r{ \b (?<prop_key> ## note: use prop_key (NOT prop_key_inline or such) (?<key> \p{L}+ ) ## note - NO spaces allowed for key for now!!! : ## possitive lookahead (must be followed by space!!) (?=[ ]+) ) }ix- PROP_NUM_RE =
note allow underscore inline e.g.
5_000discuss/check - allow space inline (e.g. 5 000) - why? why not?
%r{ \b (?<num> (?<value> [0-9]+ (?: _ [0-9]+)* ) ) \b }x- ENCLOSED_NAME_RE =
todo/fix - allow more chars in enclosed name - why? why not?
e.g. (') - Cote D'Ivore etc. change to PAREN_NAME or PARENTHESIS or such - why? why not? %r{ (?<enclosed_name> \( (?<name> \p{L}+ (?: [ ] \p{L}+ )* ) \) ) }ix- TEXT_RE =
%r{ ## must start with alpha (allow unicode letters!!) (?<text> ## positive lookbehind ## (MUST be fixed number of chars - no quantifier e.g. +? etc.) (?<=[ ,;@|\[\]] |^ ) (?: # opt 1 - start with alpha \p{L}+ ## all unicode letters (e.g. [a-z]) | # opt 2 - start with num!! - allow special case (e.g. 1. FC) \d+ # check for num lookahead (MUST be space or dot) ## MUST be followed by (optional dot) and ## required space !!! ## MUST be follow by a to z!!!! [.°]? ## optional dot (.) or degree(°) - todo - add number sign too!! [ ]? ## make space optional too - why? why not? ## yes - eg. 1st, 2nd, 5th etc. \p{L}+ | ## opt 3 - add another weirdo case ## e.g. 's Gravenwezel-Schilde '[s] [ ] \p{L}+ ) (?:(?: (?:[ ] # only single spaces allowed inline!!! ## note - exclude (v[ ]/vs[ ]/vs.[ ]) ## AND switch to case-sensitive (via -i!!!) (?! (?-i: (?: ## note - (big) V not matching for versus!!! vs\.?|v| n/p|N/P| w/o|W/O| abd\.?|ABD| aban\.?|ABAN| susp\.?|SUSP| ppd\.?|PPD| pst\.?|PST| po?stp\.?|PO?STP|P-P| x-x|X-X| awd\.?|AWD| canc\.?|CANC ) [ ] | (?: bye|BYE ) (?:[ ]|$)) ) ) | [/-] ## must NOT be surrounded by spaces )? (?: \p{L} | (?: ## note - restrict [.&'] to single char usage (no doubled e.g. && etc.) \. (?! \.) ## allow single points only (now two or more etc.) | & (?! &) | ' (?! ') ) | (?: \d+ (?! [0-9h'+] | ## protected break on 12h / 12' / 1-1 ## check usege for 3+4 - possible? where ? why? (?:[.:-]\d) ## protected/exclude/break on 12.03 / 12:03 / 12-12 ## BUT allow Park21-Arena for example e.g. 21-A :-) ) [°]? ## followed by optional ord ## negative lookahead for numbers ## note - include digits itself!!! ## note - remove / (slash) e.g. allows UDI'19/Beter Bed ) ) )* ## must NOT end with space or dash(-) ## todo/fix - possible in regex here ## only end in alphanum a-z0-9 (not dot or & ???) ## allow optional at the end ## tag or year ## make it and in the future - why? why not? ## ## change - fix ## do NOT use (A) for amateur ## use A or A. with NO ()!!! ## (A) - allow with predined alpha only for now ## e.g. (A) - amateur a team or b? ### same for U21 or U9 etc ## use with NO ()!!! - why? why not? ## or U21 U9 etc. - why? why not? ## or etc. ## (1879-1893) or allow years e.g. (1879-1893) ### ## add allow country code three to five letters for now ## change to generic 1 to 5 - why? why not? ## e.g. (A), (I), ## (AUT) ## (TRNC) five? for UEFA code for northern cyprus ## change to 1 to 4 - why? why not? ## check - fix possible for upper case only here ## inline for this group only? (?: [ ] \( \d{4}-\d{4} \) )? (?: ###### # check for country code (cc) # e.g. (AUT) or ,AUT or AUT (?: [ ] ## note - do NOT allow more than one space!!! - why? why not? \( ## note - auto-exclude reserved (aet) from SCORE_FULLER_MORE!!! ## plus golden goal (gg)/sudden death (sd), silver goal (sg) ## (ht), (ft) (?! (?: aet | agget | asdet | asget | ht | ft ) \) ) (?: [A-Z]{1,5} ) \) ) | (?: [ ]*[,›>][ ]* [A-Z]{1,5} \b ) )? ## add lookahead/lookbehind ## must be space!!! ## (or comma or start/end of string) ## kind of \b !!! ## positive lookahead (?=[ ,;@|\[\]] |$ ) ) }ix- TIME_RE =
%r{ \b (?<time> (?<hour>\d{1,2}) [:h] (?<minute>\d{2}) #### optional (inline) timezone ## note - non-utc timezone MUST be hard-coded (added) here!!! ## avoids eating-up team names (separated by one space) ## e.g. 18:30 MEX v MEX (?: [ ] ## require space - why? why not (?<timezone> (?: ## GMT - Greenwich Mean Time ## BST - British Summer Time ## CES?T - Central European (Summer) Time ## EES?T - Eastern European (Summer) Time ## (?: GMT|BST|CES?T|EES?T) (?: / UTC (?: [+-]\d{1,4} | ±0) )? ) | (?: UTC (?: [+-]\d{1,4} | ±0) ) ) )? ) \b #### ### note - local time is now INLINE and MUST follow time (?: [ ]+ ## todo/check - make space optional - why? why not? \( (?<time_local> (?<local_hour>\d{1,2}) [:h] ### todo/fix - MUST match style in time above!!! ### use capture with backref!!!! (?<local_minute>\d{2}) #### ## optional "local" timezone name eg. BRT or CEST etc. (?: [ ] ## require space - why? why not (?<local_timezone> (?: [A-Z]{3,4} (?: / UTC (?: [+-]\d{1,4} | ±0) )? ) | (?: ## e.g. 0 or 00 or 0000 UTC (?: [+-]\d{1,4} | ±0) ) ) )? # note - make timezone optional!!! ) \) )? }ix- START_GOAL_LINE_RE =
note - assume lines starting with opening ( are goal lines!!!!
note - use \A (instead of ^) - \A strictly matches the start of the string. note - check for negative lookahead to exclude ord (numbers) e.g. (1), (42), etc.!!! todo/fix -- exclude (a), (h), (n) - TEAM_AWAY, TEAM_HOME, TEAM_NEUTRAL tokens!! %r{ \A [ ]* ## ignore leading spaces (if any) \( # check NEGATIVE lookahead (?! ## exclude (a), (h), (n) ## TEAM_AWAY, TEAM_HOME, TEAM_NEUTRAL (?: a|h|n ) \) ) }xi- START_GOAL_LINE_COMPAT_RE =
%r{ \A [ ]* ## ignore leading spaces (if any) \( ## (i) check NEGATIVE lookahead ## exclude score e.g. 1-1 etc. (?! [ ]* \b \d-\d \b) ## (ii) check POSITIVE lookahead (?= [ ]* \d{1,3} '? ## optional minute marker (?: \+ \d{1,2} '? ## optional minute marker )? ) }xi- START_GOAL_LINE_ALT_RE =
check for goal line (alternate syntax)
(1-0 Player, 1-1 Player, ...) must start-off OR yes, include score note - allow "centered" style e.g. ( Player 44' (p) 1-0 1-1 Player 64' ) %r{ \A [ ]* ## ignore leading spaces (if any) \( # check POSITIVE lookahead (?= .*? ## note - non-greedy \b \d-\d \b ## score e.g. 0-1 ) }xi- GOAL_NONE_RE =
e.g. (-; Metzger)
%r{ (?<goals_none> -[ ]*; ) }x- GOAL_SEP_ALT_RE =
%r{ (?<goal_sep_alt> (?<=[ ]) ## positive lookbehind - space required - (?=[ ]|\z) ## positive lookahead - speace required )}x- GOAL_COUNT_RE =
e.g. (2)
(2/p), (2/pen.), (3/2p), (3/ 2 pen.) -or- (2,1pen), (3, 2 pens) (p), (pen.) (2 pen.), (2p) (og), (o.g.), (2og), (2 o.g.), (2ogs) %r{ (?<goal_count> \( (?: ## opt penalties (?<pen> (?: (?<pen_value> \d{1,2}) [ ]? )? (?:pens|pen\.?|p) ) | ## opt own goals (og) (?<og> (?: (?<og_value> \d{1,2}) [ ]? )? (?:ogs?|o\.g\.|o) ) | ## opt fallback - classic count/number (?: (?<value> [1-9]) ## check for option penalties (?<pen> [,/] [ ]* (?: (?<pen_value> \d{1,2}) [ ]? )? (?:pens|pen\.?|p) )? ) ) \) )}ix- MINUTE_RE =
note - inline b check in MINUTE_RE excludes
85pen or 90+4pen or 38p (possible and NOT excluded in GOAL_MINUTE_RE !!!) minute with optional stoppage (offset) %r{ (?<minute> \b (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!! \b '? ## optional minute marker (?: \+ (?<value2>\d{1,2}) \b '? ## optional minute marker )? ) }ix- GOAL_MINUTE_NA_RE =
keep separate? or add simply inside GOAL_MINUTE_RE - why? why not?
fix-fix-fix - move into GOAL_MINUTE_RE !!! %r{ (?<goal_minute_na> # positive lookbehind (?<=[ ,;]) (?<value> \?{1,2}) '? ## optional minute marker ## note - add goal minute qualifiers here inline!!! (?: (?: [ ]? (?<og> (?: \((?:og|o\.g\.|o)\)) ## allow (og) | (?: (?:og|o\.g\.|o)) ## allow plain og ) ) | (?: [ ]? (?<pen> (?: \((?:pen\.?|p)\)) ## allow () | (?: (?:pen\.?|p)) ) ) | ## add experimental header qualifier (?: [ ]? (?<hdr> \( (?:hdr\.?|h ) \) | (?: hdr\.?|h ) )) | ## add experimental free kick qualifier (?: [ ]? (?<fk> \( (?:fk\.?|f ) \) | (?: fk\.?|f) )) )? ## note - check positive lookahead (?=[ ,;)]|$) ) }ix- GOAL_MINUTE_RE =
goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og)
todo/check - keep case-insensitive or allow OG or P or PEN or only lower case - why? why not? add (gg) for golden goal - why? why not? add (sg) for silver goal - why? why not?? %r{ (?<goal_minute> \b (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!! '? ## optional minute marker (?: \+ (?<value2>\d{1,2}) '? ## optional minute marker )? ## note - add goal minute qualifiers here inline!!! (?: (?: [ ]? (?<og> (?: \((?:og|o\.g\.|o)\)) ## allow (og) | (?: (?:og|o\.g\.|o)) ## allow plain og ) ) | (?: [ ]? (?<pen> (?: \((?:pen\.?|p)\)) ## allow () | (?: (?:pen\.?|p)) ) ) | ## add experimental header qualifier (?: [ ]? (?<hdr> \( (?:hdr\.?|h ) \) | (?: hdr\.?|h ) )) | ## add experimental free kick qualifier (?: [ ]? (?<fk> \( (?:fk\.?|f ) \) | (?: fk\.?|f) )) )? ## add experimental seconds ## e.g. (95 secs) or (95sec) etc. (?: [ ]* \( (?<secs>\d{1,3}) [ ]?secs? \) )? ) ## note - check positive lookahead (?=[ ,;)]|$) }ix- GOAL_TYPE_RE =
%r{ (?<goal_type> \( (?: (?<og> og|o\.g\.|o ) | (?<pen> pen\.?|p ) | ## add experimental header qualifier (?<hdr> hdr\.?|h ) | ## add experimental free kick qualifier (?<fk> fk\.?|f ) ) \) )}xi- START_WITH_GROUP_DEF_LINE_RE =
check for start of group def line e.g.
Group A | ... Group 1 : .... Group A2 | .... note - use \A (instead of ^) - \A strictly matches the start of the string. %r{ \A [ ]* ## ignore leading spaces (if any) (?<group_def> Group [ ] [a-z0-9]+ ## todo/check - allow dot (.) too e.g. 1.A etc.- why? why not? ) ### positive lookahead MUST be : OR | (?= [ ]* [:|] [ ]) ## note: requires space for now after [:|] - keep - why? why not? }ix- ROUND_OUTLINE_I_RE =
note - use A (instead of ^) - A strictly matches the start of the string.
todo - add support for trailing markers e.g. ▪ Round 1 ▪▪▪▪▪▪▪▪ :: Round 1 :::::::::::: check - allow without space (like in heading =Heading 1=) - why? why not? ▪Round 1▪▪▪▪▪▪▪▪ ::Round 1:::::::::::: %r{ \A [ ]* ## ignore leading spaces (if any) (?<round_marker> [▪]{1,3} ## BLACK SMALL SQUARE e.g. ▪,▪▪,▪▪▪ ) [ ]+ (?<round_outline> ## must start with letter - why? why not? ### 1st round ## allow numbers e.g. Group A - 1 ## ## note - CANNOT incl. :| !!! ## used for markers for defs/definitions [^:|]+? ## use non-greedy ) (?: [ ]+ [▪]+ )? [ ]* ## ignore trailing spaces (if any) \z }xi- ROUND_OUTLINE_II_RE =
%r{ \A [ ]* ## ignore leading spaces (if any) (?<round_marker> ::{1,3} ## e.g. ::,:::,:::: ) [ ]+ (?<round_outline> ## must start with letter - why? why not? ### 1st round ## allow numbers e.g. Group A - 1 ## ## note - CANNOT incl. :| !!! ## used for markers for defs/definitions [^:|]+? ## use non-greedy ) (?: [ ]+ ::+ )? [ ]* ## ignore trailing spaces (if any) \z }xi- ROUND_OUTLINE_RE =
Regexp.union( ROUND_OUTLINE_I_RE, ROUND_OUTLINE_II_RE, )
- ROUND_DEF_OUTLINE_RE =
note - for def(initions) only one level support
that is, no round outline additions possible (e.g ▪▪ 1st leg etc.) %r{ \A [ ]* ## ignore leading spaces (if any) (?: [▪] ## BLACK SMALL SQUARE | :: ) [ ]+ (?<round_outline> [^:|]+? ## use non-greedy ) [ ]* ## ignore trailing spaces (if any) ### possitive lookahead MUST be : OR | (?= [:|] [ ]) ## note: requires space for now after [:|] - keep - why? why not? }ix- SCORE_AWD_RE =
note - keep AWD w/o dot - why? why not?
%r{ (?<score_awd> \b (?<score1>\d{1,2}) - (?<score2>\d{1,2}) [ ]? (?-i: awd\.? | AWD ) ## POSITIVE lookahead - requires space (?= [ ]) )}ix- SCORE_ABD_RE =
add support for score abandoned (inline style)
2-1 abd. or 2-1 ABD %r{ (?<score_abd> \b (?<score1>\d{1,2}) - (?<score2>\d{1,2}) [ ]? (?-i: abd\.? | ABD ) ## POSITIVE lookahead - requires space (?= [ ]) )}ix- SCORE_RE =
2-1
note - was SCORE__FT__RE changed to "generic" SCORE_RE and (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) changed (?<score1>\d{1,2}) - (?<score2>\d{1,2}) to pattern match not necessarily the full-time (ft) scoreline!!! - pattern also used for goal seq(uence) e.g. 1-0 Kane, 1-1 Johnson %r{ (?<score> \b (?<score1>\d{1,2}) - (?<score2>\d{1,2}) \b )}ix- POSTPONED =
%Q{ (?<postponed> postponed | pst\\.? | po?stp\\.? | ppd\\.? ) }- CANCELED =
add can/can. - why? why not?
%Q{ (?<canceled> cancell?ed | canc\\.? ) }- WALKOVER =
add o/w too - why? why not?
%Q{ (?<walkover> walkover | w/o | wo ) }- AWARDED =
%Q{ (?<awarded> awarded | awd\\.? ) }- SUSPENDED =
%Q{ (?<suspended> suspended | susp\\.? ) }- ABANDONED =
%Q{ (?<abandoned> abandoned | aban\\.? | abd\\.? ) }- ANNULLED =
%Q{ (?<annulled> annulled ) }- VOIDED =
note - alternative (name) to annulled
%Q{ (?<voided> voided | void ) }- REPLAY =
%Q{ (?<replay> replay | repl\\.? ) }- STATUS_RE =
note - status_note incl. complete text incl. <status> (not normalized)
<status> gets normalized e.g. ppt => postponed etc. %r{ \[ (?: ############################################# ### opt 1 - allow long forms with note/comment for some stati ## e.g. [postponed due to tropical storm "Hanna"] ## [suspended at 84' by storm; result stood] ######################### (?: (?<status_note> (?<status> #################### ## pre-match (not played) #{POSTPONED} | #{CANCELED} | #{WALKOVER} | ###################### ## pre/post match #{AWARDED} | ######################## ## post match - (partially) played #{SUSPENDED} | #{ABANDONED} | #{ANNULLED} | #{VOIDED} ### note - alternative to annulled ) ## end-of-<status> [ :;,-]+ ## leading spaces (or separators) [^\]]+? ## note - add non-greedy match ) ## end-of-<status-note> [ ]* ## eat-up optional trailing spaces ) | ######################################## ## opt 2 - short form only (no note/comments) e.g. [postponed], [Canceled], etc. #################################### (?<status> #################### ## pre-match (not played) #{POSTPONED} | #{CANCELED} | #{WALKOVER} | ###################### ## pre/post match #{AWARDED} | ######################## ## post match - (partially) played #{SUSPENDED} | #{ABANDONED} | #{ANNULLED} | #{VOIDED} ### note - alternative to annulled | #{REPLAY} ### todo/fix - keep replay - why? why not? ### prefer replay in round e.g. ## ▪ Round 17, Replay ## ▪ Semi-finals, Replays ) ) \] }ix- GOAL_RE =
Regexp.union( SPACES_RE, GOAL_NONE_RE, GOAL_MINUTE_RE, GOAL_MINUTE_NA_RE, GOAL_COUNT_RE, PROP_NAME_RE, ## note - (re)use prop name for now for (player) name GOAL_SEP_ALT_RE, ## note - add dash (-) with (required) spaces / (?<sym> [;,)]) /x ## todo/fix - add ANY_RE !!!! )
- GOAL_ALT_RE =
Regexp.union( SPACES_RE, SCORE_RE, ## e.g. 1-0, 0-1, etc. GOAL_MINUTE_RE, GOAL_TYPE_RE, PROP_NAME_RE, ## note - (re)use prop name for now for (player) name / (?<sym> [,)]) /x ## note - no semicolon (;) ## todo/fix - add ANY_RE !!!! )
- GOAL_COMPAT_RE =
Regexp.union( SPACES_RE, SCORE_RE, ## e.g. 1-0, 0-1, etc. MINUTE_RE, ## note - matches minute e.g. 92, 7, 7' 7+3, etc. GOAL_TYPE_RE, PROP_NAME_RE, ## note - (re)use prop name for now for (player) name / (?<sym> [,)]) /x ## note - no semicolon (;) ## todo/fix - add ANY_RE !!!! )
- HTML_COMMENT_RE =
%r{ <!-- .*? ## note - use non-greedy/lazy *? match --> }xm- PREPROC_NOTA_BENE_RE =
check for “literal” (multi-line) note blocks
eg. nb: or note: space required after double colon - why? why not? %r{ ^ [ ]* (?: nb | note) [ ]* : [ ]+ .+? ## non-greedy ## positive lookahead ## note - must end with blank line or end-of-file/document (?= \n[ ]*\n | \z ) }xim- PREPROC_BLOCK_RE =
note - [] block may NOT incl. square brackets
what about comments (e.g. #)? todo/check - rename to NOTE_BLOCK or TEXT_BLOCK or ??? %r{ \[ [^\[\]\#]*? ## note - use non-greedy/lazy *? match \] }xm- PROP_NAME_WORD_ =
%r{ \p{L}+ \.? ## optional dot }ix- PROP_NAME_RE =
name different from text (**does NOT allow number in name/text**)
different from PROP_KEY too %r{ (?<prop_name> \b (?<name> #{PROP_NAME_WORD_} ## connectors (?: ## (i) space - only one single space allowed inline!!! (?: ### check if negative lookbehind is redudant!! ## next char is \p{L} and NOT space ## thus double space not possible!! (?<! [ ]) ## use negative lookbehind [ ] (?= \p{L}|['"]\p{L}) ## use lookahead ) ## (ii) support (inline) quoted name e.g. "Rodri" or such | (?: (?<=[ ]) ## use positive lookbehind " \p{L}+ " ## require space here too - why? why not? ) ## (iii) dash (-) | (?: ## use POSITIVE lookBEHIND ## note - allow leading dot (.) e.g. K.-H.Förster ## short for Karl-Heinz Förster ## ## change to negative lookBEHIND [ '"-] ## \p{L}\. | \p{L} - not MUST be fixed size (?<= [\p{L}.] ) [-] ## must be surrounded by letters ## e.g. One-Two NOT ## One- Two or One - Two or One -Two etc. (?= \p{L}) ## use lookahead ) | (?: ## flex rule for quote - allow any ## only check for double quotes e.g. cannot follow other ' for now - why? why not? ## allows rodrigez 'rodri' for example (?<!') ## use negative lookbehind ' ) | ## standard case with letter(s) and optional dot #{PROP_NAME_WORD_} )* ) ## add lookahead - must be non-alphanum ## add colon (:) too - why? why not? (?= [ ,;\]\)]|$) )}ix- P_EN =
english helpers (penalty, extra time, …)
note - p must go last (shortest match) pso = penalty shootout - note - remove PSO for now (may add later back) - why? why not? todo/fix/clean-up - keep it simple - remove optional trailing dot (.) from pen., p., agg. etc. - why? why not? always use (simply) pen, p, agg (also) remove a.e.t. / a.e.t option - why? why not? UPDATE mar/2026: addd pens too - keep - why? why not? (4-3 pens) (4-3 Pens) -- keep mixed Pens/Pen. too - why? why not? (4-3 Pen.) '(?-i: PEN | P |' + '[Pp]ens | [Pp]en\.? | p\.? )'
- ET_EN =
fix - change ET_EN to AET_EN!!! - why? why not?
check - allow Aet too - why? why not? or A.e.t ?? '(?-i: AET | ' + 'aet | a\.e\.t\.? )'
- AETGG_EN =
after (golden goal/sudden death) extra time - add more options/styles - why? why not?
'(?-i: AET/GG | AGGET | ASDET | ' + 'aet/gg | a\.e\.t\.?/g\.g\.? | agget | asdet )'
- AETSG_EN =
after (silver goal) extra time
'(?-i: AET/SG | ASGET | ' + 'aet/sg | a\.e\.t\.?/s\.g\.? | asget )'
- AGG_EN =
agg/agg. or AGG
'(?-i: AGG | agg\.? )'- SCORE_P =
fix - change SCORE_P to SCORE_FULL_P
SCORE_ET to SCORE_FULL_ET (re)use SCORE_P, SCORE_ET for score only part!!! fix/fix/fix - rename to SCORE_P_ SCORE_ET_ mark internals with TRAILING underscore (leading NOT possible!) %Q< (?<p1>\\d{1,2}) - (?<p2>\\d{1,2}) [ ]? #{P_EN} >- SCORE_ET =
%Q< (?<et1>\\d{1,2}) - (?<et2>\\d{1,2}) [ ]? #{ET_EN} >- SCORE_LOOKAHEAD =
'(?= [ ,\]] | $)'- SCORE__ET_GG_SG__RE =
after extra-time with golden goal/sudden death & silver goal rule
note - golden goal & silver goal EXCLUDE penalties!!! 4-3 a.e.t/g.g. 4-3 aet/gg 4-3agget -or- 4-3 asdet 2-1 aet/sg -or- 4-3 aet/gg (3-3, 2-1) %r{ (?<score_full> \b (?<et1>\d{1,2}) - (?<et2>\d{1,2}) [ ]? (?: (?<aetgg> #{AETGG_EN}) | (?<aetsg> #{AETSG_EN}) ) ### note: ## add optional full-time, half-time score (?: [ ]+ \( [ ]* (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]* (?: , [ ]* (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* )? )? # note: make half time (HT) score optional for now \) )? #{SCORE_LOOKAHEAD} )}ix- SCORE__P_ET__RE =
note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.)
3-4 pen. 2-2 a.e.t. 3-4 pen. 2-2 a.e.t. 2-2 a.e.t. %r{ (?<score_full> \b (?: #{SCORE_P} [ ]+ )? ## note: make penalty (P) score optional for now #{SCORE_ET} #{SCORE_LOOKAHEAD} )}ix- SCORE__ET_P__RE =
note: allow SPECIAL cases WITHOUT full time scores
AND with pen in last position! 2-2 a.e.t., 3-4 pen. 2-2 a.e.t. 3-4 pen. ## or without comma separator - why? why not? %r{ (?<score_full> \b #{SCORE_ET} (?: [ ]*,[ ]* | [ ]+ ) #{SCORE_P} #{SCORE_LOOKAHEAD} )}ix- SCORE__FT_P__RE =
special case (i) - full time with penalties
2-2, 3-4 pen. %r{ (?<score_full> \b (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]*,[ ]* ## note - comma required!!! #{SCORE_P} #{SCORE_LOOKAHEAD} )}ix- SCORE__FT_HT_P__RE =
special case (ii) - full time & half-time with penalties
2-2 (1-1), 3-4 pen. %r{ (?<score_full> \b (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]* \( (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) \) [ ]*,[ ]* ## note - comma required!!! #{SCORE_P} #{SCORE_LOOKAHEAD} )}ix- SCORE__P__RE =
note: allow SPECIAL with penalty only
3-4 pen. or 3-4p etc. %r{ (?<score_full> \b #{SCORE_P} #{SCORE_LOOKAHEAD} )}ix- SCORE__P_ET_FT_HT_V2__RE =
support short all-in-one e.g.
e.g. 3-4 pen. 2-2 a.e.t. ( 1-1, 1-1 ) becomes 3-4 pen. (2-2, 1-1, 1-1) %r{ (?<score_full> \b #{SCORE_P} [ ]+ \( [ ]* (?<et1>\d{1,2}) - (?<et2>\d{1,2}) [ ]*, [ ]* (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]*, [ ]* (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* \) #{SCORE_LOOKAHEAD} )}ix- SCORE__ET_FT_HT_P__RE =
e.g. 2-2 a.e.t. (1-1, 1-0), 5-1 pen.
%r{ (?<score_full> \b #{SCORE_ET} [ ]+ \( [ ]* (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]* (?: , [ ]* (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* )? )? # note: make half time (HT) score optional for now \) (?: [ ]*,[ ]* | [ ]+) #{SCORE_P} #{SCORE_LOOKAHEAD} )}ix- SCORE__P_ET_FT_HT__RE =
e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or
3-4p 2-2aet (1-1, ) or 3-4 pen. 2-2 a.e.t. (1-1) or 2-2 a.e.t. (1-1, 1-1) or 2-2 a.e.t. (1-1, ) or 2-2 a.e.t. (1-1) %r{ (?<score_full> \b (?: #{SCORE_P} [ ]+ )? ## note - make penalty (P) score optional for now #{SCORE_ET} [ ]+ \( [ ]* (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]* (?: , [ ]* (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* )? )? # note: make half time (HT) score optional for now \) #{SCORE_LOOKAHEAD} )}ix- SCORE__P_FT_HT__RE =
special case for case WITHOUT extra time!!
same as above (but WITHOUT extra time and pen required) %r{ (?<score_full> \b #{SCORE_P} [ ]+ \( [ ]* (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]* (?: , [ ]* (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* )? )? # note: make half time (HT) score optional for now \) #{SCORE_LOOKAHEAD} )}ix- SCORE__FT_HT__RE =
e.g. 2-1 (1-1)
%r{ (?<score_full> \b (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]+ \( [ ]* (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* \) #{SCORE_LOOKAHEAD} )}ix- SCORE_FULL_1ST_RE =
note 2-2, 5-1 pen. must get priority (get before SCORE_LEGS!!!)
break out note - no need for Regexp.union for now (only single regex!) SCORE__FT_P__RE- SCORE_FULL_RE =
Regexp.union( SCORE__ET_GG_SG__RE, # e.g. 3-1 aet/gg SCORE__P_ET_FT_HT_V2__RE, # e.g. 5-1 pen. (2-2, 1-1, 1-0) SCORE__ET_FT_HT_P__RE, # e.g. 2-2 a.e.t. (1-1, 1-0), 5-1 pen. SCORE__P_ET_FT_HT__RE, # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0) SCORE__P_FT_HT__RE, # e.g. 5-1 pen. (1-1) SCORE__ET_P__RE, # e.g. 2-2 a.e.t., 5-1 pen. SCORE__FT_HT_P__RE, # e.g. 2-2 (1-1), 5-1 pen. SCORE__P_ET__RE, # e.g. 5-1 pen. 2-2 a.e.t. or 2-2 a.e.t. (w/o pen) SCORE__P__RE, # e.g. 5-1 pen. SCORE__FT_HT__RE, # e.g. 1-1 (1-0) ## note - keep basic score as its own token!!!! ## that is, SCORE & SCORE_MORE ### SCORE__FT__RE, # e.g. 1-1 -- note - must go last!!! )
- SCORE_LEGS_RE =
win on away goals
aet %r{ (?<score_legs> \b (?<leg1_ft1>\d{1,2}) - (?<leg1_ft2>\d{1,2}) (?: [ ]+ | [ ]*,[ ]*) # separate by spaces OR comma (?: ## opt 1 - after extra-time (et) score (?<leg2_et1>\d{1,2}) - (?<leg2_et2>\d{1,2}) [ ]? #{ET_EN} ## a.e.t./aet ### note - might end in dot (.) not alpha ### thus, wordboundary NOT working #{SCORE_LOOKAHEAD} | ## opt 2 - full-time (ft) (?<leg2_ft1>\d{1,2}) - (?<leg2_ft2>\d{1,2}) \b ) (?: ## check optional aggregate e.g. (agg 4-4) [ ]+ \( agg [ ] (?<agg1>\d{1,2}) - (?<agg2>\d{1,2}) ### add win options (?: ## opt 1 - on away goals (?<away> [ ]*,[ ]* (?:win [ ])? on [ ] away [ ] goals? ) | ## opt 2 - on penalties (?: [ ]*,[ ]* (?:win [ ])? (?<leg2_p1>\d{1,2}) - (?<leg2_p2>\d{1,2}) [ ] on [ ] pens ) )? \) )? )}ix- MONTH_LINES =
parse_names( <<TXT ) January Jan February Feb March Mar April Apr May June Jun July Jul August Aug September Sept Sep October Oct November Nov December Dec TXT
- MONTH_NAMES =
build_names( MONTH_LINES )
- MONTH_MAP =
pp MONTH_NAMES
build_map( MONTH_LINES, downcase: true )
- DAY_LINES =
parse_names( <<TXT ) Monday Mon Mo Tuesday Tues Tue Tu Wednesday Wed We Thursday Thurs Thur Thu Th Friday Fri Fr Saturday Sat Sa Sunday Sun Su TXT
- DAY_NAMES =
build_names( DAY_LINES )
- DAY_MAP =
pp DAY_NAMES
build_map( DAY_LINES, downcase: true )
- GROUP_DEF_RE =
note - add comma (,) as optional separator
Regexp.union( SPACES_RE, TEXT_RE, / (?<sym> [:|,] ) /x, ANY_RE, )
- PROP_CARDS_RE =
note - no inline keys possible
todo/fix - use custom (limited) prop basics too Regexp.union( SPACES_RE, MINUTE_RE, PROP_NAME_RE, / (?<sym> [;,-]) /x ## todo/fix - add ANY_RE here too!!! )
- PROP_ATTENDANCE_RE =
Regexp.union( SPACES_RE, ENCLOSED_NAME_RE, # e.g. (sold out) etc. why? why not? PROP_NUM_RE, # e.g. 28 000 or 28_000 (NOT 28,000 is not valid!!!) ## todo/fix - add ANY_RE here too!!! )
- PROP_REFEREE_RE =
Regexp.union( SPACES_RE, ENCLOSED_NAME_RE, # e.g. (sold out) etc. why? why not? PROP_NUM_RE, # e.g. 28 000 or 28_000 (NOT 28,000 is not valid!!!) PROP_KEY_INLINE_RE, PROP_NAME_RE, / (?<sym> [;,]) /x ## todo/fix - add ANY_RE here too!!! )
- ROUND_DEF_RE =
note - add comma (,) as optional separator
Regexp.union( SPACES_RE, DURATION_RE, # note - duration MUST match before date DATE_RE, ## note - date must go before time (e.g. 12.12. vs 12.12) / (?<sym> [:|,] ) /x, ANY_RE )
- SCORE_FULLER_AGG =
_mk_score_fuller_agg( win: false )
- SCORE_FULLER_AGG_WIN =
_mk_score_fuller_agg( win: true )
- SCORE_FULLER_P =
_mk_score_fuller_p( win: false )
- SCORE_FULLER_P_WIN =
_mk_score_fuller_p( win: true )
- SCORE_FULLER_AWAY_WIN =
%Q< (?: (?<away> ############ ## opt 1) with win (?: (?: win [ ] )? (?: (?<away1>\\d{1,2}) - (?<away2>\\d{1,2}) [ ] )? on [ ] away [ ] goals? # goal or goals ) | ##### ## opt 2) "classic" (post) (?: (?: (?<away1>\\d{1,2}) - (?<away2>\\d{1,2}) [ ] )? [ ]* away ) | ##### ## opt 3) up-front (pre) (?: away (?: [ ] (?<away1>\\d{1,2}) - (?<away2>\\d{1,2}) )? ) )) >- SCORE_FULLER_HT_OPT =
%Q< (?: HT [ ] (?: (?<ht1>\\d{1,2}) - (?<ht2>\\d{1,2})) [ ]*,[ ]* )? ## note - make optional >- SCORE_FULLER_FT_OPT =
%Q< (?: FT [ ] (?: (?<ft1>\\d{1,2}) - (?<ft2>\\d{1,2})) [ ]*,[ ]* )? ## note - make optional >- SCORE_FULLER__HT =
4-4 (HT 2-1)
or Team A 4-1 Team B (HT 2-1) %Q< \\( HT [ ] (?<ht1>\\d{1,2}) - (?<ht2>\\d{1,2}) \\) >- SCORE_FULLER__HT_FT__RE =
%r{ (?<score_fuller> \b (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]+ #{SCORE_FULLER__HT} )}ix- SCORE_FULLER_MORE__HT_FT__RE =
%r{ (?<score_fuller_more> #{SCORE_FULLER__HT} )}ix- SCORE_FULLER__ET =
%Q< \\( #{SCORE_FULLER_HT_OPT} #{SCORE_FULLER_FT_OPT} (?: (?<aetgg> #{AETGG_EN}) | (?<aetsg> #{AETSG_EN}) | (?<aet> #{ET_EN}) ) \\) >- SCORE_FULLER__ET__RE =
%r{ (?<score_fuller> \b (?<et1>\d{1,2}) - (?<et2>\d{1,2}) [ ]+ #{SCORE_FULLER__ET} )}ix- SCORE_FULLER_MORE__ET__RE =
%r{ (?<score_fuller_more> #{SCORE_FULLER__ET} )}ix- SCORE_FULLER__ET_P =
4-4 (aet, win 3-5 on pens)
4-4 (aet, 3-5 on pens) 4-4 (aet, 3-5 pen) 4-4 (a.e.t., 3-5 pen.) or Team A 4-4 Team B (aet, win 3-5 on pens) Team A 4-4 Team B (aet, 3-5 on pens) Team A 4-4 Team B (aet, 3-5 pen) Team A 4-4 Team B (a.e.t., 3-5 pen.) %Q< \\( #{SCORE_FULLER_HT_OPT} #{SCORE_FULLER_FT_OPT} (?<aet> #{ET_EN}) [ ]*,[ ]* #{SCORE_FULLER_P_WIN} \\) >- SCORE_FULLER__ET_P__RE =
%r{ (?<score_fuller> \b (?<et1>\d{1,2}) - (?<et2>\d{1,2}) [ ]+ #{SCORE_FULLER__ET_P} )}ix- SCORE_FULLER_MORE__ET_P__RE =
%r{ (?<score_fuller_more> #{SCORE_FULLER__ET_P} )}ix- SCORE_FULLER__FT_P =
4-4 (win 3-5 on pens)
4-4 (3-5 pen) 4-4 (3-5p) or Team A 4-4 Team B (win 3-5 on pens) Team A 4-4 Team B (3-5 pen) Team A 4-4 Team B (3-5p) %Q< \\( #{SCORE_FULLER_HT_OPT} #{SCORE_FULLER_P_WIN} \\) >- SCORE_FULLER__FT_P__RE =
%r{ (?<score_fuller> \b (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]+ \( #{SCORE_FULLER_P_WIN} \) )}ix- SCORE_FULLER_MORE__FT_P__RE =
%r{ (?<score_fuller_more> #{SCORE_FULLER__FT_P} )}ix- SCORE_FULLER__FT_AGG =
3-2 (win 4-5 on aggregate)
3-2 (4-5 on aggregate) 3-2 (4-5 on agg) 3-2 (4-5 agg) 3-2 (4-5 agg.) or 3-2 (agg 4-5) %Q< \\( #{SCORE_FULLER_HT_OPT} #{SCORE_FULLER_AGG_WIN} \\) >- SCORE_FULLER__FT_AGG__RE =
%r{ (?<score_fuller> \b (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]+ #{SCORE_FULLER__FT_AGG} )}ix- SCORE_FULLER_MORE__FT_AGG__RE =
%r{ (?<score_fuller_more> #{SCORE_FULLER__FT_AGG} )}ix- SCORE_FULLER__FT_AGG_AWAY =
ft + agg + away
2-1 (3-3 on aggregate, win on away goals) 2-1 (3-3 on aggregate, win 2-1 on away goals) %Q< \\( #{SCORE_FULLER_HT_OPT} #{SCORE_FULLER_AGG} [ ]*,[ ]* #{SCORE_FULLER_AWAY_WIN} \\) >- SCORE_FULLER__FT_AGG_AWAY__RE =
%r{ (?<score_fuller> \b (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]+ #{SCORE_FULLER__FT_AGG_AWAY} )}ix- SCORE_FULLER_MORE__FT_AGG_AWAY__RE =
%r{ (?<score_fuller_more> #{SCORE_FULLER__FT_AGG_AWAY} )}ix- SCORE_FULLER__ET_AGG_P =
2-1 (aet, 3-3 on aggregate, win 5-2 on pens)
2-1 (aet, 3-3 agg, 5-2 pen.) %Q< \\( #{SCORE_FULLER_HT_OPT} #{SCORE_FULLER_FT_OPT} (?<aet> #{ET_EN}) [ ]*,[ ]* #{SCORE_FULLER_AGG} [ ]*,[ ]* #{SCORE_FULLER_P_WIN} \\) >- SCORE_FULLER__ET_AGG_P__RE =
%r{ (?<score_fuller> \b (?<et1>\d{1,2}) - (?<et2>\d{1,2}) [ ]+ #{SCORE_FULLER__ET_AGG_P} )}ix- SCORE_FULLER_MORE__ET_AGG_P__RE =
%r{ (?<score_fuller_more> #{SCORE_FULLER__ET_AGG_P} )}ix- SCORE_FULLER_RE =
map tables
note: order matters - first come-first matched/served Regexp.union( SCORE_FULLER__HT_FT__RE, ## e.g. 3-2 (HT 2-1) SCORE_FULLER__ET_P__RE, ## e.g. 2-2 (aet, win 5-3 on pens) SCORE_FULLER__ET__RE, ## e.g. 2-3 (aet) SCORE_FULLER__FT_P__RE, ## e.g. 2-2 (win 5-3 on pens) SCORE_FULLER__FT_AGG__RE, ## e.g. 2-3 (win 5-4 on aggregate) SCORE_FULLER__FT_AGG_AWAY__RE, ## e.g. 2-1 (3-3 on aggreate, win 2-1 on away goals) SCORE_FULLER__ET_AGG_P__RE, ## e.g. 2-1 (aet, 3-3 on aggregate, win 5-2 on pens) )
- SCORE_FULLER_MORE__HT__RE =
add support for “stand-alone” (HT) and (FT) - keep why? why not?
%r{ (?<score_fuller_more> \( (?<ht> ht ) \) )}ix- SCORE_FULLER_MORE__FT__RE =
%r{ (?<score_fuller_more> \( (?<ft> ft ) \) )}ix- SCORE_FULLER_MORE__FT_ET__RE =
add special for fuller_more
(aet 4-3) - core score is ft, and fuller more incl. et!!! %r{ (?<score_fuller_more> \(#{ET_EN} [ ] (?<et1>\d{1,2}) - (?<et2>\d{1,2}) \) )}ix- SCORE_FULLER_MORE__HT_FT__CLASSIC_RE =
note - simply (1-1) !!!!!
note - special attention needed for placemenent in processing error!!! make sure it is the last (or on of the last) match(es) %r{ (?<score_fuller_more> \( (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) \) )}ix- SCORE_FULLER_MORE_RE =
Regexp.union( SCORE_FULLER_MORE__FT__RE, ## e.g. (ft) SCORE_FULLER_MORE__HT__RE, ## e.g. (ht) SCORE_FULLER_MORE__HT_FT__RE, ## e.g. (HT 2-1) SCORE_FULLER_MORE__ET_P__RE, ## e.g. (aet, win 5-3 on pens) SCORE_FULLER_MORE__ET__RE, ## e.g. (aet) SCORE_FULLER_MORE__FT_ET__RE, ## e.g. (aet 3-2) - (SPECIAL) incl. after extra-time score!! SCORE_FULLER_MORE__FT_P__RE, ## e.g. (win 5-3 on pens) SCORE_FULLER_MORE__FT_AGG__RE, ## e.g. (win 5-4 on aggregate) SCORE_FULLER_MORE__FT_AGG_AWAY__RE, ## e.g. (3-3 on aggreate, win 2-1 on away goals) SCORE_FULLER_MORE__ET_AGG_P__RE, ## e.g. (aet, 3-3 on aggregate, win 5-2 on pens) SCORE_FULLER_MORE__HT_FT__CLASSIC_RE, ## e.g. (2-1) half-time !!!! )
- DURATION_I_RE =
%r{ (?<duration> \b (?: ## optional day name ((?<day_name1>#{DAY_NAMES}) [ ] )? (?<month_name1>#{MONTH_NAMES}) [ ] (?<day1>\d{1,2}) ## optional year ( ,? # optional comma [ ] (?<year1>\d{4}) )? ## support + and - (add .. or such - why??) [ ]* - [ ]* ## optional day name ((?<day_name2>#{DAY_NAMES}) [ ] )? (?<month_name2>#{MONTH_NAMES}) [ ] (?<day2>\d{1,2}) ## optional year ( ,? # optional comma [ ] (?<year2>\d{4}) )? ) \b )}ix- DURATION_II_RE =
variant ii
add support for shorthand August 16-18, 2011 September 13-15, 2011 October 18-20, 2011 March 6-8 2012 March 6-8 - add support for August 16+17 or such (and check 16+18) use <op> to check if day2 is a plus or range or such - why? why not? %r{ (?<duration> \b (?: (?<month_name1>#{MONTH_NAMES}) [ ] (?<day1>\d{1,2}) - (?<day2>\d{1,2}) (?: ,? ## optional comma [ ] (?<year1>\d{4}) )? ## optional year ) \b )}ix- DURATION_RE =
map tables
note: order matters; first come-first matched/served Regexp.union( DURATION_I_RE, DURATION_II_RE, )
- INLINE_WO_RE =
add support for WO or W-0 too - why? why not?
%r{ (?<inline_wo> \b (?: w/o | W/O ) \b )}x- INLINE_BYE_RE =
note - NOT case insensitive
%r{ (?<inline_bye> \b (?: bye | BYE ) \b )}x- INLINE_NP_RE =
A n/p B (note - basically a inline short form of A v B [cancelled] )
N/P %r{ (?<inline_np> \b (?: n/p | N/P ) \b )}x- INLINE_ABD_RE =
abd/abd. or aban/aban. [abandoned]
ABD/ABAN %r{ (?<inline_abd> \b (?: abd\.? | aban\.? | ABD | ABAN ) ## POSITIVE lookahead - requires space (?= [ ]) )}x- INLINE_SUSP_RE =
susp/susp. [suspended]
SUSP %r{ (?<inline_susp> \b (?: susp\.? | SUSP ) ## POSITIVE lookahead - requires space (?= [ ]) )}x- INLINE_PPD_RE =
ppd/ppd. or pst/pst. or pstp/pstp. or postp/postp. [postponed]
PPD/PSTP/POSTP/P-P todo/check - add/allow p-p too - why? why not? %r{ (?<inline_ppd> \b (?: ppd\.? | pst\.? | po?stp\.? | PPD | PST | PO?STP | P-P ) ## POSITIVE lookahead - requires space (?= [ ]) )}x- INLINE_VOID_RE =
void via x-x X-X
todo/check - only allow X-X - why? why not? %r{ (?<inline_void> \b (?: x-x | X-X ) ## POSITIVE lookahead - requires space (?= [ ]) )}x- INLINE_AWD_RE =
awd/awd. [awarded]
AWD note - recommendation is to allways include score thus, use/prefer SCORE_AWD e.g. 0-3 awd %r{ (?<inline_awd> \b (?: awd\.? | AWD ) ## POSITIVE lookahead - requires space (?= [ ]) )}x- INLINE_CANC_RE =
canc/canc. [cancelled]
CANC %r{ (?<inline_canc> \b (?: canc\.? | CANC ) ## POSITIVE lookahead - requires space (?= [ ]) )}x- PROP_LINEUP_RE =
Regexp.union( SPACES_RE, MINUTE_RE, ## e.g. 44 or 44' or 45+1 or 45+1' etc. INLINE_CAPTAIN, ## e.g. [c] INLINE_YELLOW, ## e.g. [Y] or [Y 44] or [Y 44'] or [Y 45+1'] INLINE_YELLOW_RED, ## e.g. [Y/R] or [Y/R 78] INLINE_RED, ## e.g. [R] or [R 42] or [R 42'] PROP_KEY_INLINE_RE, PROP_NAME_RE, / (?<sym> [;,()\[\]-]) /x ## todo/fix - add ANY_RE here too!!! )
- PROP_PENALTIES_RE =
Regexp.union( SPACES_RE, SCORE_RE, # e.g. 1-1 etc. ENCLOSED_NAME_RE, # e.g. (save), (post), etc. PROP_NAME_RE, / (?<sym> [;,]) /x ## add [] too - why? why not? ## todo/fix - add ANY_RE here too!!! )
Class Method Summary collapse
-
._build_date(m) ⇒ Object
“internal” date helpers.
- ._build_date_legs(m) ⇒ Object
- ._build_duration(m) ⇒ Object
- ._build_goal_count(m) ⇒ Object
- ._build_goal_minute(m) ⇒ Object
- ._build_goal_minute_na(m) ⇒ Object
- ._build_goal_type(m) ⇒ Object
- ._build_minute(m) ⇒ Object
- ._build_score(m) ⇒ Object
-
._build_score_abd(m) ⇒ Object
score abandonded (abd/abd.).
-
._build_score_awd(m) ⇒ Object
score awarded (awd/awd.).
- ._build_score_full(m) ⇒ Object
- ._build_score_fuller(m) ⇒ Object
- ._build_score_fuller_more(m) ⇒ Object
- ._build_score_legs(m) ⇒ Object
- ._build_status(m) ⇒ Object
- ._build_time(m) ⇒ Object
-
._mk_score_fuller_agg(win:) ⇒ Object
regex score helpers note - MUST double escape d e.g.
-
._mk_score_fuller_p(win:) ⇒ Object
with optional win - true|false.
- ._parse_date(str) ⇒ Object
- ._parse_goal_count(str) ⇒ Object
-
._parse_goal_minute(str) ⇒ Object
parse helpers.
-
._parse_score_full(str) ⇒ Object
add parser helpers.
-
._parse_team(str) ⇒ Object
helper for testing regex match for team names.
- .build_map(lines, downcase: false) ⇒ Object
- .build_names(lines) ⇒ Object
-
.parse_date(str, start: nil) ⇒ Object
note: parse_date - returns Date object _parse_date (with underscore) - return hash of “parsed” regex match data!!.
- .parse_names(txt) ⇒ Object
Instance Method Summary collapse
- #_build_date(m) ⇒ Object
- #_build_date_legs(m) ⇒ Object
- #_build_duration(m) ⇒ Object
- #_build_goal_count(m) ⇒ Object
- #_build_goal_minute(m) ⇒ Object
- #_build_goal_minute_na(m) ⇒ Object
- #_build_goal_type(m) ⇒ Object
- #_build_minute(m) ⇒ Object
- #_build_score(m) ⇒ Object
- #_build_score_abd(m) ⇒ Object
- #_build_score_awd(m) ⇒ Object
- #_build_score_full(m) ⇒ Object
- #_build_score_fuller(m) ⇒ Object
- #_build_score_fuller_more(m) ⇒ Object
- #_build_score_legs(m) ⇒ Object
- #_build_status(m) ⇒ Object
- #_build_time(m) ⇒ Object
- #_info(*args) ⇒ Object
- #_on_goal(m, ctx:) ⇒ Object
- #_on_goal_alt(m, ctx:) ⇒ Object
-
#_on_goal_compat(m, ctx:) ⇒ Object
note - m is MatchData object.
-
#_on_group_def(m, ctx:) ⇒ Object
note - m is MatchData object.
-
#_on_prop_attendance(m, ctx:) ⇒ Object
note - m is MatchData object.
-
#_on_prop_cards(m, ctx:) ⇒ Object
note - m is MatchData object.
-
#_on_prop_lineup(m, ctx:) ⇒ Object
note - m is MatchData object.
-
#_on_prop_penalties(m, ctx:) ⇒ Object
note - m is MatchData object.
-
#_on_prop_referee(m, ctx:) ⇒ Object
note - m is MatchData object.
-
#_on_round_def(m, ctx:) ⇒ Object
note - m is MatchData object.
-
#_on_top(m, ctx:) ⇒ Object
note - m is MatchData object.
- #_prep_doc(txt) ⇒ Object
-
#_prep_line(line) ⇒ Object
auto-fix checks line-by-line.
- #_tokenize_line(line, lineno) ⇒ Object
- #_trace(*args) ⇒ Object
- #_warn(*args) ⇒ Object
- #debug? ⇒ Boolean
-
#initialize(txt, debug: false) ⇒ Lexer
constructor
A new instance of Lexer.
- #log(msg) ⇒ Object
- #tokenize_with_errors ⇒ Object
Constructor Details
#initialize(txt, debug: false) ⇒ Lexer
Returns a new instance of Lexer.
44 45 46 47 48 49 |
# File 'lib/sportdb/parser/lexer.rb', line 44 def initialize( txt, debug: false ) raise ArgumentError, "text as string expected for lexer; got #{txt.class.name}" unless txt.is_a?(String) @txt = txt @debug = debug end |
Class Method Details
._build_date(m) ⇒ Object
“internal” date helpers
6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/sportdb/parser/token-date--helpers.rb', line 6 def self._build_date( m ) date = {} ## map month names ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup date[:y] = m[:year].to_i(10) if m[:year] ## check - use y too for two-digit year or keep separate - why? why not? date[:yy] = m[:yy].to_i(10) if m[:yy] ## two digit year (e.g. 25 or 78 etc.) date[:m] = m[:month].to_i(10) if m[:month] date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name] date[:d] = m[:day].to_i(10) if m[:day] date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name] date end |
._build_date_legs(m) ⇒ Object
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/sportdb/parser/token-date--helpers.rb', line 21 def self._build_date_legs( m ) legs = {} ## map month names ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup date = {} date[:m] = MONTH_MAP[ m[:month_name1].downcase ] date[:d] = m[:day1].to_i(10) legs[:date1] = date date = {} date[:m] = MONTH_MAP[ m[:month_name2].downcase ] if m[:month_name2] date[:d] = m[:day2].to_i(10) legs[:date2] = date legs end |
._build_duration(m) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/sportdb/parser/token-date--helpers.rb', line 39 def self._build_duration( m ) ## todo/check/fix - if end: works for kwargs!!!!! duration = { start: {}, end: {}} duration[:start][:y] = m[:year1].to_i(10) if m[:year1] duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ] if m[:month_name1] duration[:start][:d] = m[:day1].to_i(10) if m[:day1] duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ] if m[:day_name1] duration[:end][:y] = m[:year2].to_i(10) if m[:year2] duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ] if m[:month_name2] duration[:end][:d] = m[:day2].to_i(10) if m[:day2] duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ] if m[:day_name2] duration end |
._build_goal_count(m) ⇒ Object
50 51 52 53 54 55 56 |
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 50 def self._build_goal_count( m ) count = {} count[:count] = m[:value].to_i(10) if m[:value] count[:og] = m[:og_value] ? m[:og_value].to_i(10) : 1 if m[:og] ## check flag count[:pen] = m[:pen_value] ? m[:pen_value].to_i(10) : 1 if m[:pen] ## check flag count end |
._build_goal_minute(m) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 6 def self._build_goal_minute( m ) minute = {} minute[:m] = m[:value].to_i(10) ## always required ## stoppage/injury time (offset) minute[:offset] = m[:value2].to_i(10) if m[:value2] minute[:og] = true if m[:og] minute[:pen] = true if m[:pen] minute[:freekick] = true if m[:fk] minute[:header] = true if m[:hdr] minute[:secs] = m[:secs].to_i(10) if m[:secs] minute end |
._build_goal_minute_na(m) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 24 def self._build_goal_minute_na( m ) minute = {} minute[:m] = '?' ## or use nil or 999 or -1 or ??? minute[:og] = true if m[:og] minute[:pen] = true if m[:pen] minute[:freekick] = true if m[:fk] minute[:header] = true if m[:hdr] minute end |
._build_goal_type(m) ⇒ Object
58 59 60 61 62 63 64 65 |
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 58 def self._build_goal_type( m ) goal = {} goal[:og] = true if m[:og] goal[:pen] = true if m[:pen] goal[:freekick] = true if m[:fk] goal[:header] = true if m[:hdr] goal end |
._build_minute(m) ⇒ Object
39 40 41 42 43 44 45 46 47 |
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 39 def self._build_minute( m ) minute = {} minute[:m] = m[:value].to_i(10) ## always required ## stoppage/injury time (offset) minute[:offset] = m[:value2].to_i(10) if m[:value2] minute end |
._build_score(m) ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 5 def self._build_score( m ) ## note - score is "generic" ## might be full-time (ft) or ## after extra-time (aet) or such ## or even undecided/unknown ## thus, use score1/score2 and NOT ft1/ft2 ## thus, use (simply an) array e.g. [1,2] ## and NOT hash (table) e.g. { ft: [1,2] } !!! score = [m[:score1].to_i(10), m[:score2].to_i(10)] score end |
._build_score_abd(m) ⇒ Object
score abandonded (abd/abd.)
30 31 32 33 34 35 36 37 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 30 def self._build_score_abd( m ) # score abandonded (abd/abd.) ### note - use "generic" score for now score = [m[:score1].to_i(10), m[:score2].to_i(10)] ## add score[:abd] = true ??? ## note - for now uses its own token e.g SCORE_ABD score end |
._build_score_awd(m) ⇒ Object
score awarded (awd/awd.)
20 21 22 23 24 25 26 27 28 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 20 def self._build_score_awd( m ) # score awarded (awd/awd.) ### note - use "generic" score for now ## to match A 3-0 B [awarded] etc. score = [m[:score1].to_i(10), m[:score2].to_i(10)] ## add score[:awarded] = true ??? ## note - for now uses its own token e.g SCORE_AWD score end |
._build_score_full(m) ⇒ Object
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 40 def self._build_score_full( m ) score = {} score[:p] = [m[:p1].to_i(10), m[:p2].to_i(10)] if m[:p1] && m[:p2] score[:et] = [m[:et1].to_i(10), m[:et2].to_i(10)] if m[:et1] && m[:et2] score[:ft] = [m[:ft1].to_i(10), m[:ft2].to_i(10)] if m[:ft1] && m[:ft2] score[:ht] = [m[:ht1].to_i(10), m[:ht2].to_i(10)] if m[:ht1] && m[:ht2] ## add golden/silver flags score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd) score[:silver] = true if m[:aetsg] ## silver goal (sg) score end |
._build_score_fuller(m) ⇒ Object
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 58 def self._build_score_fuller( m ) score = {} score[:p] = [m[:p1].to_i(10), m[:p2].to_i(10)] if m[:p1] && m[:p2] score[:et] = [m[:et1].to_i(10), m[:et2].to_i(10)] if m[:et1] && m[:et2] score[:ft] = [m[:ft1].to_i(10), m[:ft2].to_i(10)] if m[:ft1] && m[:ft2] score[:ht] = [m[:ht1].to_i(10), m[:ht2].to_i(10)] if m[:ht1] && m[:ht2] score[:agg] = [m[:agg1].to_i(10), m[:agg2].to_i(10)] if m[:agg1] && m[:agg2] if m[:away1] && m[:away2] score[:away] = [m[:away1].to_i(10), m[:away2].to_i(10)] elsif m[:away] ## fallback if no away score; check away flag score[:away] = true end ## add golden/silver flags score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd) score[:silver] = true if m[:aetsg] ## silver goal (sg) score end |
._build_score_fuller_more(m) ⇒ Object
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 86 def self._build_score_fuller_more( m ) ## SCORE + SCORE_FULLER_MORE ## note - after extra-time (aet) or full-time (ft) ## score may be present in SCORE!!! score = {} score[:p] = [m[:p1].to_i(10), m[:p2].to_i(10)] if m[:p1] && m[:p2] score[:et] = [m[:et1].to_i(10), m[:et2].to_i(10)] if m[:et1] && m[:et2] score[:ft] = [m[:ft1].to_i(10), m[:ft2].to_i(10)] if m[:ft1] && m[:ft2] score[:ht] = [m[:ht1].to_i(10), m[:ht2].to_i(10)] if m[:ht1] && m[:ht2] score[:agg] = [m[:agg1].to_i(10), m[:agg2].to_i(10)] if m[:agg1] && m[:agg2] if m[:away1] && m[:away2] score[:away] = [m[:away1].to_i(10), m[:away2].to_i(10)] elsif m[:away] ## fallback if no away score; check away flag score[:away] = true end ## add golden/silver flags score[:golden] = true if m[:aetgg] ## golden goal (gg)/sudden death (sd) score[:silver] = true if m[:aetsg] ## silver goal (sg) ## add flag in score for et/ft/ht ## used for "dangling" (generic) score score[:score] = 'et' if m[:aet] || m[:aetgg] || m[:aetsg] score[:score] = 'ft' if m[:ft] score[:score] = 'ht' if m[:ht] score end |
._build_score_legs(m) ⇒ Object
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 123 def self._build_score_legs( m ) legs = {} ############ ### build leg1 (score) score = {} score[:ft] = [m[:leg1_ft1].to_i(10), m[:leg1_ft2].to_i(10)] legs['leg1'] = score ################## ### build leg2 (score) score = {} score[:ft] = [m[:leg2_ft1].to_i(10), m[:leg2_ft2].to_i(10)] if m[:leg2_ft1] && m[:leg2_ft2] score[:et] = [m[:leg2_et1].to_i(10), m[:leg2_et2].to_i(10)] if m[:leg2_et1] && m[:leg2_et2] score[:p] = [m[:leg2_p1].to_i(10), m[:leg2_p2].to_i(10)] if m[:leg2_p1] && m[:leg2_p2] legs['leg2'] = score ## check for (opt) aggregate - keep on "top-level" legs[:agg] = [m[:agg1].to_i(10), m[:agg2].to_i(10)] if m[:agg1] && m[:agg2] legs[:away] = true if m[:away] legs end |
._build_status(m) ⇒ Object
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/sportdb/parser/token-status.rb', line 100 def self._build_status( m ) status = {} ## note - norm status text - why? why not? status[:status] = if m[:postponed] then 'postponed' elsif m[:canceled] then 'canceled' elsif m[:walkover] then 'walkover' elsif m[:awarded] then 'awarded' elsif m[:suspended] then 'suspended' elsif m[:abandoned] then 'abandoned' elsif m[:annulled] || m[:voided] then 'annulled' elsif m[:replay] then 'replay' else ## fallback on "generic" status (shouldn't happen) m[:status] end ## includes note? e.g. awarded; originally 2-0 status[:status_note] = m[:status_note] if m[:status_note] status end |
._build_time(m) ⇒ Object
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/sportdb/parser/token-time.rb', line 96 def self._build_time( m ) ## unify to iso-format ### 12.40 => 12:40 ## 12h40 => 12:40 etc. ## keep string (no time-only type in ruby) data = { time: {} } hour = m[:hour].to_i(10) ## allow 08/07/etc. minute = m[:minute].to_i(10) ## check if 24:00 possible? or only 0:00 (23:59) unless (hour >=0 && hour <=23) && (minute >=0 && minute <=59) raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range" end data[:time][:h] = hour data[:time][:m] = minute data[:time][:timezone] = m[:timezone] if m[:timezone] ## check if local time present e.g. ## 18:30 (19:30) ## 18:30 (19:30 BST) etc. if m[:time_local] data[:time_local] = {} local_hour = m[:local_hour].to_i(10) ## allow 08/07/etc. local_minute = m[:local_minute].to_i(10) ## check if 24:00 possible? or only 0:00 (23:59) unless (hour >=0 && hour <=23) && (minute >=0 && minute <=59) raise ArgumentError, "parse error - local time >#{m[:time_local]}< out-of-range" end data[:time_local][:h] = local_hour data[:time_local][:m] = local_minute data[:time_local][:timezone] = m[:local_timezone] if m[:local_timezone] end data end |
._mk_score_fuller_agg(win:) ⇒ Object
regex score helpers
note - MUST double escape \d e.g. \\d!!! if not "simple" string (e.g. '' but %Q<>)
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/sportdb/parser/token-score_fuller.rb', line 24 def self._mk_score_fuller_agg( win: ) ## with optional win - true|false %Q< (?: ############ ## opt 1) with win (?: #{ win ? '(?: win [ ] )?' : '' } (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2}) [ ] on [ ] agg (?: regate )? ) | ##### ## opt 2) "classic" (post) (?: (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2}) [ ]* #{AGG_EN} ) | ##### ## opt 3) agg up-front (pre) (?: agg [ ] (?<agg1>\\d{1,2}) - (?<agg2>\\d{1,2}) ) ) > end |
._mk_score_fuller_p(win:) ⇒ Object
with optional win - true|false
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/sportdb/parser/token-score_fuller.rb', line 53 def self._mk_score_fuller_p( win: ) ## with optional win - true|false %Q< (?: ############ ## opt 1) with win (?: #{ win ? '(?: win [ ] )?' : '' } (?<p1>\\d{1,2}) - (?<p2>\\d{1,2}) [ ] on [ ] pens ) | ##### ## opt 2) "classic" (post) (?: (?<p1>\\d{1,2}) - (?<p2>\\d{1,2}) [ ]* #{P_EN} ) | ##### ## opt 3) up-front (pre) (?: (?: pen|p) [ ] (?<p1>\\d{1,2}) - (?<p2>\\d{1,2}) ) ) > end |
._parse_date(str) ⇒ Object
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/sportdb/parser/token-date--helpers.rb', line 111 def self._parse_date( str ) ## note - strip - leading/trailing spaces automatic - why? why not? m = DATE_RE.match( str.strip ) if m && m.pre_match == '' && m.post_match == '' ## return hash table with captured components date = _build_date( m ) date elsif m ## note - match BUT not anchored to start and end-of-string!!! ## report, error somehow?? nil else nil ## no match - return nil end end |
._parse_goal_count(str) ⇒ Object
95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 95 def self._parse_goal_count( str ) ## note - strip - leading/trailing spaces m = GOAL_COUNT_RE.match( str.strip ) if m && m.pre_match == '' && m.post_match == '' _build_goal_count( m ) elsif m ## note - match BUT not anchored to start and end-of-string!!! ## report, error somehow?? nil else nil ## no match - return nil end end |
._parse_goal_minute(str) ⇒ Object
parse helpers
81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 81 def self._parse_goal_minute( str ) ## note - strip - leading/trailing spaces m = GOAL_MINUTE_RE.match( str.strip ) if m && m.pre_match == '' && m.post_match == '' _build_goal_minute( m ) elsif m ## note - match BUT not anchored to start and end-of-string!!! ## report, error somehow?? nil else nil ## no match - return nil end end |
._parse_score_full(str) ⇒ Object
add parser helpers
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 167 def self._parse_score_full( str ) ## note - strip - leading/trailing spaces automatic - why? why not? m = Regexp.union( SCORE_FULL_1ST_RE, SCORE_FULL_RE ).match( str.strip ) if m && m.pre_match == '' && m.post_match == '' pp m _build_score_full( m ) elsif m ## note - match BUT not anchored to start and end-of-string!!! ## report, error somehow?? nil else nil ## no match - return nil end end |
._parse_team(str) ⇒ Object
helper for testing regex match for team names
205 206 207 208 209 210 211 212 213 214 215 216 217 |
# File 'lib/sportdb/parser/token-text.rb', line 205 def self._parse_team( str ) ## note - strip - leading/trailing spaces m = TEXT_RE.match( str.strip ) if m && m.pre_match == '' && m.post_match == '' m elsif m ## note - match BUT not anchored to start and end-of-string!!! ## report, error somehow?? nil else nil ## no match - return nil end end |
.build_map(lines, downcase: false) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/sportdb/parser/token-date--names.rb', line 39 def self.build_map( lines, downcase: false ) ## note: downcase name!!! ## build a lookup map that maps the word to the index (line no) plus 1 e.g. ## {"january" => 1, "jan" => 1, ## "february" => 2, "feb" => 2, ## "march" => 3, "mar" => 3, ## "april" => 4, "apr" => 4, ## "may" => 5, ## "june" => 6, "jun" => 6, ... lines.each_with_index.reduce( {} ) do |h,(line,i)| line.each do |name| h[ downcase ? name.downcase : name ] = i+1 end ## note: start mapping with 1 (and NOT zero-based, that is, 0) h end end |
.build_names(lines) ⇒ Object
32 33 34 35 36 |
# File 'lib/sportdb/parser/token-date--names.rb', line 32 def self.build_names( lines ) ## join all words together into a single string e.g. ## January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|... lines.map { |line| line.join('|') }.join('|') end |
.parse_date(str, start: nil) ⇒ Object
note: parse_date - returns Date object
_parse_date (with underscore) - return hash of "parsed" regex match data!!
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/sportdb/parser/token-date--helpers.rb', line 72 def self.parse_date( str, start: nil ) if m = _parse_date( str ) year = m[:y] yy = m[:yy] #### ## support two digit shortcut for year if yy && year.nil? ### ## for now assume 00,01 to 30 is 2000,2001 to 2030 ## and 31 to 99 is 1931 to 1999 year = yy <= 30 ? 2000+yy : 1900+yy end month = m[:m] day = m[:d] wday = m[:wday] if year.nil? ## try to calculate year raise ArgumentError, "year required in date >#{str}< or pass along start date" if start.nil? year = if month > start.month || (month == start.month && day >= start.day) # assume same year as start_at event (e.g. 2013 for 2013/14 season) start.year else # assume year+1 as start_at event (e.g. 2014 for 2013/14 season) start.year+1 end end Date.new( year,month,day ) else raise ArgumentError, "unexpected date format; cannot parse >#{str}<" end end |
.parse_names(txt) ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
# File 'lib/sportdb/parser/token-date--names.rb', line 5 def self.parse_names( txt ) lines = [] # array of lines (with words) txt.each_line do |line| line = line.strip next if line.empty? next if line.start_with?( '#' ) ## skip comments too ## strip inline (until end-of-line) comments too ## e.g. Janvier Janv Jan ## check janv in use?? ## => Janvier Janv Jan line = line.sub( /#.*/, '' ).strip ## pp line values = line.split( /[ \t]+/ ) ## pp values ## todo/fix -- add check for duplicates lines << values end lines end |
Instance Method Details
#_build_date(m) ⇒ Object
59 |
# File 'lib/sportdb/parser/token-date--helpers.rb', line 59 def _build_date( m ) self.class._build_date( m ); end |
#_build_date_legs(m) ⇒ Object
60 |
# File 'lib/sportdb/parser/token-date--helpers.rb', line 60 def _build_date_legs( m ) self.class._build_date_legs( m ); end |
#_build_duration(m) ⇒ Object
61 |
# File 'lib/sportdb/parser/token-date--helpers.rb', line 61 def _build_duration( m ) self.class._build_duration( m ); end |
#_build_goal_count(m) ⇒ Object
71 |
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 71 def _build_goal_count( m ) self.class._build_goal_count( m ); end |
#_build_goal_minute(m) ⇒ Object
68 |
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 68 def _build_goal_minute( m ) self.class._build_goal_minute( m ); end |
#_build_goal_minute_na(m) ⇒ Object
69 |
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 69 def _build_goal_minute_na( m ) self.class._build_goal_minute_na( m ); end |
#_build_goal_type(m) ⇒ Object
72 |
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 72 def _build_goal_type( m ) self.class._build_goal_type( m ); end |
#_build_minute(m) ⇒ Object
70 |
# File 'lib/sportdb/parser/token-goals--helpers.rb', line 70 def _build_minute( m ) self.class._build_minute( m ); end |
#_build_score(m) ⇒ Object
153 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 153 def _build_score( m ) self.class._build_score( m ); end |
#_build_score_abd(m) ⇒ Object
155 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 155 def _build_score_abd( m ) self.class._build_score_abd( m ); end |
#_build_score_awd(m) ⇒ Object
154 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 154 def _build_score_awd( m ) self.class._build_score_awd( m ); end |
#_build_score_full(m) ⇒ Object
156 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 156 def _build_score_full( m ) self.class._build_score_full( m ); end |
#_build_score_fuller(m) ⇒ Object
157 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 157 def _build_score_fuller( m ) self.class._build_score_fuller( m ); end |
#_build_score_fuller_more(m) ⇒ Object
158 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 158 def _build_score_fuller_more( m ) self.class._build_score_fuller_more( m ); end |
#_build_score_legs(m) ⇒ Object
159 |
# File 'lib/sportdb/parser/token-score--helpers.rb', line 159 def _build_score_legs( m ) self.class._build_score_legs( m ); end |
#_build_status(m) ⇒ Object
121 |
# File 'lib/sportdb/parser/token-status.rb', line 121 def _build_status( m ) self.class._build_status( m ); end |
#_build_time(m) ⇒ Object
139 |
# File 'lib/sportdb/parser/token-time.rb', line 139 def _build_time(m) self.class._build_time(m); end |
#_info(*args) ⇒ Object
32 33 34 35 |
# File 'lib/sportdb/parser/lexer.rb', line 32 def _info( *args ) print "[INFO] Lexer -- " args.each { |arg| puts args } end |
#_on_goal(m, ctx:) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/sportdb/parser/lexer-on_goal.rb', line 19 def _on_goal( m, ctx: ) if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:goals_none] ## note - eats-up semicolon!! e.g. -; or - ; # was:[:GOALS_NONE,"<|GOALS_NONE|>"] ## use literal text!! Token.new( :GOALS_NONE, m[:goals_none], lineno: ctx.lineno, offset: m.offset(:goals_none)) elsif m[:goal_sep_alt] # was: [:GOAL_SEP_ALT, "<|GOAL_SEP_ALT|>" ] ## e.g. dash (-) WITH leading & trailing space required Token.new( :GOAL_SEP_ALT, m[:goal_sep_alt], lineno: ctx.lineno, offset: m.offset(:goal_sep_alt)) elsif m[:prop_name] ## note - change prop_name to player Token.new( :PLAYER, m[:name], lineno: ctx.lineno, offset: m.offset(:name)) elsif m[:goal_minute] Token.new( :GOAL_MINUTE, m[:goal_minute], lineno: ctx.lineno, offset: m.offset(:goal_minute), value: _build_goal_minute( m )) elsif m[:goal_minute_na] ## note - (re)use GOAL_MINUTE token; no extra GOAL_MINUTE_NA or such - why? why not? ## make sure to handle 'm' => nil upstream!!! ## change to 999 or -1 or such - why? why not? Token.new( :GOAL_MINUTE, m[:goal_minute_na], lineno: ctx.lineno, offset: m.offset(:goal_minute_na), value: _build_goal_minute_na( m )) elsif m[:goal_count] Token.new( :GOAL_COUNT, m[:goal_count], lineno: ctx.lineno, offset: m.offset(:goal_count), value: _build_goal_count( m )) elsif m[:sym] case m[:sym] when ')' ## leave goal mode!! _trace( "LEAVE GOAL_RE MODE" ) @re = RE ## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS) ## or GOAL_PAREN_CLOSE/END ??? ## fix - use ) too - why? why not? ## was: [:GOALS_END, '<|GOALS_END|>'] Token.virtual( :GOALS_END, lineno: ctx.lineno ) else Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym)) end else ctx.warn_on_else( m, mode: 'GOAL' ) nil end end |
#_on_goal_alt(m, ctx:) ⇒ Object
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# File 'lib/sportdb/parser/lexer-on_goal.rb', line 82 def _on_goal_alt( m, ctx: ) if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:prop_name] ## note - change prop_name to player Token.new(:PLAYER, m[:name], lineno: ctx.lineno, offset: m.offset(:name)) elsif m[:goal_minute] Token.new( :GOAL_MINUTE, m[:goal_minute], lineno: ctx.lineno, offset: m.offset(:goal_minute), value: _build_goal_minute( m )) elsif m[:goal_type] Token.new( :GOAL_TYPE,m[:goal_type], lineno: ctx.lineno, offset: m.offset(:goal_type), value: _build_goal_type( m )) elsif m[:score] Token.new( :SCORE, m[:score], lineno: ctx.lineno, offset: m.offset(:score), value: _build_score( m )) elsif m[:sym] case m[:sym] when ')' ## leave goal mode!! _trace( "LEAVE GOAL_ALT_RE MODE" ) @re = RE ## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS) ## or GOAL_PAREN_CLOSE/END ??? ## [:GOALS_END, '<|GOALS_END|>'] Token.virtual( :GOALS_END, lineno: ctx.lineno ) else Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym)) end else ctx.warn_on_else( m, mode: 'GOAL_ALT' ) nil end end |
#_on_goal_compat(m, ctx:) ⇒ Object
note - m is MatchData object
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
# File 'lib/sportdb/parser/lexer-on_goal.rb', line 132 def _on_goal_compat( m, ctx: ) ## note - m is MatchData object if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:prop_name] ## note - change prop_name to player Token.new(:PLAYER, m[:name], lineno: ctx.lineno, offset: m.offset(:name)) elsif m[:minute] Token.new(:MINUTE, m[:minute], lineno: ctx.lineno, offset: m.offset(:minute), value: _build_minute( m )) elsif m[:goal_type] Token.new( :GOAL_TYPE,m[:goal_type], lineno: ctx.lineno, offset: m.offset(:goal_type), value: _build_goal_type( m )) elsif m[:score] Token.new( :SCORE, m[:score], lineno: ctx.lineno, offset: m.offset(:score), value: _build_score( m )) elsif m[:sym] case m[:sym] when ')' ## leave goal mode!! _trace( "LEAVE GOAL_COMPAT_RE MODE" ) @re = RE ## note - use/return GOAL_END token - change to GOAL_END_PAREN(THESIS) ## or GOAL_PAREN_CLOSE/END ??? ## [:GOALS_END, '<|GOALS_END|>'] Token.virtual( :GOALS_END, lineno: ctx.lineno ) else Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym)) end else ctx.warn_on_else( m, mode: 'GOAL_COMPAT' ) nil end end |
#_on_group_def(m, ctx:) ⇒ Object
note - m is MatchData object
14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/sportdb/parser/lexer-on_group_def.rb', line 14 def _on_group_def( m, ctx: ) ## note - m is MatchData object if m[:spaces] || m[:space] nil ## skip spaces elsif m[:text] Token.new(:TEAM, m[:text], lineno: ctx.lineno, offset: m.offset(:text)) elsif m[:sym] Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym)) else ctx.warn_on_else( m, mode: 'GROUP_DEF' ) nil end end |
#_on_prop_attendance(m, ctx:) ⇒ Object
note - m is MatchData object
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/sportdb/parser/lexer-on_prop_misc.rb', line 45 def _on_prop_attendance( m, ctx: ) ## note - m is MatchData object if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:enclosed_name] ## reserverd for use for sold out or such (in the future) - why? why not? Token.new(:ENCLOSED_NAME, m[:name], lineno: ctx.lineno, offset: m.offset(:name)) elsif m[:num] Token.new(:PROP_NUM, m[:num], lineno: ctx.lineno, offset: m.offset(:num), value: m[:value].to_i(10)) else ctx.warn_on_else( m, mode: 'PROP_ATTENDANCE' ) nil end end |
#_on_prop_cards(m, ctx:) ⇒ Object
note - m is MatchData object
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/sportdb/parser/lexer-on_prop_misc.rb', line 17 def _on_prop_cards( m, ctx: ) ## note - m is MatchData object if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:prop_name] Token.new(:PROP_NAME, m[:name], lineno: ctx.lineno, offset: m.offset(:prop_name)) elsif m[:minute] Token.new(:MINUTE, m[:minute], lineno: ctx.lineno, offset: m.offset(:minute), value: _build_minute( m )) elsif m[:sym] Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym)) else ctx.warn_on_else( m, mode: 'PROP_CARDS' ) nil end end |
#_on_prop_lineup(m, ctx:) ⇒ Object
note - m is MatchData object
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/sportdb/parser/lexer-on_prop_lineup.rb', line 22 def _on_prop_lineup( m, ctx: ) ## note - m is MatchData object if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:prop_key] ## check for inline prop keys key = m[:key] ## supported for now coach/trainer (add manager?) if ['coach', 'trainer'].include?( key.downcase ) ## use PROP_COACH or COACH_KEY or such - why? why not? Token.new(:COACH, m[:key], lineno: ctx.lineno, offset: m.offset(:key)) else ## report error - for unknown (inline) prop key in lineup nil end elsif m[:inline_captain] Token.new(:INLINE_CAPTAIN, m[:inline_captain], lineno: ctx.lineno, offset: m.offset(:inline_captain)) elsif m[:inline_yellow] card = {} card[:m] = m[:minute].to_i(10) if m[:minute] card[:offset] = m[:offset].to_i(10) if m[:offset] Token.new(:INLINE_YELLOW, m[:inline_yellow], lineno: ctx.lineno, offset: m.offset(:inline_yellow), value: card) elsif m[:inline_red] card = {} card[:m] = m[:minute].to_i(10) if m[:minute] card[:offset] = m[:offset].to_i(10) if m[:offset] Token.new(:INLINE_RED, m[:inline_red], lineno: ctx.lineno, offset: m.offset(:inline_red), value: card) elsif m[:inline_yellow_red] card = {} card[:m] = m[:minute].to_i(10) if m[:minute] card[:offset] = m[:offset].to_i(10) if m[:offset] Token.new(:INLINE_YELLOW_RED, m[:inline_yellow_red], lineno: ctx.lineno, offset: m.offset(:inline_yellow_red), value: card) elsif m[:prop_name] Token.new(:PROP_NAME, m[:name], lineno: ctx.lineno, offset: m.offset(:prop_name)) elsif m[:minute] Token.new(:MINUTE, m[:minute], lineno: ctx.lineno, offset: m.offset(:minute), value: _build_minute( m )) elsif m[:sym] Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym)) else ctx.warn_on_else( m, mode: 'PROP_LINEUP' ) nil end end |
#_on_prop_penalties(m, ctx:) ⇒ Object
note - m is MatchData object
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/sportdb/parser/lexer-on_prop_penalties.rb', line 16 def _on_prop_penalties( m, ctx: ) ## note - m is MatchData object if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:prop_name] ## note - change prop_name to player Token.new(:PROP_NAME, m[:name], lineno: ctx.lineno, offset: m.offset(:prop_name)) elsif m[:enclosed_name] ## use HOLD,SAVE,POST or such keys - why? why not? Token.new(:ENCLOSED_NAME, m[:name], lineno: ctx.lineno, offset: m.offset(:name)) elsif m[:score] Token.new( :SCORE, m[:score], lineno: ctx.lineno, offset: m.offset(:score), value: _build_score( m )) elsif m[:sym] Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym)) else ctx.warn_on_else( m, mode: 'PROP_PENALTIES ') nil end end |
#_on_prop_referee(m, ctx:) ⇒ Object
note - m is MatchData object
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/sportdb/parser/lexer-on_prop_misc.rb', line 75 def _on_prop_referee( m, ctx: ) ## note - m is MatchData object if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:prop_key] ## check for inline prop keys key = m[:key] ## supported for now coach/trainer (add manager?) if ['att', 'attn', 'attendance' ].include?( key.downcase ) ## use ATTENDANCE_PROP or ATTENDANCE_KEY or such - why? why not? Token.new(:ATTENDANCE, m[:key], lineno: ctx.lineno, offset: m.offset(:key)) else ## report error - for unknown (inline) prop key in lineup nil end elsif m[:prop_name] ## note - change prop_name to player or to (plain) name? Token.new(:PROP_NAME, m[:name], lineno: ctx.lineno, offset: m.offset(:prop_name)) elsif m[:num] Token.new(:PROP_NUM, m[:num], lineno: ctx.lineno, offset: m.offset(:num), value: m[:value].to_i(10)) elsif m[:enclosed_name] ## use HOLD,SAVE,POST or such keys - why? why not? Token.new(:ENCLOSED_NAME, m[:name], lineno: ctx.lineno, offset: m.offset(:name)) elsif m[:sym] Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym)) else ctx.warn_on_else( m, mode: 'PROP_REFEREE' ) nil end end |
#_on_round_def(m, ctx:) ⇒ Object
note - m is MatchData object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/sportdb/parser/lexer-on_round_def.rb', line 15 def _on_round_def( m, ctx: ) ## note - m is MatchData object if m[:spaces] || m[:space] nil ## skip spaces elsif m[:date] Token.new(:DATE, m[:date], lineno: ctx.lineno, offset: m.offset(:date), value: _build_date(m)) elsif m[:duration] Token.new(:DURATION, m[:duration], lineno: ctx.lineno, offset: m.offset(:duration), value: _build_duration( m )) elsif m[:sym] Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym)) else ctx.warn_on_else( m, mode: 'ROUND_DEF' ) nil end end |
#_on_top(m, ctx:) ⇒ Object
note - m is MatchData object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/sportdb/parser/lexer-on_top.rb', line 5 def _on_top( m, ctx: ) ## note - m is MatchData object ## note - top-level (for now always) assumes TEAM for TEXT match!! ## fix/fix/fix change TEXT_RE/:text to TEAM_RE/:team !!! if m[:space] || m[:spaces] nil ## skip space(s) elsif m[:text] then Token.new(:TEAM, m[:text], lineno: ctx.lineno, offset: m.offset(:text)) elsif m[:team_home] then Token.new(:TEAM_HOME, m[:team_home], lineno: ctx.lineno, offset: m.offset(:team_home)) elsif m[:team_away] then Token.new(:TEAM_AWAY, m[:team_away], lineno: ctx.lineno, offset: m.offset(:team_away)) elsif m[:team_neutral] then Token.new(:TEAM_NEUTRAL, m[:team_neutral], lineno: ctx.lineno, offset: m.offset(:team_neutral)) ## (match) status e.g. cancelled, awarded, etc. ## inline: w/o - walkover ## n/p - not played ## bye ## abd/abd. - abandoned ## void ## susp/susp. - suspended ## ppd/ppd. or postp/postp. - postponed ## awd/awd. - awarded ## canc/canc. - cancelled/canceled elsif m[:inline_wo] then Token.new(:INLINE_WO, m[:inline_wo], lineno: ctx.lineno, offset: m.offset(:inline_wo)) elsif m[:inline_np] then Token.new(:INLINE_NP, m[:inline_np], lineno: ctx.lineno, offset: m.offset(:inline_np)) elsif m[:inline_bye] then Token.new(:INLINE_BYE, m[:inline_bye], lineno: ctx.lineno, offset: m.offset(:inline_bye)) elsif m[:inline_abd] then Token.new(:INLINE_ABD, m[:inline_abd], lineno: ctx.lineno, offset: m.offset(:inline_abd)) elsif m[:inline_void] then Token.new(:INLINE_VOID, m[:inline_void], lineno: ctx.lineno, offset: m.offset(:inline_void)) elsif m[:inline_susp] then Token.new(:INLINE_SUSP, m[:inline_susp], lineno: ctx.lineno, offset: m.offset(:inline_susp)) elsif m[:inline_ppd] then Token.new(:INLINE_PPD, m[:inline_ppd], lineno: ctx.lineno, offset: m.offset(:inline_ppd)) elsif m[:inline_awd] then Token.new(:INLINE_AWD, m[:inline_awd], lineno: ctx.lineno, offset: m.offset(:inline_awd)) elsif m[:inline_canc] then Token.new(:INLINE_CANC, m[:inline_canc], lineno: ctx.lineno, offset: m.offset(:inline_canc)) elsif m[:status] then Token.new(:STATUS, m[:status], lineno: ctx.lineno, offset: m.offset(:status), value: _build_status( m )) elsif m[:note] ### todo/check: ## use value hash - why? why not? or simplify to: ## [:NOTE, [m[:note], {note: m[:note] } ]] Token.new(:NOTE, m[:note], lineno: ctx.lineno, offset: m.offset(:note)) elsif m[:attendance] att = {} att[:value] = m[:value].gsub( '_', '' ).to_i(10) ## note - for token id use INLINE_ATTENDANCE (ATTENDANCE in use for prop!!!) Token.new(:INLINE_ATTENDANCE, m[:attendance], lineno: ctx.lineno, offset: m.offset(:attendance), value: att) elsif m[:time] then Token.new(:TIME, m[:time], lineno: ctx.lineno, offset: m.offset(:time), value: _build_time(m)) elsif m[:date] then Token.new(:DATE, m[:date], lineno: ctx.lineno, offset: m.offset(:date), value: _build_date(m)) elsif m[:date_legs] then Token.new(:DATE_LEGS, m[:date_legs], lineno: ctx.lineno, offset: m.offset(:date_legs), value: _build_date_legs(m)) elsif m[:score_legs] then Token.new(:SCORE_LEGS, m[:score_legs], lineno: ctx.lineno, offset: m.offset(:score_legs), value: _build_score_legs( m )) elsif m[:score_full] then Token.new(:SCORE_FULL, m[:score_full], lineno: ctx.lineno, offset: m.offset(:score_full), value: _build_score_full( m )) elsif m[:score_fuller] then Token.new(:SCORE_FULLER, m[:score_fuller], lineno: ctx.lineno, offset: m.offset(:score_fuller), value: _build_score_fuller( m )) elsif m[:score_fuller_more] then Token.new(:SCORE_FULLER_MORE, m[:score_fuller_more], lineno: ctx.lineno, offset: m.offset(:score_fuller_more), value: _build_score_fuller_more( m )) elsif m[:score] then Token.new(:SCORE, m[:score], lineno: ctx.lineno, offset: m.offset(:score), value: _build_score( m )) elsif m[:score_awd] then Token.new(:SCORE_AWD, m[:score_awd], lineno: ctx.lineno, offset: m.offset(:score_awd), value: _build_score_awd( m )) elsif m[:score_abd] then Token.new(:SCORE_ABD, m[:score_abd], lineno: ctx.lineno, offset: m.offset(:score_abd), value: _build_score_abd( m )) elsif m[:vs] then Token.new(:VS, m[:vs], lineno: ctx.lineno, offset: m.offset(:vs)) elsif m[:sym] case m[:sym] ## return symbols "inline" as is - why? why not? when '@' ## enter geo mode _trace( 'ENTER GEO_RE MODE' ) @re = GEO_RE @geo_count = 0 Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym)) when '(' ## enter goal scorer mode on "free-floating" open paranthesis!!! _trace( 'ENTER GOAL_RE MODE' ) @re = GOAL_RE ## note - eat-up ( for now; do NOT pass along as token ## pass along "virutal" INLINE GOALS - why? why not? Token.virtual( :INLINE_GOALS, lineno: ctx.lineno ) else Token.literal( m[:sym], lineno: ctx.lineno, offset: m.offset(:sym)) end else ctx.warn_on_else( m ) nil end end |
#_prep_doc(txt) ⇒ Object
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
# File 'lib/sportdb/parser/lexer-prep_doc.rb', line 45 def _prep_doc( txt ) ## preprocess automagically ## strip html comments ## keep empty lines? - yes (turn in BLANK tokens) ## keep leading spaces (indent) - yes (maybe used later in upstream parser!!) ## ## note - KEEP empty lines (get turned into BLANK token!!!!) ### normalize unicode (decomposed chars to composed chars) ## ## note: é is decomposed (in two chars e.g.) ## e (101) ## ́ (769) ## vs ## é (233) txt = txt.unicode_normalize(:nfc) ## "universal" newlines ## replace all windows-style cr+lf (\r\n) to lf (\n) only txt = txt.gsub( "\r\n", "\n" ) ### ## quick hack for now ## remove html-style comments <!-- --> ## (incl. multi-line) with two spaces ## will mess-up lineno tracking!!! ## fix later to have function lineno & colno!!! ## ## todo/fix - why? why not? ## to keep lineno intact ## replace with space and newline ### ## add more "native" multi-line comment-styles ## e.g. #[[ ... ]] or #<<< .. >>> or #<< .. >> ## or such - why? why not? txt = txt.gsub( HTML_COMMENT_RE ) do |m| _trace('preproc html comment:', m ) ' ' end txt = txt.gsub( PREPROC_NOTA_BENE_RE ) do |m| if m.include?( "\n" ) ## check for newlines (\n) and replace _trace('preproc (multi-line) note/nota bene block:', m ) m.gsub( "\n", '↵' ) else m end end ##### ## (another) quick hack for now ## turn multi-line note blocks into ## single-line note blocks ## by changing newline (\n) to ⏎ (unicode U+23CE) ## or why not to ___ ? ## ## unicode options for return/arrows: ## - ↵ (U+21B5): Downwards Arrow With Corner Leftwards. ## This is the most common "carriage return" symbol. ## - ⏎ (U+23CE): Return Symbol. ## Specifically designated as the keyboard's "Return" key symbol, ## often used in user interfaces. txt = txt.gsub( PREPROC_BLOCK_RE ) do |m| if m.include?( "\n" ) ## check for newlines (\n) and replace _trace( 'preproc (multi-line) block:', m ) m.gsub( "\n", '↵' ) else m end end txt end |
#_prep_line(line) ⇒ Object
auto-fix checks line-by-line
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/sportdb/parser/lexer-prep_line.rb', line 8 def _prep_line( line ) ## ## first check for tabs ## add error/warn ## for auto-fix - replace tabs with two spaces line = line.gsub( "\t" ) do |_| ## report error here ## todo/add error here _warn( "auto-fix; replacing tab (\\t) with two spaces in line #{line.inspect}" ) ' ' ## replace with two spaces end ## U+00A0 (160) -- non-breaking space (unicode) line = line.gsub( "\u00A0" ) do |uni| ## report error here ## todo/add error here _warn( "auto-fix; replacing non-breaking unicode space (#{uni}/#{uni.ord}) w/ ascii space ( /#{" ".ord}) in line #{line.inspect}" ) ' ' ## replace with space end ### ## todo/fix - print unicode numbers for [–−] ## different candidates to differentiate and document!!! ## – => U+2013 (8211) -- En Dash (unicode) ## − => U+2212 (8722) -- Minus Sign (unicode) line = line.gsub( /[–−]/ ) do |uni| ## report error here ## todo/add error here _warn( "auto-fix; replacing unicode dash (#{uni}/#{uni.ord}) w/ ascii dash (-/#{"-".ord}) in line #{line.inspect}" ) '-' ## replace with ascii dash (-) end #### add more unsmart quotes ## smart quotes line = line.gsub( /[‘’]/ ) do |uni| ## report error here ## todo/add error here _warn( "auto-fix; replacing unicode (smart) quote (#{uni}/#{uni.ord}) w/ ascii quote ('/#{"'".ord}) in line #{line.inspect}" ) "'" end line = line.gsub( /[“”]/ ) do |uni| ## report error here ## todo/add error here _warn( %Q{auto-fix; replacing unicode (smart) double quote (#{uni}/#{uni.ord}) w/ ascii double quote ("/#{'"'.ord}) in line #{line.inspect}} ) '"' end line end |
#_tokenize_line(line, lineno) ⇒ Object
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 |
# File 'lib/sportdb/parser/lexer-tokenize.rb', line 72 def _tokenize_line( line, lineno ) tokens = [] errors = [] ## keep a list of errors - why? why not? pos = 0 ## note - usually same as offset[1] aka offset[end] after match ## track last offset (begin/end) - to report error on no match ## or no match in end of string offset = [0,0] m = nil ## track number of geo text seen ## (use for - do NOT break on two spaces if no geo text seen yet!!) @geo_count = 0 #### ## quick hack - keep re state/mode between tokenize calls!!! @re ||= RE ## note - switch between RE & INSIDE_RE if @re == RE ## top-level ### check for modes once (per line) here to speed-up parsing ### for now goals only possible for start of line!! ### fix - remove optional [] - why? why not? #### ## note - ord e.g. (45) for match number can only start a (match) line ## "inline" use NOT possible ## note - ord (for ordinal number!!!) e.g match number (1), (42), etc. if (m = START_WITH_ORD.match(line)) ## note - strip enclosing () and convert to integer tokens << Token.new(:ORD, m[:ord], lineno: lineno, offset: m.offset(:ord), value: m[:value].to_i(10) ) offset = m.offset(0) pos = offset[1] ## update pos elsif (m = START_WITH_YEAR.match(line)) tokens << Token.new(:YEAR, m[:year], lineno: lineno, offset: m.offset(:year), value: m[:year].to_i(10) ) offset = m.offset(0) pos = offset[1] ## update pos elsif (m = START_WITH_GROUP_DEF_LINE_RE.match( line )) _trace( "ENTER GROUP_DEF_RE MODE" ) @re = GROUP_DEF_RE tokens << Token.new( :GROUP_DEF, m[:group_def], lineno: lineno, offset: m.offset(:group_def) ) offset = m.offset(0) pos = offset[1] ## update pos elsif (m = START_WITH_PROP_KEY_RE.match( line )) ## start with prop key (match will switch into prop mode!!!) ## - fix - remove leading spaces in regex (upstream) - why? why not? ## ### switch into new mode ## switch context to PROP_RE _trace("ENTER PROP_RE MODE" ) key = m[:key] ### todo/fix - add prop yellow/red cards too - why? why not? ## todo/fix - separate sent off and red card ## sent-off - incl. red card, yellow/red card and the era before red cards!! if ['sent off'].include?( key.downcase) @re = PROP_CARDS_RE ## use CARDS_RE ??? tokens << Token.new(:PROP_SENTOFF, m[:key], lineno: lineno, offset: m.offset(:key)) elsif ['red cards'].include?( key.downcase ) @re = PROP_CARDS_RE ## use CARDS_RE ??? tokens << Token.new(:PROP_REDCARDS, m[:key], lineno: lineno, offset: m.offset(:key)) elsif ['yellow cards'].include?( key.downcase ) @re = PROP_CARDS_RE tokens << Token.new(:PROP_YELLOWCARDS, m[:key], lineno: lineno, offset: m.offset(:key)) elsif ['ref', 'referee', 'refs', 'referees' ## note - allow/support assistant refs ].include?( key.downcase ) @re = PROP_REFEREE_RE tokens << Token.new(:PROP_REFEREE, m[:key], lineno: lineno, offset: m.offset(:key)) elsif ['att', 'attn', 'attendance'].include?( key.downcase ) @re = PROP_ATTENDANCE_RE tokens << Token.new(:PROP_ATTENDANCE, m[:key], lineno: lineno, offset: m.offset(:key)) # elsif ['goals'].include?( key.downcase ) # @re = PROP_GOAL_RE # tokens << [:PROP_GOALS, m[:key]] elsif ['penalties', 'penalty shootout', 'penalty shoot-out', 'penalty kicks'].include?( key.downcase ) @re = PROP_PENALTIES_RE tokens << Token.new(:PROP_PENALTIES, m[:key], lineno: lineno, offset: m.offset(:key)) else ## assume (team) line-up @re = PROP_LINEUP_RE ## fix-fix-fix - rename to PROP_LINEUP !! tokens << Token.new(:PROP, m[:key], lineno: lineno, offset: m.offset(:key)) end offset = m.offset(0) pos = offset[1] ## update pos ### ### todo/fix ### rename to START_WITH_ROUND_DEF_OUTLINE_RE !!!! elsif (m = ROUND_DEF_OUTLINE_RE.match( line )) _trace( "ENTER ROUND_DEF_RE MODE" ) @re = ROUND_DEF_RE ## note - return ROUND_DEF NOT ROUND_OUTLINE token ## fix - add leading ▪ too!! tokens << Token.new( :ROUND_DEF, m[:round_outline], lineno: lineno, offset: m.offset(:round_outline)) offset = m.offset(0) pos = offset[1] ## update pos elsif (m = ROUND_OUTLINE_RE.match( line )) _trace( "ROUND_OUTLINE" ) ## note - derive round level from no of (leading) markers ## e.g. ▪/:: is 1, ▪▪/::: is 2, ▪▪▪/:::: is 3, etc. ## note - ascii-style starts with double ::, thus, autodecrement by one! round_level = m[:round_marker].size round_level -= 1 if m[:round_marker].start_with?( '::' ) tokens << Token.new( :ROUND_OUTLINE, m[:round_outline], lineno: lineno, offset: m.offset(:round_outline), value: { outline: m[:round_outline], level: round_level}) ## note - eats-up line for now (change later to only eat-up marker e.g. »|>>) offset = m.offset(0) pos = offset[1] ## update pos elsif (m = START_GOAL_LINE_RE.match( line )) ## line starting with ( - assume ## switch context to GOAL_RE (goalline(s)) #### ## note - check for alternate goal line styles / formats if START_GOAL_LINE_COMPAT_RE.match(line ) ## "legacy" style starting with minute e.g. ## (6 Puskás 0-1, 9 Czibor 0-2, 11 Morlock 1-2, 18 Rahn 2-2, ## 84 Rahn 3-2) @re = GOAL_COMPAT_RE _trace( "ENTER GOAL_COMPAT_RE MODE" ) tokens << Token.virtual( :GOALS_COMPAT, lineno: lineno ) elsif START_GOAL_LINE_ALT_RE.match( line ) ## goals with scores e.g. ## (1-0 Franck Ribéry, 2-0 Ivica Olić, 2-1 Wayne Rooney) ## -or- ## (Dion Beljo 1-0 ## 1-1 Andreas Gruber ## Matthias Seidl 2-1) @re = GOAL_ALT_RE _trace( "ENTER GOAL_ALT_RE MODE" ) tokens << Token.virtual( :GOALS_ALT, lineno: lineno ) else ## "standard" / default style @re = GOAL_RE _trace( "ENTER GOAL_RE MODE" ) tokens << Token.virtual( :GOALS, lineno: lineno ) end ## note - eat-up ( for now ## pass along "virtual" GOALS or GOALS_ALT token ## (see INLINE_GOALS for the starting goal line inline) ## ## fix-fix-fix ## keep offset at [0,0] - why? why not? ## do NOT eat-up ## or better ## add tokens << Token.literal( '(', lineno: lineno, offset: ...) !!! offset = m.offset(0) pos = offset[1] ## update pos end end old_pos = -1 ## allows to backtrack to old pos (used in geo) ctx = Context.new( self, line: line, lineno: lineno, errors: errors ) while m = @re.match( line, pos ) # if debug? # pp m # puts "pos: #{pos}" # end offset = m.offset(0) ctx.offset = offset if offset[0] != pos ## match NOT starting at start/begin position!!! ## report parse error!!! msg = "parse error (tokenize) - skipping >#{line[pos..(offset[0]-1)]}< in line #{lineno}@#{offset[0]},#{offset[1]} >#{line}<" errors << msg log( msg ) puts "!! WARN - #{msg}" end ## ## todo/fix - also check if possible ## if no match but not yet end off string!!!! ## report skipped text run too!!! old_pos = pos pos = offset[1] # pp offset if debug? ## ## note: racc requires pairs e.g. [:TOKEN, VAL] ## for VAL use "text" or ["text", { opts }] array t = if @re == ROUND_DEF_RE then _on_round_def( m, ctx: ctx ) elsif @re == GROUP_DEF_RE then _on_group_def( m, ctx: ctx ) elsif @re == GEO_RE ### note - possibly end inline geo on [ (and others?? in the future ## note: break on double spaces e.g. ## e.g. Jul/16 @ Arena Auf Schalke, Gelsenkirchen Serbia 0-1 England if m[:spaces] ### note - do NOT break out ## if not text seen yet!!! if @geo_count > 0 ## get out-off geo mode and backtrack (w/ next) ## ## todo/fix ## add virtual geo_end token!!! _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" ) @re = RE pos = old_pos next ## backtrack (resume new loop step) else nil ## skip spaces end elsif m[:space] nil ## skip (single) space elsif m[:text] @geo_count += 1 ## keep pos - why? why not? Token.new(:GEO, m[:text], lineno: lineno, offset: m.offset(:text)) elsif m[:geo_end] ## "hacky" special comma; always ends geo mode!!! ## get out-off geo mode and backtrack (w/ next) ## todo/fix ## add (semi-) virtual geo_end token!!! _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" ) @re = RE pos = old_pos next ## backtrack (resume new loop step) elsif m[:sym] case m[:sym] ## note - reset geo_count to 0 (avoids break on two spaces) ## if separator seen!! when ',' then @geo_count = 0 Token.literal( m[:sym], lineno: lineno, offset: m.offset(:sym)) when '›' then @geo_count = 0; Token.literal( ',', lineno: lineno, offset: m.offset(:sym)) ## note - treat geo sep › (unicode) like comma for now!!! when '>' then @geo_count = 0; Token.literal( ',', lineno: lineno, offset: m.offset(:sym)) ## note - treat geo sep > (ascii) like comma for now!!! when '[' then ## ## todo/fix ## add virtual geo_end token!!! ## get out-off geo mode and backtrack (w/ next) _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" ) @re = RE pos = old_pos next ## backtrack (resume new loop step) else Token.literal( m[:sym], lineno: lineno, offset: m.offset(:sym)) end else ctx.warn_on_else( m, mode: 'GEO' ) nil end elsif @re == PROP_CARDS_RE then _on_prop_cards( m, ctx: ctx ) elsif @re == PROP_LINEUP_RE then _on_prop_lineup( m, ctx: ctx ) elsif @re == PROP_ATTENDANCE_RE then _on_prop_attendance( m, ctx: ctx ) elsif @re == PROP_REFEREE_RE then _on_prop_referee( m, ctx: ctx ) elsif @re == PROP_PENALTIES_RE then _on_prop_penalties( m, ctx: ctx ) elsif @re == GOAL_COMPAT_RE then _on_goal_compat( m, ctx: ctx ) elsif @re == GOAL_ALT_RE then _on_goal_alt( m, ctx: ctx ) elsif @re == GOAL_RE then _on_goal( m, ctx: ctx ) ################################################### ## assume TOP_LEVEL (a.k.a. RE) machinery else _on_top( m, ctx: ctx ) end tokens << t if t # if debug? # print ">" # print "*" * pos # puts "#{line[pos..-1]}<" # end end ## check if no match in end of string if offset[1] != line.size msg = "parse error (tokenize) - skipping >#{line[offset[1]..-1]}< in line #{lineno}@#{offset[1]},#{line.size} >#{line}<" errors << msg log( msg ) puts "!! WARN - #{msg}" end # if @re == GOAL_RE ### ALWAYS switch back to top level mode # puts " LEAVE GOAL_RE MODE, BACK TO TOP_LEVEL/RE" if debug? # @re = RE # end if @re == GEO_RE ### ALWAYS switch back to top level mode _trace( "LEAVE GEO_RE MODE, BACK TO TOP_LEVEL/RE" ) @re = RE end ### ALWAYS switch back to top level mode @re = RE if @re == GROUP_DEF_RE || @re == ROUND_DEF_RE ## ## if in prop mode continue if last token is [,-] ## otherwise change back to "standard" mode if @re == PROP_LINEUP_RE || @re == PROP_CARDS_RE || @re == PROP_PENALTIES_RE || @re == PROP_ATTENDANCE_RE || @re == PROP_REFEREE_RE if [',', '-', ';'].include?( tokens[-1].type) ## continue/stay in PROP_RE mode ## todo/check - auto-add PROP_CONT token or such ## to help parser with possible NEWLINE ## conflicts - why? why not? else ## switch back to top-level mode!! _trace( "LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE" ) @re = RE ## note - auto-add PROP_END (<PROP_END>) tokens << Token.virtual(:PROP_END, lineno: lineno) end end [tokens,errors] end |
#_trace(*args) ⇒ Object
20 21 22 23 24 25 |
# File 'lib/sportdb/parser/lexer.rb', line 20 def _trace( *args ) if debug? print "[DEBUG] Lexer -- " args.each { |arg| puts args } end end |
#_warn(*args) ⇒ Object
27 28 29 30 |
# File 'lib/sportdb/parser/lexer.rb', line 27 def _warn( *args ) print "!! [WARN] Lexer -- " args.each { |arg| puts args } end |
#debug? ⇒ Boolean
38 |
# File 'lib/sportdb/parser/lexer.rb', line 38 def debug?() @debug == true; end |
#log(msg) ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 |
# File 'lib/sportdb/parser/lexer.rb', line 7 def log( msg ) ## append msg to ./logs.txt ## use ./errors.txt - why? why not? ## ## change to ./logs_lexer.txt or such - why? why not? ## auto-add/prepend [Lexer] and timestamp!!! to msg - why? why not? File.open( './logs.txt', 'a:utf-8' ) do |f| f.write( msg ) f.write( "\n" ) end end |
#tokenize_with_errors ⇒ Object
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
# File 'lib/sportdb/parser/lexer.rb', line 54 def tokenize_with_errors tokens_by_line = [] ## note: add tokens line-by-line (flatten later) errors = [] ## keep a list of errors - why? why not? txt = _prep_doc( @txt ) #### ## quick hack - keep re state/mode between tokenize calls!!! @re ||= RE ## note - switch between RE & INSIDE_RE lineno = 0 txt.each_line do |line| lineno += 1 ## todo - "inlined virtual/collapsed/folded newlines" ## check for "↵" !!! ## and add to lineno ## note - KEEP leading spaces for indent ## use rstrip (NOT left/leading & right/trainling strip) only!! ## note - remove/strip trailing newline (and optional spaces)!!! ## trailing whitespace may incl. \n or \r\n!!! line = line.rstrip ### skip comments ## todo/check - change to blank line ## to keep lineno (closer to orginal) - why? why not? next if line.match?(/\A [ ]* ## optional leading space(s) \# /x ) ## strip (inline) end-of-line comments (from line) ## check/discuss: make - inline comment require trailing space ## e.g. #1 vs # 1 - why? why not? line = line.sub( / [ ]* ## (eat-up) optional leading space(s) \#{1,}.*? \z /x, '' ) #### # support __END__ marker to cut-off input break if line.match?( /\A [ ]* ## optional leading space(s) __END__ \z /x ) ## auto-fixes line-by-line (e.g. check for tabs, smart quotes, etc.) line = _prep_line( line ) _trace( "line #{lineno}: >#{line}<" ) ###### ### special case for empty line (aka BLANK) if line.empty? ## note - blank always resets parser mode to std/top-level!!! @re = RE tokens_by_line << [Token.virtual(:BLANK, lineno: lineno)] elsif (m = HEADING_RE.match(line)) ## note - heading always resets parser mode to std/top-level!!! @re = RE _trace( 'HEADING' ) ## note - derive heading level from no of (leading) markers ## e.g. = is 1, == is 2, == is 3, etc. heading_level = m[:heading_marker].size tokens_by_line << [Token.new(:"H#{heading_level}", m[:heading], lineno: lineno)] elsif (m = NOTA_BENE_RE.match(line)) ## note - nota bene always resets parser mode to std/top-level!!! @re = RE tokens_by_line << [Token.new(:NOTA_BENE, m[:nota_bene], lineno: lineno)] else more_tokens, more_errors = _tokenize_line( line, lineno ) tokens_by_line << more_tokens errors += more_errors end end # each line tokens_by_line = tokens_by_line.map do |tokens| ################# ## transform tokens (using simple patterns) ## to help along the (racc look ahead 1 - LA1) parser nodes = [] buf = Tokens.new( tokens ) ## pp buf loop do break if buf.eos? if buf.match?( :DATE, :TIME ) ## merge DATE TIME into DATETIME date = buf.next time = buf.next ## puts "DATETIME:" ## pp date, time ## note: time value is { time: {} } or ## { time: {}, time_local {} } text = date.text + ' ' + time.text, ## concat string of two tokens value = { date: date.value }.merge( time.value ) nodes << Token.new(:DATETIME, text, lineno: date.lineno, offset: [date.offset[0], time.offset[1]], value: value ) ### support date time with comma too - why? why not? elsif buf.match?( :DATE, ',', :TIME ) date = buf.next _ = buf.next ## ignore comma time = buf.next ## puts "DATETIME:" ## pp date, time text = date.text + ', ' + time.text ## concat string of two tokens value = { date: date.value }.merge( time.value ) nodes << Token.new(:DATETIME, text, lineno: date.lineno, offset: [date.offset[0], time.offset[1]], value: value ) elsif buf.match?( :GOAL_MINUTE, ',', :GOAL_MINUTE ) ## note - only advance by two tokens! ## allows more :GOAL_MINUTE sequences!! e.g. 12,13,14 etc!!! ## ## help parser with comma shift/reduce conflict ## change ',' to GOAL_MINUTE_SEP !!! nodes << buf.next ## pass through goal_minute comma = buf.next ## eat-up goal_minute_sep a.k.a. comma (,) ## and replace with dedicated sep(arator) nodes << Token.new( :GOAL_MINUTE_SEP, comma.text, lineno: comma.lineno, offset: comma.offset, value: comma.value) elsif buf.match?( ',', :INLINE_ATTENDANCE ) ## note - allow optional comma before inline attendance ## help parser with comma shift/reduce conflict ## change ',' to INLINE_ATTENDANCE_SEP !!! comma = buf.next ## eat-up inline_attendance_sep a.k.a. comma (,) ## and replace with dedicated sep(arator) nodes << Token.new(:INLINE_ATTENDANCE_SEP, comma.text, lineno: comma.lineno, offset: comma.offset, value: comma.value) nodes << buf.next ## pass through inline_attendance else ## pass through nodes << buf.next end end # loop nodes end # map tokens_by_line ## puts "tokens_by_line:" ## pp tokens_by_line ## flatten tokens tokens = [] tokens_by_line.each do |tok_line| ## if debug? ## pp tok_line ## end tokens += tok_line ## auto-add newlines (unless BLANK!!) unless tok_line[0] && tok_line[0].type == :BLANK ## note - reuse lineno from first token in line ## use last - why? why not? tokens << Token.newline( lineno: tok_line[0].lineno ) end end [tokens,errors] end |