Class: SportDb::Parser

Inherits:

Object

Object
SportDb::Parser

show all

Defined in:: lib/sportdb/parser.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/tokenizer.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-score.rb,
lib/sportdb/parser/token-status.rb

Constant Summary collapse

TIME_RE = keep 18h30 - why? why not? add support for 6:30pm 8:20am etc. - why? why not?

%r{
    ## e.g. 18.30 (or 18:30 or 18h30)
    (?<time>  \b
              (?<hour>\d{1,2})
                 (?: :|\.|h )
              (?<minute>\d{2})
              \b
    )
}ix

TIMEZONE_RE = for timezone format use for now: (BRT/UTC-3) (e.g. brazil time) (CET/UTC+1) - central european time (CEST/UTC+2) - central european summer time - daylight saving time (DST). (EET/UTC+1) - eastern european time (EEST/UTC+2) - eastern european summer time - daylight saving time (DST). UTC+3 UTC+4 UTC+0 UTC+00 UTC+0000 - allow +01 or +0100 - why? why not - +0130 (01:30) see https://en.wikipedia.org/wiki/Time_zone https://en.wikipedia.org/wiki/List_of_UTC_offsets https://en.wikipedia.org/wiki/UTC−04:00 etc.

%r{
   ## e.g. (UTC-2) or (CEST/UTC-2) etc.
   (?<timezone>
      \(
           ## optional "local" timezone name eg. BRT or CEST etc.
           (?:  [a-z]+
                 /
           )?
            [a-z]+
            [+-]
            \d{1,4}   ## e.g. 0 or 00 or 0000
      \)
   )
}ix

BASICS_RE =

%r{
    ## e.g. (51) or (1) etc.  - limit digits of number???
    (?<num> \(  (?<value>\d+) \) )
       |
    (?<vs>
       (?<=[ ])	# Positive lookbehind for space
       (?:
          vs|v
       )  
           # not bigger match first e.g. vs than v etc.
           # todo/fix - make vs|v case sensitive!!! only match v/vs - why? why not?
       (?=[ ])   # positive lookahead for space
    )
       |
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>[;,@|\[\]-])
}ix

MINUTE_RE =

%r{
     (?<minute>
       (?<=[ (])	 # Positive lookbehind for space or opening ( e.g. (61') required
           (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
        (?: \+
            (?<value2>\d{1,3})
        )?
        '     ## must have minute marker!!!!
     )
}ix

GOAL_PEN_RE = goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og)

%r{
   (?<pen> \(
           (?:pen|p)\.?
           \)
    )
}ix

GOAL_OG_RE =

%r{
   (?<og> \(
          (?:og|o\.g\.)
          \)
   )
}ix

PROP_BASICS_RE =

%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>[.;,\(\)\[\]-])   ## note - dot (.) is the (all-important) end-of-prop marker!!!
}ix

PROP_NAME_RE = name different from text (does not allow number in name/text) note - includes special handling for dot (.) if at the end of line!!! end-of-line dot (.) is the prop end-of-marker - do NOT eat-up!!!

%r{
                 (?<prop_name> \b
                   (?<name>
                      \p{L}+       
                       (?: \. (?: (?![ ]*$) )
                        )?      ## edge case - check for end of prop marker! (e.g. Stop.)
                      (?: 
                          [ ]?    # only single spaces allowed inline!!!
                          (?:
                              (?:
                                (?<=\p{L})   ## use lookbehind
                                 [/'-]   ## must be surrounded by letters
                                       ## e.g. One/Two NOT
                                       ##      One/ Two or One / Two or One /Two etc.
                                (?=\p{L})      ## use lookahead        
                              )
                                 |   
                              (?:
                                (?<=[ ])   ## use lookbehind  -- add letter (plus dot) or such - why? why not?
                                 [']   ## must be surrounded by leading space and
                                       ## traling letters  (e.g. UDI 'Beter Bed)
                                (?=\p{L})      ## use lookahead        
                              )   
                                 |
                              (?:
                                (?<=\p{L})   ## use lookbehind
                                 [']   ## must be surrounded by leading letter and
                                       ## trailing space PLUS letter  (e.g. UDI' Beter Bed)
                                (?=[ ]\p{L})      ## use lookahead (space WITH letter         
                              )   
                                 |
                              (?: \p{L}+
                                  (?: \. 
                                      (?: (?![ ]*$) )
                                  )?  ## last dot is delimiter!!!
                              )
                          )+
                     )*
                   )
               ## add lookahead - must be non-alphanum (or dot)
                  (?=[ .,;\]\)]|$)
                  )
}ix

PROP_KEY_RE =

%r{ 
(?<prop_key> \b
  (?<key>
      (?:\p{L}+
          |
          \d+  # check for num lookahead (MUST be space or dot)
       ## MUST be followed by (optional dot) and
       ##                      required space !!!
       ## MUST be follow by a to z!!!!
        \.?     ## optional dot
        [ ]?   ## make space optional too  - why? why not?
            ##  yes - eg. 1st, 2nd, 5th etc.
        \p{L}+
       )
       [\d\p{L}'/° -]*?   ## allow almost anyting 
                         ## fix - add negative lookahead 
                         ##         no space and dash etc.
                         ##    only allowed "inline" not at the end
                         ## must end with latter or digit!
  )
   [ ]*?     # slurp trailing spaces
    :
   (?=[ ]+)  ## possitive lookahead (must be followed by space!!)
  )
}ix

PROP_RE =

Regexp.union(
   PROP_BASICS_RE, 
   MINUTE_RE,
   PROP_NAME_RE,
)

RE = start with prop key (match will/should switch into prop mode!!!)

Regexp.union(  PROP_KEY_RE, ##  start with prop key (match will/should switch into prop mode!!!)
                    STATUS_RE,
                    TIMEZONE_RE,
TIME_RE,
DURATION_RE,  # note - duration MUST match before date
                    DATE_RE,
                    SCORE_RE,
                    BASICS_RE, MINUTE_RE,
                    GOAL_OG_RE, GOAL_PEN_RE,
TEXT_RE )

MONTH_LINES =

parse_names( <<TXT )
January    Jan
February   Feb
March      Mar
April      Apr
May
June       Jun
July       Jul
August     Aug
September  Sept  Sep
October    Oct
November   Nov
December   Dec
TXT

MONTH_NAMES =

build_names( MONTH_LINES )

MONTH_MAP = pp MONTH_NAMES

build_map( MONTH_LINES, downcase: true )

DAY_LINES =

parse_names( <<TXT )
Monday                   Mon  Mo
Tuesday            Tues  Tue  Tu
Wednesday                Wed  We
Thursday    Thurs  Thur  Thu  Th
Friday                   Fri  Fr
Saturday                 Sat  Sa
Sunday                   Sun  Su
TXT

DAY_NAMES =

build_names( DAY_LINES )

DAY_MAP = pp DAY_NAMES

build_map( DAY_LINES, downcase: true )

DATE_I_RE = e.g. Fri Aug/9 or Fri Aug 9

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?
     (?<month_name>#{MONTH_NAMES})
         (?: \/|[ ] )
     (?<day>\d{1,2})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?
  \b
)}ix

DATE_II_RE = e.g. 3 June or 10 June

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?
     (?<day>\d{1,2})
         [ ]
     (?<month_name>#{MONTH_NAMES})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?
  \b
)}ix

DATE_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
   DATE_I_RE,
   DATE_II_RE
)

DURATION_I_RE = todo add plus later on - why? why not?

%r{
(?<duration>
    \b
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name1>#{MONTH_NAMES})
      (?: \/|[ ] )
   (?<day1>\d{1,2})
   ## optional year
   ( [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]*[-][ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name2>#{MONTH_NAMES})
      (?: \/|[ ] )
   (?<day2>\d{1,2})
   ## optional year
   ( [ ]
      (?<year2>\d{4})
   )?
   \b
)}ix

DURATION_II_RE = variant ii e.g. 26 July - 27 July

%r{
(?<duration>
    \b
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<day1>\d{1,2})
      [ ]
   (?<month_name1>#{MONTH_NAMES})
   ## optional year
   ( [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]*[-][ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<day2>\d{1,2})
      [ ]
   (?<month_name2>#{MONTH_NAMES})
   ## optional year
   ( [ ]
      (?<year2>\d{4})
   )?
   \b
)}ix

DURATION_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
   DURATION_I_RE,
   DURATION_II_RE
)

TEXT_RE =

%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
           ## positive lookbehind
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<=[ ,;@|\[\]]
                 |^
            )
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |

                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MUST be followed by (optional dot) and
                      ##                      required space !!!
                      ## MUST be follow by a to z!!!!
                      \.?     ## optional dot
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add weirdo case
                ##   e.g. 5.-8. Platz Playoffs  - keep - why? why not?
                    \d+\.-\d+\.  [ ]? \p{L}+
               )

              (?:(?:  (?:[ ]
                        (?!vs?[ ])    ## note - exclude (v[ ]/vs[ ])
                       )
                      |     # only single spaces allowed inline!!!
                     [-]
                  )?
                (?:
                  \p{L} |
                  [&/'°]
                    |
                 (?:
                   \d+
                   (?!
                     [0-9h'+-] |    ## protected break on 12h / 12' / 1-1
                                    ##  check usege for 3+4 - possible? where ? why?
                     (?:[.:]\d)     ## protected/exclude/break on 12.03 / 12:03
                    )
                   ## negative lookahead for numbers
                   ##   note - include digits itself!!!
                   ##   note - remove / (slash) e.g. allows UDI'19/Beter Bed
                 )|
                 \.
               )
              )*  ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)


            ## allow optional at the end
            ##  tag or year
            ##   make it and in the future - why? why not?
            ##
            ## change - fix
            ##   do NOT use (A) for amateur
            ##   use A or A. with NO ()!!!
            ## (A) -    allow with predined  alpha only for now
            ##          e.g. (A) - amateur a team or b?
            ###  same for U21 or U9 etc
            ##        use with NO ()!!! - why? why not?
            ##      or U21 U9 etc.   - why? why not?
            ##       or etc.
            ## (1879-1893) or allow years e.g. (1879-1893)
            ###
            ##    add allow country code three to five letters for now
            ##       change to generic 1 to 5 - why? why not?
            ##     e.g. (A), (I),
            ##          (AUT)
            ##          (TRNC)   five? for UEFA code for northern cyprus
            ##     change to 1 to 4 - why? why not?
            ##   check - fix possible for upper case only here
            ##                     inline for this group only?
            (?:
               [ ]
               \(
                  \d{4}-\d{4}
               \)
            )?
             (?:
               [ ]+   ## allow more than once space - why? why not?
                  \( (?:
                       [A-Z]{1,5}
                     )
                  \)
             )?
            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## positive lookahead
            (?=[ ,;@|\[\]]
                 |$
            )
   )
}ix

P_EN = english helpers (penalty, extra time, …) note - p must go last (shortest match) pso = penalty shootout

'(?: pso | pen\.? | p\.? )'

ET_EN = e.g. p., p, pen, pen., PSO, etc.

'(?: aet | a\.e\.t\.? )'

SCORE__P_ET__RE = note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.) 3-4 pen. 2-2 a.e.t. 3-4 pen. 2-2 a.e.t. 2-2 a.e.t.

%r{
(?<score>
   \b
    (?:
       (?<p1>\d{1,2}) - (?<p2>\d{1,2})
         [ ]* #{P_EN} [ ]+
     )?             # note: make penalty (P) score optional for now
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]* #{ET_EN}
       (?=[ ,\]]|$)
)}ix

SCORE__P__RE = note: allow SPECIAL with penalty only 3-4 pen.

%r{
        (?<score>
  \b
     (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN}
       (?=[ ,\]]|$)
)}ix

SCORE__P_ET_FT_HT__RE = e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or 3-4p 2-2aet (1-1, ) or 3-4 pen. 2-2 a.e.t. (1-1) or 2-2 a.e.t. (1-1, 1-1) or 2-2 a.e.t. (1-1, ) or 2-2 a.e.t. (1-1)

%r{
          (?<score>
   \b
   (?:
    (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN} [ ]+
    )?            # note: make penalty (P) score optional for now
   (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]* #{ET_EN} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
 (?=[ ,\]]|$)
)}ix

SCORE__P_FT_HT__RE = special case for case WITHOUT extra time!! same as above (but WITHOUT extra time and pen required)

%r{
         (?<score>
            \b
 (?<p1>\d{1,2}) - (?<p2>\d{1,2})
    [ ]* #{P_EN} [ ]+
    \(
    [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]*
 (?:
      , [ ]*
     (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         [ ]*
     )?
 )?              # note: make half time (HT) score optional for now
   \)
  (?=[ ,\]]|$)
)}ix

SCORE__FT_HT__RE = e.g. 2-1 (1-1) or 2-1

%r{
            (?<score>
 \b
 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
  (?:
      [ ]+ \( [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
      [ ]* \)
  )?   # note: make half time (HT) score optional for now
(?=[ ,\]]|$)
)}ix

SCORE_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
  SCORE__P_ET_FT_HT__RE,  # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
  SCORE__P_FT_HT__RE,     # e.g. 5-1 pen. (1-1)
  SCORE__P_ET__RE,        # e.g. 2-2 a.e.t.  or  5-1 pen. 2-2 a.e.t.
  SCORE__P__RE,           # e.g. 5-1 pen.
  SCORE__FT_HT__RE,        # e.g. 1-1 (1-0) or 1-1  -- note - must go last!!!
)

STATUS_RE =

%r{
            \[
      (?:    
            ### opt 1 - allow long forms with note/comment for some stati
           (?: (?<status> awarded
                            |
                          annulled
                            |
                          abandoned
               ) [ ;,]* (?<status_note> [^\]]+ )
                 [ ]*
            )
            |
        
            ## opt 2 - short from only (no note/comments)
            (?<status>
               cancelled|canceled|can\.
                 |
               abandoned|abd\.
                 |
               postponed
                 |
               awarded|awd\.
                 |
               replay
                 |
               annulled
            )
      )
    \]
}ix

Class Method Summary collapse

.build_map(lines, downcase: false) ⇒ Object
.build_names(lines) ⇒ Object
.parse_date(str, start:) ⇒ Object

add a date parser helper.
.parse_names(txt) ⇒ Object

Instance Method Summary collapse

#is_group?(text) ⇒ Boolean

keep “old” access to checking for group, round & friends for now for compatibility.
#is_leg?(text) ⇒ Boolean
#is_round?(text) ⇒ Boolean
#log(msg) ⇒ Object
#tokenize(line, debug: false) ⇒ Object

convience helper - ignore errors by default.
#tokenize_with_errors(line, debug: false) ⇒ Object

Class Method Details

.build_map(lines, downcase: false) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 40

def self.build_map( lines, downcase: false )
   ## note: downcase name!!!
  ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
  ##  {"january" => 1,  "jan" => 1,
  ##   "february" => 2, "feb" => 2,
  ##   "march" => 3,    "mar" => 3,
  ##   "april" => 4,    "apr" => 4,
  ##   "may" => 5,
  ##   "june" => 6,     "jun" => 6, ...
  lines.each_with_index.reduce( {} ) do |h,(line,i)|
    line.each do |name|
       h[ downcase ? name.downcase : name ] = i+1
    end  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
    h
  end
end

.build_names(lines) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 33

def self.build_names( lines )
  ## join all words together into a single string e.g.
  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
  lines.map { |line| line.join('|') }.join('|')
end

.parse_date(str, start:) ⇒ `Object`

add a date parser helper

# File 'lib/sportdb/parser/token-date.rb', line 160

def self.parse_date( str, start: )
    if m=DATE_RE.match( str )

      year    = m[:year].to_i(10)  if m[:year]
      month   = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
      day     = m[:day].to_i(10)   if m[:day]
      wday    = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

      if year.nil?   ## try to calculate year
        year =  if  month > start.month ||
                   (month == start.month && day >= start.day)
                  # assume same year as start_at event (e.g. 2013 for 2013/14 season)
                  start.year
                else
                  # assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
                  start.year+1
                end
      end
      Date.new( year,month,day )
    else
      puts "!! ERROR - unexpected date format; cannot parse >#{str}<"
      exit 1
    end
end

.parse_names(txt) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 6

def self.parse_names( txt )
  lines = [] # array of lines (with words)

  txt.each_line do |line|
    line = line.strip

    next if line.empty?
    next if line.start_with?( '#' )   ## skip comments too

    ## strip inline (until end-of-line) comments too
    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
    ##   =>   Janvier  Janv  Jan

    line = line.sub( /#.*/, '' ).strip
    ## pp line

    values = line.split( /[ \t]+/ )
    ## pp values

    ## todo/fix -- add check for duplicates
    lines << values
  end
  lines

end

Instance Method Details

#is_group?(text) ⇒ `Boolean`

keep “old” access to checking for group, round & friends

for now for compatibility

Returns:

(Boolean)

62	# File 'lib/sportdb/parser.rb', line 62 def is_group?( text ) Lang.is_group?( text ); end

#is_leg?(text) ⇒ `Boolean`

Returns:

(Boolean)

64	# File 'lib/sportdb/parser.rb', line 64 def is_leg?( text ) Lang.is_leg?( text ); end

#is_round?(text) ⇒ `Boolean`

Returns:

(Boolean)

63	# File 'lib/sportdb/parser.rb', line 63 def is_round?( text ) Lang.is_round?( text ); end

#log(msg) ⇒ `Object`

# File 'lib/sportdb/parser/tokenizer.rb', line 7

def log( msg )
   ## append msg to ./logs.txt
   ##     use ./errors.txt - why? why not?
   File.open( './logs.txt', 'a:utf-8' ) do |f|
     f.write( msg )
     f.write( "\n" )
   end
end

#tokenize(line, debug: false) ⇒ `Object`

convience helper - ignore errors by default

# File 'lib/sportdb/parser/tokenizer.rb', line 256

def tokenize(  line, debug: false )
   tokens, _ = tokenize_with_errors( line, debug: debug )
   tokens
end

#tokenize_with_errors(line, debug: false) ⇒ `Object`

# File 'lib/sportdb/parser/tokenizer.rb', line 18

def tokenize_with_errors( line, debug: false )
  tokens = []
  errors = []   ## keep a list of errors - why? why not?

  puts ">#{line}<"    if debug

  pos = 0
  ## track last offsets - to report error on no match
  ##   or no match in end of string
  offsets = [0,0]
  m = nil


  ####
  ## quick hack - keep re state/mode between tokenize calls!!!
  @re  ||= RE     ## note - switch between RE & INSIDE_RE


  while m = @re.match( line, pos )
    if debug
      pp m
      puts "pos: #{pos}"
    end
    offsets = [m.begin(0), m.end(0)]

    if offsets[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!
      msg =  "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      puts msg

      errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
      log( msg )
    end

    ##
    ## todo/fix - also check if possible
    ##   if no match but not yet end off string!!!!
    ##    report skipped text run too!!!

    pos = offsets[1]

    pp offsets   if debug

    ##
    ## note: racc requires pairs e.g. [:TOKEN, VAL]
    ##         for VAL use "text" or ["text", { opts }]  array


  t = if @re == PROP_RE
         if m[:space]
              ## skip space
              nil
         elsif m[:spaces]
              ## skip spaces
              nil
         elsif m[:prop_name]
               if m[:name] == 'Y'
                 [:YELLOW_CARD, m[:name]]
               elsif m[:name] == 'R'
                 [:RED_CARD, m[:name]]
               else 
                 [:PROP_NAME, m[:name]]
               end
         elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '[' then [:'[']
            when ']' then [:']']
            when '(' then [:'(']
            when ')' then [:')']
            when '-' then [:'-']
            when '.' then 
                ## switch back to top-level mode!!
                puts "  LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE"
                @re = RE 
                [:'.']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
             puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
             nil 
         end
      else  ## assume TOP_LEVEL (a.k.a. RE) machinery
        if m[:space]
           ## skip space
           nil
        elsif m[:spaces]
           ## skip spaces
           nil
        elsif m[:prop_key]
           ##  switch context  to PROP_RE
           @re = PROP_RE
           puts "  ENTER PROP_RE MODE"
           [:PROP, m[:key]]
        elsif m[:text]
          [:TEXT, m[:text]]   ## keep pos - why? why not?
        elsif m[:status]   ## (match) status e.g. cancelled, awarded, etc.
          ## todo/check - add text (or status) 
          #     to opts hash {} by default (for value)
          if m[:status_note]   ## includes note? e.g.  awarded; originally 2-0
             [:STATUS, [m[:status], {status: m[:status], 
                                     note:   m[:status_note]} ]]
          else
             [:STATUS, [m[:status], {status: m[:status] } ]]
          end
        elsif m[:time]
              ## unify to iso-format
              ###   12.40 => 12:40
              ##    12h40 => 12:40 etc.
              ##  keep string (no time-only type in ruby)
              hour =   m[:hour].to_i(10)  ## allow 08/07/etc.
              minute = m[:minute].to_i(10)
              ## check if valid -  0:00 - 24:00
              ##   check if 24:00 possible? or only 0:00 (23:59)
              if (hour >= 0 && hour <= 24) &&
                 (minute >=0 && minute <= 59)
               ## note - for debugging keep (pass along) "literal" time
               ##   might use/add support for am/pm later
               [:TIME, [m[:time], {h:hour,m:minute}]]
              else
                 raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
              end
        elsif m[:date]
            date = {}
 ## map month names
 ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date[:y] = m[:year].to_i(10)  if m[:year]
            date[:m] = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
            date[:d]  = m[:day].to_i(10)   if m[:day]
            date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]
            ## note - for debugging keep (pass along) "literal" date
            [:DATE, [m[:date], date]]
        elsif m[:timezone]
          [:TIMEZONE, m[:timezone]]
        elsif m[:duration]
            ## todo/check/fix - if end: works for kwargs!!!!!
            duration = { start: {}, end: {}}
            duration[:start][:y] = m[:year1].to_i(10)  if m[:year1]
            duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ]   if m[:month_name1]
            duration[:start][:d]  = m[:day1].to_i(10)   if m[:day1]
            duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ]   if m[:day_name1]
            duration[:end][:y] = m[:year2].to_i(10)  if m[:year2]
            duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            duration[:end][:d]  = m[:day2].to_i(10)   if m[:day2]
            duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ]   if m[:day_name2]
            ## note - for debugging keep (pass along) "literal" duration
            [:DURATION, [m[:duration], duration]]
        elsif m[:num]   ## fix - change to ord (for ordinal number!!!)
              ## note -  strip enclosing () and convert to integer
             [:ORD, [m[:num], { value: m[:value].to_i(10) } ]]
        elsif m[:score]
              score = {}
              ## check for pen
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]

            ## note - for debugging keep (pass along) "literal" score
            [:SCORE, [m[:score], score]]
        elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
        elsif m[:og]
           [:OG, m[:og]]    ## for typed drop - string version/variants ??  why? why not?
        elsif m[:pen]
           [:PEN, m[:pen]]
        elsif m[:vs]
           [:VS, m[:vs]]
        elsif m[:sym]
          sym = m[:sym]
          ## return symbols "inline" as is - why? why not?
          ## (?<sym>[;,@|\[\]-])
 
          case sym
          when ',' then [:',']
          when ';' then [:';']
          when '@' then [:'@']
          when '|' then [:'|']
          when '[' then [:'[']
          when ']' then [:']']
          when '-' then [:'-']
          else
            nil  ## ignore others (e.g. brackets [])
          end
        else
          ## report error
           puts "!!! TOKENIZE ERROR - no match found"
           nil 
        end
      end


    tokens << t    if t

    if debug
      print ">"
      print "*" * pos
      puts "#{line[pos..-1]}<"
    end
  end

  ## check if no match in end of string
  if offsets[1] != line.size
    msg =  "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
    puts msg
    log( msg )

    errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
  end


  [tokens,errors]
end

Class: SportDb::Parser

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.build_map(lines, downcase: false) ⇒ Object

.build_names(lines) ⇒ Object

.parse_date(str, start:) ⇒ Object

.parse_names(txt) ⇒ Object

Instance Method Details

#is_group?(text) ⇒ Boolean

#is_leg?(text) ⇒ Boolean

#is_round?(text) ⇒ Boolean

#log(msg) ⇒ Object

#tokenize(line, debug: false) ⇒ Object

#tokenize_with_errors(line, debug: false) ⇒ Object

.build_map(lines, downcase: false) ⇒ `Object`

.build_names(lines) ⇒ `Object`

.parse_date(str, start:) ⇒ `Object`

.parse_names(txt) ⇒ `Object`

#is_group?(text) ⇒ `Boolean`

#is_leg?(text) ⇒ `Boolean`

#is_round?(text) ⇒ `Boolean`

#log(msg) ⇒ `Object`

#tokenize(line, debug: false) ⇒ `Object`

#tokenize_with_errors(line, debug: false) ⇒ `Object`