Class: SportDb::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/sportdb/parser/lang.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-score.rb

Constant Summary collapse

GROUP_RE =

Group A-Z Group 1-99 Group HEX # used in concaf world cup quali Group 1A or A1, B1 - used anywhere

use "key" of group - why? why not?
%r{^
  Group [ ]
     (?<key>[a-z0-9]+)
$}ix
ROUND_RE =
%r{^(
   ## add special case for group play-off rounds!
   ##  group 2 play-off   (e.g. worldcup 1954, 1958)
   ##
   ### note - allow Group ("stand-alone") as "generic" round for now
   ##      BUT do NOT allow Group 1, Group 2, Group A, Group B, etc.
     (?: Group [ ] [A-Z0-9]+ [ ] Play-?offs?  |
         Group (?: [ ] phase)?  |
         League (?: [ ] phase)?
     )
        |
   # round  - note - requiers number e.g. round 1,2, etc.
   #   note - use 1-9 regex (cannot start with 0) - why? why not?
   #             make week 01 or round 01 or matchday 01 possible?
      (?: (?: Round |
              Matchday |
              Week
           )
           [ ] [1-9][0-9]*
      )
       |
   ##  starting with qual(ification)
   ##   Qual. Round 1 / Qual. Round 2 / Qual. Round 3
   ##  or
   ##  Playoff Round 1
   ##  Play-in Round 1
     (?:  (?: Qual \. |
              Play-?off |
              Play-?in
          )
           [ ] Round [ ] [1-9][0-9]* )
       |
   ## 1. Round / 2. Round / 3. Round / etc.
   ##  First Round
   ##  Play-off Round
   ##  Final Round   (e.g. Worldcup 1950)
      (?:
           (?: [1-9][0-9]* \.  |
                1st | First   |
                2nd | Second  |
                Play-?off   |
                Final
           )
             [ ] Round
       )
       |
  ## starting with preliminary
  #   e.g.  Preliminary round
     (?:  Preliminary  [ ]
           (?:  Round |
                Semi-?finals |
                Final
           )
     )
     |
   # more (kockout) rounds
   # playoffs  - playoff, play-off, play-offs
        (?: Play-?offs?
           (?: [ ]for[ ]quarter-?finals )?
        )
        |
   # round32
        (?: Round[ ]of[ ]32 |
            Last[ ]32 )
          |
   # round16
        (?: Round[ ]of[ ]16 |
            Last[ ]16 |
            8th[ ]finals )
           |
   # fifthplace
         (?:
             (?: (Fifth|5th)[ -]place
                  (?: [ ] (?: match|play-?off|final ))?
              ) |
             (?: Match[ ]for[ ](?: fifth|5th )[ -]place )
         )
          |
   # thirdplace
          (?:
              (?: (Third|3rd)[ -]place
                     (?: [ ] (?: match|play-?off|final ))?
               ) |
              (?: Match[ ]for[ ](?: third|3rd )[ -]place )
           )
           |
   # quarterfinals
         (?:
              Quarter-?finals? |
              Quarters |
              Last[ ]8
          )
          |
   # semifinals
        (?:
             Semi-?finals? |
             Semis |
             Last[ ]4
        )
        |
   # final
         Finals?
         |
   # decider e.g. Entscheidungsspiel
         Decider
         |
    ## add replays
    ##  e.g. Final Replay
    ##       Quarter-finals replays
    ##       First round replays
     (?:
        (?: First [ ] Round |
            Quarter-?finals? |
            Finals?
         )
        [ ] Replays?
      )
     |
  ## more
     (?:
          Reclassification
     )
)$}ix
LEG_RE =

keep leg separate (from round) - why? why not?

%r{^
  # leg1
     (?: 1st|First)[ ]leg
     |
  # leg2
     (?: 2nd|Second)[ ]leg
$}ix
TIME_RE =

keep 18h30 - why? why not?

add support for 6:30pm 8:20am etc. - why? why not?
%r{
    ## e.g. 18.30 (or 18:30 or 18h30)
    (?<time>  \b
              (?<hour>\d{1,2})
                 (?: :|\.|h )
              (?<minute>\d{2})
              \b
    )
}ix
TIMEZONE_RE =

for timezone format use for now: (BRT/UTC-3) (e.g. brazil time)

(CET/UTC+1) - central european time (CEST/UTC+2) - central european summer time - daylight saving time (DST). (EET/UTC+1) - eastern european time (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).

UTC+3 UTC+4 UTC+0 UTC+00 UTC+0000

- allow +01 or +0100  - why? why not
-       +0130 (01:30)

see

https://en.wikipedia.org/wiki/Time_zone
https://en.wikipedia.org/wiki/List_of_UTC_offsets
https://en.wikipedia.org/wiki/UTC−04:00  etc.
%r{
   ## e.g. (UTC-2) or (CEST/UTC-2) etc.
   (?<timezone>
      \(
           ## optional "local" timezone name eg. BRT or CEST etc.
           (?:  [a-z]+
                 /
           )?
            [a-z]+
            [+-]
            \d{1,4}   ## e.g. 0 or 00 or 0000
      \)
   )
}ix
BASICS_RE =
%r{
    ## e.g. (51) or (1) etc.  - limit digits of number???
    (?<num> \(  (?<value>\d+) \) )
       |
    (?<vs>
       (?<=[ ])	# Positive lookbehind for space
       (?:
          vs|v
       )  
           # not bigger match first e.g. vs than v etc.
           # todo/fix - make vs|v case sensitive!!! only match v/vs - why? why not?
       (?=[ ])   # positive lookahead for space
    )
       |
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>[;,@|\[\]-])
}ix
MINUTE_RE =
%r{
     (?<minute>
       (?<=[ ])	 # Positive lookbehind for space required
           (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
        (?: \+
            (?<value2>\d{1,3})
        )?
        '     ## must have minute marker!!!!
     )
}ix
GOAL_PEN_RE =

goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og)

%r{
   (?<pen> \(
           (?:pen|p)\.?
           \)
    )
}ix
GOAL_OG_RE =
%r{
   (?<og> \(
          (?:og|o\.g\.)
          \)
   )
}ix
RE =
Regexp.union( STATUS_RE,
                    TIMEZONE_RE,
TIME_RE,
DURATION_RE,  # note - duration MUST match before date
                    DATE_RE,
                    SCORE_RE,
                    BASICS_RE, MINUTE_RE,
                    GOAL_OG_RE, GOAL_PEN_RE,
TEXT_RE )
MONTH_LINES =
parse_names( <<TXT )
January    Jan
February   Feb
March      Mar
April      Apr
May
June       Jun
July       Jul
August     Aug
September  Sept  Sep
October    Oct
November   Nov
December   Dec
TXT
MONTH_NAMES =
build_names( MONTH_LINES )
MONTH_MAP =

pp MONTH_NAMES

build_map( MONTH_LINES, downcase: true )
DAY_LINES =
parse_names( <<TXT )
Monday                   Mon  Mo
Tuesday            Tues  Tue  Tu
Wednesday                Wed  We
Thursday    Thurs  Thur  Thu  Th
Friday                   Fri  Fr
Saturday                 Sat  Sa
Sunday                   Sun  Su
TXT
DAY_NAMES =
build_names( DAY_LINES )
DAY_MAP =

pp DAY_NAMES

build_map( DAY_LINES, downcase: true )
DATE_I_RE =

e.g. Fri Aug/9 or Fri Aug 9

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?
     (?<month_name>#{MONTH_NAMES})
         (?: \/|[ ] )
     (?<day>\d{1,2})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?
  \b
)}ix
DATE_II_RE =

e.g. 3 June or 10 June

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?
     (?<day>\d{1,2})
         [ ]
     (?<month_name>#{MONTH_NAMES})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?
  \b
)}ix
DATE_RE =

map tables

note: order matters; first come-first matched/served
Regexp.union(
   DATE_I_RE,
   DATE_II_RE
)
DURATION_I_RE =

todo add plus later on - why? why not?

%r{
(?<duration>
    \b
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name1>#{MONTH_NAMES})
      (?: \/|[ ] )
   (?<day1>\d{1,2})
   ## optional year
   ( [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]*[-][ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name2>#{MONTH_NAMES})
      (?: \/|[ ] )
   (?<day2>\d{1,2})
   ## optional year
   ( [ ]
      (?<year2>\d{4})
   )?
   \b
)}ix
DURATION_II_RE =

variant ii e.g. 26 July - 27 July

%r{
(?<duration>
    \b
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<day1>\d{1,2})
      [ ]
   (?<month_name1>#{MONTH_NAMES})
   ## optional year
   ( [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]*[-][ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<day2>\d{1,2})
      [ ]
   (?<month_name2>#{MONTH_NAMES})
   ## optional year
   ( [ ]
      (?<year2>\d{4})
   )?
   \b
)}ix
DURATION_RE =

map tables

note: order matters; first come-first matched/served
Regexp.union(
   DURATION_I_RE,
   DURATION_II_RE
)
TEXT_RE =
%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
           ## positive lookbehind
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<=[ ,;@|\[\]]
                 |^
            )
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |

                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MUST be followed by (optional dot) and
                      ##                      required space !!!
                      ## MUST be follow by a to z!!!!
                      \.?     ## optional dot
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add weirdo case
                ##   e.g. 5.-8. Platz Playoffs  - keep - why? why not?
                    \d+\.-\d+\.  [ ]? \p{L}+
               )

              (?:(?:  (?:[ ]
                        (?!vs?[ ])    ## note - exclude (v[ ]/vs[ ])
                       )
                      |     # only single spaces allowed inline!!!
                     [-]
                  )?
                (?:
                  \p{L} |
                  [&/'°]
                    |
                 (?:
                   \d+
                   (?!
                     [0-9h'+-] |    ## protected break on 12h / 12' / 1-1
                                    ##  check usege for 3+4 - possible? where ? why?
                     (?:[.:]\d)     ## protected/exclude/break on 12.03 / 12:03
                    )
                   ## negative lookahead for numbers
                   ##   note - include digits itself!!!
                   ##   note - remove / (slash) e.g. allows UDI'19/Beter Bed
                 )|
                 \.
               )
              )*  ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)


            ## allow optional at the end
            ##  tag or year
            ##   make it and in the future - why? why not?
            ##
            ## change - fix
            ##   do NOT use (A) for amateur
            ##   use A or A. with NO ()!!!
            ## (A) -    allow with predined  alpha only for now
            ##          e.g. (A) - amateur a team or b?
            ###  same for U21 or U9 etc
            ##        use with NO ()!!! - why? why not?
            ##      or U21 U9 etc.   - why? why not?
            ##       or etc.
            ## (1879-1893) or allow years e.g. (1879-1893)
            ###
            ##    add allow country code three to five letters for now
            ##       change to generic 1 to 5 - why? why not?
            ##     e.g. (A), (I),
            ##          (AUT)
            ##          (TRNC)   five? for UEFA code for northern cyprus
            ##     change to 1 to 4 - why? why not?
            ##   check - fix possible for upper case only here
            ##                     inline for this group only?
            (?:
               [ ]
               \(
                  \d{4}-\d{4}
               \)
            )?
             (?:
               [ ]+   ## allow more than once space - why? why not?
                  \( (?:
                       [A-Z]{1,5}
                     )
                  \)
             )?
            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## positive lookahead
            (?=[ ,;@|\[\]]
                 |$
            )
   )
}ix
P_EN =

english helpers (penalty, extra time, …)

note - p must go last (shortest match)
  pso = penalty shootout
'(?: pso | pen\.? | p\.? )'
ET_EN =

e.g. p., p, pen, pen., PSO, etc.

'(?: aet | a\.e\.t\.? )'
SCORE__P_ET__RE =

note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.)

3-4 pen. 2-2 a.e.t.
3-4 pen.   2-2 a.e.t.
         2-2 a.e.t.
%r{
(?<score>
   \b
    (?:
       (?<p1>\d{1,2}) - (?<p2>\d{1,2})
         [ ]* #{P_EN} [ ]+
     )?             # note: make penalty (P) score optional for now
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]* #{ET_EN}
       (?=[ \]]|$)
)}ix
SCORE__P__RE =

note: allow SPECIAL with penalty only

3-4 pen.
%r{
        (?<score>
  \b
     (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN}
       (?=[ \]]|$)
)}ix
SCORE__P_ET_FT_HT__RE =

e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or

3-4p 2-2aet (1-1, )     or
3-4 pen.  2-2 a.e.t. (1-1)       or
         2-2 a.e.t. (1-1, 1-1)  or
         2-2 a.e.t. (1-1, )     or
         2-2 a.e.t. (1-1)
%r{
          (?<score>
   \b
   (?:
    (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN} [ ]+
    )?            # note: make penalty (P) score optional for now
   (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]* #{ET_EN} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
 (?=[ \]]|$)
)}ix
SCORE__P_FT_HT__RE =

special case for case WITHOUT extra time!!

same as above (but WITHOUT extra time and pen required)
%r{
         (?<score>
            \b
 (?<p1>\d{1,2}) - (?<p2>\d{1,2})
    [ ]* #{P_EN} [ ]+
    \(
    [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]*
 (?:
      , [ ]*
     (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         [ ]*
     )?
 )?              # note: make half time (HT) score optional for now
   \)
  (?=[ \]]|$)
)}ix
SCORE__FT_HT__RE =

e.g. 2-1 (1-1) or

2-1
%r{
            (?<score>
 \b
 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
  (?:
      [ ]+ \( [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
      [ ]* \)
  )?   # note: make half time (HT) score optional for now
(?=[ \]]|$)
)}ix
SCORE_RE =

map tables

note: order matters; first come-first matched/served
Regexp.union(
  SCORE__P_ET_FT_HT__RE,  # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
  SCORE__P_FT_HT__RE,     # e.g. 5-1 pen. (1-1)
  SCORE__P_ET__RE,        # e.g. 2-2 a.e.t.  or  5-1 pen. 2-2 a.e.t.
  SCORE__P__RE,           # e.g. 5-1 pen.
  SCORE__FT_HT__RE,        # e.g. 1-1 (1-0) or 1-1  -- note - must go last!!!
)

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.build_map(lines, downcase: false) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/sportdb/parser/token-date.rb', line 40

def self.build_map( lines, downcase: false )
   ## note: downcase name!!!
  ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
  ##  {"january" => 1,  "jan" => 1,
  ##   "february" => 2, "feb" => 2,
  ##   "march" => 3,    "mar" => 3,
  ##   "april" => 4,    "apr" => 4,
  ##   "may" => 5,
  ##   "june" => 6,     "jun" => 6, ...
  lines.each_with_index.reduce( {} ) do |h,(line,i)|
    line.each do |name|
       h[ downcase ? name.downcase : name ] = i+1
    end  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
    h
  end
end

.build_names(lines) ⇒ Object



33
34
35
36
37
# File 'lib/sportdb/parser/token-date.rb', line 33

def self.build_names( lines )
  ## join all words together into a single string e.g.
  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
  lines.map { |line| line.join('|') }.join('|')
end

.more_round_namesObject



180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/sportdb/parser/lang.rb', line 180

def self.more_round_names
   @more_round_name ||= begin
                           names = []
                           langs = ['en', 'de', 'es', 'pt', 'misc']
                           ## sort names by length??
                           langs.each do |lang|
                             path = "#{SportDb::Module::Parser.root}/config/rounds_#{lang}.txt"
                             names += read_names( path )
                           end
                           names
                        end
end

.parse_date(str, start:) ⇒ Object

add a date parser helper



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/sportdb/parser/token-date.rb', line 160

def self.parse_date( str, start: )
    if m=DATE_RE.match( str )

      year    = m[:year].to_i(10)  if m[:year]
      month   = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
      day     = m[:day].to_i(10)   if m[:day]
      wday    = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

      if year.nil?   ## try to calculate year
        year =  if  month > start.month ||
                   (month == start.month && day >= start.day)
                  # assume same year as start_at event (e.g. 2013 for 2013/14 season)
                  start.year
                else
                  # assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
                  start.year+1
                end
      end
      Date.new( year,month,day )
    else
      puts "!! ERROR - unexpected date format; cannot parse >#{str}<"
      exit 1
    end
end

.parse_names(txt) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/sportdb/parser/token-date.rb', line 6

def self.parse_names( txt )
  lines = [] # array of lines (with words)

  txt.each_line do |line|
    line = line.strip

    next if line.empty?
    next if line.start_with?( '#' )   ## skip comments too

    ## strip inline (until end-of-line) comments too
    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
    ##   =>   Janvier  Janv  Jan

    line = line.sub( /#.*/, '' ).strip
    ## pp line

    values = line.split( /[ \t]+/ )
    ## pp values

    ## todo/fix -- add check for duplicates
    lines << values
  end
  lines

end

.read_names(path) ⇒ Object

add more round names in different languages

  via txt files

for now must match case - maybe make caseinsensitive later - why? why not?


158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/sportdb/parser/lang.rb', line 158

def self.read_names( path )
     txt = read_text( path )
     names = [] # array of lines (with words)
     txt.each_line do |line|
       line = line.strip

       next if line.empty?
       next if line.start_with?( '#' )   ## skip comments too

       ## strip inline (until end-of-line) comments too
       ##   e.g. Janvier  Janv  Jan  ## check janv in use??
       ##   =>   Janvier  Janv  Jan

       line = line.sub( /#.*/, '' ).strip
       ## pp line

       names << line
     end
     names
end

Instance Method Details

#is_group?(text) ⇒ Boolean

Returns:

  • (Boolean)


20
21
22
23
# File 'lib/sportdb/parser/lang.rb', line 20

def is_group?( text )
   ## use regex for match
   GROUP_RE.match?( text )
end

#is_leg?(text) ⇒ Boolean

Pair matches/games if marked with leg1 n leg2

Returns:

  • (Boolean)


211
212
213
# File 'lib/sportdb/parser/lang.rb', line 211

def is_leg?( text )
   LEG_RE.match?( text )
end

#is_round?(text) ⇒ Boolean

Returns:

  • (Boolean)


194
195
196
197
# File 'lib/sportdb/parser/lang.rb', line 194

def is_round?( text )
    ROUND_RE.match?( text ) ||
    self.class.more_round_names.include?( text )
end

#log(msg) ⇒ Object



150
151
152
153
154
155
156
157
# File 'lib/sportdb/parser/token.rb', line 150

def log( msg )
   ## append msg to ./logs.txt
   ##     use ./errors.txt - why? why not?
   File.open( './logs.txt', 'a:utf-8' ) do |f|
     f.write( msg )
     f.write( "\n" )
   end
end

#tokenize(line, debug: false) ⇒ Object

convience helper - ignore errors by default



349
350
351
352
# File 'lib/sportdb/parser/token.rb', line 349

def tokenize(  line, debug: false )
   tokens, _ = tokenize_with_errors( line, debug: debug )
   tokens
end

#tokenize_with_errors(line, debug: false) ⇒ Object



161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
# File 'lib/sportdb/parser/token.rb', line 161

def tokenize_with_errors( line, debug: false )
  tokens = []
  errors = []   ## keep a list of errors - why? why not?

  puts ">#{line}<"    if debug

  pos = 0
  ## track last offsets - to report error on no match
  ##   or no match in end of string
  offsets = [0,0]
  m = nil

  while m = RE.match( line, pos )
    if debug
      pp m
      puts "pos: #{pos}"
    end
    offsets = [m.begin(0), m.end(0)]

    if offsets[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!
      msg =  "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      puts msg

      errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
      log( msg )
    end

    ##
    ## todo/fix - also check if possible
    ##   if no match but not yet end off string!!!!
    ##    report skipped text run too!!!

    pos = offsets[1]

    pp offsets   if debug

    ##
    ## note: racc requires pairs e.g. [:TOKEN, VAL]
    ##         for VAL use "text" or ["text", { opts }]  array

    t = if m[:space]
           ## skip space
           nil
        elsif m[:spaces]
           ## skip spaces
           nil
        elsif m[:text]
          [:TEXT, m[:text]]   ## keep pos - why? why not?
        elsif m[:status]   ## (match) status e.g. cancelled, awarded, etc.
          ## todo/check - add text (or status) 
          #     to opts hash {} by default (for value)
          if m[:status_note]   ## includes note? e.g.  awarded; originally 2-0
             [:STATUS, [m[:status], {status: m[:status], 
                                     note:   m[:status_note]} ]]
          else
             [:STATUS, [m[:status], {status: m[:status] } ]]
          end
        elsif m[:time]
              ## unify to iso-format
              ###   12.40 => 12:40
              ##    12h40 => 12:40 etc.
              ##  keep string (no time-only type in ruby)
              hour =   m[:hour].to_i(10)  ## allow 08/07/etc.
              minute = m[:minute].to_i(10)
              ## check if valid -  0:00 - 24:00
              ##   check if 24:00 possible? or only 0:00 (23:59)
              if (hour >= 0 && hour <= 24) &&
                 (minute >=0 && minute <= 59)
               ## note - for debugging keep (pass along) "literal" time
               ##   might use/add support for am/pm later
               [:TIME, [m[:time], {h:hour,m:minute}]]
              else
                 raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
              end
        elsif m[:date]
            date = {}
=begin
            ((?<day_name>#{DAY_NAMES})
            [ ]
       )?
       (?<month_name>#{MONTH_NAMES})
           (?: \/|[ ] )
       (?<day>\d{1,2})
       ## optional year
       (  [ ]
          (?<year>\d{4})
       )?
=end
 ## map month names
 ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date[:y] = m[:year].to_i(10)  if m[:year]
            date[:m] = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
            date[:d]  = m[:day].to_i(10)   if m[:day]
            date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]
            ## note - for debugging keep (pass along) "literal" date
            [:DATE, [m[:date], date]]
        elsif m[:timezone]
          [:TIMEZONE, m[:timezone]]
        elsif m[:duration]
            ## todo/check/fix - if end: works for kwargs!!!!!
            duration = { start: {}, end: {}}
            duration[:start][:y] = m[:year1].to_i(10)  if m[:year1]
            duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ]   if m[:month_name1]
            duration[:start][:d]  = m[:day1].to_i(10)   if m[:day1]
            duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ]   if m[:day_name1]
            duration[:end][:y] = m[:year2].to_i(10)  if m[:year2]
            duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            duration[:end][:d]  = m[:day2].to_i(10)   if m[:day2]
            duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ]   if m[:day_name2]
            ## note - for debugging keep (pass along) "literal" duration
            [:DURATION, [m[:duration], duration]]
        elsif m[:num]   ## fix - change to ord (for ordinal number!!!)
              ## note -  strip enclosing () and convert to integer
             [:ORD, [m[:num], { value: m[:value].to_i(10) } ]]
        elsif m[:score]
              score = {}
              ## check for pen
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]

            ## note - for debugging keep (pass along) "literal" score
            [:SCORE, [m[:score], score]]
        elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
        elsif m[:og]
           [:OG, m[:og]]    ## for typed drop - string version/variants ??  why? why not?
        elsif m[:pen]
           [:PEN, m[:pen]]
        elsif m[:vs]
           [:VS, m[:vs]]
        elsif m[:sym]
          sym = m[:sym]
          ## return symbols "inline" as is - why? why not?
          ## (?<sym>[;,@|\[\]-])
 
          case sym
          when ',' then [:',']
          when ';' then [:';']
          when '@' then [:'@']
          when '|' then [:'|']
          when '[' then [:'[']
          when ']' then [:']']
          when '-' then [:'-']
          else
            nil  ## ignore others (e.g. brackets [])
          end
        else
          ## report error
           puts "!!! TOKENIZE ERROR - no match found"
           nil 
        end

    tokens << t    if t

    if debug
      print ">"
      print "*" * pos
      puts "#{line[pos..-1]}<"
    end
  end

  ## check if no match in end of string
  if offsets[1] != line.size
    msg =  "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
    puts msg
    log( msg )

    errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
  end


  [tokens,errors]
end