Class: SportDb::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/sportdb/parser.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/tokenizer.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-score.rb,
lib/sportdb/parser/token-status.rb

Constant Summary collapse

TIME_RE =

keep 18h30 - why? why not?

add support for 6:30pm 8:20am etc. - why? why not?
%r{
    ## e.g. 18.30 (or 18:30 or 18h30)
    (?<time>  \b
              (?<hour>\d{1,2})
                 (?: :|\.|h )
              (?<minute>\d{2})
              \b
    )
}ix
TIMEZONE_RE =

for timezone format use for now: (BRT/UTC-3) (e.g. brazil time)

(CET/UTC+1) - central european time (CEST/UTC+2) - central european summer time - daylight saving time (DST). (EET/UTC+1) - eastern european time (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).

UTC+3 UTC+4 UTC+0 UTC+00 UTC+0000

- allow +01 or +0100  - why? why not
-       +0130 (01:30)

see

https://en.wikipedia.org/wiki/Time_zone
https://en.wikipedia.org/wiki/List_of_UTC_offsets
https://en.wikipedia.org/wiki/UTC−04:00  etc.
%r{
   ## e.g. (UTC-2) or (CEST/UTC-2) etc.
   (?<timezone>
      \(
           ## optional "local" timezone name eg. BRT or CEST etc.
           (?:  [a-z]+
                 /
           )?
            [a-z]+
            [+-]
            \d{1,4}   ## e.g. 0 or 00 or 0000
      \)
   )
}ix
BASICS_RE =
%r{
    ## e.g. (51) or (1) etc.  - limit digits of number???
    (?<num> \(  (?<value>\d+) \) )
       |
    (?<vs>
       (?<=[ ])	# Positive lookbehind for space
       (?:
          vs|v
       )  
           # not bigger match first e.g. vs than v etc.
           # todo/fix - make vs|v case sensitive!!! only match v/vs - why? why not?
       (?=[ ])   # positive lookahead for space
    )
       |
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>[;,@|\[\]-])
}ix
MINUTE_RE =
%r{
     (?<minute>
       (?<=[ (])	 # Positive lookbehind for space or opening ( e.g. (61') required
           (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
        (?: \+
            (?<value2>\d{1,3})
        )?
        '     ## must have minute marker!!!!
     )
}ix
GOAL_PEN_RE =

goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og)

%r{
   (?<pen> \(
           (?:pen|p)\.?
           \)
    )
}ix
GOAL_OG_RE =
%r{
   (?<og> \(
          (?:og|o\.g\.)
          \)
   )
}ix
PROP_BASICS_RE =
%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>[.;,\(\)\[\]-])   ## note - dot (.) is the (all-important) end-of-prop marker!!!
}ix
PROP_NAME_RE =

name different from text (does not allow number in name/text)

note - includes special handling for dot (.) if at the end of line!!!
          end-of-line dot (.) is the prop end-of-marker - do NOT eat-up!!!
%r{
                 (?<prop_name> \b
                   (?<name>
                      \p{L}+       
                       (?: \. (?: (?![ ]*$) )
                        )?      ## edge case - check for end of prop marker! (e.g. Stop.)
                      (?: 
                          [ ]?    # only single spaces allowed inline!!!
                          (?:
                              (?:
                                (?<=\p{L})   ## use lookbehind
                                 [/'-]   ## must be surrounded by letters
                                       ## e.g. One/Two NOT
                                       ##      One/ Two or One / Two or One /Two etc.
                                (?=\p{L})      ## use lookahead        
                              )
                                 |   
                              (?:
                                (?<=[ ])   ## use lookbehind  -- add letter (plus dot) or such - why? why not?
                                 [']   ## must be surrounded by leading space and
                                       ## traling letters  (e.g. UDI 'Beter Bed)
                                (?=\p{L})      ## use lookahead        
                              )   
                                 |
                              (?:
                                (?<=\p{L})   ## use lookbehind
                                 [']   ## must be surrounded by leading letter and
                                       ## trailing space PLUS letter  (e.g. UDI' Beter Bed)
                                (?=[ ]\p{L})      ## use lookahead (space WITH letter         
                              )   
                                 |
                              (?: \p{L}+
                                  (?: \. 
                                      (?: (?![ ]*$) )
                                  )?  ## last dot is delimiter!!!
                              )
                          )+
                     )*
                   )
               ## add lookahead - must be non-alphanum (or dot)
                  (?=[ .,;\]\)]|$)
                  )
}ix
PROP_KEY_RE =
%r{ 
(?<prop_key> \b
  (?<key>
      (?:\p{L}+
          |
          \d+  # check for num lookahead (MUST be space or dot)
       ## MUST be followed by (optional dot) and
       ##                      required space !!!
       ## MUST be follow by a to z!!!!
        \.?     ## optional dot
        [ ]?   ## make space optional too  - why? why not?
            ##  yes - eg. 1st, 2nd, 5th etc.
        \p{L}+
       )
       [\d\p{L}'/° -]*?   ## allow almost anyting 
                         ## fix - add negative lookahead 
                         ##         no space and dash etc.
                         ##    only allowed "inline" not at the end
                         ## must end with latter or digit!
  )
   [ ]*?     # slurp trailing spaces
    :
   (?=[ ]+)  ## possitive lookahead (must be followed by space!!)
  )
}ix
PROP_RE =
Regexp.union(
   PROP_BASICS_RE, 
   MINUTE_RE,
   PROP_NAME_RE,
)
RE =

start with prop key (match will/should switch into prop mode!!!)

Regexp.union(  PROP_KEY_RE, ##  start with prop key (match will/should switch into prop mode!!!)
                    STATUS_RE,
                    TIMEZONE_RE,
TIME_RE,
DURATION_RE,  # note - duration MUST match before date
                    DATE_RE,
                    SCORE_RE,
                    BASICS_RE, MINUTE_RE,
                    GOAL_OG_RE, GOAL_PEN_RE,
TEXT_RE )
MONTH_LINES =
parse_names( <<TXT )
January    Jan
February   Feb
March      Mar
April      Apr
May
June       Jun
July       Jul
August     Aug
September  Sept  Sep
October    Oct
November   Nov
December   Dec
TXT
MONTH_NAMES =
build_names( MONTH_LINES )
MONTH_MAP =

pp MONTH_NAMES

build_map( MONTH_LINES, downcase: true )
DAY_LINES =
parse_names( <<TXT )
Monday                   Mon  Mo
Tuesday            Tues  Tue  Tu
Wednesday                Wed  We
Thursday    Thurs  Thur  Thu  Th
Friday                   Fri  Fr
Saturday                 Sat  Sa
Sunday                   Sun  Su
TXT
DAY_NAMES =
build_names( DAY_LINES )
DAY_MAP =

pp DAY_NAMES

build_map( DAY_LINES, downcase: true )
DATE_I_RE =

e.g. Fri Aug/9 or Fri Aug 9

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?
     (?<month_name>#{MONTH_NAMES})
         (?: \/|[ ] )
     (?<day>\d{1,2})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?
  \b
)}ix
DATE_II_RE =

e.g. 3 June or 10 June

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?
     (?<day>\d{1,2})
         [ ]
     (?<month_name>#{MONTH_NAMES})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?
  \b
)}ix
DATE_RE =

map tables

note: order matters; first come-first matched/served
Regexp.union(
   DATE_I_RE,
   DATE_II_RE
)
DURATION_I_RE =

todo add plus later on - why? why not?

%r{
(?<duration>
    \b
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name1>#{MONTH_NAMES})
      (?: \/|[ ] )
   (?<day1>\d{1,2})
   ## optional year
   ( [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]*[-][ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name2>#{MONTH_NAMES})
      (?: \/|[ ] )
   (?<day2>\d{1,2})
   ## optional year
   ( [ ]
      (?<year2>\d{4})
   )?
   \b
)}ix
DURATION_II_RE =

variant ii e.g. 26 July - 27 July

%r{
(?<duration>
    \b
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<day1>\d{1,2})
      [ ]
   (?<month_name1>#{MONTH_NAMES})
   ## optional year
   ( [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]*[-][ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<day2>\d{1,2})
      [ ]
   (?<month_name2>#{MONTH_NAMES})
   ## optional year
   ( [ ]
      (?<year2>\d{4})
   )?
   \b
)}ix
DURATION_RE =

map tables

note: order matters; first come-first matched/served
Regexp.union(
   DURATION_I_RE,
   DURATION_II_RE
)
TEXT_RE =
%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
           ## positive lookbehind
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<=[ ,;@|\[\]]
                 |^
            )
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |

                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MUST be followed by (optional dot) and
                      ##                      required space !!!
                      ## MUST be follow by a to z!!!!
                      \.?     ## optional dot
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add weirdo case
                ##   e.g. 5.-8. Platz Playoffs  - keep - why? why not?
                    \d+\.-\d+\.  [ ]? \p{L}+
               )

              (?:(?:  (?:[ ]
                        (?!vs?[ ])    ## note - exclude (v[ ]/vs[ ])
                       )
                      |     # only single spaces allowed inline!!!
                     [-]
                  )?
                (?:
                  \p{L} |
                  [&/'°]
                    |
                 (?:
                   \d+
                   (?!
                     [0-9h'+-] |    ## protected break on 12h / 12' / 1-1
                                    ##  check usege for 3+4 - possible? where ? why?
                     (?:[.:]\d)     ## protected/exclude/break on 12.03 / 12:03
                    )
                   ## negative lookahead for numbers
                   ##   note - include digits itself!!!
                   ##   note - remove / (slash) e.g. allows UDI'19/Beter Bed
                 )|
                 \.
               )
              )*  ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)


            ## allow optional at the end
            ##  tag or year
            ##   make it and in the future - why? why not?
            ##
            ## change - fix
            ##   do NOT use (A) for amateur
            ##   use A or A. with NO ()!!!
            ## (A) -    allow with predined  alpha only for now
            ##          e.g. (A) - amateur a team or b?
            ###  same for U21 or U9 etc
            ##        use with NO ()!!! - why? why not?
            ##      or U21 U9 etc.   - why? why not?
            ##       or etc.
            ## (1879-1893) or allow years e.g. (1879-1893)
            ###
            ##    add allow country code three to five letters for now
            ##       change to generic 1 to 5 - why? why not?
            ##     e.g. (A), (I),
            ##          (AUT)
            ##          (TRNC)   five? for UEFA code for northern cyprus
            ##     change to 1 to 4 - why? why not?
            ##   check - fix possible for upper case only here
            ##                     inline for this group only?
            (?:
               [ ]
               \(
                  \d{4}-\d{4}
               \)
            )?
             (?:
               [ ]+   ## allow more than once space - why? why not?
                  \( (?:
                       [A-Z]{1,5}
                     )
                  \)
             )?
            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## positive lookahead
            (?=[ ,;@|\[\]]
                 |$
            )
   )
}ix
P_EN =

english helpers (penalty, extra time, …)

note - p must go last (shortest match)
  pso = penalty shootout
'(?: pso | pen\.? | p\.? )'
ET_EN =

e.g. p., p, pen, pen., PSO, etc.

'(?: aet | a\.e\.t\.? )'
SCORE__P_ET__RE =

note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.)

3-4 pen. 2-2 a.e.t.
3-4 pen.   2-2 a.e.t.
         2-2 a.e.t.
%r{
(?<score>
   \b
    (?:
       (?<p1>\d{1,2}) - (?<p2>\d{1,2})
         [ ]* #{P_EN} [ ]+
     )?             # note: make penalty (P) score optional for now
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]* #{ET_EN}
       (?=[ ,\]]|$)
)}ix
SCORE__P__RE =

note: allow SPECIAL with penalty only

3-4 pen.
%r{
        (?<score>
  \b
     (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN}
       (?=[ ,\]]|$)
)}ix
SCORE__P_ET_FT_HT__RE =

e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or

3-4p 2-2aet (1-1, )     or
3-4 pen.  2-2 a.e.t. (1-1)       or
         2-2 a.e.t. (1-1, 1-1)  or
         2-2 a.e.t. (1-1, )     or
         2-2 a.e.t. (1-1)
%r{
          (?<score>
   \b
   (?:
    (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN} [ ]+
    )?            # note: make penalty (P) score optional for now
   (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]* #{ET_EN} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
 (?=[ ,\]]|$)
)}ix
SCORE__P_FT_HT__RE =

special case for case WITHOUT extra time!!

same as above (but WITHOUT extra time and pen required)
%r{
         (?<score>
            \b
 (?<p1>\d{1,2}) - (?<p2>\d{1,2})
    [ ]* #{P_EN} [ ]+
    \(
    [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]*
 (?:
      , [ ]*
     (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         [ ]*
     )?
 )?              # note: make half time (HT) score optional for now
   \)
  (?=[ ,\]]|$)
)}ix
SCORE__FT_HT__RE =

e.g. 2-1 (1-1) or

2-1
%r{
            (?<score>
 \b
 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
  (?:
      [ ]+ \( [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
      [ ]* \)
  )?   # note: make half time (HT) score optional for now
(?=[ ,\]]|$)
)}ix
SCORE_RE =

map tables

note: order matters; first come-first matched/served
Regexp.union(
  SCORE__P_ET_FT_HT__RE,  # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
  SCORE__P_FT_HT__RE,     # e.g. 5-1 pen. (1-1)
  SCORE__P_ET__RE,        # e.g. 2-2 a.e.t.  or  5-1 pen. 2-2 a.e.t.
  SCORE__P__RE,           # e.g. 5-1 pen.
  SCORE__FT_HT__RE,        # e.g. 1-1 (1-0) or 1-1  -- note - must go last!!!
)
STATUS_RE =
%r{
            \[
      (?:    
            ### opt 1 - allow long forms with note/comment for some stati
           (?: (?<status> awarded
                            |
                          annulled
                            |
                          abandoned
               ) [ ;,]* (?<status_note> [^\]]+ )
                 [ ]*
            )
            |
        
            ## opt 2 - short from only (no note/comments)
            (?<status>
               cancelled|canceled|can\.
                 |
               abandoned|abd\.
                 |
               postponed
                 |
               awarded|awd\.
                 |
               replay
                 |
               annulled
            )
      )
    \]
}ix

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.build_map(lines, downcase: false) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/sportdb/parser/token-date.rb', line 40

def self.build_map( lines, downcase: false )
   ## note: downcase name!!!
  ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
  ##  {"january" => 1,  "jan" => 1,
  ##   "february" => 2, "feb" => 2,
  ##   "march" => 3,    "mar" => 3,
  ##   "april" => 4,    "apr" => 4,
  ##   "may" => 5,
  ##   "june" => 6,     "jun" => 6, ...
  lines.each_with_index.reduce( {} ) do |h,(line,i)|
    line.each do |name|
       h[ downcase ? name.downcase : name ] = i+1
    end  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
    h
  end
end

.build_names(lines) ⇒ Object



33
34
35
36
37
# File 'lib/sportdb/parser/token-date.rb', line 33

def self.build_names( lines )
  ## join all words together into a single string e.g.
  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
  lines.map { |line| line.join('|') }.join('|')
end

.parse_date(str, start:) ⇒ Object

add a date parser helper



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/sportdb/parser/token-date.rb', line 160

def self.parse_date( str, start: )
    if m=DATE_RE.match( str )

      year    = m[:year].to_i(10)  if m[:year]
      month   = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
      day     = m[:day].to_i(10)   if m[:day]
      wday    = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

      if year.nil?   ## try to calculate year
        year =  if  month > start.month ||
                   (month == start.month && day >= start.day)
                  # assume same year as start_at event (e.g. 2013 for 2013/14 season)
                  start.year
                else
                  # assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
                  start.year+1
                end
      end
      Date.new( year,month,day )
    else
      puts "!! ERROR - unexpected date format; cannot parse >#{str}<"
      exit 1
    end
end

.parse_names(txt) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/sportdb/parser/token-date.rb', line 6

def self.parse_names( txt )
  lines = [] # array of lines (with words)

  txt.each_line do |line|
    line = line.strip

    next if line.empty?
    next if line.start_with?( '#' )   ## skip comments too

    ## strip inline (until end-of-line) comments too
    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
    ##   =>   Janvier  Janv  Jan

    line = line.sub( /#.*/, '' ).strip
    ## pp line

    values = line.split( /[ \t]+/ )
    ## pp values

    ## todo/fix -- add check for duplicates
    lines << values
  end
  lines

end

Instance Method Details

#is_group?(text) ⇒ Boolean

keep “old” access to checking for group, round & friends

for now for compatibility

Returns:

  • (Boolean)


62
# File 'lib/sportdb/parser.rb', line 62

def is_group?( text )  Lang.is_group?( text ); end

#is_leg?(text) ⇒ Boolean

Returns:

  • (Boolean)


64
# File 'lib/sportdb/parser.rb', line 64

def is_leg?( text )    Lang.is_leg?( text ); end

#is_round?(text) ⇒ Boolean

Returns:

  • (Boolean)


63
# File 'lib/sportdb/parser.rb', line 63

def is_round?( text )  Lang.is_round?( text ); end

#log(msg) ⇒ Object



7
8
9
10
11
12
13
14
# File 'lib/sportdb/parser/tokenizer.rb', line 7

def log( msg )
   ## append msg to ./logs.txt
   ##     use ./errors.txt - why? why not?
   File.open( './logs.txt', 'a:utf-8' ) do |f|
     f.write( msg )
     f.write( "\n" )
   end
end

#tokenize(line, debug: false) ⇒ Object

convience helper - ignore errors by default



256
257
258
259
# File 'lib/sportdb/parser/tokenizer.rb', line 256

def tokenize(  line, debug: false )
   tokens, _ = tokenize_with_errors( line, debug: debug )
   tokens
end

#tokenize_with_errors(line, debug: false) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/sportdb/parser/tokenizer.rb', line 18

def tokenize_with_errors( line, debug: false )
  tokens = []
  errors = []   ## keep a list of errors - why? why not?

  puts ">#{line}<"    if debug

  pos = 0
  ## track last offsets - to report error on no match
  ##   or no match in end of string
  offsets = [0,0]
  m = nil


  ####
  ## quick hack - keep re state/mode between tokenize calls!!!
  @re  ||= RE     ## note - switch between RE & INSIDE_RE


  while m = @re.match( line, pos )
    if debug
      pp m
      puts "pos: #{pos}"
    end
    offsets = [m.begin(0), m.end(0)]

    if offsets[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!
      msg =  "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      puts msg

      errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
      log( msg )
    end

    ##
    ## todo/fix - also check if possible
    ##   if no match but not yet end off string!!!!
    ##    report skipped text run too!!!

    pos = offsets[1]

    pp offsets   if debug

    ##
    ## note: racc requires pairs e.g. [:TOKEN, VAL]
    ##         for VAL use "text" or ["text", { opts }]  array


  t = if @re == PROP_RE
         if m[:space]
              ## skip space
              nil
         elsif m[:spaces]
              ## skip spaces
              nil
         elsif m[:prop_name]
               if m[:name] == 'Y'
                 [:YELLOW_CARD, m[:name]]
               elsif m[:name] == 'R'
                 [:RED_CARD, m[:name]]
               else 
                 [:PROP_NAME, m[:name]]
               end
         elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
         elsif m[:sym]
            sym = m[:sym]
            ## return symbols "inline" as is - why? why not?
            ## (?<sym>[;,@|\[\]-])
 
            case sym
            when ',' then [:',']
            when ';' then [:';']
            when '[' then [:'[']
            when ']' then [:']']
            when '(' then [:'(']
            when ')' then [:')']
            when '-' then [:'-']
            when '.' then 
                ## switch back to top-level mode!!
                puts "  LEAVE PROP_RE MODE, BACK TO TOP_LEVEL/RE"
                @re = RE 
                [:'.']
            else
              nil  ## ignore others (e.g. brackets [])
            end
         else
            ## report error
             puts "!!! TOKENIZE ERROR (PROP_RE) - no match found"
             nil 
         end
      else  ## assume TOP_LEVEL (a.k.a. RE) machinery
        if m[:space]
           ## skip space
           nil
        elsif m[:spaces]
           ## skip spaces
           nil
        elsif m[:prop_key]
           ##  switch context  to PROP_RE
           @re = PROP_RE
           puts "  ENTER PROP_RE MODE"
           [:PROP, m[:key]]
        elsif m[:text]
          [:TEXT, m[:text]]   ## keep pos - why? why not?
        elsif m[:status]   ## (match) status e.g. cancelled, awarded, etc.
          ## todo/check - add text (or status) 
          #     to opts hash {} by default (for value)
          if m[:status_note]   ## includes note? e.g.  awarded; originally 2-0
             [:STATUS, [m[:status], {status: m[:status], 
                                     note:   m[:status_note]} ]]
          else
             [:STATUS, [m[:status], {status: m[:status] } ]]
          end
        elsif m[:time]
              ## unify to iso-format
              ###   12.40 => 12:40
              ##    12h40 => 12:40 etc.
              ##  keep string (no time-only type in ruby)
              hour =   m[:hour].to_i(10)  ## allow 08/07/etc.
              minute = m[:minute].to_i(10)
              ## check if valid -  0:00 - 24:00
              ##   check if 24:00 possible? or only 0:00 (23:59)
              if (hour >= 0 && hour <= 24) &&
                 (minute >=0 && minute <= 59)
               ## note - for debugging keep (pass along) "literal" time
               ##   might use/add support for am/pm later
               [:TIME, [m[:time], {h:hour,m:minute}]]
              else
                 raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
              end
        elsif m[:date]
            date = {}
 ## map month names
 ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date[:y] = m[:year].to_i(10)  if m[:year]
            date[:m] = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
            date[:d]  = m[:day].to_i(10)   if m[:day]
            date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]
            ## note - for debugging keep (pass along) "literal" date
            [:DATE, [m[:date], date]]
        elsif m[:timezone]
          [:TIMEZONE, m[:timezone]]
        elsif m[:duration]
            ## todo/check/fix - if end: works for kwargs!!!!!
            duration = { start: {}, end: {}}
            duration[:start][:y] = m[:year1].to_i(10)  if m[:year1]
            duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ]   if m[:month_name1]
            duration[:start][:d]  = m[:day1].to_i(10)   if m[:day1]
            duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ]   if m[:day_name1]
            duration[:end][:y] = m[:year2].to_i(10)  if m[:year2]
            duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            duration[:end][:d]  = m[:day2].to_i(10)   if m[:day2]
            duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ]   if m[:day_name2]
            ## note - for debugging keep (pass along) "literal" duration
            [:DURATION, [m[:duration], duration]]
        elsif m[:num]   ## fix - change to ord (for ordinal number!!!)
              ## note -  strip enclosing () and convert to integer
             [:ORD, [m[:num], { value: m[:value].to_i(10) } ]]
        elsif m[:score]
              score = {}
              ## check for pen
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]

            ## note - for debugging keep (pass along) "literal" score
            [:SCORE, [m[:score], score]]
        elsif m[:minute]
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:MINUTE, [m[:minute], minute]]
        elsif m[:og]
           [:OG, m[:og]]    ## for typed drop - string version/variants ??  why? why not?
        elsif m[:pen]
           [:PEN, m[:pen]]
        elsif m[:vs]
           [:VS, m[:vs]]
        elsif m[:sym]
          sym = m[:sym]
          ## return symbols "inline" as is - why? why not?
          ## (?<sym>[;,@|\[\]-])
 
          case sym
          when ',' then [:',']
          when ';' then [:';']
          when '@' then [:'@']
          when '|' then [:'|']
          when '[' then [:'[']
          when ']' then [:']']
          when '-' then [:'-']
          else
            nil  ## ignore others (e.g. brackets [])
          end
        else
          ## report error
           puts "!!! TOKENIZE ERROR - no match found"
           nil 
        end
      end


    tokens << t    if t

    if debug
      print ">"
      print "*" * pos
      puts "#{line[pos..-1]}<"
    end
  end

  ## check if no match in end of string
  if offsets[1] != line.size
    msg =  "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
    puts msg
    log( msg )

    errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
  end


  [tokens,errors]
end