Module: Scrapetor::XPath::Tokenizer

Defined in:
lib/scrapetor/xpath.rb

Overview

Tokenizer

Constant Summary collapse

OPERATORS =
%w[// / .. . :: @ ( ) [ ] , | + - = != <= >= < > * div mod and or].freeze

Class Method Summary collapse

Class Method Details

.tokenize(s) ⇒ Object



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# File 'lib/scrapetor/xpath.rb', line 78

def self.tokenize(s)
  tokens = []
  i = 0
  len = s.length
  while i < len
    c = s[i]
    case c
    when " ", "\t", "\n", "\r"
      i += 1
    when "/"
      if s[i + 1] == "/"
        tokens << [:slash_slash, "//"]; i += 2
      else
        tokens << [:slash, "/"]; i += 1
      end
    when "("
      tokens << [:lparen, "("]; i += 1
    when ")"
      tokens << [:rparen, ")"]; i += 1
    when "["
      tokens << [:lbracket, "["]; i += 1
    when "]"
      tokens << [:rbracket, "]"]; i += 1
    when ","
      tokens << [:comma, ","]; i += 1
    when "@"
      tokens << [:at, "@"]; i += 1
    when "|"
      tokens << [:pipe, "|"]; i += 1
    when "+"
      tokens << [:plus, "+"]; i += 1
    when "-"
      # `-` is a tricky one. In XPath 1.0 it's only an operator
      # when the preceding token is one of: another operator, `(`,
      # `[`, `,`, or nothing (start of expression). Otherwise it's
      # part of a name. NameTest disambiguates downstream.
      prev = tokens.last
      if prev.nil? || %i[lparen lbracket comma op slash slash_slash pipe at plus minus eq neq lt gt le ge star and_op or_op].include?(prev[0])
        tokens << [:minus, "-"]
      else
        tokens << [:minus, "-"]
      end
      i += 1
    when "="
      tokens << [:eq, "="]; i += 1
    when "!"
      if s[i + 1] == "="
        tokens << [:neq, "!="]; i += 2
      else
        raise ParseError, "stray `!` in `#{s}`"
      end
    when "<"
      if s[i + 1] == "="
        tokens << [:le, "<="]; i += 2
      else
        tokens << [:lt, "<"]; i += 1
      end
    when ">"
      if s[i + 1] == "="
        tokens << [:ge, ">="]; i += 2
      else
        tokens << [:gt, ">"]; i += 1
      end
    when ":"
      if s[i + 1] == ":"
        tokens << [:axis_sep, "::"]; i += 2
      else
        tokens << [:colon, ":"]; i += 1
      end
    when "*"
      # `*` is multiplicative when prev is a value-producing token;
      # otherwise it's NameTest "any element".
      prev = tokens.last
      if prev && %i[name number string rparen rbracket dot at_attr_done].include?(prev[0])
        tokens << [:star_mul, "*"]
      else
        tokens << [:star, "*"]
      end
      i += 1
    when "."
      if s[i + 1] == "."
        tokens << [:dot_dot, ".."]; i += 2
      elsif s[i + 1] && (s[i + 1] >= "0" && s[i + 1] <= "9")
        # Numeric literal starting with .
        j = i + 1
        j += 1 while j < len && s[j] >= "0" && s[j] <= "9"
        tokens << [:number, s[i...j].to_f]; i = j
      else
        tokens << [:dot, "."]; i += 1
      end
    when "'", '"'
      quote = c
      j = i + 1
      j += 1 while j < len && s[j] != quote
      raise ParseError, "unterminated string in `#{s}`" if j >= len
      tokens << [:string, s[(i + 1)...j]]
      i = j + 1
    when "0".."9"
      j = i
      j += 1 while j < len && s[j] >= "0" && s[j] <= "9"
      if j < len && s[j] == "." && (j + 1 >= len || (s[j + 1] >= "0" && s[j + 1] <= "9"))
        j += 1
        j += 1 while j < len && s[j] >= "0" && s[j] <= "9"
        tokens << [:number, s[i...j].to_f]
      else
        tokens << [:number, s[i...j].to_i]
      end
      i = j
    else
      # Name token: NCName chars (letters, digits, _, -). XPath
      # operators `div`, `mod`, `and`, `or` are name-shaped; we
      # classify them post-hoc based on context.
      if c =~ /[A-Za-z_]/
        j = i
        j += 1 while j < len && s[j] =~ /[A-Za-z0-9_\-]/
        name = s[i...j]
        prev = tokens.last
        # Operator names only kick in when the prior token suggests
        # we're in an operator position (after a value-producing token).
        op_position = prev && %i[name number string rparen rbracket star_mul dot_dot dot].include?(prev[0])
        if op_position && name == "and"
          tokens << [:and_op, "and"]
        elsif op_position && name == "or"
          tokens << [:or_op, "or"]
        elsif op_position && name == "div"
          tokens << [:div_op, "div"]
        elsif op_position && name == "mod"
          tokens << [:mod_op, "mod"]
        else
          tokens << [:name, name]
        end
        i = j
      else
        raise ParseError, "unrecognised char `#{c}` at #{i} in `#{s}`"
      end
    end
  end
  tokens << [:eof, nil]
  tokens
end