Class: Parselly::Lexer

Inherits:
Object
  • Object
show all
Defined in:
lib/parselly/lexer.rb

Defined Under Namespace

Classes: Identifier, Token, TokenValue

Constant Summary collapse

TOKENS =
{
  # Namespace and column combinators
  '|' => :PIPE,
  '||' => :COLUMN,

  # Combinators
  '>' => :CHILD,
  '+' => :ADJACENT,
  '~' => :SIBLING,

  # Delimiters
  '[' => :LBRACKET,
  ']' => :RBRACKET,
  '(' => :LPAREN,
  ')' => :RPAREN,
  ':' => :COLON,
  ',' => :COMMA,
  '.' => :DOT,
  '#' => :HASH,
  '*' => :STAR,
  '=' => :EQUAL,
  '-' => :MINUS,

  # Attribute operators
  '~=' => :INCLUDES,
  '|=' => :DASHMATCH,
  '^=' => :PREFIXMATCH,
  '$=' => :SUFFIXMATCH,
  '*=' => :SUBSTRINGMATCH
}.freeze
MULTI_CHAR_TOKENS =
{
  '~=' => :INCLUDES,
  '|=' => :DASHMATCH,
  '^=' => :PREFIXMATCH,
  '$=' => :SUFFIXMATCH,
  '*=' => :SUBSTRINGMATCH,
  '||' => :COLUMN
}.freeze
SINGLE_CHAR_OPERATOR_REGEX =
/[|>+~\[\]():,.#*=-]/.freeze
WHITESPACE_REGEX =
/[ \t\n\r\f]+/.freeze
COMMENT_REGEX =
%r{/\*[^*]*\*+(?:[^/*][^*]*\*+)*/}.freeze
ESCAPE_SEQUENCE =
/\\(?:[0-9a-fA-F]{1,6}[ \t\n\r\f]?|[^\n\r\f])/.freeze
IDENTIFIER_REGEX =
/
  (?:
    --
    |
    -?(?:[a-zA-Z_]|[^\x00-\x7F]|#{ESCAPE_SEQUENCE})
  )
  (?:[a-zA-Z0-9_-]|[^\x00-\x7F]|#{ESCAPE_SEQUENCE})*
/x.freeze
NUMBER_REGEX =
/\d+(\.\d+)?/.freeze
HEX_ESCAPE_REGEX =
/\\([0-9a-fA-F]{1,6})([ \t\n\r\f])?/.freeze
ESCAPED_NEWLINE_REGEX =
/\\(?:\r\n|[\n\r\f])/.freeze
SIMPLE_ESCAPE_REGEX =
/\\([^\n\r\f])/.freeze
REPLACEMENT_CHARACTER =
"\uFFFD"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input) ⇒ Lexer

Returns a new instance of Lexer.



133
134
135
136
137
138
139
140
141
142
143
# File 'lib/parselly/lexer.rb', line 133

def initialize(input)
  unless input.valid_encoding?
    raise_lexer_error('Invalid input encoding', { line: 1, column: 1, offset: 0 })
  end

  preprocessed_input, @offset_map = preprocess_input(input)
  @scanner = StringScanner.new(preprocessed_input)
  @line = 1
  @column = 1
  @tokens = []
end

Instance Attribute Details

#columnObject (readonly)

Returns the value of attribute column.



131
132
133
# File 'lib/parselly/lexer.rb', line 131

def column
  @column
end

#lineObject (readonly)

Returns the value of attribute line.



131
132
133
# File 'lib/parselly/lexer.rb', line 131

def line
  @line
end

Instance Method Details

#tokenizeObject



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/parselly/lexer.rb', line 145

def tokenize
  until @scanner.eos?
    skip_ignored
    break if @scanner.eos?

    start_position = current_position

    if (token = scan_string(start_position))
      type, value = token
      @tokens << build_token(type, value, start_position)
    elsif (value = scan_number)
      @tokens << build_token(:NUMBER, value, start_position)
    elsif (type = scan_operator)
      @tokens << build_token(type, @scanner.matched, start_position)
    elsif (value = scan_identifier(start_position))
      @tokens << build_token(:IDENT, value, start_position)
    else
      char = @scanner.getch
      raise_lexer_error("Unexpected character: #{char}", start_position)
    end
  end

  @tokens << Token.new(type: false, value: nil, position: eof_position)
  @tokens
end