Module: LexerKit::DFA::Utf8Range

Defined in:: lib/lexer_kit/dfa/utf8_range.rb

Overview

Build regex AST from UTF-8 codepoint ranges.

Class Method Summary collapse

.ast_for_ranges(ranges) ⇒ Object
.build_for_length(start_cp, end_cp, length) ⇒ Object
.build_range(start_cp, end_cp) ⇒ Object
.build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx) ⇒ Object
.byte_bounds_for_lead(lead, length) ⇒ Object
.byte_node(byte) ⇒ Object
.byte_range_node(min_byte, max_byte) ⇒ Object
.codepoint_bounds(length) ⇒ Object
.codepoint_range_for_lead(lead, length) ⇒ Object
.full_range_nodes(min_bytes, max_bytes, idx) ⇒ Object
.lead_bytes_for_length(length) ⇒ Object
.overlaps_surrogates?(start_cp, end_cp) ⇒ Boolean
.utf8_bytes(codepoint) ⇒ Object

Class Method Details

.ast_for_ranges(ranges) ⇒ `Object`

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 7

def self.ast_for_ranges(ranges)
  parts = ranges.flat_map do |start_cp, end_cp|
    build_range(start_cp, end_cp)
  end

  return RegexAST::Literal.new(byte: nil, meta: nil) if parts.empty?
  return parts.first if parts.size == 1

  RegexAST::Alternation.new(children: parts, meta: nil)
end

.build_for_length(start_cp, end_cp, length) ⇒ `Object`

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 29

def self.build_for_length(start_cp, end_cp, length)
  if length == 3 && overlaps_surrogates?(start_cp, end_cp)
    segments = []
    segments.concat(build_for_length(start_cp, 0xD7FF, length)) if start_cp <= 0xD7FF
    segments.concat(build_for_length(0xE000, end_cp, length)) if end_cp >= 0xE000
    return segments
  end

  min_cp, max_cp = codepoint_bounds(length)
  return [] if end_cp < min_cp || start_cp > max_cp

  s = [start_cp, min_cp].max
  e = [end_cp, max_cp].min
  return [] if s > e

  segments = []
  lead_bytes_for_length(length).each do |lead|
    lead_min_cp, lead_max_cp = codepoint_range_for_lead(lead, length)
    next if e < lead_min_cp || s > lead_max_cp

    seg_start = [s, lead_min_cp].max
    seg_end = [e, lead_max_cp].min
    start_bytes = utf8_bytes(seg_start)
    end_bytes = utf8_bytes(seg_end)
    min_bytes, max_bytes = byte_bounds_for_lead(lead, length)
    segments.concat(build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, 1).map do |seq|
      RegexAST::Concat.new(children: [byte_node(lead)] + seq, meta: nil)
    end)
  end

  segments
end

.build_range(start_cp, end_cp) ⇒ `Object`

Raises:

(ArgumentError)

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 18

def self.build_range(start_cp, end_cp)
  raise ArgumentError, "invalid range" if start_cp > end_cp

  segments = []
  segments.concat(build_for_length(start_cp, end_cp, 1))
  segments.concat(build_for_length(start_cp, end_cp, 2))
  segments.concat(build_for_length(start_cp, end_cp, 3))
  segments.concat(build_for_length(start_cp, end_cp, 4))
  segments
end

.build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx) ⇒ `Object`

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 62

def self.build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx)
  return [[]] if idx >= start_bytes.size

  if start_bytes[idx] == end_bytes[idx]
    tails = build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx + 1)
    return tails.map { |t| [byte_node(start_bytes[idx])] + t }
  end

  sequences = []

  end_first = max_bytes.dup
  end_first[0...idx] = start_bytes[0...idx]
  end_first[idx] = start_bytes[idx]
  first_tails = build_sequences(start_bytes, end_first, min_bytes, max_bytes, idx + 1)
  sequences.concat(first_tails.map { |t| [byte_node(start_bytes[idx])] + t })

  if start_bytes[idx] + 1 <= end_bytes[idx] - 1
    middle = byte_range_node(start_bytes[idx] + 1, end_bytes[idx] - 1)
    sequences << ([middle] + full_range_nodes(min_bytes, max_bytes, idx + 1))
  end

  start_last = min_bytes.dup
  start_last[0...idx] = end_bytes[0...idx]
  start_last[idx] = end_bytes[idx]
  last_tails = build_sequences(start_last, end_bytes, min_bytes, max_bytes, idx + 1)
  sequences.concat(last_tails.map { |t| [byte_node(end_bytes[idx])] + t })

  sequences
end

.byte_bounds_for_lead(lead, length) ⇒ `Object`

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 138

def self.byte_bounds_for_lead(lead, length)
  case length
  when 1
    [[lead], [lead]]
  when 2
    [[lead, 0x80], [lead, 0xBF]]
  when 3
    min_second, max_second = if lead == 0xE0
                               [0xA0, 0xBF]
                             elsif lead == 0xED
                               [0x80, 0x9F]
                             else
                               [0x80, 0xBF]
                             end
    [[lead, min_second, 0x80], [lead, max_second, 0xBF]]
  when 4
    min_second, max_second = if lead == 0xF0
                               [0x90, 0xBF]
                             elsif lead == 0xF4
                               [0x80, 0x8F]
                             else
                               [0x80, 0xBF]
                             end
    [[lead, min_second, 0x80, 0x80], [lead, max_second, 0xBF, 0xBF]]
  else
    [[], []]
  end
end

.byte_node(byte) ⇒ `Object`



98
99
100

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 98

def self.byte_node(byte)
  RegexAST::Literal.new(byte: byte, meta: nil)
end

.byte_range_node(min_byte, max_byte) ⇒ `Object`

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 102

def self.byte_range_node(min_byte, max_byte)
  if min_byte == max_byte
    RegexAST::Literal.new(byte: min_byte, meta: nil)
  else
    RegexAST::CharClass.new(ranges: [[min_byte, max_byte]], negated: false, meta: nil)
  end
end

.codepoint_bounds(length) ⇒ `Object`

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 114

def self.codepoint_bounds(length)
  case length
  when 1 then [0x0, 0x7F]
  when 2 then [0x80, 0x7FF]
  when 3 then [0x800, 0xFFFF]
  when 4 then [0x10000, 0x10FFFF]
  else [1, 0]
  end
end

.codepoint_range_for_lead(lead, length) ⇒ `Object`

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 167

def self.codepoint_range_for_lead(lead, length)
  min_bytes, max_bytes = byte_bounds_for_lead(lead, length)
  min_cp = min_bytes.pack("C*").force_encoding("UTF-8").ord
  max_cp = max_bytes.pack("C*").force_encoding("UTF-8").ord
  [min_cp, max_cp]
end

.full_range_nodes(min_bytes, max_bytes, idx) ⇒ `Object`

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 92

def self.full_range_nodes(min_bytes, max_bytes, idx)
  (idx...min_bytes.size).map do |pos|
    byte_range_node(min_bytes[pos], max_bytes[pos])
  end
end

.lead_bytes_for_length(length) ⇒ `Object`

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 128

def self.lead_bytes_for_length(length)
  case length
  when 1 then (0x00..0x7F).to_a
  when 2 then (0xC2..0xDF).to_a
  when 3 then (0xE0..0xEF).to_a
  when 4 then (0xF0..0xF4).to_a
  else []
  end
end

.overlaps_surrogates?(start_cp, end_cp) ⇒ `Boolean`

Returns:

(Boolean)



124
125
126

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 124

def self.overlaps_surrogates?(start_cp, end_cp)
  start_cp <= 0xDFFF && end_cp >= 0xD800
end

.utf8_bytes(codepoint) ⇒ `Object`



110
111
112

# File 'lib/lexer_kit/dfa/utf8_range.rb', line 110

def self.utf8_bytes(codepoint)
  [codepoint].pack("U").bytes
end

Module: LexerKit::DFA::Utf8Range

Overview

Class Method Summary collapse

Class Method Details

.ast_for_ranges(ranges) ⇒ Object

.build_for_length(start_cp, end_cp, length) ⇒ Object

.build_range(start_cp, end_cp) ⇒ Object

.build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx) ⇒ Object

.byte_bounds_for_lead(lead, length) ⇒ Object

.byte_node(byte) ⇒ Object

.byte_range_node(min_byte, max_byte) ⇒ Object

.codepoint_bounds(length) ⇒ Object

.codepoint_range_for_lead(lead, length) ⇒ Object

.full_range_nodes(min_bytes, max_bytes, idx) ⇒ Object

.lead_bytes_for_length(length) ⇒ Object

.overlaps_surrogates?(start_cp, end_cp) ⇒ Boolean

.utf8_bytes(codepoint) ⇒ Object

.ast_for_ranges(ranges) ⇒ `Object`

.build_for_length(start_cp, end_cp, length) ⇒ `Object`

.build_range(start_cp, end_cp) ⇒ `Object`

.build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx) ⇒ `Object`

.byte_bounds_for_lead(lead, length) ⇒ `Object`

.byte_node(byte) ⇒ `Object`

.byte_range_node(min_byte, max_byte) ⇒ `Object`

.codepoint_bounds(length) ⇒ `Object`

.codepoint_range_for_lead(lead, length) ⇒ `Object`

.full_range_nodes(min_bytes, max_bytes, idx) ⇒ `Object`

.lead_bytes_for_length(length) ⇒ `Object`

.overlaps_surrogates?(start_cp, end_cp) ⇒ `Boolean`

.utf8_bytes(codepoint) ⇒ `Object`