Module: LexerKit::DFA::Utf8Range

Defined in:
lib/lexer_kit/dfa/utf8_range.rb

Overview

Build regex AST from UTF-8 codepoint ranges.

Class Method Summary collapse

Class Method Details

.ast_for_ranges(ranges) ⇒ Object



7
8
9
10
11
12
13
14
15
16
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 7

def self.ast_for_ranges(ranges)
  parts = ranges.flat_map do |start_cp, end_cp|
    build_range(start_cp, end_cp)
  end

  return RegexAST::Literal.new(byte: nil, meta: nil) if parts.empty?
  return parts.first if parts.size == 1

  RegexAST::Alternation.new(children: parts, meta: nil)
end

.build_for_length(start_cp, end_cp, length) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 29

def self.build_for_length(start_cp, end_cp, length)
  if length == 3 && overlaps_surrogates?(start_cp, end_cp)
    segments = []
    segments.concat(build_for_length(start_cp, 0xD7FF, length)) if start_cp <= 0xD7FF
    segments.concat(build_for_length(0xE000, end_cp, length)) if end_cp >= 0xE000
    return segments
  end

  min_cp, max_cp = codepoint_bounds(length)
  return [] if end_cp < min_cp || start_cp > max_cp

  s = [start_cp, min_cp].max
  e = [end_cp, max_cp].min
  return [] if s > e

  segments = []
  lead_bytes_for_length(length).each do |lead|
    lead_min_cp, lead_max_cp = codepoint_range_for_lead(lead, length)
    next if e < lead_min_cp || s > lead_max_cp

    seg_start = [s, lead_min_cp].max
    seg_end = [e, lead_max_cp].min
    start_bytes = utf8_bytes(seg_start)
    end_bytes = utf8_bytes(seg_end)
    min_bytes, max_bytes = byte_bounds_for_lead(lead, length)
    segments.concat(build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, 1).map do |seq|
      RegexAST::Concat.new(children: [byte_node(lead)] + seq, meta: nil)
    end)
  end

  segments
end

.build_range(start_cp, end_cp) ⇒ Object

Raises:

  • (ArgumentError)


18
19
20
21
22
23
24
25
26
27
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 18

def self.build_range(start_cp, end_cp)
  raise ArgumentError, "invalid range" if start_cp > end_cp

  segments = []
  segments.concat(build_for_length(start_cp, end_cp, 1))
  segments.concat(build_for_length(start_cp, end_cp, 2))
  segments.concat(build_for_length(start_cp, end_cp, 3))
  segments.concat(build_for_length(start_cp, end_cp, 4))
  segments
end

.build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 62

def self.build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx)
  return [[]] if idx >= start_bytes.size

  if start_bytes[idx] == end_bytes[idx]
    tails = build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx + 1)
    return tails.map { |t| [byte_node(start_bytes[idx])] + t }
  end

  sequences = []

  end_first = max_bytes.dup
  end_first[0...idx] = start_bytes[0...idx]
  end_first[idx] = start_bytes[idx]
  first_tails = build_sequences(start_bytes, end_first, min_bytes, max_bytes, idx + 1)
  sequences.concat(first_tails.map { |t| [byte_node(start_bytes[idx])] + t })

  if start_bytes[idx] + 1 <= end_bytes[idx] - 1
    middle = byte_range_node(start_bytes[idx] + 1, end_bytes[idx] - 1)
    sequences << ([middle] + full_range_nodes(min_bytes, max_bytes, idx + 1))
  end

  start_last = min_bytes.dup
  start_last[0...idx] = end_bytes[0...idx]
  start_last[idx] = end_bytes[idx]
  last_tails = build_sequences(start_last, end_bytes, min_bytes, max_bytes, idx + 1)
  sequences.concat(last_tails.map { |t| [byte_node(end_bytes[idx])] + t })

  sequences
end

.byte_bounds_for_lead(lead, length) ⇒ Object



138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 138

def self.byte_bounds_for_lead(lead, length)
  case length
  when 1
    [[lead], [lead]]
  when 2
    [[lead, 0x80], [lead, 0xBF]]
  when 3
    min_second, max_second = if lead == 0xE0
                               [0xA0, 0xBF]
                             elsif lead == 0xED
                               [0x80, 0x9F]
                             else
                               [0x80, 0xBF]
                             end
    [[lead, min_second, 0x80], [lead, max_second, 0xBF]]
  when 4
    min_second, max_second = if lead == 0xF0
                               [0x90, 0xBF]
                             elsif lead == 0xF4
                               [0x80, 0x8F]
                             else
                               [0x80, 0xBF]
                             end
    [[lead, min_second, 0x80, 0x80], [lead, max_second, 0xBF, 0xBF]]
  else
    [[], []]
  end
end

.byte_node(byte) ⇒ Object



98
99
100
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 98

def self.byte_node(byte)
  RegexAST::Literal.new(byte: byte, meta: nil)
end

.byte_range_node(min_byte, max_byte) ⇒ Object



102
103
104
105
106
107
108
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 102

def self.byte_range_node(min_byte, max_byte)
  if min_byte == max_byte
    RegexAST::Literal.new(byte: min_byte, meta: nil)
  else
    RegexAST::CharClass.new(ranges: [[min_byte, max_byte]], negated: false, meta: nil)
  end
end

.codepoint_bounds(length) ⇒ Object



114
115
116
117
118
119
120
121
122
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 114

def self.codepoint_bounds(length)
  case length
  when 1 then [0x0, 0x7F]
  when 2 then [0x80, 0x7FF]
  when 3 then [0x800, 0xFFFF]
  when 4 then [0x10000, 0x10FFFF]
  else [1, 0]
  end
end

.codepoint_range_for_lead(lead, length) ⇒ Object



167
168
169
170
171
172
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 167

def self.codepoint_range_for_lead(lead, length)
  min_bytes, max_bytes = byte_bounds_for_lead(lead, length)
  min_cp = min_bytes.pack("C*").force_encoding("UTF-8").ord
  max_cp = max_bytes.pack("C*").force_encoding("UTF-8").ord
  [min_cp, max_cp]
end

.full_range_nodes(min_bytes, max_bytes, idx) ⇒ Object



92
93
94
95
96
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 92

def self.full_range_nodes(min_bytes, max_bytes, idx)
  (idx...min_bytes.size).map do |pos|
    byte_range_node(min_bytes[pos], max_bytes[pos])
  end
end

.lead_bytes_for_length(length) ⇒ Object



128
129
130
131
132
133
134
135
136
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 128

def self.lead_bytes_for_length(length)
  case length
  when 1 then (0x00..0x7F).to_a
  when 2 then (0xC2..0xDF).to_a
  when 3 then (0xE0..0xEF).to_a
  when 4 then (0xF0..0xF4).to_a
  else []
  end
end

.overlaps_surrogates?(start_cp, end_cp) ⇒ Boolean

Returns:

  • (Boolean)


124
125
126
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 124

def self.overlaps_surrogates?(start_cp, end_cp)
  start_cp <= 0xDFFF && end_cp >= 0xD800
end

.utf8_bytes(codepoint) ⇒ Object



110
111
112
# File 'lib/lexer_kit/dfa/utf8_range.rb', line 110

def self.utf8_bytes(codepoint)
  [codepoint].pack("U").bytes
end