Module: CharacterSet::ExpressionConverter

Defined in:
lib/character_set/expression_converter.rb

Constant Summary collapse

Error =
Class.new(ArgumentError)

Class Method Summary collapse

Class Method Details

.convert(expression, to = CharacterSet, acc = []) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/character_set/expression_converter.rb', line 7

def convert(expression, to = CharacterSet, acc = [])
  CharacterSet.require_optional_dependency('regexp_parser', __method__)

  case expression
  when Regexp::Expression::CharacterSet
    content = expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
    acc << (expression.negative? ? content.inversion : content)

  when Regexp::Expression::CharacterSet::Intersection
    acc << expression.map { |subexp| convert(subexp, to) }.reduce(:&)

  when Regexp::Expression::CharacterSet::Range
    start, finish = expression.map { |subexp| convert(subexp, to) }
    acc << to.new((start.min)..(finish.max))

  when Regexp::Expression::Subexpression # root, group, alternation, etc.
    expression.each { |subexp| convert(subexp, to, acc) }

  when Regexp::Expression::CharacterType::Any
    acc << to.unicode

  when Regexp::Expression::CharacterType::Base
    /(?<negative>non)?(?<base_name>.+)/ =~ expression.token
    content =
      if expression.unicode_classes?
        # in u-mode, most type shortcuts match the same as \p{<long type name>}
        if base_name == 'linebreak'
          to.from_ranges(10..13, 133..133, 8232..8233)
        else
          to.of_property(base_name)
        end
      else
        # in normal mode, types match only ascii chars
        case base_name.to_sym
        when :digit     then to.from_ranges(48..57)
        when :hex       then to.from_ranges(48..57, 65..70, 97..102)
        when :linebreak then to.from_ranges(10..13)
        when :space     then to.from_ranges(9..13, 32..32)
        when :word      then to.from_ranges(48..57, 65..90, 95..95, 97..122)
        else raise Error, "Unsupported CharacterType #{base_name}"
        end
      end
    acc << (negative ? content.inversion : content)

  when Regexp::Expression::EscapeSequence::CodepointList
    content = to.new(expression.codepoints)
    acc << (expression.i? ? content.case_insensitive : content)

  when Regexp::Expression::EscapeSequence::Base
    content = to[expression.codepoint]
    acc << (expression.i? ? content.case_insensitive : content)

  when Regexp::Expression::Literal
    content = to[*expression.text.chars]
    acc << (expression.i? ? content.case_insensitive : content)

  when Regexp::Expression::UnicodeProperty::Base,
       Regexp::Expression::PosixClass
    content = to.of_property(expression.token)
    if expression.type == :posixclass && expression.ascii_classes?
      content = content.ascii_part
    end
    acc << (expression.negative? ? content.inversion : content)

  when Regexp::Expression::Anchor::Base,
       Regexp::Expression::Backreference::Base,
       Regexp::Expression::Keep::Mark,
       Regexp::Expression::Quantifier
    # ignore zero-length and repeat expressions

  when Regexp::Expression::Base
    raise Error, "Unsupported expression class `#{expression.class}`"

  else
    raise Error, 'Pass an expression (result of Regexp::Parser.parse)'
  end

  acc.reduce(:+) || to[]
end