7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
# File 'lib/character_set/expression_converter.rb', line 7
def convert(expression)
CharacterSet.require_optional_dependency('regexp_parser')
case expression
when Regexp::Expression::Root
if expression.count != 1
raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
end
convert(expression[0])
when Regexp::Expression::CharacterSet
content = expression.map { |subexp| convert(subexp) }.reduce(:+)
expression.negative? ? content.inversion : content
when Regexp::Expression::CharacterSet::Intersection
expression.map { |subexp| convert(subexp) }.reduce(:&)
when Regexp::Expression::CharacterSet::IntersectedSequence
expression.map { |subexp| convert(subexp) }.reduce(:+)
when Regexp::Expression::CharacterSet::Range
start, finish = expression.map { |subexp| convert(subexp) }
CharacterSet.new((start.min)..(finish.max))
when Regexp::Expression::CharacterType::Any
CharacterSet.unicode
when Regexp::Expression::CharacterType::Base
/(?<negative>non)?(?<base_name>.+)/ =~ expression.token
content =
if expression.unicode_classes?
CharacterSet.of_property(base_name)
else
case base_name.to_sym
when :digit then CharacterSet.from_ranges(48..57)
when :hex then CharacterSet.from_ranges(48..57, 65..70, 97..102)
when :space then CharacterSet.from_ranges(9..13, 32..32)
when :word then CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
else raise Error, "Unsupported CharacterType #{base_name}"
end
end
negative ? content.inversion : content
when Regexp::Expression::EscapeSequence::CodepointList
CharacterSet.new(expression.codepoints)
when Regexp::Expression::EscapeSequence::Base
CharacterSet[expression.codepoint]
when Regexp::Expression::Group::Capture,
Regexp::Expression::Group::Passive,
Regexp::Expression::Group::Named,
Regexp::Expression::Group::Atomic,
Regexp::Expression::Group::Options
case expression.count
when 0 then CharacterSet[]
when 1 then convert(expression.first)
else
raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
end
when Regexp::Expression::Alternation
expression.map { |subexp| convert(subexp) }.reduce(:+)
when Regexp::Expression::Alternative
case expression.count
when 0 then CharacterSet[]
when 1 then convert(expression.first)
else
raise Error, 'Alternatives must contain exactly one expression'
end
when Regexp::Expression::Literal
if expression.set_level == 0 && expression.text.size != 1
raise Error, 'Literal runs outside of sets are codepoint *sequences*'
end
CharacterSet[expression.text.ord]
when Regexp::Expression::UnicodeProperty::Base,
Regexp::Expression::PosixClass
content = CharacterSet.of_property(expression.token)
if expression.type == :posixclass && expression.ascii_classes?
content = content.ascii_part
end
expression.negative? ? content.inversion : content
when Regexp::Expression::Base
raise Error, "Unsupported expression class `#{expression.class}`"
else
raise Error, "Pass an expression (result of Regexp::Parser.parse)"
end
end
|