Module: Regexgen
- Defined in:
- lib/regexgen.rb,
lib/regexgen/ast.rb,
lib/regexgen/set.rb,
lib/regexgen/trie.rb,
lib/regexgen/regex.rb,
lib/regexgen/state.rb,
lib/regexgen/version.rb,
lib/regexgen/minimize.rb
Overview
Generate regular expressions that match a set of strings
Defined Under Namespace
Modules: Ast, SetUtil Classes: State, Trie
Constant Summary collapse
- VERSION =
'0.3.1'
Class Method Summary collapse
- .common_substring(a, b, side) ⇒ Object
- .concat(a, b) ⇒ Object
- .generate(strings, flags = nil) ⇒ Object
-
.minimize(root) ⇒ Object
Hopcroft’s DSA minimization algorithm en.wikipedia.org/wiki/DFA_minimization#Hopcroft’s_algorithm.
- .remove_common_substring(a, b, side) ⇒ Object
- .star(exp) ⇒ Object
-
.to_regex(root) ⇒ Object
Brzozowski algebraic method cs.stackexchange.com/a/2392.
- .union(a, b) ⇒ Object
Class Method Details
.common_substring(a, b, side) ⇒ Object
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# File 'lib/regexgen/regex.rb', line 130 def common_substring(a, b, side) dir = side == :start ? 1 : -1 a = a.chars b = b.chars ai = dir == 1 ? 0 : a.length - 1 ae = dir == 1 ? a.length : -1 bi = dir == 1 ? 0 : b.length - 1 be = dir == 1 ? b.length : -1 res = '' while ai != ae && bi != be && a[ai] == b[bi] if dir == 1 res += a[ai] else res = a[ai] + res end ai += dir bi += dir end res end |
.concat(a, b) ⇒ Object
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# File 'lib/regexgen/regex.rb', line 153 def concat(a, b) return unless a && b return b if a.respond_to?(:empty?) && a.empty? return a if b.respond_to?(:empty?) && b.empty? return Ast::Literal.new(a.value + b.value) if a.is_a?(Ast::Literal) && b.is_a?(Ast::Literal) if a.is_a?(Ast::Literal) && b.is_a?(Ast::Concatenation) && b.a.is_a?(Ast::Literal) return Ast::Concatenation.new(Ast::Literal.new(a.value + b.a.value), b.b) end if b.is_a?(Ast::Literal) && a.is_a?(Ast::Concatenation) && a.b.is_a?(Ast::Literal) return Ast::Concatenation.new(a.a, Ast::Literal.new(a.b.value + b.value)) end Ast::Concatenation.new(a, b) end |
.generate(strings, flags = nil) ⇒ Object
9 10 11 12 |
# File 'lib/regexgen.rb', line 9 def generate(strings, flags = nil) Trie.new.tap { |t| strings.each(&t.method(:add)) } .to_regex(flags) end |
.minimize(root) ⇒ Object
Hopcroft’s DSA minimization algorithm en.wikipedia.org/wiki/DFA_minimization#Hopcroft’s_algorithm
Largely ported from github.com/devongovett/regexgen/blob/7ef10aef3a414b10554822cdf6e90389582b1890/src/minimize.js
P := {F, Q \ F};
W := {F, Q \ F};
while (W is not empty) do
choose and remove a set A from W
for each c in
Key:
… is a Set (yes we have sets of sets here) A \ B is complement (A - B) Q is all states F is final states Σ is the DFA’s alphabet c is a letter of the alphabet A ∩ B is intersection (A & B) |A| is the cardinality of A (A.size)
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/regexgen/minimize.rb', line 44 def minimize(root) states = root.visit final_states = states.select(&:accepting).to_set # Create a map of incoming transitions to each state, grouped by # character. transitions = Hash.new { |h, k| h[k] = Hash.new { |h_, k_| h_[k_] = Set.new } } states.each_with_object(transitions) do |s, acc| s.transitions.each do |t, st| acc[st][t] << s end end p = Set[states, final_states] w = Set.new(p) until w.empty? a = w.shift # Collect states that have transitions leading to states in a, grouped # by character. t = Hash.new { |h, k| h[k] = Set.new } a.each_with_object(t) do |s, acc| transitions[s].each do |t_, x| acc[t_].merge(x) end end t.each_value do |x| p.to_a.each do |y| intersection = x & y next if intersection.empty? complement = y - x next if complement.empty? p.replace_item(y, intersection, complement) if w.include?(y) w.replace_item(y, intersection, complement) elsif intersection.size <= complement.size w.add(intersection) else w.add(complement) end end end end new_states = Hash.new { |hash, key| hash[key] = State.new } initial = nil p.each do |s| first = s.first s_ = new_states[s] first.transitions.each do |c, old| s_.transitions[c] = new_states[p.find { |v| v.include?(old) }] end s_.accepting = first.accepting initial = s_ if s.include?(root) end initial end |
.remove_common_substring(a, b, side) ⇒ Object
116 117 118 119 120 121 122 123 124 125 126 127 128 |
# File 'lib/regexgen/regex.rb', line 116 def remove_common_substring(a, b, side) al = a.literal(side) if a.respond_to?(:literal) bl = b.literal(side) if b.respond_to?(:literal) return [a, b, nil] if al.nil? || bl.nil? || al.empty? || bl.empty? s = common_substring(al, bl, side) return [a, b, ''] if s.empty? a = a.remove_substring(side, s.length) b = b.remove_substring(side, s.length) [a, b, s] end |
.star(exp) ⇒ Object
80 81 82 |
# File 'lib/regexgen/regex.rb', line 80 def star(exp) Ast::Repetition.new(exp, '*') if exp end |
.to_regex(root) ⇒ Object
Brzozowski algebraic method cs.stackexchange.com/a/2392
Largely ported from github.com/devongovett/regexgen/blob/7ef10aef3a414b10554822cdf6e90389582b1890/src/regex.js
Initialize B
for i = 1 to m:
if final(i):
B[i] :=
Initialize A
for i = 1 to m:
for j = 1 to m:
for a in
Solve
for n = m decreasing to 1:
B[n] := star(A[n,n]) . B[n]
for j = 1 to n:
A[n,j] := star(A[n,n]) . A[n,j];
for i = 1 to n:
B[i] += A[i,n] . B[n]
for j = 1 to n:
A[i,j] += A[i,n] . A[n,j]
Result is e := B
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# File 'lib/regexgen/regex.rb', line 43 def to_regex(root) states = root.visit.to_a a = [] b = [] states.each_with_index do |a_, i| b[i] = Ast::Literal.new('') if a_.accepting a[i] = [] a_.transitions.each do |t, s| j = states.index(s) a[i][j] = a[i][j] ? union(a[i][j], Ast::Literal.new(t)) : Ast::Literal.new(t) end end (states.length - 1).downto(0) do |n| if a[n][n] b[n] = concat(star(a[n][n]), b[n]) (0...n).each do |j| a[n][j] = concat(star(a[n][n]), a[n][j]) end end (0...n).each do |i| next unless a[i][n] b[i] = union(b[i], concat(a[i][n], b[n])) (0...n).each do |j| a[i][j] = union(a[i][j], concat(a[i][n], a[n][j])) end end end b[0].to_s end |
.union(a, b) ⇒ Object
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/regexgen/regex.rb', line 84 def union(a, b) if a && b && a != b res = nil a, b, start = remove_common_substring(a, b, :start) a, b, end_ = remove_common_substring(a, b, :end) if (a.respond_to?(:empty?) && a.empty?) || (b.respond_to?(:empty?) && b.empty?) res = Ast::Repetition.new(a.empty? ? b : a, '?') elsif a.is_a?(Ast::Repetition) && a.type == '?' res = Ast::Repetition.new(Ast::Alternation.new(a.expr, b), '?') elsif b.is_a?(Ast::Repetition) && b.type == '?' res = Ast::Repetition.new(Ast::Alternation.new(a, b.expr), '?') else ac = a.char_class if a.respond_to?(:char_class) bc = b.char_class if b.respond_to?(:char_class) res = if ac && bc Ast::CharClass.new(ac, bc) else Ast::Alternation.new(a, b) end end res = Ast::Concatenation.new(Ast::Literal.new(start), res) unless start.nil? || start.empty? res = Ast::Concatenation.new(res, Ast::Literal.new(end_)) unless end_.nil? || end_.empty? return res end a || b end |