Module: Regexgen

Defined in:
lib/regexgen.rb,
lib/regexgen/ast.rb,
lib/regexgen/set.rb,
lib/regexgen/trie.rb,
lib/regexgen/regex.rb,
lib/regexgen/state.rb,
lib/regexgen/version.rb,
lib/regexgen/minimize.rb

Overview

Generate regular expressions that match a set of strings

Defined Under Namespace

Modules: Ast, SetUtil Classes: State, Trie

Constant Summary collapse

VERSION =
'0.3.1'

Class Method Summary collapse

Class Method Details

.common_substring(a, b, side) ⇒ Object



130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# File 'lib/regexgen/regex.rb', line 130

def common_substring(a, b, side)
  dir = side == :start ? 1 : -1
  a = a.chars
  b = b.chars
  ai = dir == 1 ? 0 : a.length - 1
  ae = dir == 1 ? a.length : -1
  bi = dir == 1 ? 0 : b.length - 1
  be = dir == 1 ? b.length : -1
  res = ''

  while ai != ae && bi != be && a[ai] == b[bi]
    if dir == 1
      res += a[ai]
    else
      res = a[ai] + res
    end
    ai += dir
    bi += dir
  end

  res
end

.concat(a, b) ⇒ Object



153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/regexgen/regex.rb', line 153

def concat(a, b)
  return unless a && b

  return b if a.respond_to?(:empty?) && a.empty?
  return a if b.respond_to?(:empty?) && b.empty?

  return Ast::Literal.new(a.value + b.value) if a.is_a?(Ast::Literal) && b.is_a?(Ast::Literal)

  if a.is_a?(Ast::Literal) && b.is_a?(Ast::Concatenation) && b.a.is_a?(Ast::Literal)
    return Ast::Concatenation.new(Ast::Literal.new(a.value + b.a.value), b.b)
  end

  if b.is_a?(Ast::Literal) && a.is_a?(Ast::Concatenation) && a.b.is_a?(Ast::Literal)
    return Ast::Concatenation.new(a.a, Ast::Literal.new(a.b.value + b.value))
  end

  Ast::Concatenation.new(a, b)
end

.generate(strings, flags = nil) ⇒ Object



9
10
11
12
# File 'lib/regexgen.rb', line 9

def generate(strings, flags = nil)
  Trie.new.tap { |t| strings.each(&t.method(:add)) }
      .to_regex(flags)
end

.minimize(root) ⇒ Object

Hopcroft’s DSA minimization algorithm en.wikipedia.org/wiki/DFA_minimization#Hopcroft’s_algorithm

Largely ported from github.com/devongovett/regexgen/blob/7ef10aef3a414b10554822cdf6e90389582b1890/src/minimize.js

P := {F, Q \ F};
W := {F, Q \ F};
while (W is not empty) do
     choose and remove a set A from W
     for each c in 

Key:

… is a Set (yes we have sets of sets here) A \ B is complement (A - B) Q is all states F is final states Σ is the DFA’s alphabet c is a letter of the alphabet A ∩ B is intersection (A & B) |A| is the cardinality of A (A.size)



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/regexgen/minimize.rb', line 44

def minimize(root)
  states = root.visit
  final_states = states.select(&:accepting).to_set

  # Create a map of incoming transitions to each state, grouped by
  # character.
  transitions = Hash.new { |h, k| h[k] = Hash.new { |h_, k_| h_[k_] = Set.new } }
  states.each_with_object(transitions) do |s, acc|
    s.transitions.each do |t, st|
      acc[st][t] << s
    end
  end

  p = Set[states, final_states]
  w = Set.new(p)
  until w.empty?
    a = w.shift

    # Collect states that have transitions leading to states in a, grouped
    # by character.
    t = Hash.new { |h, k| h[k] = Set.new }
    a.each_with_object(t) do |s, acc|
      transitions[s].each do |t_, x|
        acc[t_].merge(x)
      end
    end

    t.each_value do |x|
      p.to_a.each do |y|
        intersection = x & y
        next if intersection.empty?

        complement = y - x
        next if complement.empty?

        p.replace_item(y, intersection, complement)
        if w.include?(y)
          w.replace_item(y, intersection, complement)
        elsif intersection.size <= complement.size
          w.add(intersection)
        else
          w.add(complement)
        end
      end
    end
  end

  new_states = Hash.new { |hash, key| hash[key] = State.new }
  initial = nil

  p.each do |s|
    first = s.first
    s_ = new_states[s]
    first.transitions.each do |c, old|
      s_.transitions[c] = new_states[p.find { |v| v.include?(old) }]
    end

    s_.accepting = first.accepting

    initial = s_ if s.include?(root)
  end

  initial
end

.remove_common_substring(a, b, side) ⇒ Object



116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/regexgen/regex.rb', line 116

def remove_common_substring(a, b, side)
  al = a.literal(side) if a.respond_to?(:literal)
  bl = b.literal(side) if b.respond_to?(:literal)
  return [a, b, nil] if al.nil? || bl.nil? || al.empty? || bl.empty?

  s = common_substring(al, bl, side)
  return [a, b, ''] if s.empty?

  a = a.remove_substring(side, s.length)
  b = b.remove_substring(side, s.length)

  [a, b, s]
end

.star(exp) ⇒ Object



80
81
82
# File 'lib/regexgen/regex.rb', line 80

def star(exp)
  Ast::Repetition.new(exp, '*') if exp
end

.to_regex(root) ⇒ Object

Brzozowski algebraic method cs.stackexchange.com/a/2392

Largely ported from github.com/devongovett/regexgen/blob/7ef10aef3a414b10554822cdf6e90389582b1890/src/regex.js

Initialize B

for i = 1 to m:
  if final(i):
    B[i] := 

Initialize A

for i = 1 to m:
  for j = 1 to m:
    for a in 

Solve

for n = m decreasing to 1:
  B[n] := star(A[n,n]) . B[n]
  for j = 1 to n:
    A[n,j] := star(A[n,n]) . A[n,j];
  for i = 1 to n:
    B[i] += A[i,n] . B[n]
    for j = 1 to n:
      A[i,j] += A[i,n] . A[n,j]

Result is e := B



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/regexgen/regex.rb', line 43

def to_regex(root)
  states = root.visit.to_a

  a = []
  b = []

  states.each_with_index do |a_, i|
    b[i] = Ast::Literal.new('') if a_.accepting

    a[i] = []
    a_.transitions.each do |t, s|
      j = states.index(s)
      a[i][j] = a[i][j] ? union(a[i][j], Ast::Literal.new(t)) : Ast::Literal.new(t)
    end
  end

  (states.length - 1).downto(0) do |n|
    if a[n][n]
      b[n] = concat(star(a[n][n]), b[n])
      (0...n).each do |j|
        a[n][j] = concat(star(a[n][n]), a[n][j])
      end
    end

    (0...n).each do |i|
      next unless a[i][n]

      b[i] = union(b[i], concat(a[i][n], b[n]))
      (0...n).each do |j|
        a[i][j] = union(a[i][j], concat(a[i][n], a[n][j]))
      end
    end
  end

  b[0].to_s
end

.union(a, b) ⇒ Object



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/regexgen/regex.rb', line 84

def union(a, b)
  if a && b && a != b
    res = nil
    a, b, start = remove_common_substring(a, b, :start)
    a, b, end_ = remove_common_substring(a, b, :end)

    if (a.respond_to?(:empty?) && a.empty?) || (b.respond_to?(:empty?) && b.empty?)
      res = Ast::Repetition.new(a.empty? ? b : a, '?')
    elsif a.is_a?(Ast::Repetition) && a.type == '?'
      res = Ast::Repetition.new(Ast::Alternation.new(a.expr, b), '?')
    elsif b.is_a?(Ast::Repetition) && b.type == '?'
      res = Ast::Repetition.new(Ast::Alternation.new(a, b.expr), '?')
    else
      ac = a.char_class if a.respond_to?(:char_class)
      bc = b.char_class if b.respond_to?(:char_class)
      res = if ac && bc
              Ast::CharClass.new(ac, bc)
            else
              Ast::Alternation.new(a, b)
            end
    end

    res = Ast::Concatenation.new(Ast::Literal.new(start), res) unless start.nil? || start.empty?

    res = Ast::Concatenation.new(res, Ast::Literal.new(end_)) unless end_.nil? || end_.empty?

    return res
  end

  a || b
end