Module: ThompsonConstruction

Included in:
ScannerGenerator::FiniteStateMachine
Defined in:
lib/scanner_generator/thompson_construction.rb

Constant Summary collapse

PENDING =
0

Instance Method Summary collapse

Instance Method Details

#absorb_left_alt(machines) ⇒ Object



134
135
136
137
138
139
140
141
142
143
144
145
146
# File 'lib/scanner_generator/thompson_construction.rb', line 134

def absorb_left_alt(machines)
  new_machines = Array.new
  machines.each_with_index do |mach,ii|
    if mach[1].nil? || mach[1].empty? # This machine is complete.
      new_machines.push([mach[0],nil])
    else
      src, dest = mach[1].shift, mach[1].shift
      m = mach[0].replace_edge(src,PENDING,dest,new_machines.pop[0]) # NON-LAMBDA VERSION
      new_machines.push([m,mach[1]])
    end
  end
  new_machines
end

#absorb_right_alt(machines) ⇒ Object



148
149
150
# File 'lib/scanner_generator/thompson_construction.rb', line 148

def absorb_right_alt(machines)
  absorb_left_alt(machines.reverse).reverse
end

#alt_machineObject



224
225
226
227
228
229
230
231
232
233
234
235
# File 'lib/scanner_generator/thompson_construction.rb', line 224

def alt_machine
  FiniteStateMachine.new({
    :accept_states => {5=>'end'},
    :graph_hash    => {
      0 => {LAMBDA => [1,3]},
      1 => {PENDING => [2]},
      2 => {LAMBDA => [5]},
      3 => {PENDING => [4]},
      4 => {LAMBDA => [5]}
    }
  })
end

#build_machine_stack(re) ⇒ Object

Thompson-McNaughton-Yamada Construction Section



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/scanner_generator/thompson_construction.rb', line 6

def build_machine_stack(re)
  skip = 0
  escaped = false
  machines = Array.new
  (0...re.length).each do |ii| # the pointer in some cases.
    (skip -= 1) && next if skip != 0 # Advance ptr until past () group
    ch = re[ii] #re[-ii-1]
    if escaped
      case ch
      when 'n'
        machines.push([cat_machine("\n"), nil])
      else
        machines.push([cat_machine(ch), nil])
      end
      escaped = false
      next
    end
    case(ch)
    when '*' then machines.push([kleene_machine, [1,2]])
    when '+' then machines.push([plus_machine, [1,2]])
    #when '+' then machines.push([plus_machine, [[0,1],[1,1]]])
    when '?' then machines.push([question_machine, [0,1]])
    when '|' then machines.push([alt_machine, [1,2,3,4]])
    when ']' then raise "mismatched bracket closed a non-open class"
    when ')' then raise "mismatched paren closed a non-open group"
    when '('# ; puts "#{ms}\tGRPOPEN\nencounted closing paren. following chars #{re[ii+1]}#{re[ii+2]}" 
      subexpression = ''
      nesting = 0
      until (ch2 = re[ii+=1]) == ')' && nesting == 0 # Until the next character is '('
        nesting -= 1 if ch2 == ')'
        nesting += 1 if ch2 == '('
        subexpression << ch2
        #skip += 1
      end
      #skip += 1
      subgraph = re2nfa(subexpression)
      skip = subexpression.length+1 # the +1 is used to skip the closing )
      machines.push([subgraph, nil])
    when '['
      char_class = get_char_class(re[ii..-1]) # search rest of the string for []-expression
      machines.push([cat_machine(/#{char_class}/), nil])
      skip = char_class.length - 1 + char_class.scan(/\\/).length # compensate for 2 '\'s counting as 1
      # The below skip assignment works if we want to allow for odd numbers of slashes, but it's
      # not desirable, because it would allow [\n] to be [n].
      # We're reserving \ for escaping *, +, ?, etc. symbols.
      #skip = char_class.length - 1 +
      #       char_class.scan(/\\/).length*2 -
      #       char_class.scan(/\\\\/).length # compensate for 2 '\'s counting as 1
    when '\\' #; escaped = true unless escaped== true
      if escaped  # '\\' -> cat a slash
        machines.push([cat_machine(ch), nil])
        escaped = false
      else
        escaped = true
      end
    else
      machines.push([cat_machine(ch), nil])
    end
  end
  machines
end

#cat_machine(ch) ⇒ Object



204
205
206
207
208
209
# File 'lib/scanner_generator/thompson_construction.rb', line 204

def cat_machine(ch)
  FiniteStateMachine.new({
    :accept_states => {1=>'end'},
    :graph_hash    => {0 => {ch => [1]}}
  })
end

#catify(machines) ⇒ Object



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/scanner_generator/thompson_construction.rb', line 105

def catify(machines)
  new_machines = Array.new
  machines.each_with_index do |mach,ii|
    if ii == 0
      new_machines.push([mach[0],nil])
    elsif (mach[1].nil? && machines[ii-1][1].nil?)
        # This machine AND PREVIOUS are each a cat or finished */?/+
        # This code is suspiciously similar to the wrap-up code of re2nfa()
        # which implies that it's not DRY. This is something to revisit.
        lead = new_machines.pop[0]
        offset = lead.get_node_count-1
        acc = lead.accept_states.keys.first || 0
        lead.imp_attach_graph(acc,mach[0])
        lead.accept_states.delete_if do |acc_st|
          !mach[0].accept_states.keys.include?(acc_st-offset)
        end
        new_machines.push([lead,nil])
    else
      new_machines.push([mach[0],mach[1]])
    end
  end
  new_machines
end

#cloneObject



358
359
360
# File 'lib/scanner_generator/thompson_construction.rb', line 358

def clone
  Marshal.load( Marshal.dump(self) )
end

#get_char_class(str) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/scanner_generator/thompson_construction.rb', line 68

def get_char_class(str)
  escaped = false
  result = ''

  str.each_char.with_index do |ch,ii|
    if escaped == false && ch == ']' # done reading current class
      result += ch
      return result
    elsif escaped == true
      result = result[0..-2]+ch
    else
      result += ch
    end
    escaped = (ch == '\\' && escaped==false)    
  end
  raise 'character class improperly closed!'
end

#handle_alternation(machines) ⇒ Object



129
130
131
132
# File 'lib/scanner_generator/thompson_construction.rb', line 129

def handle_alternation(machines)
  machines = absorb_left_alt(machines)
  machines = absorb_right_alt(machines)
end

#imp_attach_graph(attach_point, fsm) ⇒ Object

imp_attach_graph: increments fsm’s node numbers by 1-CALLER.node_count

takes edges outgoing from fsm.origin and adds them to attach_point


313
314
315
316
317
318
319
320
321
322
323
324
325
# File 'lib/scanner_generator/thompson_construction.rb', line 313

def imp_attach_graph(attach_point, fsm)
  my_node_count = get_node_count
  graft = fsm.clone
  graft.increment_node_labels(my_node_count-1) # prevent collisions
  
  graft_root_edges = graft.graph_hash.delete(graft.origin)
  @graph_hash[attach_point] ||= Hash.new
  @graph_hash[attach_point].merge!(graft_root_edges)

  @accept_states.merge!(graft.accept_states)
  @graph_hash.merge!(graft.graph_hash)
  get_node_count
end

#kleene_machineObject



237
238
239
240
241
242
243
244
245
246
# File 'lib/scanner_generator/thompson_construction.rb', line 237

def kleene_machine
  FiniteStateMachine.new({
    :accept_states => {3=>'end'},
    :graph_hash    => {
      0 => {LAMBDA => [1,3]},
      1 => {PENDING => [2]},
      2 => {LAMBDA => [1,3]}
    }
  })
end

#kleene_up(machines) ⇒ Object



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/scanner_generator/thompson_construction.rb', line 86

def kleene_up(machines)
  new_machines = Array.new
  machines.each_with_index do |mach,ii|
    if mach[1].nil? || mach[1].empty? # This machine is complete.
      new_machines.push([mach[0],nil])
    else
      if mach[1].length == 2 # Deals with *, ?, and +, who all have same precedence
        src, dest = mach[1].shift, mach[1].shift
        #m = mach[0].lambda_replace_edge(src,PENDING,dest,new_machines.pop) # LAMBDA VERSION
        m = mach[0].replace_edge(src,PENDING,dest,new_machines.pop[0]) # NON-LAMBDA VERSION
        new_machines.push([m,nil])
      else # dealing with |
        new_machines.push([mach[0],mach[1]])
      end
    end
  end
  new_machines
end

#lambda_inject_graph(graph, src, dest) ⇒ Object



350
351
352
353
354
355
356
# File 'lib/scanner_generator/thompson_construction.rb', line 350

def lambda_inject_graph(graph, src, dest)
  old_node_count = get_node_count
  lambda_attach_graph(src, graph)
  graph.accept_states.keys.each {|k| add_edge(k+old_node_count, LAMBDA, dest)}
  graph.accept_states.keys.each {|k| @accept_states.delete(k+old_node_count)}
  self
end

#lambda_replace_edge(src, label, dest, graph) ⇒ Object



340
341
342
343
344
345
346
347
348
# File 'lib/scanner_generator/thompson_construction.rb', line 340

def lambda_replace_edge(src, label, dest, graph)
  if @graph_hash[src][label].class == Fixnum
    @graph_hash[src][label] = [@graph_hash[src][label]]
  end
  #@graph_hash[src][label].delete(dest)
  lambda_inject_graph(graph,src,dest)
  delete_edge(src,label,dest)
  self
end

#plus_machineObject



248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/scanner_generator/thompson_construction.rb', line 248

def plus_machine
  FiniteStateMachine.new({
    :accept_states => {3=>'end'},
    :graph_hash    => {
      0 => {LAMBDA => [1]},
      1 => {PENDING => [2]},
      2 => {LAMBDA => [1,3]}
    }
    # The below machine would be more concise, but we'd need to add in logic to replace TWO EDGES with one absorb.
    #:accept_states => {1=>'end'},
    #:graph_hash => {
    #  0 => {PENDING => [1]},
    #  1 => {PENDING => [1]}
    #}
  })
end

#prepend_graph(fsm) ⇒ Object



199
200
201
202
# File 'lib/scanner_generator/thompson_construction.rb', line 199

def prepend_graph(fsm)
  fsm.imp_attach_graph(fsm.accept_states.keys[0],self)
  copy(fsm)
end

#question_machineObject



211
212
213
214
215
216
217
218
219
220
221
222
# File 'lib/scanner_generator/thompson_construction.rb', line 211

def question_machine
  FiniteStateMachine.new({
    :accept_states => {1=>'end'},
    :graph_hash    => {0 => {PENDING => [1], LAMBDA => [1]}}
    #:accept_states => {3=>'end'},
    #:graph_hash => {
    #  0 => {LAMBDA => [1,3]},
    #  1 => {PENDING => [2]},
    #  2 => {LAMBDA => [3]}
    #}
  })
end

#re2nfa(re) ⇒ Object

make one pass forwards, completing all kleene stars and all alt LHSs make one pass backwards, completing all alt RHSs if any unfulfilled dependencies remain, my assumptions were mistaken



165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/scanner_generator/thompson_construction.rb', line 165

def re2nfa(re)
  #puts "re2nfa: #{re}"
  fsconstruct = FiniteStateMachine.new({:accept_states => {0=>'eh'},
                                :graph_hash    => {0=>{PENDING=>[0]}}})
  machines = build_machine_stack(re)
  machines = kleene_up(machines)
  machines = catify(machines)
  machines = handle_alternation(machines)

  #puts "New machines:"
  machines.each_with_index do |mach,ii|
    m = mach[0]
    offset = fsconstruct.get_node_count-1
    acc = fsconstruct.accept_states.keys.first || 0 # Attachment point is accept state
    fsconstruct.imp_attach_graph(acc, m)
    fsconstruct.accept_states.delete_if do |acc_st|
      #puts "purging acc #{acc}" if !m.accept_states.keys.include?(acc-offset)
      !m.accept_states.keys.include?(acc_st-offset)
    end
  end

  fsconstruct.delete_edge(0,PENDING,0)
  #@graph_hash = fsconstruct.graph_hash
  #@accept_states = fsconstruct.accept_states
  FiniteStateMachine.new({
    :graph_hash   => fsconstruct.graph_hash,
    :accept_states => fsconstruct.accept_states
    })
end

#renumber!Object

ensure no gaps in our node names!



300
301
302
303
304
305
306
307
308
309
# File 'lib/scanner_generator/thompson_construction.rb', line 300

def renumber!
  get_node_names.each_with_index do |n,ii|
    if n != ii
      retarget_edges(n,ii)
      @accept_states[ii] = @accept_states.delete(n) unless @accept_states[n].nil?
      @graph_hash[ii] = @graph_hash.delete(n)
    end
  end
  self
end

#replace_edge(src, label, dest, graph) ⇒ Object

graph edges going from origin become outgoing from src graph edges going TO final state instead go TO dest

What is "final state?" Any accept state?

@graph_hash[label].delete(dest)



272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# File 'lib/scanner_generator/thompson_construction.rb', line 272

def replace_edge(src, label, dest, graph)
  raise "can't inject a graph that had no accept states" if graph.accept_states.nil? || graph.accept_states.empty?
  if @graph_hash[src][label].class == Fixnum
    @graph_hash[src][label] = [@graph_hash[src][label]]
  end

  offset = get_node_count-1
  imp_attach_graph(src, graph)

  #draw_graph('intermediate-self')
  #graph.draw_graph('intermediate-graft')

  # for each of the edges pointing at the accept state of the graph
  # redirect them to point at dest
  #draw_graph('retarget-pre')
  graph.accept_states.keys.each do |acc|
    retarget_edges(acc+offset,dest)
    accept_states.delete(acc+offset)
  end
  delete_edge(src,label,dest)

  renumber!
  #draw_graph('retarget-post')

  self
end

#retarget_edges(old_dest, new_dest) ⇒ Object



327
328
329
330
331
332
333
334
335
336
337
338
# File 'lib/scanner_generator/thompson_construction.rb', line 327

def retarget_edges(old_dest, new_dest)
  @graph_hash.each_pair do |node,edge_hash|
    edge_hash.each_pair do |label, dest|
      if dest.include? old_dest
        #puts "#{node}[#{label}] changed from #{dest} to #{new_dest}"
        add_edge(   node, label, new_dest)
        delete_edge(node, label, old_dest)
      end
    end
  end
  self
end

#set_new_accept(node_number, type = 'end') ⇒ Object



195
196
197
# File 'lib/scanner_generator/thompson_construction.rb', line 195

def set_new_accept(node_number, type='end')
  @accept_states = {node_number => 'end'}
end