Class: Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/tsql_shparser/tsql_tokenizer.rb

Overview

Class to tokenize a given string or file.

Constant Summary collapse

VERSION =
"0.0.1"

Instance Method Summary collapse

Constructor Details

#initialize(file = nil) ⇒ Tokenizer

Returns a new instance of Tokenizer.



130
131
132
133
134
# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 130

def initialize(file=nil)
    @input_file = file
    @tokens = []
    @position = 0
end

Instance Method Details

#current_tokenObject



356
# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 356

def current_token;    look_back(0);  end

#get_next_tokenObject



340
341
342
343
344
345
346
347
# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 340

def get_next_token
  tok = ((@position >= 0) ? @tokens[@position] : nil)
  return tok unless tok

  token = Token.new(*tok)
  @position += 1  
  token
end

#look_back(m) ⇒ Object



349
350
351
352
353
354
# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 349

def look_back(m)
  tok = ((@position >= m) ? @tokens[@position-m] : nil)
  return tok unless tok
  token = Token.new(*tok)    
  token
end

#previous_tokenObject



357
# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 357

def previous_token;   look_back(1);  end

#tokenize_file(file = nil) ⇒ Object



328
329
330
331
332
333
334
335
336
337
338
# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 328

def tokenize_file(file=nil)
  @input_file ||= file 
  if @input_file
    arr = IO.readlines(@input_file)    
    tokenize_string(arr.join)      
  end
rescue
  puts $!.to_s
ensure
  return @tokens.length
end

#tokenize_string(str) ⇒ Object

Split the string into its sub-strings and return an array of triplets. Each triplet contains the ending line number, ending column number and the sub-string (token string). A token string may spill over multiple lines.



184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 184

def tokenize_string(str)
  #puts str; puts
  stream = str.split('')

  slc = nil      # Single Line Comment indicator
  mlc = nil      # Multi  Line Comment indicator
  sq  = nil      # Single Quote indicator
  dq  = nil      # Double Quote indicator
  bkt = nil      # Bracket indicator
  
  qtok  = ""     # accumulator for quoted-string token 
  atok  = ""     # accumulator for all types of tokens except quoted-string
  qstr  = ""     # Final quoted string 
  tok_arr  = []  # token array
  
  col    = 0     # Column number of the token
  line   = 1     # Line number of the token
  i      = 1     # Current Position in the stream
  prev_c = ''    # Single character look behind
  
  while stream != []
      c = stream.shift
      #puts c
      case 
      when c =~ /[ \t]/
        unless (slc or mlc or sq or dq or bkt)
          tok_arr += tok_split(line,col,atok)
          atok = ""
        end
      when ((prev_c == '-') and (c == '-'))
        unless (slc or mlc or sq or dq or bkt)
          slc = i
          atok.chop!
          #puts "starting a single-line comment @ #{i}"
        end
      when ((prev_c == '/') and (c == '*'))
        unless (slc or mlc or sq or dq or bkt)
            mlc = i
            atok.chop!
            #puts "starting a  multi-line comment @ #{i}"
        end
      when ((prev_c == '*') and (c == '/'))
        if (mlc and (mlc < (i-1)))
          mlc = nil
          c = ''
          #puts "  ending a  multi-line comment @ #{i}"
        end
      when ((c == "\r") or (c == "\n"))       
        unless (slc or mlc or sq or dq or bkt)
          tok_arr += tok_split(line,col,atok)
          atok = ""
        end

        (col = 0; line += 1) if (c == "\n")
        if slc       
          slc = nil     
          c = ''
          #puts "  ending a single-line comment @ #{i}"     
        end
      when (c == "'")
        unless (slc or mlc or dq or bkt)
          if sq 
            ### WARNING:
            # This logic is wrong: it assumes end of the single-quote token
            # But in case of a embedded/escaped single-quote the token has
            # not yet ended. Needs to be fixed in a later version. 
            ###
            sq = nil
            qtok += c  
            c = ''
            qstr = qtok              
            #puts "   ending single-quote @ #{i}"
          else
            sq = i
            
            if prev_c == 'N'
              qtok = 'N'
              atok.chop!
              temp_pos = col-1
            else
              qtok = ""
              temp_pos = col
            end
            tok_arr += tok_split(line,temp_pos,atok)
            atok = ""              
            #puts " starting single-quote @ #{i}"
          end
        end
      when (c == '"')
        unless (slc or mlc or sq or bkt)
          if dq 
            dq = nil
            qtok += c 
            c = ''
            qstr = qtok
            #puts "   ending double-quote @ #{i}"
          else
            dq = i
            qtok = ""
            tok_arr += tok_split(line,col,atok)
            atok = ""              
            #puts " starting double-quote @ #{i}"
          end
        end
      when (c == '[')
        unless (slc or mlc or sq or dq or bkt)
          bkt = i
          qtok = ""
          tok_arr += tok_split(line,col,atok)
          atok = ""            
          #puts " starting square-bracket @ #{i}"
        end
      when (c == ']')
        if bkt
          bkt = nil  
          qtok += c 
          c = ''
          qstr = qtok
          #puts "   ending square-bracket @ #{i}"
        end
      end

      qtok += c if (sq or dq or bkt)
      atok += c unless (slc or mlc or sq or dq or bkt)
      
      
      prev_c = c
      col += 1 
      i += 1
      
      (tok_arr += tok_split(line,col,qstr,true); qstr = "";) if qstr.size > 0 

  end

  tok_arr += tok_split(line, col, atok)
  
  raise "#{@input_file} Umatched quoted string at (#{line},#{col})" if (sq or dq or bkt)
  raise "#{@input_file} Incomplete Comment at (#{line},#{col})"     if mlc
  
  @tokens = tok_arr
  
end

#unget_tokenObject



359
360
361
362
# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 359

def unget_token
  @position -= 1 if (@position >= 0)
  @position
end