Class: SDL4R::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/sdl4r/tokenizer.rb

Overview

Tokenizer for SDL.

As Ruby’s IO standard libraries are not so much low-level, this class works on lines. This means that some token types reflect this line-oriented tokenizing.

The other solution would be to implement a proper tokenizer natively, which I don’t feel like doing right now.

– FIXME: implement a way of stacking the errors without raising an error immediately ++

Defined Under Namespace

Classes: Matcher

Constant Summary collapse

@@EOL_STRING =

A string used at the end of each line in order to trigger the EOL token.

"\n"
@@matcher_sets =
{
  :top => [
    Matcher.new(:EOL, /\A\n/),
    Matcher.new(:WHITESPACE, /\A\s+/, :push_back_eol => true),
    Matcher.new(:SEMICOLON, /\A;/),
    Matcher.new(:COLON, /\A:/),
    Matcher.new(:EQUAL, /\A=/),
    Matcher.new(:BLOCK_START, /\A\{/),
    Matcher.new(:BLOCK_END, /\A\}/),
    Matcher.new(:BOOLEAN, /\Atrue|false|on|off/),
    Matcher.new(:NULL, /\Anull/),
    Matcher.new(:ONE_LINE_COMMENT, /\A(?:#|--|\/\/).*\Z/, :push_back_eol => true) do
      def process_token(token)
        token.gsub!(/\A(?:#|--|\/\/)/, "")
      end
    end,
    Matcher.new(:INLINE_COMMENT, /\A\/\*[\s\S]*?\*\//) do
      def process_token(token)
        token.gsub!(/\A\/\*|\*\/\Z/, "")
      end
    end,
    Matcher.new(
      :MULTILINE_COMMENT_START,
      /\A\/\*.*\Z/,
      :next_mode => :multiline_comment,
      :push_back_eol => true) do
      def process_token(token)
        token.gsub!(/\A\/\*/, "")
      end
    end,
    Matcher.new(:CHARACTER, /\A'(?:[^\\']|\\.)'/) do
      def process_token(token)
        token.gsub!(/\A'|'\Z/, "")
      end
    end,
    Matcher.new(:INLINE_BACKQUOTE_STRING, /\A`[^`]*`/, :is_node => true) do
      def process_token(token)
        token.gsub!(/\A`|`\Z/, "")
      end
    end,
    Matcher.new(:INLINE_DOUBLE_QUOTE_STRING, /\A"(?:[^\\"]|\\.)*"/) do
      def process_token(token)
        token.gsub!(/\A"|"\Z/, "")
      end
    end,
    Matcher.new(
      :MULTILINE_BACKQUOTE_STRING_START,
      /\A`[^`]*\Z/,
      :next_mode => :multiline_backquote_string,
      :is_node => true) do
        def process_token(token)
          token.gsub!(/\A`/, "")
        end
      end,
    Matcher.new(
      :MULTILINE_DOUBLE_QUOTE_STRING_START,
      /\A"(?:[^\\"]|\\\S)*\\\s*\Z/,
      :next_mode => :multiline_double_quote_string,
      :push_back_eol => true) do
        def process_token(token)
          token.gsub!(/\A"|\\\s*\Z/, "")
        end
      end,
    Matcher.new(:INLINE_BINARY, /\A\[[\sA-Za-z0-9\/=\+]*\]/) do
        def process_token(token)
          token.gsub!(/\A\[|\s+|\]\Z/, "")
        end
      end,
    Matcher.new(
      :MULTILINE_BINARY_START, /\A\[[\sA-Za-z0-9\/=\+]*\Z/,
      :next_mode => :multiline_binary,
      :push_back_eol => true) do
        def process_token(token)
          token.gsub!(/\A\[|\s+/, "")
        end
      end,
    Matcher.new(
      :IDENTIFIER, /\A#{SDL4R::IDENTIFIER_START_CLASS}#{SDL4R::IDENTIFIER_PART_CLASS}*/),
    Matcher.new(:DATE, /\A-?\d+\/\d+\/\d+/, :is_node => true),
    Matcher.new(
      :TIME_OR_TIMESPAN,
      /\A(?:-?\d+d:)?-?\d+:\d+(?::\d+(?:\.\d+)?)?
        (?:-[a-zA-Z\/]+(?:[+-]\d+(?::\d+)?)?)?/ix),
    Matcher.new(:INTEGER, /\A[\+\-]?\d+L/i), # takes precedence on floats
    # the float regex is meant to also catch bad syntaxed floats like "1.2.2" (otherwise, we
    # would not detect this kind of errors easily).
    Matcher.new(
      :FLOAT, /\A[\+\-]?(?:\d+(?:F|D|BD)|\d*\.[\d\.]+(?:F|D|BD)?)/i),
    Matcher.new(:INTEGER, /\A[\+\-]?\d+L?/i),
    Matcher.new(:LINE_CONTINUATION, /\A\\\s*\Z/), # outside of comments, strings, etc
    Matcher.new(
      :UNCLOSED_DOUBLE_QUOTE_STRING,
      /\A"(?:[^\\"]|\\\S)*/,
      :error => "unclosed string"),
  ],
   :multiline_comment => [
    Matcher.new(:EOL, /\A\n/),
    Matcher.new(:MULTILINE_COMMENT_END, /\A[\s\S]*?\*\//, :next_mode => :top) do
      def process_token(token)
        token.gsub!(/\*\/\Z/, "")
      end
    end,
    Matcher.new(:MULTILINE_COMMENT_PART, /\A.+\Z/, :push_back_eol => true)
  ],
   :multiline_backquote_string => [
    Matcher.new(:EOL, /\A\n/),
    Matcher.new(:MULTILINE_BACKQUOTE_STRING_END, /\A[^`]*`/, :next_mode => :top) do
      def process_token(token)
        token.gsub!(/`\Z/, "")
      end
    end,
    Matcher.new(:MULTILINE_BACKQUOTE_STRING_PART, /\A[^`]*\Z/)
  ],
   :multiline_double_quote_string => [
    Matcher.new(:EOL, /\A\n/),
    Matcher.new(
      :MULTILINE_DOUBLE_QUOTE_STRING_END, /\A(?:[^\\"]|\\\S)*"/, :next_mode => :top) do
        def process_token(token)
          token.gsub!(/\A\s+|"\Z/, "")
        end
      end,
    Matcher.new(
      :MULTILINE_DOUBLE_QUOTE_STRING_PART,
      /\A(?:[^\\"]|\\\S)*\\\s*\Z/,
      :push_back_eol => true) do
        def process_token(token)
          token.gsub!(/\A\s+|\\\s*\Z/, "")
        end
      end,
    Matcher.new(
      :UNCLOSED_DOUBLE_QUOTE_STRING,
      /\A(?:[^\\"]|\\\S)*\Z/,
      :error => "unclosed multiline string")
  ],
   :multiline_binary => [
    Matcher.new(:EOL, /\A\n/),
    Matcher.new(:MULTILINE_BINARY_END, /\A[\sA-Za-z0-9\/=\+]*\]/, :next_mode => :top) do
      def process_token(token)
        token.gsub!(/\s+|\]\Z/, "")
      end
    end,
    Matcher.new(:MULTILINE_BINARY_PART, /\A[\sA-Za-z0-9\/=\+]*\Z/, :push_back_eol => true) do
      def process_token(token)
        token.gsub!(/\s+/, "")
      end
    end
  ]
}

Instance Method Summary collapse

Constructor Details

#initialize(io) ⇒ Tokenizer

Returns a new instance of Tokenizer.

Parameters:

  • the (IO)

    IO to read from

Raises:

  • (ArgumentError)

    if io is nil.



238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/sdl4r/tokenizer.rb', line 238

def initialize io
  raise ArgumentError, 'io' unless io
  @io = io
  @scanner = nil
  @line_no = -1
  set_mode(:top)

  @token = nil
  @pushed_back_token = nil
  @previous_token = nil

  @token_pool = [] # a pool of reusable Tokens
end

Instance Method Details

#previous_token_typeSymbol

Returns the type of the previous Token.

Returns:

  • (Symbol)

    the type of the previous Token.



370
371
372
# File 'lib/sdl4r/tokenizer.rb', line 370

def previous_token_type
  @previous_token ? @previous_token.type : nil
end

#raise_parse_error(msg = "parse error", line_no = @line_no, pos = @scanner.pos) ⇒ Object

Raises:



400
401
402
403
# File 'lib/sdl4r/tokenizer.rb', line 400

def raise_parse_error(msg = "parse error", line_no = @line_no, pos = @scanner.pos)
  line = (line_no == @line_no)? @scanner.string : nil
  raise SdlParseError.new(msg, line_no + 1, pos + 1, line)
end

#raise_unexpected_char(msg = "unexpected char") ⇒ Object

Raises a standard “unexpected character” error.



396
397
398
# File 'lib/sdl4r/tokenizer.rb', line 396

def raise_unexpected_char(msg = "unexpected char")
  raise_parse_error "#{msg}: <#{@scanner.peek(1)}>"
end

#readSymbol

Goes to the next token.

Returns:

  • (Symbol)

    nil if eof has been reached, the current token type otherwise.



312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
# File 'lib/sdl4r/tokenizer.rb', line 312

def read
  if @pushed_back_token
    read_pushed_back
    return @token.type
  end

  record_previous_token
  @token = nil

  if @line_no < 0 or @scanner.eos? # fetch a line if beginning or at end of line
    unless read_line
      if previous_token_type == :EOF
        return nil
      else
        @token = Token.new(nil, :EOF, nil, @line_no, @scanner ? @scanner.pos : 0)
        return @token.type
      end
    end
  end

  pos = @scanner.pos
  @matcher_set.each do |matcher|
    if token_text = @scanner.scan(matcher.regex)
      error = matcher.error
      if error
        raise_parse_error(error)

      else
        set_matcher_token(matcher, token_text, pos)
        if matcher.push_back_eol and @scanner.eos?
          @scanner.pos = @scanner.pos - @@EOL_STRING.size
        end
      end
      break
    end
  end

  raise_unexpected_char unless @token

  return @token.type
end

#set_mode(mode) ⇒ self

Sets the current working mode of this Tokenizer.

Parameters:

  • new (Symbol)

    mode

    • :top (normal default mode)

    • :multiline_comment

    • :multiline_backquote_string

    • :multiline_double_quote_string

    • :multiline_binary

Returns:

  • (self)

Raises:

  • (ArgumentError)

    if the given mode is unknown.



286
287
288
289
290
291
# File 'lib/sdl4r/tokenizer.rb', line 286

def set_mode(mode)
  ms = @@matcher_sets[mode]
  raise ArgumentError, "unknown tokenizer mode #{mode.to_s}" unless ms
  @matcher_set = ms
  self
end

#tokenString

Returns text of the current token.

Returns:

  • (String)

    text of the current token.



253
254
255
# File 'lib/sdl4r/tokenizer.rb', line 253

def token
  @token.text
end

#token_line_noInteger

Returns position of the current token (only meant for error tracking for the time being).

Returns:

  • (Integer)

    position of the current token (only meant for error tracking for the time being)



264
265
266
# File 'lib/sdl4r/tokenizer.rb', line 264

def token_line_no
  @token.line_no
end

#token_posInteger

Returns position of the current token (only meant for error tracking for the time being).

Returns:

  • (Integer)

    position of the current token (only meant for error tracking for the time being)



270
271
272
# File 'lib/sdl4r/tokenizer.rb', line 270

def token_pos
  @token.pos
end

#token_typeSymbol

Returns type of the current token (e.g. :WHITESPACE).

Returns:

  • (Symbol)

    type of the current token (e.g. :WHITESPACE)



258
259
260
# File 'lib/sdl4r/tokenizer.rb', line 258

def token_type
  @token.type
end

#unreadObject

Unreads the current token. The previous token becomes the current one

Raises:

  • if #unread has been called twice in a row (no call to #read)



378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
# File 'lib/sdl4r/tokenizer.rb', line 378

def unread
  if @pushed_back_token
    raise "only one token can be pushed back"
  else
    @pushed_back_token = @token
    @token = @previous_token

    # We have no memory of what happened before
    @previous_token = nil

    if @token.matcher
      next_mode = @token.matcher.next_mode
      set_mode(next_mode) if next_mode
    end
  end
end