Class: RKelly::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/rkelly/tokenizer.rb

Constant Summary collapse

KEYWORDS =
%w{
  break case catch continue default delete do else finally for function
  if in instanceof new return switch this throw try typeof var void while 
  with 

  const true false null debugger
}
RESERVED =
%w{
  abstract boolean byte char class double enum export extends
  final float goto implements import int interface long native package
  private protected public short static super synchronized throws
  transient volatile
}
LITERALS =
{
  # Punctuators
  '=='  => :EQEQ,
  '!='  => :NE,
  '===' => :STREQ,
  '!==' => :STRNEQ,
  '<='  => :LE,
  '>='  => :GE,
  '||'  => :OR,
  '&&'  => :AND,
  '++'  => :PLUSPLUS,
  '--'  => :MINUSMINUS,
  '<<'  => :LSHIFT,
  '<<=' => :LSHIFTEQUAL,
  '>>'  => :RSHIFT,
  '>>=' => :RSHIFTEQUAL,
  '>>>' => :URSHIFT,
  '>>>='=> :URSHIFTEQUAL,
  '&='  => :ANDEQUAL,
  '%='  => :MODEQUAL,
  '^='  => :XOREQUAL,
  '|='  => :OREQUAL,
  '+='  => :PLUSEQUAL,
  '-='  => :MINUSEQUAL,
  '*='  => :MULTEQUAL,
  '/='  => :DIVEQUAL,
}
TOKENS_THAT_IMPLY_DIVISION =
[:IDENT, :NUMBER, ')', ']', '}']

Instance Method Summary collapse

Constructor Details

#initialize(&block) ⇒ Tokenizer

Returns a new instance of Tokenizer.



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/rkelly/tokenizer.rb', line 50

def initialize(&block)
  @lexemes = []

  token(:COMMENT, /\A\/(?:\*(?:.)*?\*\/|\/[^\n]*)/m)
  token(:STRING, /\A"(?:[^"\\]*(?:\\.[^"\\]*)*)"|\A'(?:[^'\\]*(?:\\.[^'\\]*)*)'/m)

  # A regexp to match floating point literals (but not integer literals).
  token(:NUMBER, /\A\d+\.\d*(?:[eE][-+]?\d+)?|\A\d+(?:\.\d*)?[eE][-+]?\d+|\A\.\d+(?:[eE][-+]?\d+)?/m) do |type, value|
    value.gsub!(/\.(\D)/, '.0\1') if value =~ /\.\w/
    value.gsub!(/\.$/, '.0') if value =~ /\.$/
    value.gsub!(/^\./, '0.') if value =~ /^\./
    [type, eval(value)]
  end
  token(:NUMBER, /\A0[xX][\da-fA-F]+|\A0[0-7]*|\A\d+/) do |type, value|
    [type, eval(value)]
  end

  token(:LITERALS,
    Regexp.new(LITERALS.keys.sort_by { |x|
      x.length
    }.reverse.map { |x| "\\A#{x.gsub(/([|+*^])/, '\\\\\1')}" }.join('|')
  )) do |type, value|
    [LITERALS[value], value]
  end

  token(:IDENT, /\A([_\$A-Za-z][_\$0-9A-Za-z]*)/) do |type,value|
    if KEYWORDS.include?(value)
      [value.upcase.to_sym, value]
    elsif RESERVED.include?(value)
      [:RESERVED, value]
    else
      [type, value]
    end
  end

  token(:REGEXP, /\A\/(?:[^\/\r\n\\]*(?:\\[^\r\n][^\/\r\n\\]*)*)\/[gi]*/)
  token(:S, /\A[\s\r\n]*/m)

  token(:SINGLE_CHAR, /\A./) do |type, value|
    [value, value]
  end
end

Instance Method Details

#raw_tokens(string) ⇒ Object



97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/rkelly/tokenizer.rb', line 97

def raw_tokens(string)
  tokens = []
  line_number = 1
  accepting_regexp = true
  while string.length > 0
    longest_token = nil

    @lexemes.each { |lexeme|
      next if lexeme.name == :REGEXP && !accepting_regexp

      match = lexeme.match(string)
      next if match.nil?
      longest_token = match if longest_token.nil?
      next if longest_token.value.length >= match.value.length
      longest_token = match
    }

    accepting_regexp = followable_by_regex(longest_token)

    longest_token.line = line_number
    line_number += longest_token.value.scan(/\n/).length
    string = string.slice(Range.new(longest_token.value.length, -1))
    tokens << longest_token
  end
  tokens
end

#tokenize(string) ⇒ Object



93
94
95
# File 'lib/rkelly/tokenizer.rb', line 93

def tokenize(string)
  raw_tokens(string).map { |x| x.to_racc_token }
end