5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
# File 'lib/turmali/lexer.rb', line 5
def tokenize(code)
code.chomp!
tokens = []
current_indent = 0
indent_stack = []
i = 0
while i < code.size
chunk = code[i..-1]
if identifier = chunk[/\A([a-z]\w*)/, 1]
if KEYWORDS.include?(identifier)
tokens << [identifier.upcase.to_sym, identifier]
else
tokens << [:IDENTIFIER, identifier]
end
i += identifier.size
elsif constant = chunk[/\A([A-Z]\w*)/, 1]
tokens << [:CONSTANT, constant]
i += constant.size
elsif number = chunk[/\A(\d+(\.\d+)?)/, 1]
tokens << [:NUMBER, number.to_f]
i += number.size
elsif string = chunk[/\A"([^"]*)"/, 1]
tokens << [:STRING, string]
i += string.size + 2
elsif indent = chunk[/\A\:\n( +)/m, 1]
if indent.size <= current_indent
raise "Bad indent level, got #{indent.size} indents, " +
"expected > #{current_indent}"
end
current_indent = indent.size
indent_stack.push(current_indent)
tokens << [:INDENT, indent.size]
i += indent.size + 2
elsif indent = chunk[/\A\n( *)/m, 1]
if indent.size == current_indent
tokens << [:NEWLINE, "\n"]
elsif indent.size < current_indent
while indent.size < current_indent
indent_stack.pop
current_indent = indent_stack.last || 0
tokens << [:DEDENT, indent.size]
end
tokens << [:NEWLINE, "\n"]
else
raise "Missing ':'"
end
i += indent.size + 1
elsif operator = chunk[/\A(\|\||&&|==|!=|<=|>=)/, 1]
tokens << [operator, operator]
i += operator.size
elsif chunk.match(/\A /)
i += 1
else
value = chunk[0,1]
tokens << [value, value]
i += 1
end
end
while indent = indent_stack.pop
tokens << [:DEDENT, indent_stack.first || 0]
end
tokens
end
|