Class: NScript::Lexer

Inherits:
Object
  • Object
show all
Defined in:
lib/nscript/lexer/lexer.rb

Constant Summary collapse

KEYWORDS =
["if", "else", "then", "unless",
"true", "false", "yes", "no", "on", "off",
"and", "or", "is", "isnt", "not",
"new", "return",
"try", "catch", "finally", "throw",
"break", "continue",
"for", "in", "of", "by", "where", "while",
"delete", "instanceof", "typeof",
"switch", "when",
"super", "extends"]
IDENTIFIER =
/\A([a-zA-Z$_](\w|\$)*)/
NUMBER =
/\A(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i
STRING =
/\A(""|''|"(.*?)([^\\]|\\\\)"|'(.*?)([^\\]|\\\\)')/m
HEREDOC =
/\A("{6}|'{6}|"{3}\n?(.*?)\n?([ \t]*)"{3}|'{3}\n?(.*?)\n?([ \t]*)'{3})/m
JS =
/\A(``|`(.*?)([^\\]|\\\\)`)/m
OPERATOR =
/\A([+\*&|\/\-%=<>:!?]+)/
WHITESPACE =
/\A([ \t]+)/
COMMENT =
/\A(((\n?[ \t]*)?#.*$)+)/
CODE =
/\A((-|=)>)/
REGEX =
/\A(\/(.*?)([^\\]|\\\\)\/[imgy]{0,4})/
MULTI_DENT =
/\A((\n([ \t]*))+)(\.)?/
LAST_DENT =
/\n([ \t]*)/
ASSIGNMENT =
/\A(:|=)\Z/
JS_CLEANER =
/(\A`|`\Z)/
MULTILINER =
/\n/
STRING_NEWLINES =
/\n[ \t]*/
COMMENT_CLEANER =
/(^[ \t]*#|\n[ \t]*$)/
NO_NEWLINE =
/\A([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)\Z/
HEREDOC_INDENT =
/^[ \t]+/
NOT_REGEX =
[
  :IDENTIFIER, :NUMBER, :REGEX, :STRING,
  ')', '++', '--', ']', '}',
  :FALSE, :NULL, :TRUE
]
CALLABLE =
[:IDENTIFIER, :SUPER, ')', ']', '}', :STRING]

Instance Method Summary collapse

Instance Method Details

#close_indentationObject



224
225
226
# File 'lib/nscript/lexer/lexer.rb', line 224

def close_indentation
  outdent_token(@indent)
end

#comment_tokenObject



133
134
135
136
137
138
139
# File 'lib/nscript/lexer/lexer.rb', line 133

def comment_token
  return false unless comment = @chunk[COMMENT, 1]
  @line += comment.scan(MULTILINER).length
  token(:COMMENT, comment.gsub(COMMENT_CLEANER, '').split(MULTILINER))
  token("\n", "\n")
  @i += comment.length
end

#extract_next_tokenObject



62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/nscript/lexer/lexer.rb', line 62

def extract_next_token
  return if identifier_token
  return if number_token
  return if heredoc_token
  return if string_token
  return if js_token
  return if regex_token
  return if indent_token
  return if comment_token
  return if whitespace_token
  return    literal_token
end

#heredoc_tokenObject



108
109
110
111
112
113
114
115
116
117
118
# File 'lib/nscript/lexer/lexer.rb', line 108

def heredoc_token
  return false unless match = @chunk.match(HEREDOC)
  doc = match[2] || match[4]
  indent = doc.scan(HEREDOC_INDENT).min
  doc.gsub!(/^#{indent}/, "")
  doc.gsub!("\n", "\\n")
  doc.gsub!('"', '\\"')
  token(:STRING, "\"#{doc}\"")
  @line += match[1].count("\n")
  @i += match[1].length
end

#identifier_tokenObject



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/nscript/lexer/lexer.rb', line 75

def identifier_token
  return false unless identifier = @chunk[IDENTIFIER, 1]
  # Keywords are special identifiers tagged with their own name,
  # 'if' will result in an [:IF, "if"] token.
  tag = KEYWORDS.include?(identifier) ? identifier.upcase.to_sym : :IDENTIFIER
  tag = :LEADING_WHEN if tag == :WHEN && [:OUTDENT, :INDENT, "\n"].include?(last_tag)
  @tokens[-1][0] = :PROTOTYPE_ACCESS if tag == :IDENTIFIER && last_value == '::'
  if tag == :IDENTIFIER && last_value == '.' && !(@tokens[-2] && @tokens[-2][1] == '.')
    if @tokens[-2][0] == "?"
      @tokens[-1][0] = :SOAK_ACCESS
      @tokens.delete_at(-2)
    else
      @tokens[-1][0] = :PROPERTY_ACCESS
    end
  end
  token(tag, identifier)
  @i += identifier.length
end

#indent_tokenObject



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/nscript/lexer/lexer.rb', line 141

def indent_token
  return false unless indent = @chunk[MULTI_DENT, 1]
  @line += indent.scan(MULTILINER).size
  @i += indent.size
  next_character = @chunk[MULTI_DENT, 4]
  no_newlines = next_character == '.' || (last_value.to_s.match(NO_NEWLINE) && @tokens[-2][0] != '.'  && !last_value.match(CODE))
  return suppress_newlines(indent) if no_newlines
  size = indent.scan(LAST_DENT).last.last.length
  return newline_token(indent) if size == @indent
  if size > @indent
    token(:INDENT, size - @indent)
    @indents << (size - @indent)
  else
    outdent_token(@indent - size)
  end
  @indent = size
end

#js_tokenObject



120
121
122
123
124
# File 'lib/nscript/lexer/lexer.rb', line 120

def js_token
  return false unless script = @chunk[JS, 1]
  token(:JS, script.gsub(JS_CLEANER, ''))
  @i += script.length
end

#last_tagObject



205
206
207
# File 'lib/nscript/lexer/lexer.rb', line 205

def last_tag
  @tokens.last && @tokens.last[0]
end

#last_valueObject



201
202
203
# File 'lib/nscript/lexer/lexer.rb', line 201

def last_value
  @tokens.last && @tokens.last[1]
end

#literal_tokenObject



184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/nscript/lexer/lexer.rb', line 184

def literal_token
  value = @chunk[OPERATOR, 1]
  tag_parameters if value && value.match(CODE)
  value ||= @chunk[0,1]
  tag = value.match(ASSIGNMENT) ? :ASSIGN : value
  if !@spaced.equal?(last_value) && CALLABLE.include?(last_tag)
    tag = :CALL_START  if value == '('
    tag = :INDEX_START if value == '['
  end
  token(tag, value)
  @i += value.length
end

#newline_token(newlines) ⇒ Object



174
175
176
177
# File 'lib/nscript/lexer/lexer.rb', line 174

def newline_token(newlines)
  token("\n", "\n") unless last_value == "\n"
  true
end

#number_tokenObject



94
95
96
97
98
# File 'lib/nscript/lexer/lexer.rb', line 94

def number_token
  return false unless number = @chunk[NUMBER, 1]
  token(:NUMBER, number)
  @i += number.length
end

#outdent_token(move_out) ⇒ Object



159
160
161
162
163
164
165
166
# File 'lib/nscript/lexer/lexer.rb', line 159

def outdent_token(move_out)
  while move_out > 0 && !@indents.empty?
    last_indent = @indents.pop
    token(:OUTDENT, last_indent)
    move_out -= last_indent
  end
  token("\n", "\n")
end

#regex_tokenObject



126
127
128
129
130
131
# File 'lib/nscript/lexer/lexer.rb', line 126

def regex_token
  return false unless regex = @chunk[REGEX, 1]
  return false if NOT_REGEX.include?(last_tag)
  token(:REGEX, regex)
  @i += regex.length
end

#string_tokenObject



100
101
102
103
104
105
106
# File 'lib/nscript/lexer/lexer.rb', line 100

def string_token
  return false unless string = @chunk[STRING, 1]
  escaped = string.gsub(STRING_NEWLINES, " \\\n")
  token(:STRING, escaped)
  @line += string.count("\n")
  @i += string.length
end

#suppress_newlines(newlines) ⇒ Object



179
180
181
182
# File 'lib/nscript/lexer/lexer.rb', line 179

def suppress_newlines(newlines)
  @tokens.pop if last_value == "\\"
  true
end

#tag_parametersObject



209
210
211
212
213
214
215
216
217
218
219
220
221
222
# File 'lib/nscript/lexer/lexer.rb', line 209

def tag_parameters
  return if last_tag != ')'
  i = 0
  loop do
    i -= 1
    tok = @tokens[i]
    return if !tok
    case tok[0]
    when :IDENTIFIER  then tok[0] = :PARAM
    when ')'          then tok[0] = :PARAM_END
    when '('          then return tok[0] = :PARAM_START
    end
  end
end

#token(tag, value) ⇒ Object



197
198
199
# File 'lib/nscript/lexer/lexer.rb', line 197

def token(tag, value)
  @tokens << [tag, Value.new(value, @line)]
end

#tokenize(code) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/nscript/lexer/lexer.rb', line 45

def tokenize(code)
  @code    = code.chomp # Cleanup code by remove extra line breaks
  @i       = 0          # Current character position we're parsing
  @line    = 1          # The current line.
  @indent  = 0          # The current indent level.
  @indents = []         # The stack of all indent levels we are currently within.
  @tokens  = []         # Collection of all parsed tokens in the form [:TOKEN_TYPE, value]
  @spaced  = nil        # The last value that has a space following it.
  while @i < @code.length
    @chunk = @code[@i..-1]
    extract_next_token
  end
  puts "original stream: #{@tokens.inspect}" if ENV['VERBOSE']
  close_indentation
  Rewriter.new.rewrite(@tokens)
end

#whitespace_tokenObject



168
169
170
171
172
# File 'lib/nscript/lexer/lexer.rb', line 168

def whitespace_token
  return false unless whitespace = @chunk[WHITESPACE, 1]
  @spaced = last_value
  @i += whitespace.length
end