Class: NScript::Lexer

Inherits:

Object

Object
NScript::Lexer

show all

Defined in:: lib/nscript/lexer/lexer.rb

Constant Summary collapse

KEYWORDS =

["if", "else", "then", "unless",
"true", "false", "yes", "no", "on", "off",
"and", "or", "is", "isnt", "not",
"new", "return",
"try", "catch", "finally", "throw",
"break", "continue",
"for", "in", "of", "by", "where", "while",
"delete", "instanceof", "typeof",
"switch", "when",
"super", "extends"]

IDENTIFIER =

/\A([a-zA-Z$_](\w|\$)*)/

NUMBER =

/\A(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i

STRING =

/\A(""|''|"(.*?)([^\\]|\\\\)"|'(.*?)([^\\]|\\\\)')/m

HEREDOC =

/\A("{6}|'{6}|"{3}\n?(.*?)\n?([ \t]*)"{3}|'{3}\n?(.*?)\n?([ \t]*)'{3})/m

JS =

/\A(``|`(.*?)([^\\]|\\\\)`)/m

OPERATOR =

/\A([+\*&|\/\-%=<>:!?]+)/

WHITESPACE =

/\A([ \t]+)/

COMMENT =

/\A(((\n?[ \t]*)?#.*$)+)/

CODE =

/\A((-|=)>)/

REGEX =

/\A(\/(.*?)([^\\]|\\\\)\/[imgy]{0,4})/

MULTI_DENT =

/\A((\n([ \t]*))+)(\.)?/

LAST_DENT =

/\n([ \t]*)/

ASSIGNMENT =

/\A(:|=)\Z/

JS_CLEANER =

/(\A`|`\Z)/

MULTILINER =

/\n/

STRING_NEWLINES =

/\n[ \t]*/

COMMENT_CLEANER =

/(^[ \t]*#|\n[ \t]*$)/

NO_NEWLINE =

/\A([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)\Z/

HEREDOC_INDENT =

/^[ \t]+/

NOT_REGEX =

[
  :IDENTIFIER, :NUMBER, :REGEX, :STRING,
  ')', '++', '--', ']', '}',
  :FALSE, :NULL, :TRUE
]

CALLABLE =

[:IDENTIFIER, :SUPER, ')', ']', '}', :STRING]

Instance Method Summary collapse

Instance Method Details

#close_indentation ⇒ `Object`



224
225
226

# File 'lib/nscript/lexer/lexer.rb', line 224

def close_indentation
  outdent_token(@indent)
end

#comment_token ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 133

def comment_token
  return false unless comment = @chunk[COMMENT, 1]
  @line += comment.scan(MULTILINER).length
  token(:COMMENT, comment.gsub(COMMENT_CLEANER, '').split(MULTILINER))
  token("\n", "\n")
  @i += comment.length
end

#extract_next_token ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 62

def extract_next_token
  return if identifier_token
  return if number_token
  return if heredoc_token
  return if string_token
  return if js_token
  return if regex_token
  return if indent_token
  return if comment_token
  return if whitespace_token
  return    literal_token
end

#heredoc_token ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 108

def heredoc_token
  return false unless match = @chunk.match(HEREDOC)
  doc = match[2] || match[4]
  indent = doc.scan(HEREDOC_INDENT).min
  doc.gsub!(/^#{indent}/, "")
  doc.gsub!("\n", "\\n")
  doc.gsub!('"', '\\"')
  token(:STRING, "\"#{doc}\"")
  @line += match[1].count("\n")
  @i += match[1].length
end

#identifier_token ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 75

def identifier_token
  return false unless identifier = @chunk[IDENTIFIER, 1]
  # Keywords are special identifiers tagged with their own name,
  # 'if' will result in an [:IF, "if"] token.
  tag = KEYWORDS.include?(identifier) ? identifier.upcase.to_sym : :IDENTIFIER
  tag = :LEADING_WHEN if tag == :WHEN && [:OUTDENT, :INDENT, "\n"].include?(last_tag)
  @tokens[-1][0] = :PROTOTYPE_ACCESS if tag == :IDENTIFIER && last_value == '::'
  if tag == :IDENTIFIER && last_value == '.' && !(@tokens[-2] && @tokens[-2][1] == '.')
    if @tokens[-2][0] == "?"
      @tokens[-1][0] = :SOAK_ACCESS
      @tokens.delete_at(-2)
    else
      @tokens[-1][0] = :PROPERTY_ACCESS
    end
  end
  token(tag, identifier)
  @i += identifier.length
end

#indent_token ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 141

def indent_token
  return false unless indent = @chunk[MULTI_DENT, 1]
  @line += indent.scan(MULTILINER).size
  @i += indent.size
  next_character = @chunk[MULTI_DENT, 4]
  no_newlines = next_character == '.' || (last_value.to_s.match(NO_NEWLINE) && @tokens[-2][0] != '.'  && !last_value.match(CODE))
  return suppress_newlines(indent) if no_newlines
  size = indent.scan(LAST_DENT).last.last.length
  return newline_token(indent) if size == @indent
  if size > @indent
    token(:INDENT, size - @indent)
    @indents << (size - @indent)
  else
    outdent_token(@indent - size)
  end
  @indent = size
end

#js_token ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 120

def js_token
  return false unless script = @chunk[JS, 1]
  token(:JS, script.gsub(JS_CLEANER, ''))
  @i += script.length
end

#last_tag ⇒ `Object`



205
206
207

# File 'lib/nscript/lexer/lexer.rb', line 205

def last_tag
  @tokens.last && @tokens.last[0]
end

#last_value ⇒ `Object`



201
202
203

# File 'lib/nscript/lexer/lexer.rb', line 201

def last_value
  @tokens.last && @tokens.last[1]
end

#literal_token ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 184

def literal_token
  value = @chunk[OPERATOR, 1]
  tag_parameters if value && value.match(CODE)
  value ||= @chunk[0,1]
  tag = value.match(ASSIGNMENT) ? :ASSIGN : value
  if !@spaced.equal?(last_value) && CALLABLE.include?(last_tag)
    tag = :CALL_START  if value == '('
    tag = :INDEX_START if value == '['
  end
  token(tag, value)
  @i += value.length
end

#newline_token(newlines) ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 174

def newline_token(newlines)
  token("\n", "\n") unless last_value == "\n"
  true
end

#number_token ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 94

def number_token
  return false unless number = @chunk[NUMBER, 1]
  token(:NUMBER, number)
  @i += number.length
end

#outdent_token(move_out) ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 159

def outdent_token(move_out)
  while move_out > 0 && !@indents.empty?
    last_indent = @indents.pop
    token(:OUTDENT, last_indent)
    move_out -= last_indent
  end
  token("\n", "\n")
end

#regex_token ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 126

def regex_token
  return false unless regex = @chunk[REGEX, 1]
  return false if NOT_REGEX.include?(last_tag)
  token(:REGEX, regex)
  @i += regex.length
end

#string_token ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 100

def string_token
  return false unless string = @chunk[STRING, 1]
  escaped = string.gsub(STRING_NEWLINES, " \\\n")
  token(:STRING, escaped)
  @line += string.count("\n")
  @i += string.length
end

#suppress_newlines(newlines) ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 179

def suppress_newlines(newlines)
  @tokens.pop if last_value == "\\"
  true
end

#tag_parameters ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 209

def tag_parameters
  return if last_tag != ')'
  i = 0
  loop do
    i -= 1
    tok = @tokens[i]
    return if !tok
    case tok[0]
    when :IDENTIFIER  then tok[0] = :PARAM
    when ')'          then tok[0] = :PARAM_END
    when '('          then return tok[0] = :PARAM_START
    end
  end
end

#token(tag, value) ⇒ `Object`



197
198
199

# File 'lib/nscript/lexer/lexer.rb', line 197

def token(tag, value)
  @tokens << [tag, Value.new(value, @line)]
end

#tokenize(code) ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 45

def tokenize(code)
  @code    = code.chomp # Cleanup code by remove extra line breaks
  @i       = 0          # Current character position we're parsing
  @line    = 1          # The current line.
  @indent  = 0          # The current indent level.
  @indents = []         # The stack of all indent levels we are currently within.
  @tokens  = []         # Collection of all parsed tokens in the form [:TOKEN_TYPE, value]
  @spaced  = nil        # The last value that has a space following it.
  while @i < @code.length
    @chunk = @code[@i..-1]
    extract_next_token
  end
  puts "original stream: #{@tokens.inspect}" if ENV['VERBOSE']
  close_indentation
  Rewriter.new.rewrite(@tokens)
end

#whitespace_token ⇒ `Object`

# File 'lib/nscript/lexer/lexer.rb', line 168

def whitespace_token
  return false unless whitespace = @chunk[WHITESPACE, 1]
  @spaced = last_value
  @i += whitespace.length
end

Class: NScript::Lexer

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#close_indentation ⇒ Object

#comment_token ⇒ Object

#extract_next_token ⇒ Object

#heredoc_token ⇒ Object

#identifier_token ⇒ Object

#indent_token ⇒ Object

#js_token ⇒ Object

#last_tag ⇒ Object

#last_value ⇒ Object

#literal_token ⇒ Object

#newline_token(newlines) ⇒ Object

#number_token ⇒ Object

#outdent_token(move_out) ⇒ Object

#regex_token ⇒ Object

#string_token ⇒ Object

#suppress_newlines(newlines) ⇒ Object

#tag_parameters ⇒ Object

#token(tag, value) ⇒ Object

#tokenize(code) ⇒ Object

#whitespace_token ⇒ Object