Class: Riml::Lexer

Inherits:

Object

Object
Riml::Lexer

show all

Includes:: Constants

Defined in:: lib/lexer.rb

Constant Summary collapse

SINGLE_LINE_COMMENT_REGEX =

/\A[ \t\f]*"(.*)$/

OPERATOR_REGEX =

/\A#{Regexp.union(['||', '&&', '===', '+=', '-=', '.='] + COMPARISON_OPERATORS)}/

INTERPOLATION_REGEX =

/"([^"]*?)(\#\{([^"]*?)\})([^"]*?)"/m

ANCHORED_INTERPOLATION_REGEX =

/\A#{INTERPOLATION_REGEX}/m

INTERPOLATION_SPLIT_REGEX =

/(\#\{.*?\})/m

Constants included from Constants

Constants::BUILTIN_COMMANDS, Constants::BUILTIN_FUNCTIONS, Constants::COMPARISON_OPERATORS, Constants::DEFINE_KEYWORDS, Constants::END_KEYWORDS, Constants::IGNORECASE_CAPABLE_OPERATORS, Constants::KEYWORDS, Constants::REGISTERS, Constants::RIML_COMMANDS, Constants::RIML_END_KEYWORDS, Constants::RIML_KEYWORDS, Constants::SPECIAL_VARIABLE_PREFIXES, Constants::SPLAT_LITERAL, Constants::VIML_COMMANDS, Constants::VIML_END_KEYWORDS, Constants::VIML_KEYWORDS

Instance Attribute Summary collapse

#chunk ⇒ Object readonly

Returns the value of attribute chunk.
#current_indent ⇒ Object readonly

Returns the value of attribute current_indent.
#ignore_indentation_check ⇒ Object

for REPL.
#invalid_keyword ⇒ Object readonly

Returns the value of attribute invalid_keyword.
#lineno ⇒ Object readonly

Returns the value of attribute lineno.
#prev_token ⇒ Object readonly

Returns the value of attribute prev_token.
#tokens ⇒ Object readonly

Returns the value of attribute tokens.

Instance Method Summary collapse

#initialize(code) ⇒ Lexer constructor

A new instance of Lexer.
#next_token ⇒ Object
#prev_token_is_keyword? ⇒ Boolean
#set_start_state! ⇒ Object
#tokenize ⇒ Object
#tokenize_chunk(chunk) ⇒ Object

Constructor Details

#initialize(code) ⇒ `Lexer`

Returns a new instance of Lexer.

# File 'lib/lexer.rb', line 18

def initialize(code)
  @code = code
  @code.chomp!
  set_start_state!
end

Instance Attribute Details

#chunk ⇒ `Object` (readonly)

Returns the value of attribute chunk.



14
15
16

# File 'lib/lexer.rb', line 14

def chunk
  @chunk
end

#current_indent ⇒ `Object` (readonly)

Returns the value of attribute current_indent.



14
15
16

# File 'lib/lexer.rb', line 14

def current_indent
  @current_indent
end

#ignore_indentation_check ⇒ `Object`

for REPL



16
17
18

# File 'lib/lexer.rb', line 16

def ignore_indentation_check
  @ignore_indentation_check
end

#invalid_keyword ⇒ `Object` (readonly)

Returns the value of attribute invalid_keyword.



14
15
16

# File 'lib/lexer.rb', line 14

def invalid_keyword
  @invalid_keyword
end

#lineno ⇒ `Object` (readonly)

Returns the value of attribute lineno.



14
15
16

# File 'lib/lexer.rb', line 14

def lineno
  @lineno
end

#prev_token ⇒ `Object` (readonly)

Returns the value of attribute prev_token.



14
15
16

# File 'lib/lexer.rb', line 14

def prev_token
  @prev_token
end

#tokens ⇒ `Object` (readonly)

Returns the value of attribute tokens.



14
15
16

# File 'lib/lexer.rb', line 14

def tokens
  @tokens
end

Instance Method Details

#next_token ⇒ `Object`

# File 'lib/lexer.rb', line 47

def next_token
  while @token_buf.empty? && more_code_to_tokenize?
    tokenize_chunk(get_new_chunk)
  end
  if !@token_buf.empty?
    token = @token_buf.shift
    if token.size == 3
      @lineno += token.pop
    end
    tokens << token
    return @prev_token = token
  end
  check_indentation unless ignore_indentation_check
  nil
end

#prev_token_is_keyword? ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/lexer.rb', line 236

def prev_token_is_keyword?
  if prev_token && prev_token[1]
    if KEYWORDS.include?(prev_token[1])
      return @invalid_keyword = prev_token[1]
    end
    prev_prev_token = tokens[-2]
    if prev_prev_token && KEYWORDS.include?(prev_prev_token[1])
      return @invalid_keyword = prev_prev_token[1]
    end
  end
end

#set_start_state! ⇒ `Object`

# File 'lib/lexer.rb', line 24

def set_start_state!
  # number of characters consumed
  @i = 0
  # array of doubles and triples: [tokenname, tokenval, lineno_to_add(optional)]
  # ex: [[:NEWLINE, "\n"]] OR [[:NEWLINE, "\n", 1]]
  @token_buf = []
  @tokens = []
  @prev_token = nil
  @lineno = 1
  @current_indent = 0
  @indent_pending = false
  @dedent_pending = false
  @one_line_conditional_end_pending = false
  @in_function_declaration = false
  @invalid_keyword = nil
end

#tokenize ⇒ `Object`

# File 'lib/lexer.rb', line 41

def tokenize
  set_start_state!
  while next_token != nil; end
  @tokens
end

#tokenize_chunk(chunk) ⇒ `Object`

# File 'lib/lexer.rb', line 63

def tokenize_chunk(chunk)
  @chunk = chunk
  # deal with line continuations
  if cont = chunk[/\A\r?\n*[ \t\f]*\\/m]
    @i += cont.size
    @lineno += cont.each_line.to_a.size - 1
    return
  end

  # all lines that start with ':' pass right through unmodified
  if (prev_token.nil? || prev_token[0] == :NEWLINE) && (ex_literal = chunk[/\A[ \t\f]*:(.*)?$/])
    @i += ex_literal.size
    @token_buf << [:EX_LITERAL, $1]
    return
  end

  if splat_var = chunk[/\Aa:\d+/]
    @i += splat_var.size
    @token_buf << [:SCOPE_MODIFIER, 'a:'] << [:IDENTIFIER, splat_var[2..-1]]
  # the 'n' scope modifier is added by riml
  elsif scope_modifier = chunk[/\A([bwtglsavn]:)(\w|{)/, 1]
    @i += 2
    @token_buf << [:SCOPE_MODIFIER, scope_modifier]
  elsif scope_modifier_literal = chunk[/\A([bwtglsavn]:)/]
    @i += scope_modifier_literal.size
    @token_buf << [:SCOPE_MODIFIER_LITERAL, scope_modifier_literal]
  elsif special_var_prefix = chunk[/\A(&(\w:)?(?!&)|\$|@)/]
    @token_buf << [:SPECIAL_VAR_PREFIX, special_var_prefix.strip]
    @i += special_var_prefix.size
    if special_var_prefix == '@'
      new_chunk = get_new_chunk
      next_char = new_chunk[0]
      if REGISTERS.include?(next_char)
        @token_buf << [:IDENTIFIER, next_char]
        @i += 1
      end
    else
      @expecting_identifier = true
    end
  elsif function_method = chunk[/\A(function)\(/, 1]
    @token_buf << [:IDENTIFIER, function_method]
    @i += function_method.size
  elsif identifier = chunk[/\A[a-zA-Z_][\w#]*(\?|!)?/]
    # keyword identifiers
    if KEYWORDS.include?(identifier)
      if identifier.match(/\Afunction/)
        old_identifier = identifier.dup
        identifier.sub!(/function/, "def")
        @i += (old_identifier.size - identifier.size)
      end

      if DEFINE_KEYWORDS.include?(identifier)
        @in_function_declaration = true
      end

      # strip '?' out of token names and replace '!' with '_bang'
      token_name = identifier.sub(/\?\Z/, "").sub(/!\Z/, "_bang").upcase
      track_indent_level(chunk, identifier)

      if VIML_END_KEYWORDS.include?(identifier)
        token_name = :END
      end

      @token_buf << [token_name.intern, identifier]

    elsif BUILTIN_COMMANDS.include?(identifier) && !chunk[/\A#{Regexp.escape(identifier)}\(/]
      @token_buf << [:BUILTIN_COMMAND, identifier]
    elsif RIML_COMMANDS.include? identifier
      @token_buf << [:RIML_COMMAND, identifier]
    elsif VIML_COMMANDS.include?(identifier) && (prev_token.nil? || prev_token[0] == :NEWLINE)
      @i += identifier.size
      new_chunk = get_new_chunk
      until_eol = new_chunk[/.*$/].to_s
      @token_buf << [:EX_LITERAL, identifier << until_eol]
      @i += until_eol.size
      return
    # method names and variable names
    else
      @token_buf << [:IDENTIFIER, identifier]
    end

    @i += identifier.size

    parse_dict_vals!

    if @in_function_declaration
      @in_function_declaration = false unless DEFINE_KEYWORDS.include?(identifier) && @token_buf.size == 1
    end
  elsif splat = chunk[/\A(\.{3}|\*[a-zA-Z_]\w*)/]
    @token_buf << [:SPLAT, splat]
    @i += splat.size
  # integer (octal)
  elsif octal = chunk[/\A0[0-7]+/]
    @token_buf << [:NUMBER, octal]
    @i += octal.size
  # integer (hex)
  elsif hex = chunk[/\A0[xX]\h+/]
    @token_buf << [:NUMBER, hex]
    @i += hex.size
  # integer or float (decimal)
  elsif decimal = chunk[/\A[0-9]+(\.[0-9]+([eE][+-]?[0-9]+)?)?/]
    @token_buf << [:NUMBER, decimal]
    @i += decimal.size
  elsif interpolation = chunk[ANCHORED_INTERPOLATION_REGEX]
    # "hey there, #{name}" = "hey there, " . name
    parts = interpolation[1...-1].split(INTERPOLATION_SPLIT_REGEX)
    handle_interpolation(*parts)
    @i += interpolation.size
  elsif (single_line_comment = chunk[SINGLE_LINE_COMMENT_REGEX]) && (prev_token.nil? || prev_token[0] == :NEWLINE)
    @i += single_line_comment.size + 1 # consume next newline character
    @lineno += single_line_comment.each_line.to_a.size
  elsif inline_comment = chunk[/\A[ \t\f]*"[^"]*?$/]
    @i += inline_comment.size # inline comment, don't consume newline character
    @lineno += inline_comment.each_line.to_a.size - 1
  elsif string_double = chunk[/\A"(.*?)(?<!\\)"/, 1]
    @token_buf << [:STRING_D, string_double]
    @i += string_double.size + 2
  elsif string_single = chunk[/\A'(([^']|'')*)'/, 1]
    @token_buf << [:STRING_S, string_single]
    @i += string_single.size + 2
  elsif newlines = chunk[/\A([\r\n]+)/, 1]
    # push only 1 newline
    @token_buf << [:NEWLINE, "\n"] unless prev_token && prev_token[0] == :NEWLINE

    # pending indents/dedents
    if @one_line_conditional_end_pending
      @one_line_conditional_end_pending = false
    elsif @indent_pending
      @indent_pending = false
    elsif @dedent_pending
      @dedent_pending = false
    end

    @i += newlines.size
    @lineno += newlines.size
  elsif heredoc_pattern = chunk[%r{\A<<(.+?)\r?\n}]
    pattern = $1
    @i += heredoc_pattern.size
    new_chunk = get_new_chunk
    heredoc_string = new_chunk[%r|(.+?\r?\n)(#{Regexp.escape(pattern)})|m, 1]
    @i += heredoc_string.size + pattern.size
    heredoc_string.chomp!
    if heredoc_string =~ INTERPOLATION_REGEX || %Q("#{heredoc_string}") =~ INTERPOLATION_REGEX
      parts = heredoc_string.split(INTERPOLATION_SPLIT_REGEX)
      handle_interpolation(*parts)
    else
      @token_buf << [:STRING_D, escape_chars!(heredoc_string)]
    end
    @lineno += heredoc_string.each_line.to_a.size
  # operators of more than 1 char
  elsif operator = chunk[OPERATOR_REGEX]
    @token_buf << [operator, operator]
    @i += operator.size
  elsif regexp = chunk[%r{\A/.*?[^\\]/}]
    @token_buf << [:REGEXP, regexp]
    @i += regexp.size
  elsif whitespaces = chunk[/\A[ \t\f]+/]
    @i += whitespaces.size
  # operators and tokens of single chars, one of: ( ) , . [ ] ! + - = < > /
  else
    value = chunk[0, 1]
    if value == '|'
      @token_buf << [:NEWLINE, "\n"]
    else
      @token_buf << [value, value]
    end
    @i += 1
    if value == ']' || value == ')' && (chunk[1, 1] == '.' && chunk[3, 1] != ':')
      parse_dict_vals!
    end
  end
end

Class: Riml::Lexer

Constant Summary collapse

Constants included from Constants

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(code) ⇒ Lexer

Instance Attribute Details

#chunk ⇒ Object (readonly)

#current_indent ⇒ Object (readonly)

#ignore_indentation_check ⇒ Object

#invalid_keyword ⇒ Object (readonly)

#lineno ⇒ Object (readonly)

#prev_token ⇒ Object (readonly)

#tokens ⇒ Object (readonly)