Class: Nasl::Tokenizer

Inherits:

Object

Object
Nasl::Tokenizer

Defined in:: lib/nasl/tokenizer.rb

Constant Summary collapse

@@initialized =

false

@@keywords =

{
  'break'      => :BREAK,
  'continue'   => :CONTINUE,
  'else'       => :ELSE,
  'export'     => :EXPORT,
  'for'        => :FOR,
  'foreach'    => :FOREACH,
  'function'   => :FUNCTION,
  'global_var' => :GLOBAL,
  'if'         => :IF,
  'import'     => :IMPORT,
  'in'         => :IN,
  'include'    => :INCLUDE,
  'local_var'  => :LOCAL,
  'repeat'     => :REPEAT,
  'return'     => :RETURN,
  'until'      => :UNTIL,
  'x'          => :REP,
  'while'      => :WHILE,
  'do'         => :DO,
  'namespace'  => :NAMESPACE,
  'object'     => :OBJECT,
#      'new'        => :NEW,
  'var'        => :VAR,
  'public'     => :PUBLIC,
  'private'    => :PRIVATE,
  'switch'    => :SWITCH,
  'case'    => :CASE,
  'default'    => :DEFAULT,
   'FALSE'      => :FALSE,
  'NULL'       => :UNDEF,
  'TRUE'       => :TRUE
}

@@operator_lengths =

[]

@@operators = These are all of the operators defined in NASL. Their order is vitally important.

[
  ["><",   :SUBSTR_EQ],
  [">!<",  :SUBSTR_NE],
   ["=~",   :REGEX_EQ],
  ["!~",   :REGEX_NE],
   ["==",   :CMP_EQ],
  ["!=",   :CMP_NE],
  ["<=",   :CMP_LE],
  [">=",   :CMP_GE],
   ["=",    :ASS_EQ],
  ["+=",   :ADD_EQ],
  ["-=",   :SUB_EQ],
  ["*=",   :MUL_EQ],
  ["/=",   :DIV_EQ],
  ["%=",   :MOD_EQ],
  [">>=",  :SRL_EQ],
  [">>>=", :SRA_EQ],
  ["<<=",  :SLL_EQ],
   ["||",   :OR],
  ["&&",   :AND],
  ["!",    :NOT],
   ["|",    :BIT_OR],
  ["^",    :BIT_XOR],
  [">>>",  :BIT_SRA],
  [">>",   :BIT_SRL],
  ["<<",   :BIT_SLL],
   ["<",    :CMP_LT],
  [">",    :CMP_GT],
   ["++",   :INCR],
  ["--",   :DECR],
   ["**",   :EXP],
   ["+",    :ADD],
  ["-",    :SUB],
  ["*",    :MUL],
  ["/",    :DIV],
  ["%",    :MOD],
   ["~",    :BIT_NOT],
   [".",    :PERIOD],
  [",",    :COMMA],
  [":",    :COLON],
  [";",    :SEMICOLON],
  ["(",    :LPAREN],
  [")",    :RPAREN],
  ["[",    :LBRACK],
  ["]",    :RBRACK],
  ["{",    :LBRACE],
  ["}",    :RBRACE],
   ["&",    :AMPERSAND],
  ["@",    :AT_SIGN]
]

@@annotated =

[
  :EXPORT,
  :FUNCTION,
  :GLOBAL,
  :PUBLIC,
  :PRIVATE
]

Instance Method Summary collapse

Constructor Details

#initialize(code, path) ⇒ `Tokenizer`

Returns a new instance of Tokenizer.

# File 'lib/nasl/tokenizer.rb', line 160

def initialize(code, path)
  @code = code

  # Perform one-time initialization of tokenizer data structures.
  initialize!

  # Create a context object that will be shared amongst all tokens for this
  # code.
  @ctx = Context.new(@code, path)

  reset
end

Instance Method Details

#consume(num = 1) ⇒ `Object`

# File 'lib/nasl/tokenizer.rb', line 173

def consume(num=1)
  # Update the index of the character we're currently looking at.
  @point += num

  # Update the flag that indicates whether we've reached the file's end.
  @eof = (@point >= @code.length)

  # Update the the character we're examining currently.
  @char = @code[@point]

  # Extract the remainder of the line.
  @line = @code[@point..@ctx.eol(@point)]
end

#die(msg) ⇒ `Object`

Raises:

(TokenException)

# File 'lib/nasl/tokenizer.rb', line 211

def die(msg)
  # We want the default context for token errors to be all lines that
  # contain the region.
  region = @ctx.bol(@mark)..@ctx.eol(@point)
  bt = @ctx.context(@mark..@point + 1, region)

  # Raise an exception with the context as our backtrace.
  raise TokenException, msg, bt
end

#get_comment ⇒ `Object`

# File 'lib/nasl/tokenizer.rb', line 315

def get_comment
  # Remember the column the comment begins in.
  col = @ctx.col(@point)

  # Consume all of the comments in the block.
  block = []
  begin
    prev = @ctx.row(@point)
    comment = @line[/^#.*$/]
    break if comment.nil?
    block << comment
    consume(comment.length)
    skip
    cur = @ctx.row(@point)
  end while @ctx.col(@point) == col && cur == prev + 1

  return [:COMMENT, block.join("\n")]
end

#get_comment_c_style ⇒ `Object`

# File 'lib/nasl/tokenizer.rb', line 334

def get_comment_c_style
  if @code[@point+1] == '/'
    comment = @line[/^\/\/.*$/]
  # Multi-line: /* comment here */
  else
    newline = @code[@point..-1]
    comment = newline[/^\/\*.*?\*\//m]
    die("Unterminated multiline comment") if comment.nil?
  end

  consume(comment.length)
  skip

  return [:COMMENT, comment]
end

#get_identifier ⇒ `Object`

# File 'lib/nasl/tokenizer.rb', line 221

def get_identifier
  # Identifiers are composed of letters, digits, and underscores.
  #ident = @line[/^[_a-z][_a-z0-9]*/i]
#      ident = @line[/^[_a-z]([_a-z0-9]*::[_a-z0-9]+)*[_a-z0-9]*/i]
  ident = @line[/^(::|[_a-z])([_a-z0-9]*::[_a-z0-9]+)*[_a-z0-9]*/i]
  consume(ident.length)

  # Assume that we've got an identifier until proven otherwise.
  type = :IDENT

  # Identifiers may be prefixed with keywords. One example of a valid
  # identifier is "break_". To ensure that we catch these cases, we
  # initially parse all keywords as identifiers and then convert them as
  # needed.
  type = @@keywords[ident] if @@keywords.has_key? ident

  return [type, ident]
end

#get_integer ⇒ `Object`

# File 'lib/nasl/tokenizer.rb', line 240

def get_integer
  # Try and parse the integer in any of three bases.
  if @line =~ /^0x/i
    # Hex integers start with "0x".
    type = :INT_HEX
    name = "hex"
    regex1 = /^0x\w+/i
    regex2 = /^0x[a-f0-9]+/i
  elsif @line =~ /^0\w+/
    # Octal integers start with "0".
    type = :INT_OCT
    name = "octal"
    regex1 = /^0\w+/
    regex2 = /^0[0-7]+/
  else
    # Anything else is a decimal integer.
    type = :INT_DEC
    name = "decimal"
    regex1 = /^\w*/
    regex2 = /^[0-9]+/
  end

  # First match with an overly permissive regex, and then match with the
  # proper regex. If the permissive and restrictive versions don't match,
  # then there's an error in the input.
  permissive = @line[regex1]
  restrictive = @line[regex2]

  if permissive.nil? || restrictive.nil? || permissive != restrictive
    # NASL interprets integers with a leading zero as octal if the only
    # contain octal digits, and considers the integers as decimal otherwise.
    type = :INT_DEC
    regex2 = /^[0-9]+/
    restrictive = @line[regex2]
  end

  if permissive.nil? || restrictive.nil? || permissive != restrictive
    die("Invalid #{name} literal")
  end

  # If there was no problem, we use the restrictive version as the body of
  # our integer.
  integer = restrictive

  consume(integer.length)

  return [type, integer]
end

#get_operator ⇒ `Object`

# File 'lib/nasl/tokenizer.rb', line 350

def get_operator
  line_prefixes = @@operator_lengths.map { |len| @line[0, len] }
  operators_that_matched = line_prefixes.map { |prefix| @@operators[prefix] }
  operators_that_matched.reject!(&:nil?)
  return nil if operators_that_matched.empty?
  op, type = operators_that_matched.sort { |a, b| a[2] <=> b[2] }.first
  consume(op.length)
  return [type, op]
end

#get_string ⇒ `Object`

# File 'lib/nasl/tokenizer.rb', line 289

def get_string
  unparsed = @code[@point..-1]

  if @char == "'"
    type = :DATA

    # Single-quoted strings cannot have single-quotes stuffed inside them.
    contents = unparsed[/\A'(\\.|[^'\\])*'/m]
    die("Unterminated single-quoted string") if contents.nil?
  else
    type = :STRING

    # Double-quoted strings cannot have double quotes stuffed inside them.
    contents = unparsed[/\A"[^"]*"/m]
    die("Unterminated double-quoted string") if contents.nil?
  end

  # Move the point forward over the string.
  consume(contents.length)

  # Remove the bounding quotes.
  contents = contents[1..-2]

  return [type, contents]
end

#get_token ⇒ `Object`

# File 'lib/nasl/tokenizer.rb', line 360

def get_token
  # If we deferred a token, emit it now.
  unless @deferred.nil?
    token = @deferred
    @deferred = nil
    return token
  end

  # Make sure we're not at the end of the file.
  return [false, Token.new(:EOF, "$", @point...@point, @ctx)] if @eof

  # Save our starting point, which to use Emacs terminology is called the
  # 'mark'.
  @mark = @point

  # Try to parse token at the point.
  token = if @char =~ /[_a-z]/i or @line =~ /^::/
    get_identifier
  elsif @char =~ /['"]/
    get_string
  elsif @char =~ /[0-9]/
    get_integer
  elsif @char == '#'
    get_comment
  elsif (@char == '/') && ["/", "*"].include?(@code[@point+1])
    get_comment_c_style
  else
    get_operator
  end

  # Everything in the language is enumerated by the above functions, so if
  # we get here without a token parsed, the input file is invalid.
  die("Invalid character ('#@char')") if token.nil?

  # Consume all whitespace after the token, and create an object with
  # context.
  skip
  token = [token.first, Token.new(*token, @mark...@point, @ctx)]

  # If a comment is the first token in a file, or is followed by certain
  # tokens, then it is considered significant. Such tokens will appear in
  # the grammar so that it can be made visible to nasldoc.
  if token.first == :COMMENT
    if @previous.nil?
      @previous = [:DUMMY, ""]
    else
      @previous = token
      token = get_token
    end
  elsif !@previous.nil? && @previous.first == :COMMENT && @@annotated.include?(token.first)
    @deferred = token
    token = @previous
    @previous = @deferred       
  else
    @previous = token
  end

  return token
end

#get_tokens ⇒ `Object`

# File 'lib/nasl/tokenizer.rb', line 420

def get_tokens
  tokens = []

  begin
    tokens << get_token
  end while not tokens.last.last.type == :EOF

  return tokens
end

#initialize! ⇒ `Object`

# File 'lib/nasl/tokenizer.rb', line 144

def initialize!
  return if @@initialized

  @@operator_lengths = @@operators.map { |op, type| op.length }.uniq

  # Convert the operators into a form that's fast to access.
  tmp = {}
  @@operators.each_with_index do |op_and_type, index|
    op, type = op_and_type
    tmp[op] = [op, type, index]
  end
  @@operators = tmp

  @@initialized = true
end

#reset ⇒ `Object`

# File 'lib/nasl/tokenizer.rb', line 187

def reset
  # We need to remember the last token so we only emit comments significant
  # to nasldoc.
  @previous = nil
  @deferred = nil

  # Set tokenizer to initial state, ready to tokenize the code from the
  # start.
  @point = 0
  consume(0)
  skip

  # Return tokenizer to allow method chaining.
  self
end

#skip ⇒ `Object`

# File 'lib/nasl/tokenizer.rb', line 203

def skip
  while true do
    whitespace = @line[/^\s+/]
    return if whitespace.nil?
    consume(whitespace.length)
  end
end

Class: Nasl::Tokenizer

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(code, path) ⇒ Tokenizer

Instance Method Details

#consume(num = 1) ⇒ Object

#die(msg) ⇒ Object

#get_comment ⇒ Object

#get_comment_c_style ⇒ Object

#get_identifier ⇒ Object

#get_integer ⇒ Object

#get_operator ⇒ Object

#get_string ⇒ Object

#get_token ⇒ Object

#get_tokens ⇒ Object

#initialize! ⇒ Object

#reset ⇒ Object

#skip ⇒ Object