Class: Puppet::Pops::Parser::Lexer

Inherits:

Object

Object
Puppet::Pops::Parser::Lexer

show all

Extended by:: Forwardable

Defined in:: lib/puppet/pops/parser/lexer.rb

Defined Under Namespace

Modules: Contextual Classes: Locator, Token, TokenList

Constant Summary collapse

TOKENS =

TokenList.new

DQ_initial_token_types =

{'$' => :DQPRE,'"' => :STRING}

DQ_continuation_token_types =

{'$' => :DQMID,'"' => :DQPOST}

KEYWORDS =

TokenList.new

@@pairs =

{
  "{"   => "}",
  "("   => ")",
  "["   => "]",
  "<|"  => "|>",
  "<<|" => "|>>",
  "|"   => "|"
}

Instance Attribute Summary collapse

#file ⇒ Object
#indefine ⇒ Object (also: #indefine?)
#lexing_context ⇒ Object readonly
#locator ⇒ Object readonly
#token_queue ⇒ Object readonly

Instance Method Summary collapse

#assert_numeric(value) ⇒ Object
#clear ⇒ Object
#expected ⇒ Object
#find_regex_token ⇒ Object

Find the next token that matches a regex.
#find_string_token ⇒ Object
#find_token ⇒ Object

Find the next token, returning the string and the token.
#followed_by ⇒ Object

Returns “<eof>” if at end of input, else the following 5 characters with n r t escaped.
#format_quote(q) ⇒ Object
#fullscan ⇒ Object

scan the whole file basically just used for testing.
#init_multibyte ⇒ Object

Returns true if ruby version >= 1.9.3 since regexp supports multi-byte matches and expanded character categories like [[:blank:]].
#initialize ⇒ Lexer constructor

A new instance of Lexer.
#initvars ⇒ Object
#lex_error(msg) ⇒ Object
#line ⇒ Object

Returns the line number (starting from 1) for the current position in the scanned text (at the end of the last produced, but not necessarily consumed..
#match?(r) ⇒ Boolean
#multibyte? ⇒ Boolean
#munge_token(token, value) ⇒ Object

Make any necessary changes to the token and/or value.
#namespace ⇒ Object

Collect the current namespace.
#pos ⇒ Object
#position_in_source ⇒ Object

Returns a hash with the current position in source based on the current lexing context.
#positioned_message(msg) ⇒ Object

Formats given message by appending file, line and position if available.
#replace_false_start_with_text(appendix) ⇒ Object
#scan {|[false,false]| ... } ⇒ Object

this is the heart of the lexer.
#skip ⇒ Object

Skip any skipchars in our remaining string.
#slurpstring(terminators, escapes = %w{ \\ $ ' " r n t s }+["\n"], ignore_invalid_escapes = false) ⇒ Object

we’ve encountered the start of a string…
#string=(string) ⇒ Object

just parse a string, not a whole file.
#tokenize_interpolated_string(token_type, preamble = '') ⇒ Object
#warn_if_variable_has_hyphen(var_name) ⇒ Object

Constructor Details

#initialize ⇒ `Lexer`

Returns a new instance of Lexer.

# File 'lib/puppet/pops/parser/lexer.rb', line 464

def initialize
  @multibyte = init_multibyte
  initvars
end

Instance Attribute Details

#file ⇒ `Object`



15
16
17

# File 'lib/puppet/pops/parser/lexer.rb', line 15

def file
  @file
end

#indefine ⇒ `Object` Also known as: indefine?



19
20
21

# File 'lib/puppet/pops/parser/lexer.rb', line 19

def indefine
  @indefine
end

#lexing_context ⇒ `Object` (readonly)



15
16
17

# File 'lib/puppet/pops/parser/lexer.rb', line 15

def lexing_context
  @lexing_context
end

#locator ⇒ `Object` (readonly)



17
18
19

# File 'lib/puppet/pops/parser/lexer.rb', line 17

def locator
  @locator
end

#token_queue ⇒ `Object` (readonly)



15
16
17

# File 'lib/puppet/pops/parser/lexer.rb', line 15

def token_queue
  @token_queue
end

Instance Method Details

#assert_numeric(value) ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 469

def assert_numeric(value)
  if value =~ /^0[xX].*$/
    lex_error (positioned_message("Not a valid hex number #{value}")) unless value =~ /^0[xX][0-9A-Fa-f]+$/
  elsif value =~ /^0[^.].*$/
    lex_error(positioned_message("Not a valid octal number #{value}")) unless value =~ /^0[0-7]+$/
  else
    lex_error(positioned_message("Not a valid decimal number #{value}")) unless value =~ /0?\d+(?:\.\d+)?(?:[eE]-?\d+)?/
  end
end

#clear ⇒ `Object`



396
397
398

# File 'lib/puppet/pops/parser/lexer.rb', line 396

def clear
  initvars
end

#expected ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 400

def expected
  return nil if @expected.empty?
  name = @expected[-1]
  TOKENS.lookup(name) or lex_error "Internal Lexer Error: Could not find expected token #{name}"
end

#find_regex_token ⇒ `Object`

Find the next token that matches a regex. We look for these first.

# File 'lib/puppet/pops/parser/lexer.rb', line 440

def find_regex_token
  best_token = nil
  best_length = 0

  # I tried optimizing based on the first char, but it had
  # a slightly negative affect and was a good bit more complicated.
  TOKENS.regex_tokens.each do |token|
    if length = @scanner.match?(token.regex) and token.acceptable?(lexing_context)
      # We've found a longer match
      if length > best_length
        best_length = length
        best_token = token
      end
    end
  end

  return best_token, @scanner.scan(best_token.regex) if best_token
end

#find_string_token ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 428

def find_string_token
  # We know our longest string token is three chars, so try each size in turn
  # until we either match or run out of chars.  This way our worst-case is three
  # tries, where it is otherwise the number of string token we have.  Also,
  # the lookups are optimized hash lookups, instead of regex scans.
  #
  s = @scanner.peek(3)
  token = TOKENS.lookup(s[0,3]) || TOKENS.lookup(s[0,2]) || TOKENS.lookup(s[0,1])
  [ token, token && @scanner.scan(token.regex) ]
end

#find_token ⇒ `Object`

Find the next token, returning the string and the token.



460
461
462

# File 'lib/puppet/pops/parser/lexer.rb', line 460

def find_token
  shift_token || find_regex_token || find_string_token
end

#followed_by ⇒ `Object`

Returns “<eof>” if at end of input, else the following 5 characters with n r t escaped

# File 'lib/puppet/pops/parser/lexer.rb', line 699

def followed_by
  return "<eof>" if @scanner.eos?
  result = @scanner.rest[0,5] + "..."
  result.gsub!("\t", '\t')
  result.gsub!("\n", '\n')
  result.gsub!("\r", '\r')
  result
end

#format_quote(q) ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 708

def format_quote q
  if q == "'"
    '"\'"'
  else
    "'#{q}'"
  end
end

#fullscan ⇒ `Object`

scan the whole file basically just used for testing

# File 'lib/puppet/pops/parser/lexer.rb', line 408

def fullscan
  array = []

  self.scan { |token, str|
    # Ignore any definition nesting problems
    @indefine = false
    array.push([token,str])
  }
  array
end

#init_multibyte ⇒ `Object`

Returns true if ruby version >= 1.9.3 since regexp supports multi-byte matches and expanded character categories like [[:blank:]].

This implementation will fail if there are more than 255 minor or micro versions of ruby

# File 'lib/puppet/pops/parser/lexer.rb', line 484

def init_multibyte
  numver = RUBY_VERSION.split(".").collect {|s| s.to_i }
  return true if (numver[0] << 16 | numver[1] << 8 | numver[2]) >= (1 << 16 | 9 << 8 | 3)
  false
end

#initvars ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 494

def initvars
  @previous_token = nil
  @scanner = nil
  @file = nil

  # AAARRGGGG! okay, regexes in ruby are bloody annoying
  # no one else has "\n" =~ /\s/

  if multibyte?
    # Skip all kinds of space, and CR, but not newlines
    @skip = %r{[[:blank:]\r]+}
  else
    @skip = %r{[ \t\r]+}
  end

  @namestack = []
  @token_queue = []
  @indefine = false
  @expected = []
  @lexing_context = {
    :after => nil,
    :start_of_line => true,
    :offset => 0,      # byte offset before where token starts
    :end_offset => 0,  # byte offset after scanned token
    :brace_count => 0,  # nested depth of braces
    :interpolation_stack => []   # matching interpolation brace level
  }
end

#lex_error(msg) ⇒ `Object`

Raises:

(Puppet::LexError)



22
23
24

# File 'lib/puppet/pops/parser/lexer.rb', line 22

def lex_error msg
  raise Puppet::LexError.new(msg)
end

#line ⇒ `Object`

Returns the line number (starting from 1) for the current position in the scanned text (at the end of the last produced, but not necessarily consumed.

# File 'lib/puppet/pops/parser/lexer.rb', line 782

def line
  return 1 unless lexing_context && locator
  locator.line_for_offset(lexing_context[:end_offset])
end

#match?(r) ⇒ `Boolean`

Returns:

(Boolean)



656
657
658

# File 'lib/puppet/pops/parser/lexer.rb', line 656

def match? r
  @scanner.match?(r)
end

#multibyte? ⇒ `Boolean`

Returns:

(Boolean)



490
491
492

# File 'lib/puppet/pops/parser/lexer.rb', line 490

def multibyte?
  @multibyte
end

#munge_token(token, value) ⇒ `Object`

Make any necessary changes to the token and/or value.

# File 'lib/puppet/pops/parser/lexer.rb', line 524

def munge_token(token, value)
  # A token may already have been munged (converted and positioned)
  #
  return token, value if value.is_a? Hash

  skip if token.skip_text

  return if token.skip

  token, value = token.convert(self, value) if token.respond_to?(:convert)

  return unless token

  return if token.skip

  # If the conversion performed the munging/positioning
  return token, value if value.is_a? Hash

  pos_hash = position_in_source
  pos_hash[:value] = value

  # Add one to pos, first char on line is 1
  return token, pos_hash
end

#namespace ⇒ `Object`

Collect the current namespace.



573
574
575

# File 'lib/puppet/pops/parser/lexer.rb', line 573

def namespace
  @namestack.join("::")
end

#pos ⇒ `Object`



560
561
562

# File 'lib/puppet/pops/parser/lexer.rb', line 560

def pos
  @locator.pos_on_line(lexing_context[:offset])
end

#position_in_source ⇒ `Object`

Returns a hash with the current position in source based on the current lexing context

# File 'lib/puppet/pops/parser/lexer.rb', line 551

def position_in_source
  pos        = @locator.pos_on_line(lexing_context[:offset])
  offset     = @locator.char_offset(lexing_context[:offset])
  length     = @locator.char_length(lexing_context[:offset], lexing_context[:end_offset])
  start_line = @locator.line_for_offset(lexing_context[:offset])

  return { :line => start_line, :pos => pos, :offset => offset, :length => length}
end

#positioned_message(msg) ⇒ `Object`

Formats given message by appending file, line and position if available.

# File 'lib/puppet/pops/parser/lexer.rb', line 691

def positioned_message msg
  result = [msg]
  result << "in file #{file}" if file
  result << "at line #{line}:#{pos}" if line
  result.join(" ")
end

#replace_false_start_with_text(appendix) ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 756

def replace_false_start_with_text(appendix)
  last_token = token_queue.pop
  value = last_token.last
  if value.is_a? Hash
    value[:value] + appendix
  else
    value + appendix
  end
end

#scan {|[false,false]| ... } ⇒ `Object`

this is the heart of the lexer

Yields:

([false,false])

# File 'lib/puppet/pops/parser/lexer.rb', line 579

def scan
  #Puppet.debug("entering scan")
  lex_error "Internal Error: No string or file given to lexer to process." unless @scanner

  # Skip any initial whitespace.
  skip

  until token_queue.empty? and @scanner.eos? do
    offset = @scanner.pos
    matched_token, value = find_token
    end_offset = @scanner.pos

    # error out if we didn't match anything at all
    lex_error "Could not match #{@scanner.rest[/^(\S+|\s+|.*)/]}" unless matched_token

    newline = matched_token.name == :RETURN

    lexing_context[:start_of_line] = newline
    lexing_context[:offset] = offset
    lexing_context[:end_offset] = end_offset

    final_token, token_value = munge_token(matched_token, value)
    # update end position since munging may have moved the end offset
    lexing_context[:end_offset] = @scanner.pos

    unless final_token
      skip
      next
    end

    lexing_context[:after] = final_token.name unless newline
    if final_token.name == :DQPRE
      lexing_context[:interpolation_stack] << lexing_context[:brace_count]
    elsif final_token.name == :DQPOST
      lexing_context[:interpolation_stack].pop
    end

    value = token_value[:value]

    if match = @@pairs[value] and final_token.name != :DQUOTE and final_token.name != :SQUOTE
      @expected << match
    elsif exp = @expected[-1] and exp == value and final_token.name != :DQUOTE and final_token.name != :SQUOTE
      @expected.pop
    end

    yield [final_token.name, token_value]

    if @previous_token
      namestack(value) if @previous_token.name == :CLASS and value != '{'

      if @previous_token.name == :DEFINE
        if indefine?
          msg = "Cannot nest definition #{value} inside #{@indefine}"
          self.indefine = false
          raise Puppet::ParseError, msg
        end

        @indefine = value
      end
    end
    @previous_token = final_token
    skip
  end
  # Cannot reset @scanner to nil here - it is needed to answer questions about context after
  # completed parsing.
  # Seems meaningless to do this. Everything will be gc anyway.
  #@scanner = nil

  # This indicates that we're done parsing.
  yield [false,false]
end

#skip ⇒ `Object`

Skip any skipchars in our remaining string.



652
653
654

# File 'lib/puppet/pops/parser/lexer.rb', line 652

def skip
  @scanner.skip(@skip)
end

#slurpstring(terminators, escapes = %w{ \\ $ ' " r n t s }+["\n"], ignore_invalid_escapes = false) ⇒ `Object`

we’ve encountered the start of a string… slurp in the rest of the string and return it

# File 'lib/puppet/pops/parser/lexer.rb', line 666

def slurpstring(terminators,escapes=%w{ \\  $ ' " r n t s }+["\n"],ignore_invalid_escapes=false)
  # we search for the next quote that isn't preceded by a
  # backslash; the caret is there to match empty strings
  last = @scanner.matched
  str = @scanner.scan_until(/([^\\]|^|[^\\])([\\]{2})*[#{terminators}]/) || lex_error(positioned_message("Unclosed quote after #{format_quote(last)} followed by '#{followed_by}'"))
  str.gsub!(/\\(.)/m) {
    ch = $1
    if escapes.include? ch
      case ch
      when 'r'; "\r"
      when 'n'; "\n"
      when 't'; "\t"
      when 's'; " "
      when "\n"; ''
      else      ch
      end
    else
      Puppet.warning(positioned_message("Unrecognized escape sequence '\\#{ch}'")) unless ignore_invalid_escapes
      "\\#{ch}"
    end
  }
  [ str[0..-2],str[-1,1] ]
end

#string=(string) ⇒ `Object`

just parse a string, not a whole file

# File 'lib/puppet/pops/parser/lexer.rb', line 767

def string=(string)
  @scanner = StringScanner.new(string)
  @locator = Locator.new(string, multibyte?)
end

#tokenize_interpolated_string(token_type, preamble = '') ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 716

def tokenize_interpolated_string(token_type,preamble='')
  # Expecting a (possibly empty) stretch of text terminated by end of string ", a variable $, or expression ${
  # The length of this part includes the start and terminating characters.
  value,terminator = slurpstring('"$')

  # Advanced after '{' if this is in expression ${} interpolation
  braced = terminator == '$' && @scanner.scan(/\{/)
  # make offset to end_ofset be the length of the pre expression string including its start and terminating chars
  lexing_context[:end_offset] = @scanner.pos

  token_queue << [TOKENS[token_type[terminator]],position_in_source().merge!({:value => preamble+value})]
  variable_regex = if Puppet[:allow_variables_with_dashes]
    TOKENS[:VARIABLE_WITH_DASH].regex
  else
    TOKENS[:VARIABLE].regex
  end
  if terminator != '$' or braced
    return token_queue.shift
  end

  tmp_offset = @scanner.pos
  if var_name = @scanner.scan(variable_regex)
    lexing_context[:offset] = tmp_offset
    lexing_context[:end_offset] = @scanner.pos
    warn_if_variable_has_hyphen(var_name)
    # If the varname after ${ is followed by (, it is a function call, and not a variable
    # reference.
    #
    if braced && @scanner.match?(%r{[ \t\r]*\(})
      token_queue << [TOKENS[:NAME], position_in_source().merge!({:value=>var_name})]
    else
      token_queue << [TOKENS[:VARIABLE],position_in_source().merge!({:value=>var_name})]
    end
    lexing_context[:offset] = @scanner.pos
    tokenize_interpolated_string(DQ_continuation_token_types)
  else
    tokenize_interpolated_string(token_type, replace_false_start_with_text(terminator))
  end
end

#warn_if_variable_has_hyphen(var_name) ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 772

def warn_if_variable_has_hyphen(var_name)
  if var_name.include?('-')
    Puppet.deprecation_warning("Using `-` in variable names is deprecated at #{file || '<string>'}:#{line}. See http://links.puppetlabs.com/puppet-hyphenated-variable-deprecation")
  end
end

Class: Puppet::Pops::Parser::Lexer

Defined Under Namespace

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ Lexer

Instance Attribute Details

#file ⇒ Object

#indefine ⇒ Object Also known as: indefine?

#lexing_context ⇒ Object (readonly)

#locator ⇒ Object (readonly)

#token_queue ⇒ Object (readonly)

Instance Method Details

#assert_numeric(value) ⇒ Object

#clear ⇒ Object

#expected ⇒ Object

#find_regex_token ⇒ Object

#find_string_token ⇒ Object

#find_token ⇒ Object

#followed_by ⇒ Object

#format_quote(q) ⇒ Object

#fullscan ⇒ Object

#init_multibyte ⇒ Object

#initvars ⇒ Object

#lex_error(msg) ⇒ Object

#line ⇒ Object

#match?(r) ⇒ Boolean

#multibyte? ⇒ Boolean

#munge_token(token, value) ⇒ Object

#namespace ⇒ Object

#pos ⇒ Object

#position_in_source ⇒ Object

#positioned_message(msg) ⇒ Object

#replace_false_start_with_text(appendix) ⇒ Object

#scan {|[false,false]| ... } ⇒ Object

#skip ⇒ Object

#slurpstring(terminators, escapes = %w{ \\ $ ' " r n t s }+["\n"], ignore_invalid_escapes = false) ⇒ Object

#string=(string) ⇒ Object

#tokenize_interpolated_string(token_type, preamble = '') ⇒ Object

#warn_if_variable_has_hyphen(var_name) ⇒ Object