Class: Puppet::Pops::Parser::Lexer

Inherits:

Object

Object
Puppet::Pops::Parser::Lexer

show all

Extended by:: Forwardable

Defined in:: lib/puppet/pops/parser/lexer.rb

Defined Under Namespace

Modules: Contextual Classes: Token, TokenList

Constant Summary collapse

TOKENS =

TokenList.new

DQ_initial_token_types =

{'$' => :DQPRE,'"' => :STRING}

DQ_continuation_token_types =

{'$' => :DQMID,'"' => :DQPOST}

KEYWORDS =

TokenList.new

MULTIBYTE =

Puppet::Pops::Parser::Locator::MULTIBYTE

SKIPPATTERN =

MULTIBYTE ? %r{[[:blank:]\r]+} : %r{[ \t\r]+}

LBRACE_CHAR =

'{'

@@pairs =

{
  "{"   => "}",
  "("   => ")",
  "["   => "]",
  "<|"  => "|>",
  "<<|" => "|>>",
  "|"   => "|"
}

Instance Attribute Summary collapse

#file ⇒ Object
#indefine ⇒ Object (also: #indefine?)
#lexing_context ⇒ Object readonly
#locator ⇒ Object readonly
#token_queue ⇒ Object readonly

Instance Method Summary collapse

#assert_numeric(value) ⇒ Object
#clear ⇒ Object
#expected ⇒ Object
#find_regex_token ⇒ Object

Find the next token that matches a regex.
#find_string_token ⇒ Object
#find_token ⇒ Object

Find the next token, returning the string and the token.
#followed_by ⇒ Object

Returns “<eof>” if at end of input, else the following 5 characters with n r t escaped.
#format_quote(q) ⇒ Object
#fullscan ⇒ Object

scan the whole file basically just used for testing.
#initialize ⇒ Lexer constructor

A new instance of Lexer.
#initvars ⇒ Object
#lex_error(msg) ⇒ Object
#line ⇒ Object

Returns the line number (starting from 1) for the current position in the scanned text (at the end of the last produced, but not necessarily consumed..
#match?(r) ⇒ Boolean
#munge_token(token, value) ⇒ Object

Make any necessary changes to the token and/or value.
#namespace ⇒ Object

Collect the current namespace.
#pos ⇒ Object
#positioned_message(msg) ⇒ Object

Formats given message by appending file, line and position if available.
#positioned_value(value) ⇒ Object

Returns a hash with the current position in source based on the current lexing context.
#replace_false_start_with_text(appendix) ⇒ Object
#scan {|[false,false]| ... } ⇒ Object

this is the heart of the lexer.
#slurpstring(terminators, escapes = %w{ \\ $ ' " r n t s }+["\n"], ignore_invalid_escapes = false) ⇒ Object

we’ve encountered the start of a string…
#string=(string, path = '') ⇒ Object

just parse a string, not a whole file.
#tokenize_interpolated_string(token_type, preamble = '') ⇒ Object
#warn_if_variable_has_hyphen(var_name) ⇒ Object

Constructor Details

#initialize ⇒ `Lexer`

Returns a new instance of Lexer.



453
454
455

# File 'lib/puppet/pops/parser/lexer.rb', line 453

def initialize
  initvars
end

Instance Attribute Details

#file ⇒ `Object`



15
16
17

# File 'lib/puppet/pops/parser/lexer.rb', line 15

def file
  @file
end

#indefine ⇒ `Object` Also known as: indefine?



19
20
21

# File 'lib/puppet/pops/parser/lexer.rb', line 19

def indefine
  @indefine
end

#lexing_context ⇒ `Object` (readonly)



15
16
17

# File 'lib/puppet/pops/parser/lexer.rb', line 15

def lexing_context
  @lexing_context
end

#locator ⇒ `Object` (readonly)



17
18
19

# File 'lib/puppet/pops/parser/lexer.rb', line 17

def locator
  @locator
end

#token_queue ⇒ `Object` (readonly)



15
16
17

# File 'lib/puppet/pops/parser/lexer.rb', line 15

def token_queue
  @token_queue
end

Instance Method Details

#assert_numeric(value) ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 457

def assert_numeric(value)
  if value =~ /^0[xX].*$/
    lex_error (positioned_message("Not a valid hex number #{value}")) unless value =~ /^0[xX][0-9A-Fa-f]+$/
  elsif value =~ /^0[^.].*$/
    lex_error(positioned_message("Not a valid octal number #{value}")) unless value =~ /^0[0-7]+$/
  else
    lex_error(positioned_message("Not a valid decimal number #{value}")) unless value =~ /0?\d+(?:\.\d+)?(?:[eE]-?\d+)?/
  end
end

#clear ⇒ `Object`



376
377
378

# File 'lib/puppet/pops/parser/lexer.rb', line 376

def clear
  initvars
end

#expected ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 380

def expected
  return nil if @expected.empty?
  name = @expected[-1]
  TOKENS.lookup(name) or lex_error "Internal Lexer Error: Could not find expected token #{name}"
end

#find_regex_token ⇒ `Object`

Find the next token that matches a regex. We look for these first.

# File 'lib/puppet/pops/parser/lexer.rb', line 424

def find_regex_token
  best_token = nil
  best_length = 0

  # I tried optimizing based on the first char, but it had
  # a slightly negative affect and was a good bit more complicated.
  _lxc = @lexing_context
  _scn = @scanner
  TOKENS.regex_tokens.each do |token|
    if length = _scn.match?(token.regex) and token.acceptable?(_lxc)
      # We've found a longer match
      if length > best_length
        best_length = length
        best_token = token
      end
    end
  end

  return best_token, _scn.scan(best_token.regex) if best_token
end

#find_string_token ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 408

def find_string_token
  # We know our longest string token is three chars, so try each size in turn
  # until we either match or run out of chars.  This way our worst-case is three
  # tries, where it is otherwise the number of string token we have.  Also,
  # the lookups are optimized hash lookups, instead of regex scans.
  #
  _scn = @scanner
  s = _scn.peek(3)
  token = TOKENS.lookup(s[0,3]) || TOKENS.lookup(s[0,2]) || TOKENS.lookup(s[0,1])
  unless token
    return [nil, nil]
  end
  [ token, _scn.scan(token.regex) ]
end

#find_token ⇒ `Object`

Find the next token, returning the string and the token.



446
447
448

# File 'lib/puppet/pops/parser/lexer.rb', line 446

def find_token
  shift_token || find_regex_token || find_string_token
end

#followed_by ⇒ `Object`

Returns “<eof>” if at end of input, else the following 5 characters with n r t escaped

# File 'lib/puppet/pops/parser/lexer.rb', line 665

def followed_by
  return "<eof>" if @scanner.eos?
  result = @scanner.rest[0,5] + "..."
  result.gsub!("\t", '\t')
  result.gsub!("\n", '\n')
  result.gsub!("\r", '\r')
  result
end

#format_quote(q) ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 674

def format_quote q
  if q == "'"
    '"\'"'
  else
    "'#{q}'"
  end
end

#fullscan ⇒ `Object`

scan the whole file basically just used for testing

# File 'lib/puppet/pops/parser/lexer.rb', line 388

def fullscan
  array = []

  self.scan { |token, str|
    # Ignore any definition nesting problems
    @indefine = false
    array.push([token,str])
  }
  array
end

#initvars ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 467

def initvars
  @previous_token = nil
  @scanner = nil
  @file = nil

  # AAARRGGGG! okay, regexes in ruby are bloody annoying
  # no one else has "\n" =~ /\s/

  @namestack = []
  @token_queue = []
  @indefine = false
  @expected = []
  @lexing_context = {
    :after => nil,
    :start_of_line => true,
    :offset => 0,      # byte offset before where token starts
    :end_offset => 0,  # byte offset after scanned token
    :brace_count => 0,  # nested depth of braces
    :interpolation_stack => []   # matching interpolation brace level
  }
end

#lex_error(msg) ⇒ `Object`

Raises:

(Puppet::LexError)



22
23
24

# File 'lib/puppet/pops/parser/lexer.rb', line 22

def lex_error msg
  raise Puppet::LexError.new(msg)
end

#line ⇒ `Object`

Returns the line number (starting from 1) for the current position in the scanned text (at the end of the last produced, but not necessarily consumed.

# File 'lib/puppet/pops/parser/lexer.rb', line 749

def line
  return 1 unless @lexing_context && locator
  locator.line_for_offset(@lexing_context[:end_offset])
end

#match?(r) ⇒ `Boolean`

Returns:

(Boolean)



622
623
624

# File 'lib/puppet/pops/parser/lexer.rb', line 622

def match? r
  @scanner.match?(r)
end

#munge_token(token, value) ⇒ `Object`

Make any necessary changes to the token and/or value.

# File 'lib/puppet/pops/parser/lexer.rb', line 490

def munge_token(token, value)
  # A token may already have been munged (converted and positioned)
  #
  return token, value if value.is_a? Hash

  @scanner.skip(SKIPPATTERN) if token.skip_text

  return if token.skip

  token, value = token.convert(self, value) if token.respond_to?(:convert)

  return unless token

  return if token.skip

  # If the conversion performed the munging/positioning
  return token, value if value.is_a? Hash

  return token, positioned_value(value)
end

#namespace ⇒ `Object`

Collect the current namespace.



535
536
537

# File 'lib/puppet/pops/parser/lexer.rb', line 535

def namespace
  @namestack.join("::")
end

#pos ⇒ `Object`



522
523
524

# File 'lib/puppet/pops/parser/lexer.rb', line 522

def pos
  @locator.pos_on_line(@lexing_context[:offset])
end

#positioned_message(msg) ⇒ `Object`

Formats given message by appending file, line and position if available.

# File 'lib/puppet/pops/parser/lexer.rb', line 657

def positioned_message msg
  result = [msg]
  result << "in file #{file}" if file
  result << "at line #{line}:#{pos}" if line
  result.join(" ")
end

#positioned_value(value) ⇒ `Object`

Returns a hash with the current position in source based on the current lexing context

# File 'lib/puppet/pops/parser/lexer.rb', line 513

def positioned_value(value)
  {
    :value => value,
    :locator => @locator,
    :offset => @lexing_context[:offset],
    :end_offset => @lexing_context[:end_offset]
  }
end

#replace_false_start_with_text(appendix) ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 723

def replace_false_start_with_text(appendix)
  last_token = token_queue.pop
  value = last_token.last
  if value.is_a? Hash
    value[:value] + appendix
  else
    value + appendix
  end
end

#scan {|[false,false]| ... } ⇒ `Object`

this is the heart of the lexer

Yields:

([false,false])

# File 'lib/puppet/pops/parser/lexer.rb', line 544

def scan
  _scn = @scanner
  #Puppet.debug("entering scan")
  lex_error "Internal Error: No string or file given to lexer to process." unless _scn

  # Skip any initial whitespace.
  _scn.skip(SKIPPATTERN)
  _lbrace = '{'.freeze  # faster to compare against a frozen string in

  until token_queue.empty? and _scn.eos? do
    offset = _scn.pos
    matched_token, value = find_token
    end_offset = _scn.pos

    # error out if we didn't match anything at all
    lex_error "Could not match #{_scn.rest[/^(\S+|\s+|.*)/]}" unless matched_token

    newline = matched_token.name == :RETURN

    _lxc = @lexing_context
    _lxc[:start_of_line] = newline
    _lxc[:offset] = offset
    _lxc[:end_offset] = end_offset

    final_token, token_value = munge_token(matched_token, value)
    # update end position since munging may have moved the end offset
    _lxc[:end_offset] = _scn.pos

    unless final_token
      _scn.skip(SKIPPATTERN)
      next
    end

    _lxc[:after] = final_token.name unless newline
    if final_token.name == :DQPRE
      _lxc[:interpolation_stack] << _lxc[:brace_count]
    elsif final_token.name == :DQPOST
      _lxc[:interpolation_stack].pop
    end

    value = token_value[:value]

    _expected = @expected
    if match = @@pairs[value] and final_token.name != :DQUOTE and final_token.name != :SQUOTE
      _expected << match
    elsif exp = _expected[-1] and exp == value and final_token.name != :DQUOTE and final_token.name != :SQUOTE
      _expected.pop
    end

    yield [final_token.name, token_value]

    _prv = @previous_token
    if _prv
      namestack(value) if _prv.name == :CLASS and value != LBRACE_CHAR

      # TODO: Lexer has no business dealing with this - it is semantic
      if _prv.name == :DEFINE
        if indefine?
          msg = "Cannot nest definition #{value} inside #{@indefine}"
          self.indefine = false
          raise Puppet::ParseError, msg
        end

        @indefine = value
      end
    end
    @previous_token = final_token
    _scn.skip(SKIPPATTERN)
  end
  # Cannot reset @scanner to nil here - it is needed to answer questions about context after
  # completed parsing.
  # Seems meaningless to do this. Everything will be gc anyway.
  #@scanner = nil

  # This indicates that we're done parsing.
  yield [false,false]
end

#slurpstring(terminators, escapes = %w{ \\ $ ' " r n t s }+["\n"], ignore_invalid_escapes = false) ⇒ `Object`

we’ve encountered the start of a string… slurp in the rest of the string and return it

# File 'lib/puppet/pops/parser/lexer.rb', line 632

def slurpstring(terminators,escapes=%w{ \\  $ ' " r n t s }+["\n"],ignore_invalid_escapes=false)
  # we search for the next quote that isn't preceded by a
  # backslash; the caret is there to match empty strings
  last = @scanner.matched
  str = @scanner.scan_until(/([^\\]|^|[^\\])([\\]{2})*[#{terminators}]/) || lex_error(positioned_message("Unclosed quote after #{format_quote(last)} followed by '#{followed_by}'"))
  str.gsub!(/\\(.)/m) {
    ch = $1
    if escapes.include? ch
      case ch
      when 'r'; "\r"
      when 'n'; "\n"
      when 't'; "\t"
      when 's'; " "
      when "\n"; ''
      else      ch
      end
    else
      Puppet.warning(positioned_message("Unrecognized escape sequence '\\#{ch}'")) unless ignore_invalid_escapes
      "\\#{ch}"
    end
  }
  [ str[0..-2],str[-1,1] ]
end

#string=(string, path = '') ⇒ `Object`

just parse a string, not a whole file

# File 'lib/puppet/pops/parser/lexer.rb', line 734

def string=(string, path='')
  @scanner = StringScanner.new(string.freeze)
  @locator = Puppet::Pops::Parser::Locator.locator(string, path)
end

#tokenize_interpolated_string(token_type, preamble = '') ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 682

def tokenize_interpolated_string(token_type,preamble='')
  # Expecting a (possibly empty) stretch of text terminated by end of string ", a variable $, or expression ${
  # The length of this part includes the start and terminating characters.
  value,terminator = slurpstring('"$')

  # Advanced after '{' if this is in expression ${} interpolation
  braced = terminator == '$' && @scanner.scan(/\{/)
  # make offset to end_ofset be the length of the pre expression string including its start and terminating chars
  lxc = @lexing_context
  lxc[:end_offset] = @scanner.pos

  token_queue << [TOKENS[token_type[terminator]],positioned_value(preamble+value)]
  variable_regex = if Puppet[:allow_variables_with_dashes]
    TOKENS[:VARIABLE_WITH_DASH].regex
  else
    TOKENS[:VARIABLE].regex
  end
  if terminator != '$' or braced
    return token_queue.shift
  end

  tmp_offset = @scanner.pos
  if var_name = @scanner.scan(variable_regex)
    lxc[:offset] = tmp_offset
    lxc[:end_offset] = @scanner.pos
    warn_if_variable_has_hyphen(var_name)
    # If the varname after ${ is followed by (, it is a function call, and not a variable
    # reference.
    #
    if braced && @scanner.match?(%r{[ \t\r]*\(})
      token_queue << [TOKENS[:NAME], positioned_value(var_name)]
    else
      token_queue << [TOKENS[:VARIABLE],positioned_value(var_name)]
    end
    lxc[:offset] = @scanner.pos
    tokenize_interpolated_string(DQ_continuation_token_types)
  else
    tokenize_interpolated_string(token_type, replace_false_start_with_text(terminator))
  end
end

#warn_if_variable_has_hyphen(var_name) ⇒ `Object`

# File 'lib/puppet/pops/parser/lexer.rb', line 739

def warn_if_variable_has_hyphen(var_name)
  if var_name.include?('-')
    Puppet.deprecation_warning("Using `-` in variable names is deprecated at #{file || '<string>'}:#{line}. See http://links.puppetlabs.com/puppet-hyphenated-variable-deprecation")
  end
end

Class: Puppet::Pops::Parser::Lexer

Defined Under Namespace

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ Lexer

Instance Attribute Details

#file ⇒ Object

#indefine ⇒ Object Also known as: indefine?

#lexing_context ⇒ Object (readonly)

#locator ⇒ Object (readonly)

#token_queue ⇒ Object (readonly)

Instance Method Details

#assert_numeric(value) ⇒ Object

#clear ⇒ Object

#expected ⇒ Object

#find_regex_token ⇒ Object

#find_string_token ⇒ Object

#find_token ⇒ Object

#followed_by ⇒ Object

#format_quote(q) ⇒ Object

#fullscan ⇒ Object

#initvars ⇒ Object

#lex_error(msg) ⇒ Object

#line ⇒ Object

#match?(r) ⇒ Boolean

#munge_token(token, value) ⇒ Object

#namespace ⇒ Object

#pos ⇒ Object

#positioned_message(msg) ⇒ Object

#positioned_value(value) ⇒ Object

#replace_false_start_with_text(appendix) ⇒ Object

#scan {|[false,false]| ... } ⇒ Object

#slurpstring(terminators, escapes = %w{ \\ $ ' " r n t s }+["\n"], ignore_invalid_escapes = false) ⇒ Object

#string=(string, path = '') ⇒ Object

#tokenize_interpolated_string(token_type, preamble = '') ⇒ Object

#warn_if_variable_has_hyphen(var_name) ⇒ Object