Class: Hocon::Impl::Tokenizer::TokenIterator

Inherits:

Object

Object
Hocon::Impl::Tokenizer::TokenIterator

show all

Defined in:: lib/hocon/impl/tokenizer.rb

Defined Under Namespace

Classes: WhitespaceSaver

Constant Summary collapse

FIRST_NUMBER_CHARS = chars JSON allows a number to start with

"0123456789-"

NUMBER_CHARS = chars JSON allows to be part of a number

"0123456789eE+-."

NOT_IN_UNQUOTED_TEXT = chars that stop an unquoted string

"$\"{}[]:=,+#`^?!@*&\\"

Class Method Summary collapse

Instance Method Summary collapse

#append_triple_quoted_string(sb, sb_orig) ⇒ Object
#each ⇒ Object
#has_next? ⇒ Boolean
#initialize(origin, input, allow_comments) ⇒ TokenIterator constructor

A new instance of TokenIterator.
#map ⇒ Object
#next ⇒ Object
#next_char_after_whitespace(saver) ⇒ Object

get next char, skipping non-newline whitespace.
#next_char_raw ⇒ Object

this should ONLY be called from nextCharSkippingComments or when inside a quoted string, or when parsing a sequence like ${ or +=, everything else should use nextCharSkippingComments()..
#pull_comment(first_char) ⇒ Object

ONE char has always been consumed, either the # or the first /, but not both slashes.
#pull_escape_sequence(sb, sb_orig) ⇒ Object
#pull_next_token(saver) ⇒ Object
#pull_number(first_char) ⇒ Object
#pull_plus_equals ⇒ Object
#pull_quoted_string ⇒ Object
#pull_substitution ⇒ Object
#pull_unquoted_text ⇒ Object

The rules here are intended to maximize convenience while avoiding confusion with real valid JSON.
#put_back(c) ⇒ Object
#queue_next_token ⇒ Object
#remove ⇒ Object
#start_of_comment?(c) ⇒ Boolean
#to_list ⇒ Object

Constructor Details

#initialize(origin, input, allow_comments) ⇒ `TokenIterator`

Returns a new instance of TokenIterator.

# File 'lib/hocon/impl/tokenizer.rb', line 108

def initialize(origin, input, allow_comments)
  @origin = origin
  @input = input
  @allow_comments = allow_comments
  @buffer = []
  @line_number = 1
  @line_origin = @origin.with_line_number(@line_number)
  @tokens = []
  @tokens << Tokens::START
  @whitespace_saver = WhitespaceSaver.new
end

Class Method Details

.line_origin(base_origin, line_number) ⇒ `Object`



197
198
199

# File 'lib/hocon/impl/tokenizer.rb', line 197

def self.line_origin(base_origin, line_number)
  base_origin.with_line_number(line_number)
end

.problem(origin, what, message, suggest_quotes, cause) ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 190

def self.problem(origin, what, message, suggest_quotes, cause)
  if what.nil? || message.nil?
    raise ConfigBugOrBrokenError.new("internal error, creating bad TokenizerProblemError")
  end
  TokenizerProblemError.new(Tokens.new_problem(origin, what, message, suggest_quotes, cause))
end

.simple_value?(t) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/hocon/impl/tokenizer.rb', line 562

def self.simple_value?(t)
  Tokens.substitution?(t) ||
      Tokens.unquoted_text?(t) ||
      Tokens.value?(t)
end

.whitespace?(c) ⇒ `Boolean`

Returns:

(Boolean)



143
144
145

# File 'lib/hocon/impl/tokenizer.rb', line 143

def self.whitespace?(c)
  Hocon::Impl::ConfigImplUtil.whitespace?(c)
end

.whitespace_not_newline?(c) ⇒ `Boolean`

Returns:

(Boolean)



147
148
149

# File 'lib/hocon/impl/tokenizer.rb', line 147

def self.whitespace_not_newline?(c)
  (c != "\n") and (Hocon::Impl::ConfigImplUtil.whitespace?(c))
end

Instance Method Details

#append_triple_quoted_string(sb, sb_orig) ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 380

def append_triple_quoted_string(sb, sb_orig)
  # we are after the opening triple quote and need to consume the
  # close triple
  consecutive_quotes = 0

  while true
    c = next_char_raw

    if c == '"'
      consecutive_quotes += 1
    elsif consecutive_quotes >= 3
      # the last three quotes end the string and the other kept.
      sb.string = sb.string[0...-3]
      put_back c
      break
    else
      consecutive_quotes = 0
      if c == -1
        error_msg = "End of input but triple-quoted string was still open"
        raise self.class.problem(@line_origin, c, error_msg, false, nil)
      elsif c == "\n"
        # keep the line number accurate
        @line_number += 1
        @line_origin = @origin.with_line_number(@line_number)
      end
    end

    sb << c
    sb_orig << c
  end
end

#each ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 600

def each
  while has_next?
    # Have to use self.next instead of next because next is a reserved word
    yield self.next
  end
end

#has_next? ⇒ `Boolean`

Returns:

(Boolean)



577
578
579

# File 'lib/hocon/impl/tokenizer.rb', line 577

def has_next?
  !@tokens.empty?
end

#map ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 607

def map
  token_list = []
  each do |token|
    # yield token to calling method, append whatever is returned from the
    # map block to token_list
    token_list << yield(token)
  end
  token_list
end

#next ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 581

def next
  t = @tokens.shift
  if (@tokens.empty?) and (t != Tokens::EOF)
    begin
      queue_next_token
    rescue TokenizerProblemError => e
      @tokens.push(e.problem)
    end
    if @tokens.empty?
      raise ConfigBugOrBrokenError, "bug: tokens queue should not be empty here"
    end
  end
  t
end

#next_char_after_whitespace(saver) ⇒ `Object`

get next char, skipping non-newline whitespace

# File 'lib/hocon/impl/tokenizer.rb', line 175

def next_char_after_whitespace(saver)
  while true
    c = next_char_raw
    if c == -1
      return -1
    else
      if self.class.whitespace_not_newline?(c)
        saver.add(c)
      else
        return c
      end
    end
  end
end

#next_char_raw ⇒ `Object`

this should ONLY be called from nextCharSkippingComments or when inside a quoted string, or when parsing a sequence like ${ or +=, everything else should use nextCharSkippingComments().

# File 'lib/hocon/impl/tokenizer.rb', line 124

def next_char_raw
  if @buffer.empty?
    begin
      @input.readchar.chr
    rescue EOFError
      -1
    end
  else
    @buffer.pop
  end
end

#pull_comment(first_char) ⇒ `Object`

ONE char has always been consumed, either the # or the first /, but not both slashes

# File 'lib/hocon/impl/tokenizer.rb', line 203

def pull_comment(first_char)
  double_slash = false
  if first_char == '/'
    discard = next_char_raw
    if discard != '/'
      raise ConfigBugOrBrokenError, "called pullComment but // not seen"
    end
    double_slash = true
  end

  io = StringIO.new
  while true
    c = next_char_raw
    if (c == -1) || (c == "\n")
      put_back(c)
      if (double_slash)
        return Tokens.new_comment_double_slash(@line_origin, io.string)
      else
        return Tokens.new_comment_hash(@line_origin, io.string)
      end
    else
      io << c
    end
  end
end

#pull_escape_sequence(sb, sb_orig) ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 322

def pull_escape_sequence(sb, sb_orig)
  escaped = next_char_raw

  if escaped == -1
    error_msg = "End of input but backslash in string had nothing after it"
    raise self.class.problem(@line_origin, "", error_msg, false, nil)
  end

  # This is needed so we return the unescaped escape characters back out when rendering
  # the token
  sb_orig << "\\" << escaped

  case escaped
    when "\""
      sb << "\""
    when "\\"
      sb << "\\"
    when "/"
      sb << "/"
    when "b"
      sb << "\b"
    when "f"
      sb << "\f"
    when "n"
      sb << "\n"
    when "r"
      sb << "\r"
    when "t"
      sb << "\t"
    when "u"
      codepoint = ""

      # Grab the 4 hex chars for the unicode character
      4.times do
        c = next_char_raw

        if c == -1
          error_msg = "End of input but expecting 4 hex digits for \\uXXXX escape"
          raise self.class.problem(@line_origin, c, error_msg, false, nil)
        end

        codepoint << c
      end
      sb_orig << codepoint
      # Convert codepoint to a unicode character
      packed = [codepoint.hex].pack("U")
      if packed == "_"
        raise self.class.problem(@line_origin, codepoint,
                                 "Malformed hex digits after \\u escape in string: '#{codepoint}'",
                                 false, nil)
      end
      sb << packed
    else
      error_msg = "backslash followed by '#{escaped}', this is not a valid escape sequence (quoted strings use JSON escaping, so use double-backslash \\ for literal backslash)"
      raise self.class.problem(Hocon::Impl::Tokenizer.as_string(escaped), "", error_msg, false, nil)
  end
end

#pull_next_token(saver) ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 513

def pull_next_token(saver)
  c = next_char_after_whitespace(saver)
  if c == -1
    Tokens::EOF
  elsif c == "\n"
    # newline tokens have the just-ended line number
    line = Tokens.new_line(@line_origin)
    @line_number += 1
    @line_origin = @origin.with_line_number(@line_number)
    line
  else
    t = nil
    if start_of_comment?(c)
      t = pull_comment(c)
    else
      t = case c
            when '"' then pull_quoted_string
            when '$' then pull_substitution
            when ':' then Tokens::COLON
            when ',' then Tokens::COMMA
            when '=' then Tokens::EQUALS
            when '{' then Tokens::OPEN_CURLY
            when '}' then Tokens::CLOSE_CURLY
            when '[' then Tokens::OPEN_SQUARE
            when ']' then Tokens::CLOSE_SQUARE
            when '+' then pull_plus_equals
            else nil
          end

      if t.nil?
        if FIRST_NUMBER_CHARS.index(c)
          t = pull_number(c)
        elsif NOT_IN_UNQUOTED_TEXT.index(c)
          raise self.class.problem(@line_origin, c, "Reserved character '#{c}' is not allowed outside quotes", true, nil)
        else
          put_back(c)
          t = pull_unquoted_text
        end
      end
    end

    if t.nil?
      raise ConfigBugOrBrokenError, "bug: failed to generate next token"
    end

    t
  end
end

#pull_number(first_char) ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 279

def pull_number(first_char)
  sb = StringIO.new
  sb << first_char
  contained_decimal_or_e = false
  c = next_char_raw
  while (c != -1) && (NUMBER_CHARS.index(c))
    if (c == '.') ||
        (c == 'e') ||
        (c == 'E')
      contained_decimal_or_e = true
    end
    sb << c
    c = next_char_raw
  end
  # the last character we looked at wasn't part of the number, put it
  # back
  put_back(c)
  s = sb.string
  begin
    if contained_decimal_or_e
      # force floating point representation
      Tokens.new_double(@line_origin, Float(s), s)
    else
      Tokens.new_long(@line_origin, Integer(s), s)
    end
  rescue ArgumentError => e
    if e.message =~ /^invalid value for (Float|Integer)\(\)/
      # not a number after all, see if it's an unquoted string.
      s.each_char do |u|
        if NOT_IN_UNQUOTED_TEXT.index(u)
          raise self.class.problem(@line_origin, u, "Reserved character '#{u}'" +
                                                   "is not allowed outside quotes", true, nil)
        end
      end
      # no evil chars so we just decide this was a string and
      # not a number.
      Tokens.new_unquoted_text(@line_origin, s)
    else
      raise e
    end
  end
end

#pull_plus_equals ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 458

def pull_plus_equals
  # the initial '+' has already been consumed
  c = next_char_raw

  unless c == '='
    error_msg = "'+' not followed by =, '#{c}' not allowed after '+'"
    raise self.class.problem(@line_origin, c, error_msg, true, nil) # true = suggest quotes
  end

  Tokens::PLUS_EQUALS
end

#pull_quoted_string ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 412

def pull_quoted_string
  # the open quote has already been consumed
  sb = StringIO.new

  # We need a second StringIO to keep track of escape characters.
  # We want to return them exactly as they appeared in the original text,
  # which means we will need a new StringIO to escape escape characters
  # so we can also keep the actual value of the string. This is gross.
  sb_orig = StringIO.new
  sb_orig << '"'

  c = ""
  while c != '"'
    c = next_char_raw
    if c == -1
      raise self.class.problem(@line_origin, c, "End of input but string quote was still open", false, nil)
    end

    if c == "\\"
      pull_escape_sequence(sb, sb_orig)
    elsif c == '"'
      sb_orig << c
      # done!
    elsif c =~ /[[:cntrl:]]/
      raise self.class.problem(@line_origin, c, "JSON does not allow unescaped #{c}" +
                                               " in quoted strings, use a backslash escape", false, nil)
    else
      sb << c
      sb_orig << c
    end
  end

  # maybe switch to triple-quoted string, sort of hacky...
  if sb.length == 0
    third = next_char_raw
    if third == '"'
      sb_orig << third
      append_triple_quoted_string(sb, sb_orig)
    else
      put_back(third)
    end
  end

  Tokens.new_string(@line_origin, sb.string, sb_orig.string)
end

#pull_substitution ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 470

def pull_substitution
  # the initial '$' has already been consumed
  c = next_char_raw
  if c != '{'
    error_msg = "'$' not followed by {, '#{c}' not allowed after '$'"
    raise self.class.problem(@line_origin, c, error_msg, true, nil) # true = suggest quotes
  end

  optional = false
  c = next_char_raw

  if c == '?'
    optional = true
  else
    put_back(c)
  end

  saver = WhitespaceSaver.new
  expression = []

  while true
    t = pull_next_token(saver)
    # note that we avoid validating the allowed tokens inside
    # the substitution here; we even allow nested substitutions
    # in the tokenizer. The parser sorts it out.

    if t == Tokens::CLOSE_CURLY
      # end the loop, done!
      break
    elsif t == Tokens::EOF
      raise self.class.problem(@line_origin, t, "Substitution ${ was not closed with a }", false, nil)
    else
      whitespace = saver.check(t, @line_origin, @line_number)
      unless whitespace.nil?
        expression << whitespace
      end
      expression << t
    end
  end

  Tokens.new_substitution(@line_origin, optional, expression)
end

#pull_unquoted_text ⇒ `Object`

The rules here are intended to maximize convenience while avoiding confusion with real valid JSON. Basically anything that parses as JSON is treated the JSON way and otherwise we assume it’s a string and let the parser sort it out.

# File 'lib/hocon/impl/tokenizer.rb', line 241

def pull_unquoted_text
  origin = @line_origin
  io = StringIO.new
  c = next_char_raw
  while true
    if (c == -1) or
        (NOT_IN_UNQUOTED_TEXT.index(c)) or
        (self.class.whitespace?(c)) or
        (start_of_comment?(c))
      break
    else
      io << c
    end

    # we parse true/false/null tokens as such no matter
    # what is after them, as long as they are at the
    # start of the unquoted token.
    if io.length == 4
      if io.string == "true"
        return Tokens.new_boolean(origin, true)
      elsif io.string == "null"
        return Tokens.new_null(origin)
      end
    elsif io.length  == 5
      if io.string == "false"
        return Tokens.new_boolean(origin, false)
      end
    end

    c = next_char_raw
  end

  # put back the char that ended the unquoted text
  put_back(c)

  Tokens.new_unquoted_text(origin, io.string)
end

#put_back(c) ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 136

def put_back(c)
  if @buffer.length > 2
    raise ConfigBugOrBrokenError, "bug: putBack() three times, undesirable look-ahead"
  end
  @buffer.push(c)
end

#queue_next_token ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 568

def queue_next_token
  t = pull_next_token(@whitespace_saver)
  whitespace = @whitespace_saver.check(t, @origin, @line_number)
  if whitespace
    @tokens.push(whitespace)
  end
  @tokens.push(t)
end

#remove ⇒ `Object`

Raises:

(ConfigBugOrBrokenError)



596
597
598

# File 'lib/hocon/impl/tokenizer.rb', line 596

def remove
  raise ConfigBugOrBrokenError, "Does not make sense to remove items from token stream"
end

#start_of_comment?(c) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/hocon/impl/tokenizer.rb', line 151

def start_of_comment?(c)
  if c == -1
    false
  else
    if @allow_comments
      if c == '#'
        true
      elsif c == '/'
        maybe_second_slash = next_char_raw
        # we want to predictably NOT consume any chars
        put_back(maybe_second_slash)
        if maybe_second_slash == '/'
          true
        else
          false
        end
      end
    else
      false
    end
  end
end

#to_list ⇒ `Object`

# File 'lib/hocon/impl/tokenizer.rb', line 617

def to_list
  # Return array of tokens from the iterator
  self.map { |token| token }
end

Class: Hocon::Impl::Tokenizer::TokenIterator

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(origin, input, allow_comments) ⇒ TokenIterator

Class Method Details

.line_origin(base_origin, line_number) ⇒ Object

.problem(origin, what, message, suggest_quotes, cause) ⇒ Object

.simple_value?(t) ⇒ Boolean

.whitespace?(c) ⇒ Boolean

.whitespace_not_newline?(c) ⇒ Boolean

Instance Method Details

#append_triple_quoted_string(sb, sb_orig) ⇒ Object

#each ⇒ Object

#has_next? ⇒ Boolean

#map ⇒ Object

#next ⇒ Object

#next_char_after_whitespace(saver) ⇒ Object

#next_char_raw ⇒ Object

#pull_comment(first_char) ⇒ Object

#pull_escape_sequence(sb, sb_orig) ⇒ Object

#pull_next_token(saver) ⇒ Object

#pull_number(first_char) ⇒ Object

#pull_plus_equals ⇒ Object

#pull_quoted_string ⇒ Object

#pull_substitution ⇒ Object

#pull_unquoted_text ⇒ Object

#put_back(c) ⇒ Object

#queue_next_token ⇒ Object

#remove ⇒ Object

#start_of_comment?(c) ⇒ Boolean

#to_list ⇒ Object

#initialize(origin, input, allow_comments) ⇒ `TokenIterator`

.line_origin(base_origin, line_number) ⇒ `Object`

.problem(origin, what, message, suggest_quotes, cause) ⇒ `Object`

.simple_value?(t) ⇒ `Boolean`

.whitespace?(c) ⇒ `Boolean`

.whitespace_not_newline?(c) ⇒ `Boolean`

#append_triple_quoted_string(sb, sb_orig) ⇒ `Object`

#each ⇒ `Object`

#has_next? ⇒ `Boolean`

#map ⇒ `Object`

#next ⇒ `Object`

#next_char_after_whitespace(saver) ⇒ `Object`

#next_char_raw ⇒ `Object`

#pull_comment(first_char) ⇒ `Object`

#pull_escape_sequence(sb, sb_orig) ⇒ `Object`

#pull_next_token(saver) ⇒ `Object`

#pull_number(first_char) ⇒ `Object`

#pull_plus_equals ⇒ `Object`

#pull_quoted_string ⇒ `Object`

#pull_substitution ⇒ `Object`

#pull_unquoted_text ⇒ `Object`

#put_back(c) ⇒ `Object`

#queue_next_token ⇒ `Object`

#remove ⇒ `Object`

#start_of_comment?(c) ⇒ `Boolean`

#to_list ⇒ `Object`