Class: Hocon::Impl::Tokenizer::TokenIterator

Inherits:
Object
  • Object
show all
Defined in:
lib/hocon/impl/tokenizer.rb

Defined Under Namespace

Classes: WhitespaceSaver

Constant Summary collapse

FIRST_NUMBER_CHARS =

chars JSON allows a number to start with

"0123456789-"
NUMBER_CHARS =

chars JSON allows to be part of a number

"0123456789eE+-."
NOT_IN_UNQUOTED_TEXT =

chars that stop an unquoted string

"$\"{}[]:=,+#`^?!@*&\\"

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(origin, input, allow_comments) ⇒ TokenIterator

Returns a new instance of TokenIterator.



108
109
110
111
112
113
114
115
116
117
118
# File 'lib/hocon/impl/tokenizer.rb', line 108

def initialize(origin, input, allow_comments)
  @origin = origin
  @input = input
  @allow_comments = allow_comments
  @buffer = []
  @line_number = 1
  @line_origin = @origin.with_line_number(@line_number)
  @tokens = []
  @tokens << Tokens::START
  @whitespace_saver = WhitespaceSaver.new
end

Class Method Details

.line_origin(base_origin, line_number) ⇒ Object



197
198
199
# File 'lib/hocon/impl/tokenizer.rb', line 197

def self.line_origin(base_origin, line_number)
  base_origin.with_line_number(line_number)
end

.problem(origin, what, message, suggest_quotes, cause) ⇒ Object



190
191
192
193
194
195
# File 'lib/hocon/impl/tokenizer.rb', line 190

def self.problem(origin, what, message, suggest_quotes, cause)
  if what.nil? || message.nil?
    raise ConfigBugOrBrokenError.new("internal error, creating bad TokenizerProblemError")
  end
  TokenizerProblemError.new(Tokens.new_problem(origin, what, message, suggest_quotes, cause))
end

.simple_value?(t) ⇒ Boolean

Returns:

  • (Boolean)


562
563
564
565
566
# File 'lib/hocon/impl/tokenizer.rb', line 562

def self.simple_value?(t)
  Tokens.substitution?(t) ||
      Tokens.unquoted_text?(t) ||
      Tokens.value?(t)
end

.whitespace?(c) ⇒ Boolean

Returns:

  • (Boolean)


143
144
145
# File 'lib/hocon/impl/tokenizer.rb', line 143

def self.whitespace?(c)
  Hocon::Impl::ConfigImplUtil.whitespace?(c)
end

.whitespace_not_newline?(c) ⇒ Boolean

Returns:

  • (Boolean)


147
148
149
# File 'lib/hocon/impl/tokenizer.rb', line 147

def self.whitespace_not_newline?(c)
  (c != "\n") and (Hocon::Impl::ConfigImplUtil.whitespace?(c))
end

Instance Method Details

#append_triple_quoted_string(sb, sb_orig) ⇒ Object



380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
# File 'lib/hocon/impl/tokenizer.rb', line 380

def append_triple_quoted_string(sb, sb_orig)
  # we are after the opening triple quote and need to consume the
  # close triple
  consecutive_quotes = 0

  while true
    c = next_char_raw

    if c == '"'
      consecutive_quotes += 1
    elsif consecutive_quotes >= 3
      # the last three quotes end the string and the other kept.
      sb.string = sb.string[0...-3]
      put_back c
      break
    else
      consecutive_quotes = 0
      if c == -1
        error_msg = "End of input but triple-quoted string was still open"
        raise self.class.problem(@line_origin, c, error_msg, false, nil)
      elsif c == "\n"
        # keep the line number accurate
        @line_number += 1
        @line_origin = @origin.with_line_number(@line_number)
      end
    end

    sb << c
    sb_orig << c
  end
end

#eachObject



600
601
602
603
604
605
# File 'lib/hocon/impl/tokenizer.rb', line 600

def each
  while has_next?
    # Have to use self.next instead of next because next is a reserved word
    yield self.next
  end
end

#has_next?Boolean

Returns:

  • (Boolean)


577
578
579
# File 'lib/hocon/impl/tokenizer.rb', line 577

def has_next?
  !@tokens.empty?
end

#mapObject



607
608
609
610
611
612
613
614
615
# File 'lib/hocon/impl/tokenizer.rb', line 607

def map
  token_list = []
  each do |token|
    # yield token to calling method, append whatever is returned from the
    # map block to token_list
    token_list << yield(token)
  end
  token_list
end

#nextObject



581
582
583
584
585
586
587
588
589
590
591
592
593
594
# File 'lib/hocon/impl/tokenizer.rb', line 581

def next
  t = @tokens.shift
  if (@tokens.empty?) and (t != Tokens::EOF)
    begin
      queue_next_token
    rescue TokenizerProblemError => e
      @tokens.push(e.problem)
    end
    if @tokens.empty?
      raise ConfigBugOrBrokenError, "bug: tokens queue should not be empty here"
    end
  end
  t
end

#next_char_after_whitespace(saver) ⇒ Object

get next char, skipping non-newline whitespace



175
176
177
178
179
180
181
182
183
184
185
186
187
188
# File 'lib/hocon/impl/tokenizer.rb', line 175

def next_char_after_whitespace(saver)
  while true
    c = next_char_raw
    if c == -1
      return -1
    else
      if self.class.whitespace_not_newline?(c)
        saver.add(c)
      else
        return c
      end
    end
  end
end

#next_char_rawObject

this should ONLY be called from nextCharSkippingComments or when inside a quoted string, or when parsing a sequence like ${ or +=, everything else should use nextCharSkippingComments().



124
125
126
127
128
129
130
131
132
133
134
# File 'lib/hocon/impl/tokenizer.rb', line 124

def next_char_raw
  if @buffer.empty?
    begin
      @input.readchar.chr
    rescue EOFError
      -1
    end
  else
    @buffer.pop
  end
end

#pull_comment(first_char) ⇒ Object

ONE char has always been consumed, either the # or the first /, but not both slashes



203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/hocon/impl/tokenizer.rb', line 203

def pull_comment(first_char)
  double_slash = false
  if first_char == '/'
    discard = next_char_raw
    if discard != '/'
      raise ConfigBugOrBrokenError, "called pullComment but // not seen"
    end
    double_slash = true
  end

  io = StringIO.new
  while true
    c = next_char_raw
    if (c == -1) || (c == "\n")
      put_back(c)
      if (double_slash)
        return Tokens.new_comment_double_slash(@line_origin, io.string)
      else
        return Tokens.new_comment_hash(@line_origin, io.string)
      end
    else
      io << c
    end
  end
end

#pull_escape_sequence(sb, sb_orig) ⇒ Object



322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
# File 'lib/hocon/impl/tokenizer.rb', line 322

def pull_escape_sequence(sb, sb_orig)
  escaped = next_char_raw

  if escaped == -1
    error_msg = "End of input but backslash in string had nothing after it"
    raise self.class.problem(@line_origin, "", error_msg, false, nil)
  end

  # This is needed so we return the unescaped escape characters back out when rendering
  # the token
  sb_orig << "\\" << escaped

  case escaped
    when "\""
      sb << "\""
    when "\\"
      sb << "\\"
    when "/"
      sb << "/"
    when "b"
      sb << "\b"
    when "f"
      sb << "\f"
    when "n"
      sb << "\n"
    when "r"
      sb << "\r"
    when "t"
      sb << "\t"
    when "u"
      codepoint = ""

      # Grab the 4 hex chars for the unicode character
      4.times do
        c = next_char_raw

        if c == -1
          error_msg = "End of input but expecting 4 hex digits for \\uXXXX escape"
          raise self.class.problem(@line_origin, c, error_msg, false, nil)
        end

        codepoint << c
      end
      sb_orig << codepoint
      # Convert codepoint to a unicode character
      packed = [codepoint.hex].pack("U")
      if packed == "_"
        raise self.class.problem(@line_origin, codepoint,
                                 "Malformed hex digits after \\u escape in string: '#{codepoint}'",
                                 false, nil)
      end
      sb << packed
    else
      error_msg = "backslash followed by '#{escaped}', this is not a valid escape sequence (quoted strings use JSON escaping, so use double-backslash \\ for literal backslash)"
      raise self.class.problem(Hocon::Impl::Tokenizer.as_string(escaped), "", error_msg, false, nil)
  end
end

#pull_next_token(saver) ⇒ Object



513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
# File 'lib/hocon/impl/tokenizer.rb', line 513

def pull_next_token(saver)
  c = next_char_after_whitespace(saver)
  if c == -1
    Tokens::EOF
  elsif c == "\n"
    # newline tokens have the just-ended line number
    line = Tokens.new_line(@line_origin)
    @line_number += 1
    @line_origin = @origin.with_line_number(@line_number)
    line
  else
    t = nil
    if start_of_comment?(c)
      t = pull_comment(c)
    else
      t = case c
            when '"' then pull_quoted_string
            when '$' then pull_substitution
            when ':' then Tokens::COLON
            when ',' then Tokens::COMMA
            when '=' then Tokens::EQUALS
            when '{' then Tokens::OPEN_CURLY
            when '}' then Tokens::CLOSE_CURLY
            when '[' then Tokens::OPEN_SQUARE
            when ']' then Tokens::CLOSE_SQUARE
            when '+' then pull_plus_equals
            else nil
          end

      if t.nil?
        if FIRST_NUMBER_CHARS.index(c)
          t = pull_number(c)
        elsif NOT_IN_UNQUOTED_TEXT.index(c)
          raise self.class.problem(@line_origin, c, "Reserved character '#{c}' is not allowed outside quotes", true, nil)
        else
          put_back(c)
          t = pull_unquoted_text
        end
      end
    end

    if t.nil?
      raise ConfigBugOrBrokenError, "bug: failed to generate next token"
    end

    t
  end
end

#pull_number(first_char) ⇒ Object



279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# File 'lib/hocon/impl/tokenizer.rb', line 279

def pull_number(first_char)
  sb = StringIO.new
  sb << first_char
  contained_decimal_or_e = false
  c = next_char_raw
  while (c != -1) && (NUMBER_CHARS.index(c))
    if (c == '.') ||
        (c == 'e') ||
        (c == 'E')
      contained_decimal_or_e = true
    end
    sb << c
    c = next_char_raw
  end
  # the last character we looked at wasn't part of the number, put it
  # back
  put_back(c)
  s = sb.string
  begin
    if contained_decimal_or_e
      # force floating point representation
      Tokens.new_double(@line_origin, Float(s), s)
    else
      Tokens.new_long(@line_origin, Integer(s), s)
    end
  rescue ArgumentError => e
    if e.message =~ /^invalid value for (Float|Integer)\(\)/
      # not a number after all, see if it's an unquoted string.
      s.each_char do |u|
        if NOT_IN_UNQUOTED_TEXT.index(u)
          raise self.class.problem(@line_origin, u, "Reserved character '#{u}'" +
                                                   "is not allowed outside quotes", true, nil)
        end
      end
      # no evil chars so we just decide this was a string and
      # not a number.
      Tokens.new_unquoted_text(@line_origin, s)
    else
      raise e
    end
  end
end

#pull_plus_equalsObject



458
459
460
461
462
463
464
465
466
467
468
# File 'lib/hocon/impl/tokenizer.rb', line 458

def pull_plus_equals
  # the initial '+' has already been consumed
  c = next_char_raw

  unless c == '='
    error_msg = "'+' not followed by =, '#{c}' not allowed after '+'"
    raise self.class.problem(@line_origin, c, error_msg, true, nil) # true = suggest quotes
  end

  Tokens::PLUS_EQUALS
end

#pull_quoted_stringObject



412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
# File 'lib/hocon/impl/tokenizer.rb', line 412

def pull_quoted_string
  # the open quote has already been consumed
  sb = StringIO.new

  # We need a second StringIO to keep track of escape characters.
  # We want to return them exactly as they appeared in the original text,
  # which means we will need a new StringIO to escape escape characters
  # so we can also keep the actual value of the string. This is gross.
  sb_orig = StringIO.new
  sb_orig << '"'

  c = ""
  while c != '"'
    c = next_char_raw
    if c == -1
      raise self.class.problem(@line_origin, c, "End of input but string quote was still open", false, nil)
    end

    if c == "\\"
      pull_escape_sequence(sb, sb_orig)
    elsif c == '"'
      sb_orig << c
      # done!
    elsif c =~ /[[:cntrl:]]/
      raise self.class.problem(@line_origin, c, "JSON does not allow unescaped #{c}" +
                                               " in quoted strings, use a backslash escape", false, nil)
    else
      sb << c
      sb_orig << c
    end
  end

  # maybe switch to triple-quoted string, sort of hacky...
  if sb.length == 0
    third = next_char_raw
    if third == '"'
      sb_orig << third
      append_triple_quoted_string(sb, sb_orig)
    else
      put_back(third)
    end
  end

  Tokens.new_string(@line_origin, sb.string, sb_orig.string)
end

#pull_substitutionObject



470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
# File 'lib/hocon/impl/tokenizer.rb', line 470

def pull_substitution
  # the initial '$' has already been consumed
  c = next_char_raw
  if c != '{'
    error_msg = "'$' not followed by {, '#{c}' not allowed after '$'"
    raise self.class.problem(@line_origin, c, error_msg, true, nil) # true = suggest quotes
  end

  optional = false
  c = next_char_raw

  if c == '?'
    optional = true
  else
    put_back(c)
  end

  saver = WhitespaceSaver.new
  expression = []

  while true
    t = pull_next_token(saver)
    # note that we avoid validating the allowed tokens inside
    # the substitution here; we even allow nested substitutions
    # in the tokenizer. The parser sorts it out.

    if t == Tokens::CLOSE_CURLY
      # end the loop, done!
      break
    elsif t == Tokens::EOF
      raise self.class.problem(@line_origin, t, "Substitution ${ was not closed with a }", false, nil)
    else
      whitespace = saver.check(t, @line_origin, @line_number)
      unless whitespace.nil?
        expression << whitespace
      end
      expression << t
    end
  end

  Tokens.new_substitution(@line_origin, optional, expression)
end

#pull_unquoted_textObject

The rules here are intended to maximize convenience while avoiding confusion with real valid JSON. Basically anything that parses as JSON is treated the JSON way and otherwise we assume it’s a string and let the parser sort it out.



241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# File 'lib/hocon/impl/tokenizer.rb', line 241

def pull_unquoted_text
  origin = @line_origin
  io = StringIO.new
  c = next_char_raw
  while true
    if (c == -1) or
        (NOT_IN_UNQUOTED_TEXT.index(c)) or
        (self.class.whitespace?(c)) or
        (start_of_comment?(c))
      break
    else
      io << c
    end

    # we parse true/false/null tokens as such no matter
    # what is after them, as long as they are at the
    # start of the unquoted token.
    if io.length == 4
      if io.string == "true"
        return Tokens.new_boolean(origin, true)
      elsif io.string == "null"
        return Tokens.new_null(origin)
      end
    elsif io.length  == 5
      if io.string == "false"
        return Tokens.new_boolean(origin, false)
      end
    end

    c = next_char_raw
  end

  # put back the char that ended the unquoted text
  put_back(c)

  Tokens.new_unquoted_text(origin, io.string)
end

#put_back(c) ⇒ Object



136
137
138
139
140
141
# File 'lib/hocon/impl/tokenizer.rb', line 136

def put_back(c)
  if @buffer.length > 2
    raise ConfigBugOrBrokenError, "bug: putBack() three times, undesirable look-ahead"
  end
  @buffer.push(c)
end

#queue_next_tokenObject



568
569
570
571
572
573
574
575
# File 'lib/hocon/impl/tokenizer.rb', line 568

def queue_next_token
  t = pull_next_token(@whitespace_saver)
  whitespace = @whitespace_saver.check(t, @origin, @line_number)
  if whitespace
    @tokens.push(whitespace)
  end
  @tokens.push(t)
end

#removeObject



596
597
598
# File 'lib/hocon/impl/tokenizer.rb', line 596

def remove
  raise ConfigBugOrBrokenError, "Does not make sense to remove items from token stream"
end

#start_of_comment?(c) ⇒ Boolean

Returns:

  • (Boolean)


151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# File 'lib/hocon/impl/tokenizer.rb', line 151

def start_of_comment?(c)
  if c == -1
    false
  else
    if @allow_comments
      if c == '#'
        true
      elsif c == '/'
        maybe_second_slash = next_char_raw
        # we want to predictably NOT consume any chars
        put_back(maybe_second_slash)
        if maybe_second_slash == '/'
          true
        else
          false
        end
      end
    else
      false
    end
  end
end

#to_listObject



617
618
619
620
# File 'lib/hocon/impl/tokenizer.rb', line 617

def to_list
  # Return array of tokens from the iterator
  self.map { |token| token }
end