Class: TSJSON::Lexer

Inherits:
Object
  • Object
show all
Defined in:
lib/language/lexer/lexer.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ Lexer

Returns a new instance of Lexer.



11
12
13
14
15
16
17
18
19
# File 'lib/language/lexer/lexer.rb', line 11

def initialize(source)
  startOfFileToken = Token.new(TokenKind::SOF, 0, 0, 0, 0, nil)

  self.source = source
  self.last_token = startOfFileToken
  self.token = startOfFileToken
  self.line = 1
  self.line_start = 0
end

Instance Attribute Details

#last_tokenObject

Returns the value of attribute last_token.



9
10
11
# File 'lib/language/lexer/lexer.rb', line 9

def last_token
  @last_token
end

#lineObject

Returns the value of attribute line.



9
10
11
# File 'lib/language/lexer/lexer.rb', line 9

def line
  @line
end

#line_startObject

Returns the value of attribute line_start.



9
10
11
# File 'lib/language/lexer/lexer.rb', line 9

def line_start
  @line_start
end

#sourceObject

Returns the value of attribute source.



9
10
11
# File 'lib/language/lexer/lexer.rb', line 9

def source
  @source
end

#tokenObject

Returns the value of attribute token.



9
10
11
# File 'lib/language/lexer/lexer.rb', line 9

def token
  @token
end

Instance Method Details

#advanceObject



21
22
23
24
# File 'lib/language/lexer/lexer.rb', line 21

def advance
  self.last_token = self.token
  self.token = self.lookahead
end

#char2hex(a) ⇒ Object



436
437
438
439
440
441
442
443
444
445
446
# File 'lib/language/lexer/lexer.rb', line 436

def char2hex(a)
  if a >= 48 && a <= 57
    a - 48 # 0-9
  elsif a >= 65 && a <= 70
    a - 55 # A-F
  elsif a >= 97 && a <= 102
    a - 87 # a-f
  else
    -1
  end
end

#char_code(str) ⇒ Object



153
154
155
# File 'lib/language/lexer/lexer.rb', line 153

def char_code(str)
  char_code_at(str, 0)
end

#char_code_at(str, pos) ⇒ Object



147
148
149
150
151
# File 'lib/language/lexer/lexer.rb', line 147

def char_code_at(str, pos)
  str[pos || 0].ord
rescue StandardError
  Float::NAN
end

#is_name_start(code) ⇒ Object



423
424
425
426
427
# File 'lib/language/lexer/lexer.rb', line 423

def is_name_start(code)
  return(
    code === 95 || (code >= 65 && code <= 90) || (code >= 97 && code <= 122)
  )
end

#is_nan?(val) ⇒ Boolean

Returns:

  • (Boolean)


448
449
450
# File 'lib/language/lexer/lexer.rb', line 448

def is_nan?(val)
  val.is_a?(Float) && val.nan?
end

#lookaheadObject



26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/language/lexer/lexer.rb', line 26

def lookahead
  token = self.token

  if token.kind != TokenKind::EOF
    loop do
      # Note: next is only mutable during parsing, so we cast to allow this.
      token = token.next || (token.next = readToken(token))
      break if (token.kind != TokenKind::COMMENT)
    end
  end
  return token
end


171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/language/lexer/lexer.rb', line 171

def print_char_code(code)
  return(
    if is_nan?(code)
      TokenKind::EOF
    else
      if code < 0x007f
        code.chr.to_json
      else
        utf_str = '00' + code.to_s(16).upcase
        "\"\\u#{utf_str.slice(utf_str.length - 4, 4)}\""
      end
    end
  )
end

#read_comment(source, start, line, col, prev) ⇒ Object



214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# File 'lib/language/lexer/lexer.rb', line 214

def read_comment(source, start, line, col, prev)
  body = source.body

  position = start

  loop do
    code = char_code_at(body, position += 1)
    break unless !is_nan?(code) && (code > 0x001f || code == 0x0009)
  end

  return(
    Token.new(
      TokenKind::COMMENT,
      start,
      position,
      line,
      col,
      prev,
      body[start + 2..position - 1]
    )
  )
end

#read_digits(source, start, firstCode) ⇒ Object



402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
# File 'lib/language/lexer/lexer.rb', line 402

def read_digits(source, start, firstCode)
  body = source.body
  position = start
  code = firstCode
  if (code >= 48 && code <= 57)
    # 0 - 9
    loop do
      code = char_code_at(body, position += 1)
      break unless (code >= 48 && code <= 57) # 0 - 9
    end
    return position
  end
  raise TSJSONSyntaxError.syntax_error(
          source,
          position,
          "Invalid number, expected digit but got: #{
            print_char_code(code)
          }."
        )
end

#read_name(source, start, line, col, prev) ⇒ Object



186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'lib/language/lexer/lexer.rb', line 186

def read_name(source, start, line, col, prev)
  body = source.body
  bodyLength = body.length
  position = start + 1
  code = 0
  while (
          position != bodyLength &&
            (!is_nan?(code = char_code_at(body, position))) &&
            (
              code == 95 || (code >= 48 && code <= 57) ||
                (code >= 65 && code <= 90) || (code >= 97 && code <= 122)
            )
        )
    position += 1
  end
  return(
    Token.new(
      TokenKind::NAME,
      start,
      position,
      line,
      col,
      prev,
      body[start..position - 1]
    )
  )
end

#read_number(source, start, firstCode, line, col, prev) ⇒ Object



334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
# File 'lib/language/lexer/lexer.rb', line 334

def read_number(source, start, firstCode, line, col, prev)
  body = source.body
  code = firstCode
  position = start
  isFloat = false

  code = char_code_at(body, position += 1) if (code === 45) # -

  if (code === 48)
    # 0
    code = char_code_at(body, position += 1)
    if (code >= 48 && code <= 57)
      raise TSJSONSyntaxError.syntax_error(
              source,
              position,
              "Invalid number, unexpected digit after 0: #{
                print_char_code(code)
              }."
            )
    end
  else
    position = read_digits(source, position, code)
    code = char_code_at(body, position)
  end

  if (code === 46)
    # .
    isFloat = true

    code = char_code_at(body, position += 1)
    position = read_digits(source, position, code)
    code = char_code_at(body, position)
  end

  if (code === 69 || code === 101)
    # E e
    isFloat = true

    code = char_code_at(body, position += 1)
    code = char_code_at(body, position += 1) if (code === 43 || code === 45) # + -
    position = read_digits(source, position, code)
    code = char_code_at(body, position)
  end

  # Numbers cannot be followed by . or NameStart
  if (code === 46 || is_name_start(code))
    raise TSJSONSyntaxError.syntax_error(
            source,
            position,
            "Invalid number, expected digit but got: #{
              print_char_code(code)
            }."
          )
  end

  return(
    Token.new(
      isFloat ? TokenKind::FLOAT : TokenKind::INT,
      start,
      position,
      line,
      col,
      prev,
      body[start..position - 1]
    )
  )
end

#read_string(source, start, line, col, prev) ⇒ Object



237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
# File 'lib/language/lexer/lexer.rb', line 237

def read_string(source, start, line, col, prev)
  body = source.body
  position = start + 1
  chunkStart = position
  code = 0
  value = ''

  while (
          position < body.length && (code = char_code_at(body, position)) &&
            !is_nan?(code) && code != 0x000a && code != 0x000d
        )
    # Closing Quote (")
    if (code == 34)
      value += body[chunkStart..position - 1]
      return(
        Token.new(
          TokenKind::STRING,
          start,
          position + 1,
          line,
          col,
          prev,
          value
        )
      )
    end

    # SourceCharacter
    if (code < 0x0020 && code != 0x0009)
      raise TSJSONSyntaxError.syntax_error(
              source,
              position,
              "Invalid character within String: #{print_char_code(code)}."
            )
    end

    position += 1
    if (code == 92)
      # \
      value += body[chunkStart..position - 2]
      code = char_code_at(body, position)
      case (code)
      when 34
        value += '"'
      when 47
        value += '/'
      when 92
        value += '\\'
      when 98
        value += '\b'
      when 102
        value += '\f'
      when 110
        value += '\n'
      when 114
        value += '\r'
      when 116
        value += '\t'
      when 117
        charCode =
          uniCharCode(
            char_code_at(body, position + 1),
            char_code_at(body, position + 2),
            char_code_at(body, position + 3),
            char_code_at(body, position + 4)
          )
        if (charCode < 0)
          invalid_sequence = body[position + 1..position + 4]
          raise TSJSONSyntaxError.syntax_error(
                  source,
                  position,
                  "Invalid character escape sequence: \\u#{
                    invalid_sequence
                  }."
                )
        end
        value += charCode.chr(Encoding::UTF_8)
        position += 4
      else
        raise TSJSONSyntaxError.syntax_error(
                source,
                position,
                "Invalid character escape sequence: \\#{code.chr}."
              )
      end
      position += 1
      chunkStart = position
    end
  end

  raise TSJSONSyntaxError.syntax_error(
          source,
          position,
          'Unterminated string.'
        )
end

#readToken(prev) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/language/lexer/lexer.rb', line 39

def readToken(prev)
  lexer = self
  source = lexer.source
  body = source.body
  body_length = body.length

  pos = prev.end_pos
  while (pos < body_length)
    code = char_code_at(body, pos)

    line = lexer.line
    col = 1 + pos - lexer.line_start

    #SourceCharacter
    case (code)
    when 0xfeff, 9, 32
      pos += 1
      next
    when 10
      pos += 1
      lexer.line += 1
      lexer.line_start = pos
      next
    when 13
      if (char_code_at(body, pos + 1) == 10)
        pos += 2
      else
        pos += 1
      end
      lexer.line += 1
      lexer.line_start = pos
      next
    when char_code('/')
      if (char_code_at(body, pos + 1) == char_code('/'))
        return read_comment(source, pos, line, col, prev)
      end
      break
    when char_code(',')
      return Token.new(TokenKind::COMMA, pos, pos + 1, line, col, prev)
    when char_code('&')
      return Token.new(TokenKind::AMP, pos, pos + 1, line, col, prev)
    when char_code('(')
      return Token.new(TokenKind::PAREN_L, pos, pos + 1, line, col, prev)
    when char_code(')')
      return Token.new(TokenKind::PAREN_R, pos, pos + 1, line, col, prev)
    when char_code(':')
      return Token.new(TokenKind::COLON, pos, pos + 1, line, col, prev)
    when char_code(';')
      return Token.new(TokenKind::SEMICOLON, pos, pos + 1, line, col, prev)
    when char_code('=')
      return Token.new(TokenKind::EQUALS, pos, pos + 1, line, col, prev)
    when char_code('<')
      return Token.new(TokenKind::CHEVRON_L, pos, pos + 1, line, col, prev)
    when char_code('>')
      return Token.new(TokenKind::CHEVRON_R, pos, pos + 1, line, col, prev)
    when char_code('[')
      return Token.new(TokenKind::BRACKET_L, pos, pos + 1, line, col, prev)
    when char_code(']')
      return Token.new(TokenKind::BRACKET_R, pos, pos + 1, line, col, prev)
    when char_code('{')
      return Token.new(TokenKind::BRACE_L, pos, pos + 1, line, col, prev)
    when char_code('|')
      return Token.new(TokenKind::PIPE, pos, pos + 1, line, col, prev)
    when char_code('}')
      return Token.new(TokenKind::BRACE_R, pos, pos + 1, line, col, prev)
    when char_code('.')
      return Token.new(TokenKind::DOT, pos, pos + 1, line, col, prev)
    when char_code('?')
      return(
        Token.new(TokenKind::QUESTION_MARK, pos, pos + 1, line, col, prev)
      )
    when char_code('"')
      return read_string(source, pos, line, col, prev)
    when char_code('-'), char_code('0'), char_code('1'), char_code('2'),
         char_code('3'), char_code('4'), char_code('5'), char_code('6'),
         char_code('7'), char_code('8'), char_code('9')
      return read_number(source, pos, code, line, col, prev)
    when char_code('A'), char_code('B'), char_code('C'), char_code('D'),
         char_code('E'), char_code('F'), char_code('G'), char_code('H'),
         char_code('I'), char_code('J'), char_code('K'), char_code('L'),
         char_code('M'), char_code('N'), char_code('O'), char_code('P'),
         char_code('Q'), char_code('R'), char_code('S'), char_code('T'),
         char_code('U'), char_code('V'), char_code('W'), char_code('X'),
         char_code('Y'), char_code('Z'), char_code('_'), char_code('a'),
         char_code('b'), char_code('c'), char_code('d'), char_code('e'),
         char_code('f'), char_code('g'), char_code('h'), char_code('i'),
         char_code('j'), char_code('k'), char_code('l'), char_code('m'),
         char_code('n'), char_code('o'), char_code('p'), char_code('q'),
         char_code('r'), char_code('s'), char_code('t'), char_code('u'),
         char_code('v'), char_code('w'), char_code('x'), char_code('y'),
         char_code('z')
      return read_name(source, pos, line, col, prev)
    end

    raise TSJSONSyntaxError.syntax_error(
            source,
            pos,
            unexpectedCharacterMessage(code)
          )
  end

  line = lexer.line
  col = 1 + pos - lexer.line_start
  return(
    Token.new(TokenKind::EOF, body_length, body_length, line, col, prev)
  )
end

#unexpectedCharacterMessage(code) ⇒ Object



157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/language/lexer/lexer.rb', line 157

def unexpectedCharacterMessage(code)
  if (code < 0x0020 && code != 0x0009 && code != 0x000a && code != 0x000d)
    return "Cannot contain the invalid character #{print_char_code(code)}."
  end

  if (code == 39)
    return(
      'Unexpected single quote character (\'), did you mean to use a double quote (")?'
    )
  end

  return "Cannot parse the unexpected character #{print_char_code(code)}."
end

#uniCharCode(a, b, c, d) ⇒ Object



429
430
431
432
433
434
# File 'lib/language/lexer/lexer.rb', line 429

def uniCharCode(a, b, c, d)
  return(
    (char2hex(a) << 12) | (char2hex(b) << 8) | (char2hex(c) << 4) |
      char2hex(d)
  )
end