Class: TSJSON::Lexer

Inherits:

Object

Object
TSJSON::Lexer

show all

Defined in:: lib/language/lexer/lexer.rb

Instance Attribute Summary collapse

#last_token ⇒ Object

Returns the value of attribute last_token.
#line ⇒ Object

Returns the value of attribute line.
#line_start ⇒ Object

Returns the value of attribute line_start.
#source ⇒ Object

Returns the value of attribute source.
#token ⇒ Object

Returns the value of attribute token.

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ `Lexer`

Returns a new instance of Lexer.

# File 'lib/language/lexer/lexer.rb', line 11

def initialize(source)
  startOfFileToken = Token.new(TokenKind::SOF, 0, 0, 0, 0, nil)

  self.source = source
  self.last_token = startOfFileToken
  self.token = startOfFileToken
  self.line = 1
  self.line_start = 0
end

Instance Attribute Details

#last_token ⇒ `Object`

Returns the value of attribute last_token.



9
10
11

# File 'lib/language/lexer/lexer.rb', line 9

def last_token
  @last_token
end

#line ⇒ `Object`

Returns the value of attribute line.



9
10
11

# File 'lib/language/lexer/lexer.rb', line 9

def line
  @line
end

#line_start ⇒ `Object`

Returns the value of attribute line_start.



9
10
11

# File 'lib/language/lexer/lexer.rb', line 9

def line_start
  @line_start
end

#source ⇒ `Object`

Returns the value of attribute source.



9
10
11

# File 'lib/language/lexer/lexer.rb', line 9

def source
  @source
end

#token ⇒ `Object`

Returns the value of attribute token.



9
10
11

# File 'lib/language/lexer/lexer.rb', line 9

def token
  @token
end

Instance Method Details

#advance ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 21

def advance
  self.last_token = self.token
  self.token = self.lookahead
end

#char2hex(a) ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 436

def char2hex(a)
  if a >= 48 && a <= 57
    a - 48 # 0-9
  elsif a >= 65 && a <= 70
    a - 55 # A-F
  elsif a >= 97 && a <= 102
    a - 87 # a-f
  else
    -1
  end
end

#char_code(str) ⇒ `Object`



153
154
155

# File 'lib/language/lexer/lexer.rb', line 153

def char_code(str)
  char_code_at(str, 0)
end

#char_code_at(str, pos) ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 147

def char_code_at(str, pos)
  str[pos || 0].ord
rescue StandardError
  Float::NAN
end

#is_name_start(code) ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 423

def is_name_start(code)
  return(
    code === 95 || (code >= 65 && code <= 90) || (code >= 97 && code <= 122)
  )
end

#is_nan?(val) ⇒ `Boolean`

Returns:

(Boolean)



448
449
450

# File 'lib/language/lexer/lexer.rb', line 448

def is_nan?(val)
  val.is_a?(Float) && val.nan?
end

#lookahead ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 26

def lookahead
  token = self.token

  if token.kind != TokenKind::EOF
    loop do
      # Note: next is only mutable during parsing, so we cast to allow this.
      token = token.next || (token.next = readToken(token))
      break if (token.kind != TokenKind::COMMENT)
    end
  end
  return token
end

#print_char_code(code) ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 171

def print_char_code(code)
  return(
    if is_nan?(code)
      TokenKind::EOF
    else
      if code < 0x007f
        code.chr.to_json
      else
        utf_str = '00' + code.to_s(16).upcase
        "\"\\u#{utf_str.slice(utf_str.length - 4, 4)}\""
      end
    end
  )
end

#read_comment(source, start, line, col, prev) ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 214

def read_comment(source, start, line, col, prev)
  body = source.body

  position = start

  loop do
    code = char_code_at(body, position += 1)
    break unless !is_nan?(code) && (code > 0x001f || code == 0x0009)
  end

  return(
    Token.new(
      TokenKind::COMMENT,
      start,
      position,
      line,
      col,
      prev,
      body[start + 2..position - 1]
    )
  )
end

#read_digits(source, start, firstCode) ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 402

def read_digits(source, start, firstCode)
  body = source.body
  position = start
  code = firstCode
  if (code >= 48 && code <= 57)
    # 0 - 9
    loop do
      code = char_code_at(body, position += 1)
      break unless (code >= 48 && code <= 57) # 0 - 9
    end
    return position
  end
  raise TSJSONSyntaxError.syntax_error(
          source,
          position,
          "Invalid number, expected digit but got: #{
            print_char_code(code)
          }."
        )
end

#read_name(source, start, line, col, prev) ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 186

def read_name(source, start, line, col, prev)
  body = source.body
  bodyLength = body.length
  position = start + 1
  code = 0
  while (
          position != bodyLength &&
            (!is_nan?(code = char_code_at(body, position))) &&
            (
              code == 95 || (code >= 48 && code <= 57) ||
                (code >= 65 && code <= 90) || (code >= 97 && code <= 122)
            )
        )
    position += 1
  end
  return(
    Token.new(
      TokenKind::NAME,
      start,
      position,
      line,
      col,
      prev,
      body[start..position - 1]
    )
  )
end

#read_number(source, start, firstCode, line, col, prev) ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 334

def read_number(source, start, firstCode, line, col, prev)
  body = source.body
  code = firstCode
  position = start
  isFloat = false

  code = char_code_at(body, position += 1) if (code === 45) # -

  if (code === 48)
    # 0
    code = char_code_at(body, position += 1)
    if (code >= 48 && code <= 57)
      raise TSJSONSyntaxError.syntax_error(
              source,
              position,
              "Invalid number, unexpected digit after 0: #{
                print_char_code(code)
              }."
            )
    end
  else
    position = read_digits(source, position, code)
    code = char_code_at(body, position)
  end

  if (code === 46)
    # .
    isFloat = true

    code = char_code_at(body, position += 1)
    position = read_digits(source, position, code)
    code = char_code_at(body, position)
  end

  if (code === 69 || code === 101)
    # E e
    isFloat = true

    code = char_code_at(body, position += 1)
    code = char_code_at(body, position += 1) if (code === 43 || code === 45) # + -
    position = read_digits(source, position, code)
    code = char_code_at(body, position)
  end

  # Numbers cannot be followed by . or NameStart
  if (code === 46 || is_name_start(code))
    raise TSJSONSyntaxError.syntax_error(
            source,
            position,
            "Invalid number, expected digit but got: #{
              print_char_code(code)
            }."
          )
  end

  return(
    Token.new(
      isFloat ? TokenKind::FLOAT : TokenKind::INT,
      start,
      position,
      line,
      col,
      prev,
      body[start..position - 1]
    )
  )
end

#read_string(source, start, line, col, prev) ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 237

def read_string(source, start, line, col, prev)
  body = source.body
  position = start + 1
  chunkStart = position
  code = 0
  value = ''

  while (
          position < body.length && (code = char_code_at(body, position)) &&
            !is_nan?(code) && code != 0x000a && code != 0x000d
        )
    # Closing Quote (")
    if (code == 34)
      value += body[chunkStart..position - 1]
      return(
        Token.new(
          TokenKind::STRING,
          start,
          position + 1,
          line,
          col,
          prev,
          value
        )
      )
    end

    # SourceCharacter
    if (code < 0x0020 && code != 0x0009)
      raise TSJSONSyntaxError.syntax_error(
              source,
              position,
              "Invalid character within String: #{print_char_code(code)}."
            )
    end

    position += 1
    if (code == 92)
      #        value += body[chunkStart..position - 2]
      code = char_code_at(body, position)
      case (code)
      when 34
        value += '"'
      when 47
        value += '/'
      when 92
        value += '\\'
      when 98
        value += '\b'
      when 102
        value += '\f'
      when 110
        value += '\n'
      when 114
        value += '\r'
      when 116
        value += '\t'
      when 117
        charCode =
          uniCharCode(
            char_code_at(body, position + 1),
            char_code_at(body, position + 2),
            char_code_at(body, position + 3),
            char_code_at(body, position + 4)
          )
        if (charCode < 0)
          invalid_sequence = body[position + 1..position + 4]
          raise TSJSONSyntaxError.syntax_error(
                  source,
                  position,
                  "Invalid character escape sequence: \\u#{
                    invalid_sequence
                  }."
                )
        end
        value += charCode.chr(Encoding::UTF_8)
        position += 4
      else
        raise TSJSONSyntaxError.syntax_error(
                source,
                position,
                "Invalid character escape sequence: \\#{code.chr}."
              )
      end
      position += 1
      chunkStart = position
    end
  end

  raise TSJSONSyntaxError.syntax_error(
          source,
          position,
          'Unterminated string.'
        )
end

#readToken(prev) ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 39

def readToken(prev)
  lexer = self
  source = lexer.source
  body = source.body
  body_length = body.length

  pos = prev.end_pos
  while (pos < body_length)
    code = char_code_at(body, pos)

    line = lexer.line
    col = 1 + pos - lexer.line_start

    #SourceCharacter
    case (code)
    when 0xfeff, 9, 32
      pos += 1
      next
    when 10
      pos += 1
      lexer.line += 1
      lexer.line_start = pos
      next
    when 13
      if (char_code_at(body, pos + 1) == 10)
        pos += 2
      else
        pos += 1
      end
      lexer.line += 1
      lexer.line_start = pos
      next
    when char_code('/')
      if (char_code_at(body, pos + 1) == char_code('/'))
        return read_comment(source, pos, line, col, prev)
      end
      break
    when char_code(',')
      return Token.new(TokenKind::COMMA, pos, pos + 1, line, col, prev)
    when char_code('&')
      return Token.new(TokenKind::AMP, pos, pos + 1, line, col, prev)
    when char_code('(')
      return Token.new(TokenKind::PAREN_L, pos, pos + 1, line, col, prev)
    when char_code(')')
      return Token.new(TokenKind::PAREN_R, pos, pos + 1, line, col, prev)
    when char_code(':')
      return Token.new(TokenKind::COLON, pos, pos + 1, line, col, prev)
    when char_code(';')
      return Token.new(TokenKind::SEMICOLON, pos, pos + 1, line, col, prev)
    when char_code('=')
      return Token.new(TokenKind::EQUALS, pos, pos + 1, line, col, prev)
    when char_code('<')
      return Token.new(TokenKind::CHEVRON_L, pos, pos + 1, line, col, prev)
    when char_code('>')
      return Token.new(TokenKind::CHEVRON_R, pos, pos + 1, line, col, prev)
    when char_code('[')
      return Token.new(TokenKind::BRACKET_L, pos, pos + 1, line, col, prev)
    when char_code(']')
      return Token.new(TokenKind::BRACKET_R, pos, pos + 1, line, col, prev)
    when char_code('{')
      return Token.new(TokenKind::BRACE_L, pos, pos + 1, line, col, prev)
    when char_code('|')
      return Token.new(TokenKind::PIPE, pos, pos + 1, line, col, prev)
    when char_code('}')
      return Token.new(TokenKind::BRACE_R, pos, pos + 1, line, col, prev)
    when char_code('.')
      return Token.new(TokenKind::DOT, pos, pos + 1, line, col, prev)
    when char_code('?')
      return(
        Token.new(TokenKind::QUESTION_MARK, pos, pos + 1, line, col, prev)
      )
    when char_code('"')
      return read_string(source, pos, line, col, prev)
    when char_code('-'), char_code('0'), char_code('1'), char_code('2'),
         char_code('3'), char_code('4'), char_code('5'), char_code('6'),
         char_code('7'), char_code('8'), char_code('9')
      return read_number(source, pos, code, line, col, prev)
    when char_code('A'), char_code('B'), char_code('C'), char_code('D'),
         char_code('E'), char_code('F'), char_code('G'), char_code('H'),
         char_code('I'), char_code('J'), char_code('K'), char_code('L'),
         char_code('M'), char_code('N'), char_code('O'), char_code('P'),
         char_code('Q'), char_code('R'), char_code('S'), char_code('T'),
         char_code('U'), char_code('V'), char_code('W'), char_code('X'),
         char_code('Y'), char_code('Z'), char_code('_'), char_code('a'),
         char_code('b'), char_code('c'), char_code('d'), char_code('e'),
         char_code('f'), char_code('g'), char_code('h'), char_code('i'),
         char_code('j'), char_code('k'), char_code('l'), char_code('m'),
         char_code('n'), char_code('o'), char_code('p'), char_code('q'),
         char_code('r'), char_code('s'), char_code('t'), char_code('u'),
         char_code('v'), char_code('w'), char_code('x'), char_code('y'),
         char_code('z')
      return read_name(source, pos, line, col, prev)
    end

    raise TSJSONSyntaxError.syntax_error(
            source,
            pos,
            unexpectedCharacterMessage(code)
          )
  end

  line = lexer.line
  col = 1 + pos - lexer.line_start
  return(
    Token.new(TokenKind::EOF, body_length, body_length, line, col, prev)
  )
end

#unexpectedCharacterMessage(code) ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 157

def unexpectedCharacterMessage(code)
  if (code < 0x0020 && code != 0x0009 && code != 0x000a && code != 0x000d)
    return "Cannot contain the invalid character #{print_char_code(code)}."
  end

  if (code == 39)
    return(
      'Unexpected single quote character (\'), did you mean to use a double quote (")?'
    )
  end

  return "Cannot parse the unexpected character #{print_char_code(code)}."
end

#uniCharCode(a, b, c, d) ⇒ `Object`

# File 'lib/language/lexer/lexer.rb', line 429

def uniCharCode(a, b, c, d)
  return(
    (char2hex(a) << 12) | (char2hex(b) << 8) | (char2hex(c) << 4) |
      char2hex(d)
  )
end

Class: TSJSON::Lexer

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ Lexer

Instance Attribute Details

#last_token ⇒ Object

#line ⇒ Object

#line_start ⇒ Object

#source ⇒ Object

#token ⇒ Object

Instance Method Details

#advance ⇒ Object

#char2hex(a) ⇒ Object

#char_code(str) ⇒ Object

#char_code_at(str, pos) ⇒ Object

#is_name_start(code) ⇒ Object

#is_nan?(val) ⇒ Boolean

#lookahead ⇒ Object

#print_char_code(code) ⇒ Object

#read_comment(source, start, line, col, prev) ⇒ Object

#read_digits(source, start, firstCode) ⇒ Object

#read_name(source, start, line, col, prev) ⇒ Object

#read_number(source, start, firstCode, line, col, prev) ⇒ Object

#read_string(source, start, line, col, prev) ⇒ Object

#readToken(prev) ⇒ Object

#unexpectedCharacterMessage(code) ⇒ Object

#uniCharCode(a, b, c, d) ⇒ Object