Class: GraphQL::Language::Lexer

Inherits:

Object

Object
GraphQL::Language::Lexer

show all

Defined in:: lib/graphql/language/lexer.rb

Defined Under Namespace

Modules: ByteFor, Punctuation

Constant Summary collapse

ESCAPES =

/\\["\\\/bfnrt]/

ESCAPES_REPLACE =

{
  '\\"' => '"',
  "\\\\" => "\\",
  "\\/" => '/',
  "\\b" => "\b",
  "\\f" => "\f",
  "\\n" => "\n",
  "\\r" => "\r",
  "\\t" => "\t",
}

UTF_8 =

/\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i

VALID_STRING =

/\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o

ESCAPED =

/(?:#{ESCAPES}|#{UTF_8})/o

IGNORE_REGEXP =

%r{
  (?:
    [, \c\r\n\t]+ |
    \#.*$
  )*
}x

IDENTIFIER_REGEXP =

/[_A-Za-z][_0-9A-Za-z]*/

INT_REGEXP =

/-?(?:[0]|[1-9][0-9]*)/

FLOAT_DECIMAL_REGEXP =

/[.][0-9]+/

FLOAT_EXP_REGEXP =

/[eE][+-]?[0-9]+/

NUMERIC_REGEXP = TODO: FLOAT_EXP_REGEXP should not be allowed to follow INT_REGEXP, integers are not allowed to have exponent parts.

/#{INT_REGEXP}(#{FLOAT_DECIMAL_REGEXP}#{FLOAT_EXP_REGEXP}|#{FLOAT_DECIMAL_REGEXP}|#{FLOAT_EXP_REGEXP})?/

KEYWORDS =

[
  "on",
  "fragment",
  "true",
  "false",
  "null",
  "query",
  "mutation",
  "subscription",
  "schema",
  "scalar",
  "type",
  "extend",
  "implements",
  "interface",
  "union",
  "enum",
  "input",
  "directive",
  "repeatable"
].freeze

KEYWORD_REGEXP =

/#{Regexp.union(KEYWORDS.sort)}\b/

KEYWORD_BY_TWO_BYTES =

[
  :INTERFACE,
  :MUTATION,
  :EXTEND,
  :FALSE,
  :ENUM,
  :TRUE,
  :NULL,
  nil,
  nil,
  nil,
  nil,
  nil,
  nil,
  nil,
  :QUERY,
  nil,
  nil,
  :REPEATABLE,
  :IMPLEMENTS,
  :INPUT,
  :TYPE,
  :SCHEMA,
  nil,
  nil,
  nil,
  :DIRECTIVE,
  :UNION,
  nil,
  nil,
  :SCALAR,
  nil,
  :FRAGMENT
]

PUNCTUATION_NAME_FOR_BYTE = A sparse array mapping the bytes for each punctuation to a symbol name for that punctuation

Punctuation.constants.each_with_object([]) { |name, arr|
  punct = Punctuation.const_get(name)
  arr[punct.ord] = name
}

QUOTE =

'"'

UNICODE_DIGIT =

/[0-9A-Za-z]/

FOUR_DIGIT_UNICODE =

/#{UNICODE_DIGIT}{4}/

N_DIGIT_UNICODE =

%r{#{Punctuation::LCURLY}#{UNICODE_DIGIT}{4,}#{Punctuation::RCURLY}}x

UNICODE_ESCAPE =

%r{\\u(?:#{FOUR_DIGIT_UNICODE}|#{N_DIGIT_UNICODE})}

STRING_ESCAPE =

%r{[\\][\\/bfnrt]}

BLOCK_QUOTE =

'"""'

ESCAPED_QUOTE =

/\\"/

STRING_CHAR =

/#{ESCAPED_QUOTE}|[^"\\\n\r]|#{UNICODE_ESCAPE}|#{STRING_ESCAPE}/

QUOTED_STRING_REGEXP =

%r{#{QUOTE} (?:#{STRING_CHAR})* #{QUOTE}}x

BLOCK_STRING_REGEXP =

%r{
  #{BLOCK_QUOTE}
  (?: [^"\\]               |  # Any characters that aren't a quote or slash
     (?<!") ["]{1,2} (?!") |  # Any quotes that don't have quotes next to them
     \\"{0,3}(?!")         |  # A slash followed by <= 3 quotes that aren't followed by a quote
     \\                    |  # A slash
     "{1,2}(?!")              # 1 or 2 " followed by something that isn't a quote
  )*
  (?:"")?
  #{BLOCK_QUOTE}
}xm

FIRST_BYTES = Use this array to check, for a given byte that will start a token, what kind of token might it start?

Array.new(255)

Instance Attribute Summary collapse

#pos ⇒ Object readonly
Returns the value of attribute pos.

Class Method Summary collapse

.replace_escaped_characters_in_place(raw_string) ⇒ Object
Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it.
.tokenize(string) ⇒ Object
This is not used during parsing because the parser doesn't actually need tokens.

Instance Method Summary collapse

#_hash(key) ⇒ Object
This produces a unique integer for bytes 2 and 3 of each keyword string See https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html.
#advance ⇒ Object
#column_number ⇒ Object
#debug_token_value(token_name) ⇒ Object
#eos? ⇒ Boolean
#initialize(graphql_str, filename: nil, max_tokens: nil) ⇒ Lexer constructor
A new instance of Lexer.
#line_number ⇒ Object
#raise_parse_error(message, line = line_number, col = column_number) ⇒ Object
#string_value ⇒ Object
#token_value ⇒ Object

Constructor Details

#initialize(graphql_str, filename: nil, max_tokens: nil) ⇒ `Lexer`

Returns a new instance of Lexer.

# File 'lib/graphql/language/lexer.rb', line 6

def initialize(graphql_str, filename: nil, max_tokens: nil)
  if !(graphql_str.encoding == Encoding::UTF_8 || graphql_str.ascii_only?)
    graphql_str = graphql_str.dup.force_encoding(Encoding::UTF_8)
  end
  @string = graphql_str
  @filename = filename
  @scanner = StringScanner.new(graphql_str)
  @pos = nil
  @max_tokens = max_tokens || Float::INFINITY
  @tokens_count = 0
end

Instance Attribute Details

#pos ⇒ `Object` (readonly)

Returns the value of attribute pos.



22
23
24

# File 'lib/graphql/language/lexer.rb', line 22

def pos
  @pos
end

Class Method Details

.replace_escaped_characters_in_place(raw_string) ⇒ `Object`

Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it

# File 'lib/graphql/language/lexer.rb', line 318

def self.replace_escaped_characters_in_place(raw_string)
  raw_string.gsub!(ESCAPED) do |matched_str|
    if (point_str_1 = $1 || $2)
      codepoint_1 = point_str_1.to_i(16)
      if (codepoint_2 = $3)
        codepoint_2 = codepoint_2.to_i(16)
        if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate
            (codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate
          # A surrogate pair
          combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000
          [combined].pack('U'.freeze)
        else
          # Two separate code points
          [codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze)
        end
      else
        [codepoint_1].pack('U'.freeze)
      end
    else
      ESCAPES_REPLACE[matched_str]
    end
  end
  nil
end

.tokenize(string) ⇒ `Object`

This is not used during parsing because the parser doesn't actually need tokens.

# File 'lib/graphql/language/lexer.rb', line 345

def self.tokenize(string)
  lexer = GraphQL::Language::Lexer.new(string)
  tokens = []
  prev_token = nil
  while (token_name = lexer.advance)
    new_token = [
      token_name,
      lexer.line_number,
      lexer.column_number,
      lexer.debug_token_value(token_name),
      prev_token,
    ]
    tokens << new_token
    prev_token = new_token
  end
  tokens
end

Instance Method Details

#_hash(key) ⇒ `Object`

This produces a unique integer for bytes 2 and 3 of each keyword string See https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html



239
240
241

# File 'lib/graphql/language/lexer.rb', line 239

def _hash key
  (key * 18592990) >> 27 & 0x1f
end

#advance ⇒ `Object`

# File 'lib/graphql/language/lexer.rb', line 24

def advance
  @scanner.skip(IGNORE_REGEXP)
  return false if @scanner.eos?
  @tokens_count += 1
  if @tokens_count > @max_tokens
    raise_parse_error("This query is too large to execute.")
  end
  @pos = @scanner.pos
  next_byte = @string.getbyte(@pos)
  next_byte_is_for = FIRST_BYTES[next_byte]
  case next_byte_is_for
  when ByteFor::PUNCTUATION
    @scanner.pos += 1
    PUNCTUATION_NAME_FOR_BYTE[next_byte]
  when ByteFor::NAME
    if len = @scanner.skip(KEYWORD_REGEXP)
      case len
      when 2
        :ON
      when 12
        :SUBSCRIPTION
      else
        pos = @pos

        # Use bytes 2 and 3 as a unique identifier for this keyword
        bytes = (@string.getbyte(pos + 2) << 8) | @string.getbyte(pos + 1)
        KEYWORD_BY_TWO_BYTES[_hash(bytes)]
      end
    else
      @scanner.skip(IDENTIFIER_REGEXP)
      :IDENTIFIER
    end
  when ByteFor::IDENTIFIER
    @scanner.skip(IDENTIFIER_REGEXP)
    :IDENTIFIER
  when ByteFor::NUMBER
    @scanner.skip(NUMERIC_REGEXP)

    if GraphQL.reject_numbers_followed_by_names
      new_pos = @scanner.pos
      peek_byte = @string.getbyte(new_pos)
      next_first_byte = FIRST_BYTES[peek_byte]
      if next_first_byte == ByteFor::NAME || next_first_byte == ByteFor::IDENTIFIER
        number_part = token_value
        name_part = @scanner.scan(IDENTIFIER_REGEXP)
        raise_parse_error("Name after number is not allowed (in `#{number_part}#{name_part}`)")
      end
    end
    # Check for a matched decimal:
    @scanner[1] ? :FLOAT : :INT
  when ByteFor::ELLIPSIS
    if @string.getbyte(@pos + 1) != 46 || @string.getbyte(@pos + 2) != 46
      raise_parse_error("Expected `...`, actual: #{@string[@pos..@pos + 2].inspect}")
    end
    @scanner.pos += 3
    :ELLIPSIS
  when ByteFor::STRING
    if @scanner.skip(BLOCK_STRING_REGEXP) || @scanner.skip(QUOTED_STRING_REGEXP)
      :STRING
    else
      raise_parse_error("Expected string or block string, but it was malformed")
    end
  else
    @scanner.pos += 1
    :UNKNOWN_CHAR
  end
rescue ArgumentError => err
  if err.message == "invalid byte sequence in UTF-8"
    raise_parse_error("Parse error on bad Unicode escape sequence", nil, nil)
  end
end

#column_number ⇒ `Object`



158
159
160

# File 'lib/graphql/language/lexer.rb', line 158

def column_number
  @scanner.string[0..@pos].split("\n").last.length
end

#debug_token_value(token_name) ⇒ `Object`

# File 'lib/graphql/language/lexer.rb', line 102

def debug_token_value(token_name)
  if token_name && Lexer::Punctuation.const_defined?(token_name)
    Lexer::Punctuation.const_get(token_name)
  elsif token_name == :ELLIPSIS
    "..."
  elsif token_name == :STRING
    string_value
  elsif @scanner.matched_size.nil?
    @scanner.peek(1)
  else
    token_value
  end
end

#eos? ⇒ `Boolean`

Returns:

(Boolean)



18
19
20

# File 'lib/graphql/language/lexer.rb', line 18

def eos?
  @scanner.eos?
end

#line_number ⇒ `Object`



154
155
156

# File 'lib/graphql/language/lexer.rb', line 154

def line_number
  @scanner.string[0..@pos].count("\n") + 1
end

#raise_parse_error(message, line = line_number, col = column_number) ⇒ `Object`

Raises:

(GraphQL::ParseError)



162
163
164

# File 'lib/graphql/language/lexer.rb', line 162

def raise_parse_error(message, line = line_number, col = column_number)
  raise GraphQL::ParseError.new(message, line, col, @string, filename: @filename)
end

#string_value ⇒ `Object`

# File 'lib/graphql/language/lexer.rb', line 131

def string_value
  str = token_value
  is_block = str.start_with?('"""')
  if is_block
    str.gsub!(/\A"""|"""\z/, '')
    return Language::BlockString.trim_whitespace(str)
  else
    str.gsub!(/\A"|"\z/, '')

    if !str.valid_encoding? || !str.match?(VALID_STRING)
      raise_parse_error("Bad unicode escape in #{str.inspect}")
    else
      Lexer.replace_escaped_characters_in_place(str)

      if !str.valid_encoding?
        raise_parse_error("Bad unicode escape in #{str.inspect}")
      else
        str
      end
    end
  end
end

#token_value ⇒ `Object`

# File 'lib/graphql/language/lexer.rb', line 96

def token_value
  @string.byteslice(@scanner.pos - @scanner.matched_size, @scanner.matched_size)
rescue StandardError => err
  raise GraphQL::Error, "(token_value failed: #{err.class}: #{err.message})"
end

Class: GraphQL::Language::Lexer

Defined Under Namespace

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(graphql_str, filename: nil, max_tokens: nil) ⇒ Lexer

Instance Attribute Details

#pos ⇒ Object (readonly)

Class Method Details

.replace_escaped_characters_in_place(raw_string) ⇒ Object

.tokenize(string) ⇒ Object

Instance Method Details

#_hash(key) ⇒ Object

#advance ⇒ Object

#column_number ⇒ Object

#debug_token_value(token_name) ⇒ Object

#eos? ⇒ Boolean

#line_number ⇒ Object

#raise_parse_error(message, line = line_number, col = column_number) ⇒ Object

#string_value ⇒ Object

#token_value ⇒ Object