Class: Moxml::XPath::Lexer

Inherits:

Object

Object
Moxml::XPath::Lexer

show all

Defined in:: lib/moxml/xpath/lexer.rb

Overview

XPath expression lexer/tokenizer

Converts XPath expressions into a stream of tokens for parsing. Each token is represented as [type, value, position].

Examples:

lexer = Lexer.new("//book[@id='123']")
tokens = lexer.tokenize
# => [[:dslash, "//", 0], [:name, "book", 2], ...]

Constant Summary collapse

AXIS_NAMES = XPath axis names for recognition

%w[
  ancestor ancestor-or-self attribute child descendant
  descendant-or-self following following-sibling namespace
  parent preceding preceding-sibling self
].freeze

NODE_TYPES = XPath node type names

%w[
  comment text processing-instruction node
].freeze

KEYWORDS = Reserved keywords

%w[and or mod div].freeze

Instance Method Summary collapse

#initialize(expression) ⇒ Lexer constructor

Initialize lexer with XPath expression.
#tokenize ⇒ Array<Array>

Tokenize the XPath expression.

Constructor Details

#initialize(expression) ⇒ `Lexer`

Initialize lexer with XPath expression

Parameters:

expression (String) —

XPath expression to tokenize

# File 'lib/moxml/xpath/lexer.rb', line 33

def initialize(expression)
  @expression = expression.to_s
  @position = 0
  @length = @expression.length
  @tokens = []
end

Instance Method Details

#tokenize ⇒ `Array<Array>`

Tokenize the XPath expression

Returns:

(Array<Array>) —

Array of [type, value, position] tuples

Raises:

(XPath::SyntaxError) —

if expression contains invalid syntax

# File 'lib/moxml/xpath/lexer.rb', line 44

def tokenize
  @tokens = []
  @position = 0

  while @position < @length
    skip_whitespace
    break if @position >= @length

    token_start = @position

    case current_char
    when "/"
      if peek_char == "/"
        add_token(:dslash, "//", token_start)
        advance(2)
      else
        add_token(:slash, "/", token_start)
        advance
      end
    when "|"
      add_token(:pipe, "|", token_start)
      advance
    when "+"
      add_token(:plus, "+", token_start)
      advance
    when "-"
      add_token(:minus, "-", token_start)
      advance
    when "*"
      add_token(:star, "*", token_start)
      advance
    when "="
      add_token(:eq, "=", token_start)
      advance
    when "!"
      if peek_char == "="
        add_token(:neq, "!=", token_start)
        advance(2)
      else
        raise_syntax_error("Unexpected '!' at position #{@position}")
      end
    when "<"
      if peek_char == "="
        add_token(:lte, "<=", token_start)
        advance(2)
      else
        add_token(:lt, "<", token_start)
        advance
      end
    when ">"
      if peek_char == "="
        add_token(:gte, ">=", token_start)
        advance(2)
      else
        add_token(:gt, ">", token_start)
        advance
      end
    when "("
      add_token(:lparen, "(", token_start)
      advance
    when ")"
      add_token(:rparen, ")", token_start)
      advance
    when "["
      add_token(:lbracket, "[", token_start)
      advance
    when "]"
      add_token(:rbracket, "]", token_start)
      advance
    when ","
      add_token(:comma, ",", token_start)
      advance
    when "@"
      add_token(:at, "@", token_start)
      advance
    when ":"
      if peek_char == ":"
        add_token(:dcolon, "::", token_start)
        advance(2)
      else
        add_token(:colon, ":", token_start)
        advance
      end
    when "."
      if peek_char == "."
        add_token(:ddot, "..", token_start)
        advance(2)
      elsif /\d/.match?(peek_char)
        scan_number(token_start)
      else
        add_token(:dot, ".", token_start)
        advance
      end
    when "$"
      add_token(:dollar, "$", token_start)
      advance
    when '"', "'"
      scan_string(token_start)
    when /\d/
      scan_number(token_start)
    when /[a-zA-Z_]/
      scan_name_or_keyword(token_start)
    else
      raise_syntax_error(
        "Unexpected character '#{current_char}' at position #{@position}",
      )
    end
  end

  @tokens
end