Class: Moxml::XPath::Lexer

Inherits:
Object
  • Object
show all
Defined in:
lib/moxml/xpath/lexer.rb

Overview

XPath expression lexer/tokenizer

Converts XPath expressions into a stream of tokens for parsing. Each token is represented as [type, value, position].

Examples:

lexer = Lexer.new("//book[@id='123']")
tokens = lexer.tokenize
# => [[:dslash, "//", 0], [:name, "book", 2], ...]

Constant Summary collapse

AXIS_NAMES =

XPath axis names for recognition

%w[
  ancestor ancestor-or-self attribute child descendant
  descendant-or-self following following-sibling namespace
  parent preceding preceding-sibling self
].freeze
NODE_TYPES =

XPath node type names

%w[
  comment text processing-instruction node
].freeze
KEYWORDS =

Reserved keywords

%w[and or mod div].freeze

Instance Method Summary collapse

Constructor Details

#initialize(expression) ⇒ Lexer

Initialize lexer with XPath expression

Parameters:

  • expression (String)

    XPath expression to tokenize



33
34
35
36
37
38
# File 'lib/moxml/xpath/lexer.rb', line 33

def initialize(expression)
  @expression = expression.to_s
  @position = 0
  @length = @expression.length
  @tokens = []
end

Instance Method Details

#tokenizeArray<Array>

Tokenize the XPath expression

Returns:

  • (Array<Array>)

    Array of [type, value, position] tuples

Raises:



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/moxml/xpath/lexer.rb', line 44

def tokenize
  @tokens = []
  @position = 0

  while @position < @length
    skip_whitespace
    break if @position >= @length

    token_start = @position

    case current_char
    when "/"
      if peek_char == "/"
        add_token(:dslash, "//", token_start)
        advance(2)
      else
        add_token(:slash, "/", token_start)
        advance
      end
    when "|"
      add_token(:pipe, "|", token_start)
      advance
    when "+"
      add_token(:plus, "+", token_start)
      advance
    when "-"
      add_token(:minus, "-", token_start)
      advance
    when "*"
      add_token(:star, "*", token_start)
      advance
    when "="
      add_token(:eq, "=", token_start)
      advance
    when "!"
      if peek_char == "="
        add_token(:neq, "!=", token_start)
        advance(2)
      else
        raise_syntax_error("Unexpected '!' at position #{@position}")
      end
    when "<"
      if peek_char == "="
        add_token(:lte, "<=", token_start)
        advance(2)
      else
        add_token(:lt, "<", token_start)
        advance
      end
    when ">"
      if peek_char == "="
        add_token(:gte, ">=", token_start)
        advance(2)
      else
        add_token(:gt, ">", token_start)
        advance
      end
    when "("
      add_token(:lparen, "(", token_start)
      advance
    when ")"
      add_token(:rparen, ")", token_start)
      advance
    when "["
      add_token(:lbracket, "[", token_start)
      advance
    when "]"
      add_token(:rbracket, "]", token_start)
      advance
    when ","
      add_token(:comma, ",", token_start)
      advance
    when "@"
      add_token(:at, "@", token_start)
      advance
    when ":"
      if peek_char == ":"
        add_token(:dcolon, "::", token_start)
        advance(2)
      else
        add_token(:colon, ":", token_start)
        advance
      end
    when "."
      if peek_char == "."
        add_token(:ddot, "..", token_start)
        advance(2)
      elsif /\d/.match?(peek_char)
        scan_number(token_start)
      else
        add_token(:dot, ".", token_start)
        advance
      end
    when "$"
      add_token(:dollar, "$", token_start)
      advance
    when '"', "'"
      scan_string(token_start)
    when /\d/
      scan_number(token_start)
    when /[a-zA-Z_]/
      scan_name_or_keyword(token_start)
    else
      raise_syntax_error(
        "Unexpected character '#{current_char}' at position #{@position}",
      )
    end
  end

  @tokens
end