Class: AppQuery::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/app_query/tokenizer.rb

Defined Under Namespace

Classes: LexError

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input, state: nil, start: nil, pos: nil) ⇒ Tokenizer

Returns a new instance of Tokenizer.



13
14
15
16
17
18
19
# File 'lib/app_query/tokenizer.rb', line 13

def initialize(input, state: nil, start: nil, pos: nil)
  @input = input
  @tokens = []
  @start = start || 0
  @pos = pos || @start
  @return = Array(state || :lex_sql)
end

Instance Attribute Details

#inputObject (readonly)

Returns the value of attribute input.



7
8
9
# File 'lib/app_query/tokenizer.rb', line 7

def input
  @input
end

#posObject (readonly)

Returns the value of attribute pos.



7
8
9
# File 'lib/app_query/tokenizer.rb', line 7

def pos
  @pos
end

#startObject (readonly)

Returns the value of attribute start.



7
8
9
# File 'lib/app_query/tokenizer.rb', line 7

def start
  @start
end

#tokensObject (readonly)

Returns the value of attribute tokens.



7
8
9
# File 'lib/app_query/tokenizer.rb', line 7

def tokens
  @tokens
end

Class Method Details

.tokenizeObject



9
10
11
# File 'lib/app_query/tokenizer.rb', line 9

def self.tokenize(...)
  new(...).run
end

Instance Method Details

#chars_readObject



36
37
38
# File 'lib/app_query/tokenizer.rb', line 36

def chars_read
  input[start...pos]
end

#emit_token(t, v: nil) ⇒ Object



53
54
55
56
57
# File 'lib/app_query/tokenizer.rb', line 53

def emit_token(t, v: nil)
  @tokens << {v: v || chars_read, t: t, start: start, end: pos}
  @start = @pos
  self
end

#eos?Boolean

Returns:

  • (Boolean)


32
33
34
# File 'lib/app_query/tokenizer.rb', line 32

def eos?
  pos == input.size
end

#err(msg) ⇒ Object

Raises:



21
22
23
24
25
26
27
28
29
30
# File 'lib/app_query/tokenizer.rb', line 21

def err(msg)
  linepos = linepos_by_pos[pos] || linepos_by_pos[pos.pred]

  msg += <<~ERR

    #{input}
    #{" " * linepos}^
  ERR
  raise LexError, msg
end

#last_emitted(ignore:) ⇒ Object



122
123
124
125
126
127
128
129
130
131
132
# File 'lib/app_query/tokenizer.rb', line 122

def last_emitted(ignore:)
  if ignore.none?
    @tokens.last
  else
    t = @tokens.dup
    while (result = t.pop)
      break if !ignore.include?(result[:t])
    end
    result
  end
end

#last_emitted?(ignore_whitespace: true, ignore: [], **kws) ⇒ Boolean

Returns:

  • (Boolean)


134
135
136
137
138
139
140
141
142
143
# File 'lib/app_query/tokenizer.rb', line 134

def last_emitted?(ignore_whitespace: true, ignore: [], **kws)
  ignore = if ignore.any?
    ignore
  elsif ignore_whitespace
    %w[COMMENT WHITESPACE]
  else
    []
  end
  last_emitted(ignore:)&.slice(*kws.keys) == kws
end

#lex_append_cteObject



104
105
106
107
108
# File 'lib/app_query/tokenizer.rb', line 104

def lex_append_cte
  emit_token "COMMA", v: ","
  emit_token "WHITESPACE", v: "\n  "
  push_return :lex_recursive_cte
end

#lex_commentObject



304
305
306
307
308
309
310
311
312
313
314
315
316
317
# File 'lib/app_query/tokenizer.rb', line 304

def lex_comment
  err "Expected comment, i.e. '--' or '/*'" unless match_comment?

  if match?("--")
    read_until(/\n/)
  else
    read_until %r{\*/}
    err "Expected comment close '*/'." if eos?
    read_char 2
  end

  emit_token "COMMENT"
  push_return :lex_whitespace
end

#lex_cteObject



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# File 'lib/app_query/tokenizer.rb', line 145

def lex_cte
  if match_comment?
    push_return :lex_cte, :lex_comment
  elsif last_emitted? t: "CTE_IDENTIFIER", ignore_whitespace: true
    if match?(/AS(\s|\()/i)
      read_char 2
      emit_token "AS"

      push_return :lex_cte, :lex_cte_select, :lex_maybe_materialized, :lex_whitespace
    elsif match?(%r{\(})
      # "foo " "(id)"
      push_return :lex_cte, :lex_cte_columns
    else
      err "Expected 'AS' or CTE columns following CTE-identifier, e.g. 'foo AS' 'foo()'"
    end
  elsif last_emitted? t: "CTE_COLUMNS_CLOSE", ignore_whitespace: true
    if match?(/AS(\s|\()/i)
      read_char 2
      emit_token "AS"

      push_return :lex_cte, :lex_cte_select, :lex_maybe_materialized, :lex_whitespace
    else
      err "Expected 'AS' following CTE-columns"
    end
  elsif last_emitted? t: "CTE_SELECT", ignore_whitespace: true
    if match?(/,/)
      # but wait, there's more!
      read_char
      emit_token "CTE_COMMA"
      push_return :lex_cte, :lex_whitespace
    end
  else
    push_return :lex_cte, :lex_cte_identifier
  end
end

#lex_cte_columnsObject



203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# File 'lib/app_query/tokenizer.rb', line 203

def lex_cte_columns
  err "Expected CTE columns, e.g. '(id, other)'" unless match? %r{\(}

  read_char
  read_until(/\S/)
  emit_token "CTE_COLUMNS_OPEN"

  loop do
    if match?(/\)/)
      err "Expected a column name" unless last_emitted? t: "CTE_COLUMN"

      read_char
      emit_token "CTE_COLUMNS_CLOSE"
      break
    elsif match?(/,/)
      # "( " ","
      err "Expected a column name" unless last_emitted? t: "CTE_COLUMN"
      read_char # ','

      read_until(/\S/)
      emit_token "CTE_COLUMN_DIV"
    elsif match?(/"/)
      unless last_emitted? t: "CTE_COLUMNS_OPEN"
        err "Expected comma" unless last_emitted? t: "CTE_COLUMN_DIV"
      end

      read_char
      read_until(/"/)
      read_char

      emit_token "CTE_COLUMN"
    elsif match?(/[_A-Za-z]/)
      unless last_emitted? t: "CTE_COLUMNS_OPEN"
        err "Expected comma" unless last_emitted? t: "CTE_COLUMN_DIV"
      end

      read_until %r{,|\s|\)}

      emit_token "CTE_COLUMN"
    elsif match?(/\s/)
      read_until(/\S/)
    else
      # e.g. "(id," "1)" or eos?
      err "Expected valid column name"
    end
  end

  push_return :lex_whitespace
end

#lex_cte_identifierObject



278
279
280
281
282
283
284
285
286
287
288
289
290
291
# File 'lib/app_query/tokenizer.rb', line 278

def lex_cte_identifier
  err "Expected CTE identifier, e.g. 'foo', '\"foo bar\"' " unless match? %r{[_"A-Za-z]}

  if match?(/"/)
    read_char
    read_until(/"/)
    read_char
  else
    read_until %r{\s|\(}
  end
  emit_token "CTE_IDENTIFIER"

  push_return :lex_whitespace
end

#lex_cte_selectObject



253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
# File 'lib/app_query/tokenizer.rb', line 253

def lex_cte_select
  err "Expected CTE select, e.g. '(select 1)'" unless match? %r{\(}
  read_char

  level = 1
  loop do
    read_until(/\)|\(/)
    if eos?
      err "CTE select ended prematurely"
    elsif match?(/\(/)
      level += 1
    elsif match?(/\)/)
      level -= 1
      break if level.zero?
    end
    read_char
  end

  err "Expected non-empty CTE select, e.g. '(select 1)'" if chars_read.strip == "("
  read_char
  emit_token "CTE_SELECT"

  push_return :lex_whitespace
end

#lex_maybe_materializedObject



181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# File 'lib/app_query/tokenizer.rb', line 181

def lex_maybe_materialized
  if match?(/materialized/i)
    read_until(/\(/)
    emit_token "MATERIALIZED"
  elsif match?(%r{\(})
    # done
  elsif match?(/not\s/i)
    read_char 3
    read_until(/\S/)
    emit_token "NOT_MATERIALIZED"
    err "Expected 'MATERIALIZED'" unless match?(/materialized/i)

    push_return :lex_maybe_materialized
  else
    err "Expected CTE select or NOT? MATERIALIZED"
  end
end

#lex_prepend_cteObject



94
95
96
97
98
99
100
101
102
# File 'lib/app_query/tokenizer.rb', line 94

def lex_prepend_cte
  if eos?
    emit_token "COMMA", v: ","
    emit_token "WHITESPACE", v: "\n"
  else
    # emit_token "WHITESPACE", v: " "
    push_return :lex_prepend_cte, :lex_recursive_cte
  end
end

#lex_recursive_cteObject



110
111
112
113
114
115
116
117
118
119
120
# File 'lib/app_query/tokenizer.rb', line 110

def lex_recursive_cte
  if match?(/recursive\s/i)
    read_until(/\s/)
    # make trailing whitespace part of next token
    # this makes adding cte's easier
    read_until(/\S/)
    emit_token "RECURSIVE"
  end

  push_return :lex_cte
end

#lex_selectObject

there should always be a SELECT



294
295
296
297
298
299
300
301
302
# File 'lib/app_query/tokenizer.rb', line 294

def lex_select
  read_until(/\Z/)
  read_char

  if last_emitted? t: "COMMENT", ignore_whitespace: false
    emit_token "WHITESPACE", v: "\n"
  end
  emit_token "SELECT"
end

#lex_sqlObject



71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/app_query/tokenizer.rb', line 71

def lex_sql
  if last_emitted? t: "CTE_SELECT", ignore: %w[WHITESPACE COMMENT]
    push_return :lex_select
  elsif match?(/\s/)
    push_return :lex_sql, :lex_whitespace
  elsif match_comment?
    push_return :lex_sql, :lex_comment
  elsif match?(/with/i)
    push_return :lex_sql, :lex_with
  else
    push_return :lex_select
  end
end

#lex_whitespaceObject

optional



320
321
322
323
324
325
326
# File 'lib/app_query/tokenizer.rb', line 320

def lex_whitespace
  if match?(/\s/)
    read_until(/\S/)

    emit_token "WHITESPACE"
  end
end

#lex_withObject



85
86
87
88
89
90
91
92
# File 'lib/app_query/tokenizer.rb', line 85

def lex_with
  err "Expected 'WITH'" unless match? %r{WITH\s}i
  read_until(/\s/)
  read_until(/\S/)
  emit_token "WITH"

  push_return :lex_recursive_cte
end

#match?(re) ⇒ Boolean

Returns:

  • (Boolean)


49
50
51
# File 'lib/app_query/tokenizer.rb', line 49

def match?(re)
  rest[Regexp.new("\\A%s" % re)]
end

#match_comment?Boolean

Returns:

  • (Boolean)


199
200
201
# File 'lib/app_query/tokenizer.rb', line 199

def match_comment?
  match?(%r{--|/\*})
end

#push_return(*steps) ⇒ Object



59
60
61
62
# File 'lib/app_query/tokenizer.rb', line 59

def push_return(*steps)
  (@return ||= []).push(*steps)
  self
end

#read_char(n = 1) ⇒ Object



40
41
42
43
# File 'lib/app_query/tokenizer.rb', line 40

def read_char(n = 1)
  @pos = [pos + n, input.size].min
  self
end

#read_until(pattern) ⇒ Object



64
65
66
67
68
69
# File 'lib/app_query/tokenizer.rb', line 64

def read_until(pattern)
  loop do
    break if match?(pattern) || eos?
    read_char
  end
end

#restObject



45
46
47
# File 'lib/app_query/tokenizer.rb', line 45

def rest
  input[pos...]
end

#run(pos: nil) ⇒ Object



328
329
330
331
332
333
# File 'lib/app_query/tokenizer.rb', line 328

def run(pos: nil)
  loop do
    break if step.nil?
  end
  eos? ? tokens : self
end

#stepObject



335
336
337
338
339
340
# File 'lib/app_query/tokenizer.rb', line 335

def step
  if (state = @return.pop)
    method(state).call
    self
  end
end