Class: AppQuery::Tokenizer
- Inherits:
-
Object
- Object
- AppQuery::Tokenizer
- Defined in:
- lib/app_query/tokenizer.rb
Defined Under Namespace
Classes: LexError
Instance Attribute Summary collapse
-
#input ⇒ Object
readonly
Returns the value of attribute input.
-
#pos ⇒ Object
readonly
Returns the value of attribute pos.
-
#start ⇒ Object
readonly
Returns the value of attribute start.
-
#tokens ⇒ Object
readonly
Returns the value of attribute tokens.
Class Method Summary collapse
Instance Method Summary collapse
- #chars_read ⇒ Object
- #emit_token(t, v: nil) ⇒ Object
- #eos? ⇒ Boolean
- #err(msg) ⇒ Object
-
#initialize(input, state: nil, start: nil, pos: nil) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #last_emitted(ignore:) ⇒ Object
- #last_emitted?(ignore_whitespace: true, ignore: [], **kws) ⇒ Boolean
- #lex_append_cte ⇒ Object
- #lex_comment ⇒ Object
- #lex_cte ⇒ Object
- #lex_cte_columns ⇒ Object
- #lex_cte_identifier ⇒ Object
- #lex_cte_select ⇒ Object
- #lex_maybe_materialized ⇒ Object
- #lex_prepend_cte ⇒ Object
- #lex_recursive_cte ⇒ Object
-
#lex_select ⇒ Object
there should always be a SELECT.
- #lex_sql ⇒ Object
-
#lex_whitespace ⇒ Object
optional.
- #lex_with ⇒ Object
- #match?(re) ⇒ Boolean
- #match_comment? ⇒ Boolean
- #push_return(*steps) ⇒ Object
- #read_char(n = 1) ⇒ Object
- #read_until(pattern) ⇒ Object
- #rest ⇒ Object
- #run(pos: nil) ⇒ Object
- #step ⇒ Object
Constructor Details
#initialize(input, state: nil, start: nil, pos: nil) ⇒ Tokenizer
Returns a new instance of Tokenizer.
13 14 15 16 17 18 19 |
# File 'lib/app_query/tokenizer.rb', line 13 def initialize(input, state: nil, start: nil, pos: nil) @input = input @tokens = [] @start = start || 0 @pos = pos || @start @return = Array(state || :lex_sql) end |
Instance Attribute Details
#input ⇒ Object (readonly)
Returns the value of attribute input.
7 8 9 |
# File 'lib/app_query/tokenizer.rb', line 7 def input @input end |
#pos ⇒ Object (readonly)
Returns the value of attribute pos.
7 8 9 |
# File 'lib/app_query/tokenizer.rb', line 7 def pos @pos end |
#start ⇒ Object (readonly)
Returns the value of attribute start.
7 8 9 |
# File 'lib/app_query/tokenizer.rb', line 7 def start @start end |
#tokens ⇒ Object (readonly)
Returns the value of attribute tokens.
7 8 9 |
# File 'lib/app_query/tokenizer.rb', line 7 def tokens @tokens end |
Class Method Details
.tokenize ⇒ Object
9 10 11 |
# File 'lib/app_query/tokenizer.rb', line 9 def self.tokenize(...) new(...).run end |
Instance Method Details
#chars_read ⇒ Object
36 37 38 |
# File 'lib/app_query/tokenizer.rb', line 36 def chars_read input[start...pos] end |
#emit_token(t, v: nil) ⇒ Object
53 54 55 56 57 |
# File 'lib/app_query/tokenizer.rb', line 53 def emit_token(t, v: nil) @tokens << {v: v || chars_read, t: t, start: start, end: pos} @start = @pos self end |
#eos? ⇒ Boolean
32 33 34 |
# File 'lib/app_query/tokenizer.rb', line 32 def eos? pos == input.size end |
#err(msg) ⇒ Object
21 22 23 24 25 26 27 28 29 30 |
# File 'lib/app_query/tokenizer.rb', line 21 def err(msg) linepos = linepos_by_pos[pos] || linepos_by_pos[pos.pred] msg += <<~ERR #{input} #{" " * linepos}^ ERR raise LexError, msg end |
#last_emitted(ignore:) ⇒ Object
122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/app_query/tokenizer.rb', line 122 def last_emitted(ignore:) if ignore.none? @tokens.last else t = @tokens.dup while (result = t.pop) break if !ignore.include?(result[:t]) end result end end |
#last_emitted?(ignore_whitespace: true, ignore: [], **kws) ⇒ Boolean
134 135 136 137 138 139 140 141 142 143 |
# File 'lib/app_query/tokenizer.rb', line 134 def last_emitted?(ignore_whitespace: true, ignore: [], **kws) ignore = if ignore.any? ignore elsif ignore_whitespace %w[COMMENT WHITESPACE] else [] end last_emitted(ignore:)&.slice(*kws.keys) == kws end |
#lex_append_cte ⇒ Object
104 105 106 107 108 |
# File 'lib/app_query/tokenizer.rb', line 104 def lex_append_cte emit_token "COMMA", v: "," emit_token "WHITESPACE", v: "\n " push_return :lex_recursive_cte end |
#lex_comment ⇒ Object
304 305 306 307 308 309 310 311 312 313 314 315 316 317 |
# File 'lib/app_query/tokenizer.rb', line 304 def lex_comment err "Expected comment, i.e. '--' or '/*'" unless match_comment? if match?("--") read_until(/\n/) else read_until %r{\*/} err "Expected comment close '*/'." if eos? read_char 2 end emit_token "COMMENT" push_return :lex_whitespace end |
#lex_cte ⇒ Object
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
# File 'lib/app_query/tokenizer.rb', line 145 def lex_cte if match_comment? push_return :lex_cte, :lex_comment elsif last_emitted? t: "CTE_IDENTIFIER", ignore_whitespace: true if match?(/AS(\s|\()/i) read_char 2 emit_token "AS" push_return :lex_cte, :lex_cte_select, :lex_maybe_materialized, :lex_whitespace elsif match?(%r{\(}) # "foo " "(id)" push_return :lex_cte, :lex_cte_columns else err "Expected 'AS' or CTE columns following CTE-identifier, e.g. 'foo AS' 'foo()'" end elsif last_emitted? t: "CTE_COLUMNS_CLOSE", ignore_whitespace: true if match?(/AS(\s|\()/i) read_char 2 emit_token "AS" push_return :lex_cte, :lex_cte_select, :lex_maybe_materialized, :lex_whitespace else err "Expected 'AS' following CTE-columns" end elsif last_emitted? t: "CTE_SELECT", ignore_whitespace: true if match?(/,/) # but wait, there's more! read_char emit_token "CTE_COMMA" push_return :lex_cte, :lex_whitespace end else push_return :lex_cte, :lex_cte_identifier end end |
#lex_cte_columns ⇒ Object
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 |
# File 'lib/app_query/tokenizer.rb', line 203 def lex_cte_columns err "Expected CTE columns, e.g. '(id, other)'" unless match? %r{\(} read_char read_until(/\S/) emit_token "CTE_COLUMNS_OPEN" loop do if match?(/\)/) err "Expected a column name" unless last_emitted? t: "CTE_COLUMN" read_char emit_token "CTE_COLUMNS_CLOSE" break elsif match?(/,/) # "( " "," err "Expected a column name" unless last_emitted? t: "CTE_COLUMN" read_char # ',' read_until(/\S/) emit_token "CTE_COLUMN_DIV" elsif match?(/"/) unless last_emitted? t: "CTE_COLUMNS_OPEN" err "Expected comma" unless last_emitted? t: "CTE_COLUMN_DIV" end read_char read_until(/"/) read_char emit_token "CTE_COLUMN" elsif match?(/[_A-Za-z]/) unless last_emitted? t: "CTE_COLUMNS_OPEN" err "Expected comma" unless last_emitted? t: "CTE_COLUMN_DIV" end read_until %r{,|\s|\)} emit_token "CTE_COLUMN" elsif match?(/\s/) read_until(/\S/) else # e.g. "(id," "1)" or eos? err "Expected valid column name" end end push_return :lex_whitespace end |
#lex_cte_identifier ⇒ Object
278 279 280 281 282 283 284 285 286 287 288 289 290 291 |
# File 'lib/app_query/tokenizer.rb', line 278 def lex_cte_identifier err "Expected CTE identifier, e.g. 'foo', '\"foo bar\"' " unless match? %r{[_"A-Za-z]} if match?(/"/) read_char read_until(/"/) read_char else read_until %r{\s|\(} end emit_token "CTE_IDENTIFIER" push_return :lex_whitespace end |
#lex_cte_select ⇒ Object
253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 |
# File 'lib/app_query/tokenizer.rb', line 253 def lex_cte_select err "Expected CTE select, e.g. '(select 1)'" unless match? %r{\(} read_char level = 1 loop do read_until(/\)|\(/) if eos? err "CTE select ended prematurely" elsif match?(/\(/) level += 1 elsif match?(/\)/) level -= 1 break if level.zero? end read_char end err "Expected non-empty CTE select, e.g. '(select 1)'" if chars_read.strip == "(" read_char emit_token "CTE_SELECT" push_return :lex_whitespace end |
#lex_maybe_materialized ⇒ Object
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
# File 'lib/app_query/tokenizer.rb', line 181 def lex_maybe_materialized if match?(/materialized/i) read_until(/\(/) emit_token "MATERIALIZED" elsif match?(%r{\(}) # done elsif match?(/not\s/i) read_char 3 read_until(/\S/) emit_token "NOT_MATERIALIZED" err "Expected 'MATERIALIZED'" unless match?(/materialized/i) push_return :lex_maybe_materialized else err "Expected CTE select or NOT? MATERIALIZED" end end |
#lex_prepend_cte ⇒ Object
94 95 96 97 98 99 100 101 102 |
# File 'lib/app_query/tokenizer.rb', line 94 def lex_prepend_cte if eos? emit_token "COMMA", v: "," emit_token "WHITESPACE", v: "\n" else # emit_token "WHITESPACE", v: " " push_return :lex_prepend_cte, :lex_recursive_cte end end |
#lex_recursive_cte ⇒ Object
110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/app_query/tokenizer.rb', line 110 def lex_recursive_cte if match?(/recursive\s/i) read_until(/\s/) # make trailing whitespace part of next token # this makes adding cte's easier read_until(/\S/) emit_token "RECURSIVE" end push_return :lex_cte end |
#lex_select ⇒ Object
there should always be a SELECT
294 295 296 297 298 299 300 301 302 |
# File 'lib/app_query/tokenizer.rb', line 294 def lex_select read_until(/\Z/) read_char if last_emitted? t: "COMMENT", ignore_whitespace: false emit_token "WHITESPACE", v: "\n" end emit_token "SELECT" end |
#lex_sql ⇒ Object
71 72 73 74 75 76 77 78 79 80 81 82 83 |
# File 'lib/app_query/tokenizer.rb', line 71 def lex_sql if last_emitted? t: "CTE_SELECT", ignore: %w[WHITESPACE COMMENT] push_return :lex_select elsif match?(/\s/) push_return :lex_sql, :lex_whitespace elsif match_comment? push_return :lex_sql, :lex_comment elsif match?(/with/i) push_return :lex_sql, :lex_with else push_return :lex_select end end |
#lex_whitespace ⇒ Object
optional
320 321 322 323 324 325 326 |
# File 'lib/app_query/tokenizer.rb', line 320 def lex_whitespace if match?(/\s/) read_until(/\S/) emit_token "WHITESPACE" end end |
#lex_with ⇒ Object
85 86 87 88 89 90 91 92 |
# File 'lib/app_query/tokenizer.rb', line 85 def lex_with err "Expected 'WITH'" unless match? %r{WITH\s}i read_until(/\s/) read_until(/\S/) emit_token "WITH" push_return :lex_recursive_cte end |
#match?(re) ⇒ Boolean
49 50 51 |
# File 'lib/app_query/tokenizer.rb', line 49 def match?(re) rest[Regexp.new("\\A%s" % re)] end |
#match_comment? ⇒ Boolean
199 200 201 |
# File 'lib/app_query/tokenizer.rb', line 199 def match_comment? match?(%r{--|/\*}) end |
#push_return(*steps) ⇒ Object
59 60 61 62 |
# File 'lib/app_query/tokenizer.rb', line 59 def push_return(*steps) (@return ||= []).push(*steps) self end |
#read_char(n = 1) ⇒ Object
40 41 42 43 |
# File 'lib/app_query/tokenizer.rb', line 40 def read_char(n = 1) @pos = [pos + n, input.size].min self end |
#read_until(pattern) ⇒ Object
64 65 66 67 68 69 |
# File 'lib/app_query/tokenizer.rb', line 64 def read_until(pattern) loop do break if match?(pattern) || eos? read_char end end |
#rest ⇒ Object
45 46 47 |
# File 'lib/app_query/tokenizer.rb', line 45 def rest input[pos...] end |
#run(pos: nil) ⇒ Object
328 329 330 331 332 333 |
# File 'lib/app_query/tokenizer.rb', line 328 def run(pos: nil) loop do break if step.nil? end eos? ? tokens : self end |
#step ⇒ Object
335 336 337 338 339 340 |
# File 'lib/app_query/tokenizer.rb', line 335 def step if (state = @return.pop) method(state).call self end end |