Class: Tokn::Tokenizer
- Inherits:
-
Object
- Object
- Tokn::Tokenizer
- Defined in:
- lib/tokn/tokenizer.rb
Overview
Extracts tokens from a script, given a previously constructed DFA.
Instance Method Summary collapse
-
#hasNext ⇒ Object
Determine if another token exists.
-
#initialize(dfa, text, skipName = nil) ⇒ Tokenizer
constructor
Construct a tokenizer.
-
#nameOf(token) ⇒ Object
Get the name of a token (i.e., the name of the token definition, not its text).
-
#peek ⇒ Object
Determine next token (without reading it).
-
#read(tokenName = nil) ⇒ Object
Read next token.
-
#readIf(tokenName) ⇒ Object
Read next token if it has a particular name.
-
#readSequence(seq) ⇒ Object
Read a sequence of tokens.
-
#readSequenceIf(seq) ⇒ Object
Read a sequence of tokens, if they have particular names.
-
#unread(count = 1) ⇒ Object
Unread one (or more) previously read tokens.
Constructor Details
#initialize(dfa, text, skipName = nil) ⇒ Tokenizer
Construct a tokenizer
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/tokn/tokenizer.rb', line 16 def initialize(dfa, text, skipName = nil) @dfa = dfa @text = text if !text raise ArgumentError, "No text defined" end @skipTokenId = nil if skipName @skipTokenId = dfa.tokenId(skipName) if !@skipTokenId raise ArgumentError, "No token with name "+skipName+" found" end end @lineNumber = 0 @column = 0 @cursor = 0 @tokenHistory = [] @historyPointer = 0 end |
Instance Method Details
#hasNext ⇒ Object
Determine if another token exists
219 220 221 |
# File 'lib/tokn/tokenizer.rb', line 219 def hasNext !peek().nil? end |
#nameOf(token) ⇒ Object
Get the name of a token (i.e., the name of the token definition, not its text)
> token read from this tokenizer
228 229 230 |
# File 'lib/tokn/tokenizer.rb', line 228 def nameOf(token) @dfa.tokenName(token.id) end |
#peek ⇒ Object
Determine next token (without reading it)
Returns Token, or nil if end of input
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/tokn/tokenizer.rb', line 40 def peek # if !@text # raise IllegalStateException, "No input text specified" # end db = false !db || warn("debug printing is on") !db || pr("peek, cursor=%d\n",@cursor) if @historyPointer == @tokenHistory.size while true # repeat until we find a non-skipped token, or run out of text break if @cursor >= @text.length bestLength = 0 bestId = ToknInternal::UNKNOWN_TOKEN charOffset = 0 state = @dfa.startState while @cursor + charOffset <= @text.length ch = nil if @cursor + charOffset < @text.length ch = @text[@cursor + charOffset].ord() !db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr) end nextState = nil # Examine edges leaving this state. # If one is labelled with a token id, we don't need to match the character with it; # store as best token found if length is longer than previous, or equal to previous # with higher id. # If an edge is labelled with the current character, advance to that state. edges = state.edges edges.each do |lbl,dest| a = lbl.array !db || pr(" edge lbl=%s, dest=%s\n",d(lbl),d(dest)) if a[0] < ToknInternal::EPSILON newTokenId = ToknInternal::edgeLabelToTokenId(a[0]) !db || pr(" new token id=%d\n",newTokenId) if (bestLength < charOffset || newTokenId > bestId) bestLength, bestId = charOffset, newTokenId !db || pr(" making longest found so far\n") end end if ch && lbl.contains?(ch) !db || pr(" setting next state to %s\n",d(dest)) nextState = dest break end end if !nextState break end state = nextState charOffset += 1 !db || pr(" advanced to next state\n") end if bestId == @skipTokenId @cursor += bestLength next end peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column) @tokenHistory.push(peekToken) break # We found a token, so stop end end ret = nil if @historyPointer < @tokenHistory.size ret = @tokenHistory[@historyPointer] end ret end |
#read(tokenName = nil) ⇒ Object
Read next token
if token has different than expected name
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# File 'lib/tokn/tokenizer.rb', line 131 def read(tokenName = nil) token = peek() if !token raise TokenizerException,"No more tokens" end if token.id == ToknInternal::UNKNOWN_TOKEN raise TokenizerException, "Unknown token "+token.inspect end if tokenName && tokenName != nameOf(token) raise TokenizerException, "Unexpected token "+token.inspect end @historyPointer += 1 # Advance cursor, line number, column tl = token.text.length @cursor += tl tl.times do |i| c = token.text[i] @column += 1 if c == "\n" @lineNumber += 1 @column = 0 end end token end |
#readIf(tokenName) ⇒ Object
Read next token if it has a particular name
> tokenName : name to look for < token read, or nil
167 168 169 170 171 172 173 174 |
# File 'lib/tokn/tokenizer.rb', line 167 def readIf(tokenName) ret = nil token = peek() if token && nameOf(token) == tokenName ret = read() end ret end |
#readSequence(seq) ⇒ Object
Read a sequence of tokens
181 182 183 184 185 186 187 188 189 |
# File 'lib/tokn/tokenizer.rb', line 181 def readSequence(seq) seqNames = seq.split(' ') ret = [] seqNames.each do |name| tk = name != '_' ? read(name) : read ret.push(tk) end ret end |
#readSequenceIf(seq) ⇒ Object
Read a sequence of tokens, if they have particular names
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
# File 'lib/tokn/tokenizer.rb', line 197 def readSequenceIf(seq) ret = [] seqNames = seq.split(' ') seqNames.each do |name| tk = peek break if !tk if name != '_' && nameOf(tk) != name break end ret.push(read) end if ret.size != seqNames.size unread(ret.size) ret = nil end ret end |
#unread(count = 1) ⇒ Object
Unread one (or more) previously read tokens
236 237 238 239 240 241 |
# File 'lib/tokn/tokenizer.rb', line 236 def unread(count = 1) if @historyPointer < count raise TokenizerException, "Cannot unread before start" end @historyPointer -= count end |