Class: Tokn::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/tokn/tokenizer.rb

Overview

Extracts tokens from a script, given a previously constructed DFA.

Instance Method Summary collapse

Constructor Details

#initialize(dfa, text, skipName = nil) ⇒ Tokenizer

Construct a tokenizer

Parameters:

  • dfa

    the DFA to use

  • text

    the text to extract tokens from

  • skipName (defaults to: nil)

    if not nil, tokens with this name will be skipped



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/tokn/tokenizer.rb', line 16

def initialize(dfa, text, skipName = nil)
  @dfa = dfa
  @text = text
  if !text
    raise ArgumentError, "No text defined"
  end
  @skipTokenId = nil
  if skipName
    @skipTokenId = dfa.tokenId(skipName)
    if !@skipTokenId 
      raise ArgumentError, "No token with name "+skipName+" found"
    end
  end
  @lineNumber = 0
  @column = 0
  @cursor = 0
  @tokenHistory = []
  @historyPointer = 0  
end

Instance Method Details

#hasNextObject

Determine if another token exists



219
220
221
# File 'lib/tokn/tokenizer.rb', line 219

def hasNext
  !peek().nil?
end

#nameOf(token) ⇒ Object

Get the name of a token (i.e., the name of the token definition, not its text)

> token read from this tokenizer



228
229
230
# File 'lib/tokn/tokenizer.rb', line 228

def nameOf(token)
  @dfa.tokenName(token.id)
end

#peekObject

Determine next token (without reading it)

Returns Token, or nil if end of input



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/tokn/tokenizer.rb', line 40

def peek
  # if !@text
    # raise IllegalStateException, "No input text specified"
  # end
  
  db = false
  !db || warn("debug printing is on")
  !db || pr("peek, cursor=%d\n",@cursor)
  
  if @historyPointer == @tokenHistory.size
    while true # repeat until we find a non-skipped token, or run out of text
      break if @cursor >= @text.length
      
      bestLength = 0
      bestId = ToknInternal::UNKNOWN_TOKEN
      
      charOffset = 0
      state = @dfa.startState
      while @cursor + charOffset <= @text.length
        ch = nil
        if @cursor + charOffset < @text.length
          ch = @text[@cursor + charOffset].ord()
          !db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr)
        end
  
        nextState = nil
        
        # Examine edges leaving this state.
        # If one is labelled with a token id, we don't need to match the character with it;
        # store as best token found if length is longer than previous, or equal to previous
        # with higher id.
        
        # If an edge is labelled with the current character, advance to that state.
        
        edges = state.edges
        edges.each do |lbl,dest|
          a = lbl.array
          !db || pr("  edge lbl=%s, dest=%s\n",d(lbl),d(dest))
          if a[0] < ToknInternal::EPSILON
            newTokenId = ToknInternal::edgeLabelToTokenId(a[0])
            !db || pr("   new token id=%d\n",newTokenId)
          
            if (bestLength < charOffset || newTokenId > bestId)
              bestLength, bestId = charOffset, newTokenId
              !db || pr("     making longest found so far\n")
            end
          end
          
          if ch && lbl.contains?(ch)
            !db || pr("   setting next state to %s\n",d(dest))
            nextState = dest
            break
          end
        end 
        
        if !nextState
          break
        end
        state = nextState
        charOffset += 1 
        !db || pr(" advanced to next state\n")
      end
    
      if bestId == @skipTokenId
        @cursor += bestLength
        next
      end
      
      peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column)
      
      @tokenHistory.push(peekToken)
      break # We found a token, so stop
    end  
  end
  
  ret = nil
  if @historyPointer < @tokenHistory.size
    ret = @tokenHistory[@historyPointer]
  end
  
  ret 
end

#read(tokenName = nil) ⇒ Object

Read next token

if token has different than expected name

Parameters:

  • tokenName (defaults to: nil)

    if not nil, the (string) name of the token expected

Raises:

  • TokenizerException if no more tokens,if unrecognized token, or



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/tokn/tokenizer.rb', line 131

def read(tokenName = nil)
  token = peek()
  if !token
    raise TokenizerException,"No more tokens"
  end
  
  if token.id == ToknInternal::UNKNOWN_TOKEN
    raise TokenizerException, "Unknown token "+token.inspect
  end
  
  if tokenName && tokenName != nameOf(token)
    raise TokenizerException, "Unexpected token "+token.inspect
  end
  
  @historyPointer += 1
  
  # Advance cursor, line number, column
  
  tl = token.text.length
  @cursor += tl
  tl.times do |i|
    c = token.text[i]
    @column += 1
    if c == "\n"
      @lineNumber += 1
      @column = 0
    end
  end
  token
end

#readIf(tokenName) ⇒ Object

Read next token if it has a particular name

> tokenName : name to look for < token read, or nil



167
168
169
170
171
172
173
174
# File 'lib/tokn/tokenizer.rb', line 167

def readIf(tokenName)
  ret = nil
  token = peek()
  if token && nameOf(token) == tokenName
    ret = read()
  end
  ret
end

#readSequence(seq) ⇒ Object

Read a sequence of tokens

Parameters:

  • seq

    string of space-delimited token names; if name is ‘_’, allows any token name in that position

Returns:

  • array of tokens read



181
182
183
184
185
186
187
188
189
# File 'lib/tokn/tokenizer.rb', line 181

def readSequence(seq)
  seqNames = seq.split(' ')
  ret = []
  seqNames.each do |name|
    tk = name != '_' ? read(name) : read
    ret.push(tk)
  end
  ret
end

#readSequenceIf(seq) ⇒ Object

Read a sequence of tokens, if they have particular names

Parameters:

  • seq

    string of space-delimited token names; if name is ‘_’, allows any token name in that position

Returns:

  • array of tokens read, or nil if the tokens had different names (or an end of input was encountered)



197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'lib/tokn/tokenizer.rb', line 197

def readSequenceIf(seq)
  ret = []
  seqNames = seq.split(' ')
  seqNames.each do |name|
    tk = peek
    break if !tk
    if name != '_' && nameOf(tk) != name
      break
    end
    ret.push(read)
  end
  
  if ret.size != seqNames.size
    unread(ret.size)
    ret = nil
  end
  ret
end

#unread(count = 1) ⇒ Object

Unread one (or more) previously read tokens

Raises:

  • TokenizerException if attempt to unread token that was never read



236
237
238
239
240
241
# File 'lib/tokn/tokenizer.rb', line 236

def unread(count = 1)
  if @historyPointer < count
    raise TokenizerException, "Cannot unread before start"
  end
  @historyPointer -= count
end