Class: Lexer

Inherits:
TokenSource show all
Defined in:
lib/antlr4/Lexer.rb

Overview

A lexer is recognizer that draws input symbols from a character stream.

lexer grammars result in a subclass of self object. A Lexer object
uses simplified match() and error recovery mechanisms in the interest
of speed.

Constant Summary collapse

DEFAULT_MODE =
0
MORE =
-2
SKIP =
-3
DEFAULT_TOKEN_CHANNEL =
Token::DEFAULT_CHANNEL
HIDDEN =
Token::HIDDEN_CHANNEL
MIN_CHAR_VALUE =
"\u0000"
MAX_CHAR_VALUE =
"\uFFFE"

Instance Attribute Summary collapse

Attributes inherited from Recognizer

#interp, #listeners, #ruleIndexMapCache, #state, #tokenTypeMapCache

Instance Method Summary collapse

Methods inherited from Recognizer

#addErrorListener, #checkVersion, #extractVersion, #getErrorHeader, #getErrorListenerDispatch, #getRuleIndexMap, #getState, #getTokenErrorDisplay, #getTokenType, #getTokenTypeMap, #precpred, #sempred

Constructor Details

#initialize(_input) ⇒ Lexer

Returns a new instance of Lexer.



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/antlr4/Lexer.rb', line 21

def initialize(_input)
    super()
    @input = _input
    @factory = CommonTokenFactory.DEFAULT
    @tokenFactorySourcePair = [self, _input]

    @interp = nil # child classes must populate this
    
    # The goal of all lexer rules/methods is to create a token object.
    #  self is an instance variable as multiple rules may collaborate to
    #  create a single token.  nextToken will return self object after
    #  matching lexer rule(s).  If you subclass to allow multiple token
    #  emissions, then set self to the last token to be matched or
    #  something nonnull so that the auto token emit mechanism will not
    #  emit another token.
    @token = nil

    # What character index in the stream did the current token start at?
    #  Needed, for example, to get the text for current token.  Set at
    #  the start of nextToken.
    @tokenStartCharIndex = -1

    # The line on which the first character of the token resides#/
    @tokenStartLine = -1

    # The character position of first character within the line#/
    @tokenStartColumn = -1

    # Once we see EOF on char stream, next token will be EOF.
    #  If you have DONE : EOF ; then you see DONE EOF.
    @hitEOF = false

    # The channel number for the current token#/
    @channel = Token::DEFAULT_CHANNEL

    # The token type for the current token#/
    @type = Token::INVALID_TYPE

    @modeStack = Array.new
    @mode = Lexer::DEFAULT_MODE

    # You can set the text for the current token to override what is in
    #  the input char buffer.  Use setText() or can set self instance var.
    #/
    @text = nil
end

Instance Attribute Details

#channelObject

Returns the value of attribute channel.



19
20
21
# File 'lib/antlr4/Lexer.rb', line 19

def channel
  @channel
end

#factoryObject

, :interp



17
18
19
# File 'lib/antlr4/Lexer.rb', line 17

def factory
  @factory
end

#hitEOFObject

Returns the value of attribute hitEOF.



19
20
21
# File 'lib/antlr4/Lexer.rb', line 19

def hitEOF
  @hitEOF
end

#inputObject

, :interp



17
18
19
# File 'lib/antlr4/Lexer.rb', line 17

def input
  @input
end

#modeObject

Returns the value of attribute mode.



19
20
21
# File 'lib/antlr4/Lexer.rb', line 19

def mode
  @mode
end

#modeStackObject

Returns the value of attribute modeStack.



19
20
21
# File 'lib/antlr4/Lexer.rb', line 19

def modeStack
  @modeStack
end

#textObject

Return the text matched so far for the current token or any

text override.


252
253
254
# File 'lib/antlr4/Lexer.rb', line 252

def text
  @text
end

#tokenObject

Returns the value of attribute token.



18
19
20
# File 'lib/antlr4/Lexer.rb', line 18

def token
  @token
end

#tokenFactorySourcePairObject

, :interp



17
18
19
# File 'lib/antlr4/Lexer.rb', line 17

def tokenFactorySourcePair
  @tokenFactorySourcePair
end

#tokenStartCharIndexObject

Returns the value of attribute tokenStartCharIndex.



18
19
20
# File 'lib/antlr4/Lexer.rb', line 18

def tokenStartCharIndex
  @tokenStartCharIndex
end

#tokenStartColumnObject

Returns the value of attribute tokenStartColumn.



18
19
20
# File 'lib/antlr4/Lexer.rb', line 18

def tokenStartColumn
  @tokenStartColumn
end

#tokenStartLineObject

Returns the value of attribute tokenStartLine.



18
19
20
# File 'lib/antlr4/Lexer.rb', line 18

def tokenStartLine
  @tokenStartLine
end

#typeObject

Returns the value of attribute type.



19
20
21
# File 'lib/antlr4/Lexer.rb', line 19

def type
  @type
end

Instance Method Details

#columnObject



237
238
239
# File 'lib/antlr4/Lexer.rb', line 237

def column
    return self.interp.column
end

#column=(column) ⇒ Object



241
242
243
# File 'lib/antlr4/Lexer.rb', line 241

def column=(column)
    self.interp.column = column
end

#emitObject

The standard method called to automatically emit a token at the

outermost lexical rule.  The token object should point into the
char buffer start..stop.  If there is a text override in 'text',
use that to set the token's text.  Override self method to emit
custom Token objects or provide a new factory.

/



208
209
210
211
212
213
# File 'lib/antlr4/Lexer.rb', line 208

def emit
    t = self.factory.create(self.tokenFactorySourcePair, self.type, self.text, self.channel, self.tokenStartCharIndex,
                             self.getCharIndex()-1, self.tokenStartLine, self.tokenStartColumn)
    self.emitToken(t)
    return t
end

#emitEOFObject



215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/antlr4/Lexer.rb', line 215

def emitEOF()
    cpos = self.column
    # The character position for EOF is one beyond the position of
    # the previous token's last character
    if not self.token.nil? then
        n = self.token.stop - self.token.start + 1
        cpos = self.token.column + n
    end
    eof = self.factory.create(self.tokenFactorySourcePair, Token::EOF, nil, Token::DEFAULT_CHANNEL, self.input.index,
                               self.input.index-1, self.line, cpos)
    self.emitToken(eof)
    return eof
end

#emitToken(token) ⇒ Object

By default does not support multiple emits per nextToken invocation

for efficiency reasons.  Subclass and override self method, nextToken,
and getToken (to push tokens into a list and pull from that list
rather than a single variable as self implementation does).

/



198
199
200
# File 'lib/antlr4/Lexer.rb', line 198

def emitToken(token)
    self.token = token
end

#getAllTokensObject

Return a list of all Token objects in input char stream.

Forces load of all tokens. Does not include EOF token.

/



269
270
271
272
273
274
275
276
277
# File 'lib/antlr4/Lexer.rb', line 269

def getAllTokens
    tokens = Array.new
    t = self.nextToken()
    while t.type!=Token::EOF do
        tokens.push(t)
        t = self.nextToken()
    end
    return tokens
end

#getCharErrorDisplay(c) ⇒ Object



311
312
313
# File 'lib/antlr4/Lexer.rb', line 311

def getCharErrorDisplay(c)
    return "'" + self.getErrorDisplayForChar(c) + "'"
end

#getCharIndexObject

What is the index of the current character of lookahead?#/



246
247
248
# File 'lib/antlr4/Lexer.rb', line 246

def getCharIndex()
    return self.input.index
end

#getErrorDisplay(s) ⇒ Object



287
288
289
290
291
292
# File 'lib/antlr4/Lexer.rb', line 287

def getErrorDisplay(s)
    StringIO.open  do |buf|
        s.chars.each{|c| buf.write(self.getErrorDisplayForChar(c)) }
        return buf.string()
    end
end

#getErrorDisplayForChar(c) ⇒ Object



293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# File 'lib/antlr4/Lexer.rb', line 293

def getErrorDisplayForChar(c)
    begin
      cc = c[0].ord 
    rescue ArgumentError
      cc = "\ufffd".ord
    end
    if cc==Token::EOF then
        return "<EOF>"
    elsif c == "\n"
        return "\\n"
    elsif c=="\t"
        return "\\t"
    elsif c=="\r"
        return "\\r"
    else
        return c
    end
end

#getRuleNamesObject



331
332
333
# File 'lib/antlr4/Lexer.rb', line 331

def getRuleNames
    self.ruleNames
end

#inputStreamObject

Set the char stream and reset the lexer#/



177
178
179
# File 'lib/antlr4/Lexer.rb', line 177

def inputStream
    return self.input
end

#inputStream=(input) ⇒ Object



181
182
183
184
185
186
187
# File 'lib/antlr4/Lexer.rb', line 181

def inputStream=(input)
    self.input = nil
    self.tokenFactorySourcePair = [self, nil]
    self.reset()
    self.input = input
    self.tokenFactorySourcePair = [self, self.input]
end

#lineObject



229
230
231
# File 'lib/antlr4/Lexer.rb', line 229

def line
    return self.interp.line
end

#line=(line) ⇒ Object



233
234
235
# File 'lib/antlr4/Lexer.rb', line 233

def line=(line)
    self.interp.line = line
end

#moreObject



155
156
157
# File 'lib/antlr4/Lexer.rb', line 155

def more
    self.type = Lexer::MORE
end

#nextTokenObject

Return a token from self source; i.e., match a token on the char

stream.


90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/antlr4/Lexer.rb', line 90

def nextToken
    if self.input.nil? 
        raise IllegalStateException.new("nextToken requires a non-null input stream.")
    end

    # Mark start location in char stream so unbuffered streams are
    # guaranteed at least have text of current token
    tokenStartMarker = self.input.mark()
    begin
        while true do 
            if self.hitEOF then
                self.emitEOF()
                return self.token
            end
            self.token = nil
            self.channel = Token::DEFAULT_CHANNEL
            self.tokenStartCharIndex = self.input.index
            self.tokenStartColumn = self.interp.column
            self.tokenStartLine = self.interp.line
            self.text = nil
            continueOuter = false
            while true do 
                self.type = Token::INVALID_TYPE
                ttype = Lexer::SKIP
                begin
                    ttype = self.interp.match(self.input, self.mode)
                rescue LexerNoViableAltException => e
                    self.notifyListeners(e)		# report error
                    self.recover(e)
                end
                if self.input.LA(1)==Token::EOF then
                    self.hitEOF = true
                end
                if self.type == Token::INVALID_TYPE
                    self.type = ttype
              
                end
                if self.type == Lexer::SKIP
                    continueOuter = true
                    break
                end
                if self.type!= Lexer::MORE
                    break
                end
            end
            next if continueOuter
            self.emit() if self.token.nil?
            return self.token
        end
    ensure  
        # make sure we release marker after match or
        # unbuffered char stream will keep buffering
        self.input.release(tokenStartMarker)
    end
end

#notifyListeners(e) ⇒ Object

:LexerNoViableAltException):



278
279
280
281
282
283
284
285
# File 'lib/antlr4/Lexer.rb', line 278

def notifyListeners(e) # :LexerNoViableAltException):
    start = self.tokenStartCharIndex
    stop = self.input.index
    text = self.input.getText(start, stop)
    msg = "token recognition error at: '#{self.getErrorDisplay(text) }'"
    listener = self.getErrorListenerDispatch()
    listener.syntaxError(self, nil, self.tokenStartLine, self.tokenStartColumn, msg, e)
end

#popModeObject



165
166
167
168
169
170
171
172
173
174
# File 'lib/antlr4/Lexer.rb', line 165

def popMode
    if self.modeStack.empty? then
        raise Exception.new("Empty Stack")
    end
    if self.interp.debug then
        puts  "popMode back to #{self.modeStack.slice(0,self.modeStack.length-1)}"
    end
    self.mode = self.modeStack.pop() 
    return self.mode
end

#pushMode(m) ⇒ Object



158
159
160
161
162
163
164
# File 'lib/antlr4/Lexer.rb', line 158

def pushMode(m)
    if self.interp.debug then
        puts "pushMode #{m}"
    end
    self.modeStack.push(self.mode)
    self.mode = m
end

#recover(re) ⇒ Object

Lexers can normally match any char in it’s vocabulary after matching

a token, so do the easy thing and just kill a character and hope
it all works out.  You can instead use the rule invocation stack
to do sophisticated error recovery if you are in a fragment rule.

/



320
321
322
323
324
325
326
327
328
329
330
# File 'lib/antlr4/Lexer.rb', line 320

def recover(re) # :RecognitionException):
    if self.input.LA(1) != Token::EOF then
        if re.kind_of?  LexerNoViableAltException then
                # skip a char and try again
                self.interp.consume(self.input)
        else
            # TODO: Do we lose character or line position information?
            self.input.consume()
        end
    end
end

#resetObject



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/antlr4/Lexer.rb', line 68

def reset
    # wack Lexer state variables
    if not self.input.nil? then 
        self.input.seek(0) # rewind the input
    end
    self.token = nil
    self.type = Token::INVALID_TYPE
    self.channel = Token::DEFAULT_CHANNEL
    self.tokenStartCharIndex = -1
    self.tokenStartColumn = -1
    self.tokenStartLine = -1
    self.text = nil

    self.hitEOF = false
    self.mode = Lexer::DEFAULT_MODE
    self.modeStack = Array.new

    self.interp.reset()
end

#skipObject

Instruct the lexer to skip creating a token for current lexer rule

and look for another token.  nextToken() knows to keep looking when
a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
if token==null at end of any token rule, it creates one for you
and emits it.

/



152
153
154
# File 'lib/antlr4/Lexer.rb', line 152

def skip
    self.type = Lexer::SKIP
end

#sourceNameObject



189
190
191
# File 'lib/antlr4/Lexer.rb', line 189

def sourceName
    return self.input.sourceName
end