Class: SimpleLexer::Lexer

Inherits:
Object
  • Object
show all
Defined in:
lib/simple_lexer.rb

Overview

Object defined with certain rules that takes text as input and outputs Tokens based on the rules.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize { ... } ⇒ Lexer

Creates a new instance of Lexer.

Yields:

  • Some rules passed to instance_eval.

See Also:



30
31
32
33
34
35
# File 'lib/simple_lexer.rb', line 30

def initialize(&rules)
  @rules = [] # list of {:rule => Regexp, :token => :token_id}
  @ignore = [] # list of Regexp
  @pos = 0 # position in input
  instance_eval &rules
end

Instance Attribute Details

#posFixnum

Returns The current position of the input pointer.

Returns:

  • (Fixnum)

    The current position of the input pointer.



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/simple_lexer.rb', line 22

class Lexer

  attr_reader :rules
  attr_accessor :pos

  # Creates a new instance of Lexer.
  # @yield [] Some rules passed to instance_eval.
  # @see #tok An example of a number Lexer using <code>tok</code>. 
  def initialize(&rules)
    @rules = [] # list of {:rule => Regexp, :token => :token_id}
    @ignore = [] # list of Regexp
    @pos = 0 # position in input
    instance_eval &rules
  end

  # Defines a new Token rule for the Lexer to match.
  # @param [Regexp] rule Regular expression that defines the token 
  # @param [Symbol] token Token class 
  # @yield [text] The expression will give the Token its value. 
  # @example Rule for numbers
  #   my_lexer = SimpleLexer::Lexer.new do
  #     tok /-?\d+(\.\d+)?/, :number do |text| text.to_f end
  #   end
  #   my_lexer.load = "-435.234"
  #   puts my_lexer.next_token[:value] # -435.234
  def tok(rule, token, &action)
    @rules << {:rule => Regexp.new('\A' + rule.source), :token => token, :action => action}
  end

  # Defines rules of input classes to ignore (consume and not output any
  # tokens.)
  # @param [Regexp, Symbol] rule Regular expression that defines ignored
  #   characters.
  # @note You can set _rule_ to <code>:whitespace</code> to ignore whitespace
  #     characters.
  # @example Ignoring parentheses
  #   my_lexer = SimpleLexer::Lexer.new do
  #     tok /\w+/, :identifier
  #     ign /[\(\)]/
  #   end
  # @example Ignoring whitespace
  #   my_lexer = SimpleLexer::Lexer.new do
  #     tok /\w+/, :identifier
  #     ign :whitespace
  #   end
  def ign(rule) 
    if rule == :whitespace
      rule = /\s+/
    end
    
    @ignore << Regexp.new('\A' + rule.source)
  end

  # Give the Lexer some text to tokenize.
  # @param [String] input Text for the Lexer to tokenize.
  def load=(input)
    @load = input 
    @pos = 0 
  end
 
  # What still remains to be processed.
  # @return [String] Substring of the input starting from input pointer.
  def load 
    @load[@pos..-1]
  end

  # Gets the next Token in the input and advances the input pointer.
  # @return [Hash{Symbol=>Values}]
  #   - <code>:token</code> Token class
  #   - <code>:text</code> Matched text
  #   - <code>:value</code> Value as defined by passed block, if applicable.
  # @raise [NoMatchError] If load contains a sequence for which the Lexer has
  #   no rule.
  def next_token
    # get the next token
    # my_lexer.next_token -> [ :token => :token_id, :text => matched ]
    for rule in @ignore
      if match = load[rule]
        @pos += match.length
      end
    end

    if @pos >= @load.length
      raise EndOfStreamException, "Finished lexing, no more tokens left."
    end

    for rule in @rules
      if match = load[rule[:rule]]
        @pos += match.length
        return {:token => rule[:token], :text => match, 
                :value => (!rule[:action].nil? ? rule[:action].call(match) : nil) } 
      end
    end

    raise NoMatchError, "Unable to match, unexpected characters: '#{load[0..10]}...'"
  end

  # Tokenize the entire input stream.
  # @return [Array<Hash>] An Array of Tokens processed by the Lexer
  def all_tokens  
    tokens = []  
    loop do
      tokens << next_token
    end
  rescue EndOfStreamException => e
    tokens 
  end

  # Checks if the Lexer has finished Tokenizing the entire input stream.
  # @return [Boolean] Whether Lexer has reached the end of input.
  def finished?
    return @pos >= @load.length
  end

end

#rulesArray<Regexp> (readonly)

Returns A list of the rules for the Lexer.

Returns:

  • (Array<Regexp>)

    A list of the rules for the Lexer.



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/simple_lexer.rb', line 22

class Lexer

  attr_reader :rules
  attr_accessor :pos

  # Creates a new instance of Lexer.
  # @yield [] Some rules passed to instance_eval.
  # @see #tok An example of a number Lexer using <code>tok</code>. 
  def initialize(&rules)
    @rules = [] # list of {:rule => Regexp, :token => :token_id}
    @ignore = [] # list of Regexp
    @pos = 0 # position in input
    instance_eval &rules
  end

  # Defines a new Token rule for the Lexer to match.
  # @param [Regexp] rule Regular expression that defines the token 
  # @param [Symbol] token Token class 
  # @yield [text] The expression will give the Token its value. 
  # @example Rule for numbers
  #   my_lexer = SimpleLexer::Lexer.new do
  #     tok /-?\d+(\.\d+)?/, :number do |text| text.to_f end
  #   end
  #   my_lexer.load = "-435.234"
  #   puts my_lexer.next_token[:value] # -435.234
  def tok(rule, token, &action)
    @rules << {:rule => Regexp.new('\A' + rule.source), :token => token, :action => action}
  end

  # Defines rules of input classes to ignore (consume and not output any
  # tokens.)
  # @param [Regexp, Symbol] rule Regular expression that defines ignored
  #   characters.
  # @note You can set _rule_ to <code>:whitespace</code> to ignore whitespace
  #     characters.
  # @example Ignoring parentheses
  #   my_lexer = SimpleLexer::Lexer.new do
  #     tok /\w+/, :identifier
  #     ign /[\(\)]/
  #   end
  # @example Ignoring whitespace
  #   my_lexer = SimpleLexer::Lexer.new do
  #     tok /\w+/, :identifier
  #     ign :whitespace
  #   end
  def ign(rule) 
    if rule == :whitespace
      rule = /\s+/
    end
    
    @ignore << Regexp.new('\A' + rule.source)
  end

  # Give the Lexer some text to tokenize.
  # @param [String] input Text for the Lexer to tokenize.
  def load=(input)
    @load = input 
    @pos = 0 
  end
 
  # What still remains to be processed.
  # @return [String] Substring of the input starting from input pointer.
  def load 
    @load[@pos..-1]
  end

  # Gets the next Token in the input and advances the input pointer.
  # @return [Hash{Symbol=>Values}]
  #   - <code>:token</code> Token class
  #   - <code>:text</code> Matched text
  #   - <code>:value</code> Value as defined by passed block, if applicable.
  # @raise [NoMatchError] If load contains a sequence for which the Lexer has
  #   no rule.
  def next_token
    # get the next token
    # my_lexer.next_token -> [ :token => :token_id, :text => matched ]
    for rule in @ignore
      if match = load[rule]
        @pos += match.length
      end
    end

    if @pos >= @load.length
      raise EndOfStreamException, "Finished lexing, no more tokens left."
    end

    for rule in @rules
      if match = load[rule[:rule]]
        @pos += match.length
        return {:token => rule[:token], :text => match, 
                :value => (!rule[:action].nil? ? rule[:action].call(match) : nil) } 
      end
    end

    raise NoMatchError, "Unable to match, unexpected characters: '#{load[0..10]}...'"
  end

  # Tokenize the entire input stream.
  # @return [Array<Hash>] An Array of Tokens processed by the Lexer
  def all_tokens  
    tokens = []  
    loop do
      tokens << next_token
    end
  rescue EndOfStreamException => e
    tokens 
  end

  # Checks if the Lexer has finished Tokenizing the entire input stream.
  # @return [Boolean] Whether Lexer has reached the end of input.
  def finished?
    return @pos >= @load.length
  end

end

Instance Method Details

#all_tokensArray<Hash>

Tokenize the entire input stream.

Returns:

  • (Array<Hash>)

    An Array of Tokens processed by the Lexer



121
122
123
124
125
126
127
128
# File 'lib/simple_lexer.rb', line 121

def all_tokens  
  tokens = []  
  loop do
    tokens << next_token
  end
rescue EndOfStreamException => e
  tokens 
end

#finished?Boolean

Checks if the Lexer has finished Tokenizing the entire input stream.

Returns:

  • (Boolean)

    Whether Lexer has reached the end of input.



132
133
134
# File 'lib/simple_lexer.rb', line 132

def finished?
  return @pos >= @load.length
end

#ign(rule) ⇒ Object

Note:

You can set rule to :whitespace to ignore whitespace characters.

Defines rules of input classes to ignore (consume and not output any tokens.)

Examples:

Ignoring parentheses

my_lexer = SimpleLexer::Lexer.new do
  tok /\w+/, :identifier
  ign /[\(\)]/
end

Ignoring whitespace

my_lexer = SimpleLexer::Lexer.new do
  tok /\w+/, :identifier
  ign :whitespace
end

Parameters:

  • rule (Regexp, Symbol)

    Regular expression that defines ignored characters.



67
68
69
70
71
72
73
# File 'lib/simple_lexer.rb', line 67

def ign(rule) 
  if rule == :whitespace
    rule = /\s+/
  end
  
  @ignore << Regexp.new('\A' + rule.source)
end

#loadString

What still remains to be processed.

Returns:

  • (String)

    Substring of the input starting from input pointer.



84
85
86
# File 'lib/simple_lexer.rb', line 84

def load 
  @load[@pos..-1]
end

#load=(input) ⇒ Object

Give the Lexer some text to tokenize.

Parameters:

  • input (String)

    Text for the Lexer to tokenize.



77
78
79
80
# File 'lib/simple_lexer.rb', line 77

def load=(input)
  @load = input 
  @pos = 0 
end

#next_tokenHash{Symbol=>Values}

Gets the next Token in the input and advances the input pointer.

Returns:

  • (Hash{Symbol=>Values})
    • :token Token class

    • :text Matched text

    • :value Value as defined by passed block, if applicable.

Raises:

  • (NoMatchError)

    If load contains a sequence for which the Lexer has no rule.



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/simple_lexer.rb', line 95

def next_token
  # get the next token
  # my_lexer.next_token -> [ :token => :token_id, :text => matched ]
  for rule in @ignore
    if match = load[rule]
      @pos += match.length
    end
  end

  if @pos >= @load.length
    raise EndOfStreamException, "Finished lexing, no more tokens left."
  end

  for rule in @rules
    if match = load[rule[:rule]]
      @pos += match.length
      return {:token => rule[:token], :text => match, 
              :value => (!rule[:action].nil? ? rule[:action].call(match) : nil) } 
    end
  end

  raise NoMatchError, "Unable to match, unexpected characters: '#{load[0..10]}...'"
end

#tok(rule, token) {|text| ... } ⇒ Object

Defines a new Token rule for the Lexer to match.

Examples:

Rule for numbers

my_lexer = SimpleLexer::Lexer.new do
  tok /-?\d+(\.\d+)?/, :number do |text| text.to_f end
end
my_lexer.load = "-435.234"
puts my_lexer.next_token[:value] # -435.234

Parameters:

  • rule (Regexp)

    Regular expression that defines the token

  • token (Symbol)

    Token class

Yields:

  • (text)

    The expression will give the Token its value.



47
48
49
# File 'lib/simple_lexer.rb', line 47

def tok(rule, token, &action)
  @rules << {:rule => Regexp.new('\A' + rule.source), :token => token, :action => action}
end