Class: Rhetor::LexicalAnalyser

Inherits:
Object
  • Object
show all
Defined in:
lib/rhetor/lexical_analyser.rb

Overview

LexicalAnalyser is a class that performs lexical analysis of strings using a set of predefined rules.

Author:

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(&block) ⇒ void

Creates a new lexical analyser and evaluates the passed block within it

Examples:

Creating a simple HQ9+ parser

lexer = Rhetor::LexicalAnalyser.new {
  rule 'H', :hello_world
  rule 'Q', :quine
  rule '9', :ninety_nine_bottles
  rule '+', :increment
  ignore /\s+/
}

Parameters:

  • block (Block)

    the block to be executed



30
31
32
33
34
35
36
37
38
39
# File 'lib/rhetor/lexical_analyser.rb', line 30

def initialize(&block)
  @string_patterns = {}
  @regexp_patterns = {}
  @ignored = []
  @used_names = []
  @evaluator = {}
  @string = nil
  @position = nil
  (block.arity == 1) ? block[self] : instance_eval(&block) if block_given?
end

Instance Attribute Details

#positionInteger (readonly)

Returns the current position of analyser.

Returns:

  • (Integer)

    the current position of analyser



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/rhetor/lexical_analyser.rb', line 15

class LexicalAnalyser
  attr_reader :string, :position

  # Creates a new lexical analyser and evaluates the passed block within it
  # @param block [Block] the block to be executed
  # @example Creating a simple HQ9+ parser
  #   lexer = Rhetor::LexicalAnalyser.new {
  #     rule 'H', :hello_world
  #     rule 'Q', :quine
  #     rule '9', :ninety_nine_bottles
  #     rule '+', :increment
  #     ignore /\s+/
  #   }
  # @return [void]
  #
  def initialize(&block)
    @string_patterns = {}
    @regexp_patterns = {}
    @ignored = []
    @used_names = []
    @evaluator = {}
    @string = nil
    @position = nil
    (block.arity == 1) ? block[self] : instance_eval(&block) if block_given?
  end

  # Makes the analyser to recognize some pattern
  # @param pattern [String, Regexp] the pattern
  # @param name [Symbol] the name of the rule
  # @param evaluator [Proc,nil] a proc. This proc will be called
  #   if the pattern is encountered. It receives a matched substring
  #   and calculates the value of the corresponding token. If this
  #   argument is omitted, the value of the token will coincide
  #   with the matched substring.
  # @raise [InvalidPattern] if the pattern is not valid
  # @raise [InvalidRuleName] unless the name of the rule is a symbol
  # @raise [RuleAlreadyExists] if the rule with the same name already exists
  # @return [void]
  #
  def rule(pattern, name, &evaluator)
    check_rule(pattern, name)
    @used_names.push name
    @evaluator[name] = evaluator
    array_name = "@#{pattern.class.name.downcase}_patterns".intern
    instance_variable_get(array_name)[name] = pattern
  end

  # Makes the analyser to ignore some pattern
  # @param pattern [String, Regexp] the pattern to be ignored
  # @return [void]
  #
  def ignore(pattern)
    fail InvalidPattern unless [String, Regexp].include? pattern.class
    @ignored.push pattern unless @ignored.include? pattern
  end

  # Initiates the analysis of the string
  # @param string [String] the string to be analyzed
  # @return [void]
  #
  def begin_analysis(string)
    fail InvalidString unless string.is_a? String
    @string = string
    @position = 0
    @size = string.size
  end

  # @return [Token] the next token found in the string
  # @raise [NoStringLoaded] if no string is being analyzed
  # @raise [UnmatchedString] if the analyser is unable to get the next token
  #
  def next_token
    fail NoStringLoaded unless @string
    @position = skip_ignored(@string, @position)
    return EOF_TOKEN if @position >= @size
    name, length = string_pattern(@string, @position)
    name, length = regexp_pattern(@string, @position) if length == 0
    fail UnmatchedString, "at position #{@position}" if length == 0
    token = make_token(name, @position, length)
    @position += length
    token
  end

  # Analyzes the given string
  # @param string [String] the string to be analyzed
  # @yieldparam token [Token] every encountered token
  # @return [Array<Token>] the array of encountered tokens
  #
  def analyse(string, &block)
    begin_analysis(string)
    tokens = []
    loop do
      last_token = next_token
      (last_token == EOF_TOKEN) ? break : tokens << last_token
      block.call(last_token) if block_given?
    end
    tokens
  end

  private

  def make_token(name, position, size)
    substring = @string[position, size]
    value = @evaluator[name] ? @evaluator[name].call(substring) : substring
    Rhetor::Token.new(value, name, position, size)
  end

  def check_rule(pattern, name)
    fail InvalidPattern unless [String, Regexp].include? pattern.class
    fail InvalidRuleName unless name.is_a? Symbol
    fail RuleAlreadyExists if @used_names.include? name
  end

  def string_pattern(string, position)
    results = @string_patterns.map do |name, pattern|
      [name, matched_size(pattern, string, position)]
    end
    results.max_by(&:last) || [nil, 0]
  end

  def regexp_pattern(string, position)
    results = @regexp_patterns.map do |name, pattern|
      [name, matched_size(pattern, string, position)]
    end
    # results.max_by(&:last) || [nil, 0]
    results.sort_by(&:last).find { |_name, size| size > 0 } || [nil, 0]
  end

  def skip_ignored(string, position)
    skipped = @ignored.map { |p| matched_size(p, string, position) }.max
    skipped ? position + skipped : position
  end

  def matched_size(pattern, string, position)
    if pattern.is_a? String
      (string[position, pattern.size] == pattern) ? pattern.size : 0
    elsif pattern.is_a? Regexp
      md = string.match(pattern, position)
      return 0 unless md
      md.begin(0) == position ? md[0].size : 0
    end
  end
end

#stringString (readonly)

Returns the string being analyzed.

Returns:

  • (String)

    the string being analyzed



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/rhetor/lexical_analyser.rb', line 15

class LexicalAnalyser
  attr_reader :string, :position

  # Creates a new lexical analyser and evaluates the passed block within it
  # @param block [Block] the block to be executed
  # @example Creating a simple HQ9+ parser
  #   lexer = Rhetor::LexicalAnalyser.new {
  #     rule 'H', :hello_world
  #     rule 'Q', :quine
  #     rule '9', :ninety_nine_bottles
  #     rule '+', :increment
  #     ignore /\s+/
  #   }
  # @return [void]
  #
  def initialize(&block)
    @string_patterns = {}
    @regexp_patterns = {}
    @ignored = []
    @used_names = []
    @evaluator = {}
    @string = nil
    @position = nil
    (block.arity == 1) ? block[self] : instance_eval(&block) if block_given?
  end

  # Makes the analyser to recognize some pattern
  # @param pattern [String, Regexp] the pattern
  # @param name [Symbol] the name of the rule
  # @param evaluator [Proc,nil] a proc. This proc will be called
  #   if the pattern is encountered. It receives a matched substring
  #   and calculates the value of the corresponding token. If this
  #   argument is omitted, the value of the token will coincide
  #   with the matched substring.
  # @raise [InvalidPattern] if the pattern is not valid
  # @raise [InvalidRuleName] unless the name of the rule is a symbol
  # @raise [RuleAlreadyExists] if the rule with the same name already exists
  # @return [void]
  #
  def rule(pattern, name, &evaluator)
    check_rule(pattern, name)
    @used_names.push name
    @evaluator[name] = evaluator
    array_name = "@#{pattern.class.name.downcase}_patterns".intern
    instance_variable_get(array_name)[name] = pattern
  end

  # Makes the analyser to ignore some pattern
  # @param pattern [String, Regexp] the pattern to be ignored
  # @return [void]
  #
  def ignore(pattern)
    fail InvalidPattern unless [String, Regexp].include? pattern.class
    @ignored.push pattern unless @ignored.include? pattern
  end

  # Initiates the analysis of the string
  # @param string [String] the string to be analyzed
  # @return [void]
  #
  def begin_analysis(string)
    fail InvalidString unless string.is_a? String
    @string = string
    @position = 0
    @size = string.size
  end

  # @return [Token] the next token found in the string
  # @raise [NoStringLoaded] if no string is being analyzed
  # @raise [UnmatchedString] if the analyser is unable to get the next token
  #
  def next_token
    fail NoStringLoaded unless @string
    @position = skip_ignored(@string, @position)
    return EOF_TOKEN if @position >= @size
    name, length = string_pattern(@string, @position)
    name, length = regexp_pattern(@string, @position) if length == 0
    fail UnmatchedString, "at position #{@position}" if length == 0
    token = make_token(name, @position, length)
    @position += length
    token
  end

  # Analyzes the given string
  # @param string [String] the string to be analyzed
  # @yieldparam token [Token] every encountered token
  # @return [Array<Token>] the array of encountered tokens
  #
  def analyse(string, &block)
    begin_analysis(string)
    tokens = []
    loop do
      last_token = next_token
      (last_token == EOF_TOKEN) ? break : tokens << last_token
      block.call(last_token) if block_given?
    end
    tokens
  end

  private

  def make_token(name, position, size)
    substring = @string[position, size]
    value = @evaluator[name] ? @evaluator[name].call(substring) : substring
    Rhetor::Token.new(value, name, position, size)
  end

  def check_rule(pattern, name)
    fail InvalidPattern unless [String, Regexp].include? pattern.class
    fail InvalidRuleName unless name.is_a? Symbol
    fail RuleAlreadyExists if @used_names.include? name
  end

  def string_pattern(string, position)
    results = @string_patterns.map do |name, pattern|
      [name, matched_size(pattern, string, position)]
    end
    results.max_by(&:last) || [nil, 0]
  end

  def regexp_pattern(string, position)
    results = @regexp_patterns.map do |name, pattern|
      [name, matched_size(pattern, string, position)]
    end
    # results.max_by(&:last) || [nil, 0]
    results.sort_by(&:last).find { |_name, size| size > 0 } || [nil, 0]
  end

  def skip_ignored(string, position)
    skipped = @ignored.map { |p| matched_size(p, string, position) }.max
    skipped ? position + skipped : position
  end

  def matched_size(pattern, string, position)
    if pattern.is_a? String
      (string[position, pattern.size] == pattern) ? pattern.size : 0
    elsif pattern.is_a? Regexp
      md = string.match(pattern, position)
      return 0 unless md
      md.begin(0) == position ? md[0].size : 0
    end
  end
end

Instance Method Details

#analyse(string) {|token| ... } ⇒ Array<Token>

Analyzes the given string

Parameters:

  • string (String)

    the string to be analyzed

Yield Parameters:

  • token (Token)

    every encountered token

Returns:

  • (Array<Token>)

    the array of encountered tokens



103
104
105
106
107
108
109
110
111
112
# File 'lib/rhetor/lexical_analyser.rb', line 103

def analyse(string, &block)
  begin_analysis(string)
  tokens = []
  loop do
    last_token = next_token
    (last_token == EOF_TOKEN) ? break : tokens << last_token
    block.call(last_token) if block_given?
  end
  tokens
end

#begin_analysis(string) ⇒ void

This method returns an undefined value.

Initiates the analysis of the string

Parameters:

  • string (String)

    the string to be analyzed



75
76
77
78
79
80
# File 'lib/rhetor/lexical_analyser.rb', line 75

def begin_analysis(string)
  fail InvalidString unless string.is_a? String
  @string = string
  @position = 0
  @size = string.size
end

#ignore(pattern) ⇒ void

This method returns an undefined value.

Makes the analyser to ignore some pattern

Parameters:

  • pattern (String, Regexp)

    the pattern to be ignored



66
67
68
69
# File 'lib/rhetor/lexical_analyser.rb', line 66

def ignore(pattern)
  fail InvalidPattern unless [String, Regexp].include? pattern.class
  @ignored.push pattern unless @ignored.include? pattern
end

#next_tokenToken

Returns the next token found in the string.

Returns:

  • (Token)

    the next token found in the string

Raises:



86
87
88
89
90
91
92
93
94
95
96
# File 'lib/rhetor/lexical_analyser.rb', line 86

def next_token
  fail NoStringLoaded unless @string
  @position = skip_ignored(@string, @position)
  return EOF_TOKEN if @position >= @size
  name, length = string_pattern(@string, @position)
  name, length = regexp_pattern(@string, @position) if length == 0
  fail UnmatchedString, "at position #{@position}" if length == 0
  token = make_token(name, @position, length)
  @position += length
  token
end

#rule(pattern, name, &evaluator) ⇒ void

This method returns an undefined value.

Makes the analyser to recognize some pattern

Parameters:

  • pattern (String, Regexp)

    the pattern

  • name (Symbol)

    the name of the rule

  • evaluator (Proc, nil)

    a proc. This proc will be called if the pattern is encountered. It receives a matched substring and calculates the value of the corresponding token. If this argument is omitted, the value of the token will coincide with the matched substring.

Raises:



54
55
56
57
58
59
60
# File 'lib/rhetor/lexical_analyser.rb', line 54

def rule(pattern, name, &evaluator)
  check_rule(pattern, name)
  @used_names.push name
  @evaluator[name] = evaluator
  array_name = "@#{pattern.class.name.downcase}_patterns".intern
  instance_variable_get(array_name)[name] = pattern
end