Class: List::Matcher

Inherits:
Object
  • Object
show all
Defined in:
lib/list_matcher.rb

Defined Under Namespace

Classes: Alternate, CharClass, Error, Leaf, Node, Sequence, Special, SpecialPattern

Constant Summary collapse

QRX =

to make a replacement of Regexp.quote that ignores characters that only need quoting inside character classes

Regexp.new "([" + ( (1..255).map(&:chr).select{ |c| Regexp.quote(c) != c } - %w(-) ).map{ |c| Regexp.quote c }.join + "])"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(atomic: true, backtracking: true, bound: false, strip: false, case_insensitive: false, multiline: false, not_extended: false, normalize_whitespace: false, symbols: {}, name: false, vet: false) ⇒ Matcher

Returns a new instance of Matcher.



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/list_matcher.rb', line 25

def initialize(
      atomic:               true,
      backtracking:         true,
      bound:                false,
      strip:                false,
      case_insensitive:     false,
      multiline:            false,
      not_extended:         false,
      normalize_whitespace: false,
      symbols:              {},
      name:                 false,
      vet:                  false
    )
  @atomic               = atomic
  @backtracking         = backtracking
  @strip                = strip || normalize_whitespace
  @case_insensitive     = case_insensitive
  @multiline            = multiline
  @not_extended         = not_extended
  @symbols              = deep_dup symbols
  @_bound               = bound
  @bound                = !!bound
  @normalize_whitespace = normalize_whitespace
  @vet                  = vet
  if name
    raise Error, "name must be a string or symbol" unless name.is_a?(String) || name.is_a?(Symbol)
    begin
      Regexp.new "(?<#{name}>.*)"   # stir up any errors that might arise from using this name in a named capture
      @name = name
    rescue
      raise Error, "#{name} does not work as the name of a named group"
    end
  end
  case bound
  when TrueClass
    @word_test   = /\w/
    @left_bound  = '\b'
    @right_bound = '\b'
  when FalseClass
  when Symbol
    case bound
    when :string, :string_left, :string_right
      @word_test   = /./
      @left_bound  = '\A'
      @right_bound = '\z'
    when :line, :line_left, :line_right
      @word_test   = /./
      @left_bound  = '^'
      @right_bound = '$'
    when :word, :word_left, :word_right
      @word_test   = /\w/
      @left_bound  = '\b'
      @right_bound = '\b'
    else
      raise Error, "unfamiliar value for :bound option: #{bound.inspect}"
    end
    if /_left/ === bound.to_s
      @right_bound = nil
    elsif /_right/ === bound.to_s
      @left_bound = nil
    end
  when Hash
    @word_test   = bound[:test] || raise( Error, 'no boundary test provided' )
    @left_bound  = bound[:left]
    @right_bound = bound[:right]
    raise Error, 'neither bound provided' unless @left_bound || @right_bound
    raise Error, 'test must be Regexp or String' unless @word_test.is_a?(Regexp) || @word_test.is_a?(String)
    @word_test = Regexp.new @word_test unless @word_test.is_a?(Regexp)
    [ @left_bound, @right_bound ].compact.each do |b|
      raise Error, 'bounds must be strings' unless b.is_a?(String)
      begin
        Regexp.new b
      rescue
        raise Error, "bad boundary pattern: #{b}"
      end
    end
  else
    raise Error, "unfamiliar value for :bound option: #{bound.inspect}"
  end
  symbols.keys.each do |k|
    raise Error, "symbols variable #{k.inspect} is neither a string, a symbol, nor a regex" unless k.is_a?(String) || k.is_a?(Symbol) || k.is_a?(Regexp)
  end
  if normalize_whitespace
    @symbols[' '] = { pattern: '\s++' }
  elsif not_extended
    @symbols[' '] = { pattern: ' ' }
  end
  if not_extended
    @symbols['#'] = { pattern: '#' }
  end
  if vet
    Special.new( self, @symbols, [] ).verify
  end
end

Instance Attribute Details

#atomicObject (readonly)

Returns the value of attribute atomic.



5
6
7
# File 'lib/list_matcher.rb', line 5

def atomic
  @atomic
end

#backtrackingObject (readonly)

Returns the value of attribute backtracking.



5
6
7
# File 'lib/list_matcher.rb', line 5

def backtracking
  @backtracking
end

#boundObject (readonly)

Returns the value of attribute bound.



5
6
7
# File 'lib/list_matcher.rb', line 5

def bound
  @bound
end

#case_insensitiveObject (readonly)

Returns the value of attribute case_insensitive.



5
6
7
# File 'lib/list_matcher.rb', line 5

def case_insensitive
  @case_insensitive
end

#left_boundObject (readonly)

Returns the value of attribute left_bound.



5
6
7
# File 'lib/list_matcher.rb', line 5

def left_bound
  @left_bound
end

#multilineObject (readonly)

Returns the value of attribute multiline.



5
6
7
# File 'lib/list_matcher.rb', line 5

def multiline
  @multiline
end

#nameObject (readonly)

Returns the value of attribute name.



5
6
7
# File 'lib/list_matcher.rb', line 5

def name
  @name
end

#normalize_whitespaceObject (readonly)

Returns the value of attribute normalize_whitespace.



5
6
7
# File 'lib/list_matcher.rb', line 5

def normalize_whitespace
  @normalize_whitespace
end

#not_extendedObject (readonly)

Returns the value of attribute not_extended.



5
6
7
# File 'lib/list_matcher.rb', line 5

def not_extended
  @not_extended
end

#right_boundObject (readonly)

Returns the value of attribute right_bound.



5
6
7
# File 'lib/list_matcher.rb', line 5

def right_bound
  @right_bound
end

#stripObject (readonly)

Returns the value of attribute strip.



5
6
7
# File 'lib/list_matcher.rb', line 5

def strip
  @strip
end

#vetObject (readonly)

Returns the value of attribute vet.



5
6
7
# File 'lib/list_matcher.rb', line 5

def vet
  @vet
end

#word_testObject (readonly)

Returns the value of attribute word_test.



5
6
7
# File 'lib/list_matcher.rb', line 5

def word_test
  @word_test
end

Class Method Details

.pattern(list, opts = {}) ⇒ Object

convenience method for one-off regexen where there’s no point in keeping around a pattern generator



13
14
15
# File 'lib/list_matcher.rb', line 13

def self.pattern(list, opts={})
  self.new(**opts).pattern list
end

.quote(s) ⇒ Object



245
246
247
# File 'lib/list_matcher.rb', line 245

def self.quote(s)
  s.gsub(QRX) { |c| Regexp.quote c }
end

.rx(list, opts = {}) ⇒ Object

like self.pattern, but returns a regex rather than a string



18
19
20
# File 'lib/list_matcher.rb', line 18

def self.rx(list, opts={})
  self.new(**opts).rx list
end

Instance Method Details

#bud(opts = {}) ⇒ Object

returns a new pattern matcher differing from the original only in the options specified



121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/list_matcher.rb', line 121

def bud(opts={})
  opts = {
    atomic:               @atomic,
    backtracking:         @backtracking,
    bound:                @_bound,
    strip:                @strip,
    case_insensitive:     @case_insensitive,
    multiline:            @multiline,
    not_extended:         @not_extended,
    normalize_whitespace: @normalize_whitespace,
    symbols:              @symbols,
    name:                 @name,
    vet:                  @vet && opts[:symbols]
  }.merge opts
  self.class.new(**opts)
end

#modifiersObject



168
169
170
171
172
173
174
# File 'lib/list_matcher.rb', line 168

def modifiers
  ( @modifiers ||= if case_insensitive || multiline || not_extended
    [ [ ( 'i' if case_insensitive ), ( 'm' if multiline ), ( '-x' if not_extended ) ].compact.join ]
  else
    [nil]
  end )[0]
end

#pattern(list, opts = {}) ⇒ Object

converst list into a string representing a regex pattern suitable for inclusion in a larger regex



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/list_matcher.rb', line 139

def pattern( list, opts={} )
  return bud(opts).pattern list unless opts.empty?
  list = list.compact.map(&:to_s).select{ |s| s.length > 0 }
  list.map!(&:strip).select!{ |s| s.length > 0 } if strip
  list.map!{ |s| s.gsub /\s++/, ' ' } if normalize_whitespace
  return nil if list.empty?
  specializer = Special.new self, @symbols, list
  list = specializer.normalize

  root = tree list, specializer
  root.root = true
  root.flatten
  rx = root.convert
  if m = modifiers
    rx = "(?#{m}:#{rx})"
    grouped = true
  end
  if name
    rx = "(?<#{name}>#{rx})"
    grouped = true
  end
  return rx if grouped && backtracking
  if atomic && !root.atomic?
    wrap rx
  else
    rx
  end
end

#pfxObject



181
182
183
# File 'lib/list_matcher.rb', line 181

def pfx
  @pfx ||= backtracking ? '(?:' : '(?>'
end

#qmarkObject



185
186
187
# File 'lib/list_matcher.rb', line 185

def qmark
  @qmark ||= backtracking ? '?' : '?+'
end

#quote(s) ⇒ Object



249
250
251
# File 'lib/list_matcher.rb', line 249

def quote(s)
  self.class.quote s
end

#rx(list, opts = {}) ⇒ Object

like pattern but it returns a regex instead of a string



177
178
179
# File 'lib/list_matcher.rb', line 177

def rx(list, opts={})
  Regexp.new pattern(list, opts)
end

#tree(list, symbols) ⇒ Object



197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# File 'lib/list_matcher.rb', line 197

def tree(list, symbols)
  if list.size == 1
    leaves = list[0].chars.map do |c|
      symbols.symbols(c) || Leaf.new( self, c )
    end
    if leaves.length == 1
      leaves.first
    else
      Sequence.new self, *leaves
    end
  elsif list.all?{ |w| w.length == 1 }
    chars = list.select{ |w| !symbols.symbols(w) }
    if chars.size > 1
      list -= chars
      c = CharClass.new self, chars
    end
    a = Alternate.new self, symbols, list unless list.empty?
    a.children.unshift c if a && c
    a || c
  elsif c = best_prefix(list)   # found a fixed-width prefix pattern
    if optional = c[1].include?('')
      c[1].reject!{ |w| w == '' }
    end
    c1 = tree c[0], symbols
    c2 = tree c[1], symbols
    c2 = c2.optionalize optional
    Sequence.new self, c1, c2
  elsif c = best_suffix(list)   # found a fixed-width suffix pattern
    if optional = c[0].include?('')
      c[0].reject!{ |w| w == '' }   # TODO make this faster with index
    end
    c1 = tree c[0], symbols
    c1 = c1.optionalize optional
    c2 = tree c[1], symbols
    Sequence.new self, c1, c2
  else
    grouped = list.group_by{ |w| w[0] }
    chars = grouped.select{ |_, w| w.size == 1 && w[0].size == 1 && !symbols.symbols(w[0]) }.map{ |v, _| v }
    if chars.size > 1
      list -= chars
      c = CharClass.new self, chars
    end
    a = Alternate.new self, symbols, list
    a.children.unshift c if c
    a
  end
end

#wrap(s) ⇒ Object



189
190
191
# File 'lib/list_matcher.rb', line 189

def wrap(s)
  pfx + s + ')'
end

#wrap_sizeObject



193
194
195
# File 'lib/list_matcher.rb', line 193

def wrap_size
  @wrap_size ||= pfx.length + 1
end