Class: List::Matcher

Inherits:
Object
  • Object
show all
Defined in:
lib/list_matcher.rb

Defined Under Namespace

Classes: Alternate, CharClass, Leaf, Node, Sequence, Special, SpecialPattern

Constant Summary collapse

QRX =

to make a replacement of Regexp.quote that ignores characters that only need quoting inside character classes

Regexp.new "([" + ( (1..255).map(&:chr).select{ |c| Regexp.quote(c) != c } - %w(-) ).map{ |c| Regexp.quote c }.join + "])"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(atomic: true, backtracking: true, bound: false, strip: false, case_insensitive: false, multiline: false, normalize_whitespace: false, symbols: {}, name: false, vet: false) ⇒ Matcher

Returns a new instance of Matcher.



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/list_matcher.rb', line 21

def initialize(
      atomic:               true,
      backtracking:         true,
      bound:                false,
      strip:                false,
      case_insensitive:     false,
      multiline:            false,
      normalize_whitespace: false,
      symbols:              {},
      name:                 false,
      vet:                  false
    )
  @atomic               = atomic
  @backtracking         = backtracking
  @strip                = strip || normalize_whitespace
  @case_insensitive     = case_insensitive
  @multiline            = multiline
  @symbols              = deep_dup symbols
  @_bound               = bound
  @bound                = !!bound
  @normalize_whitespace = normalize_whitespace
  @vet                  = vet
  if name
    raise "" unless name.is_a?(String) || name.is_a?(Symbol)
    if Regexp.new "(?<#{name}>.*)"   # stir up any errors that might arise from using this name in a named capture
      @name = name
    end
  end
  if bound == :string
    @word_test   = /./
    @left_bound  = '\A'
    @right_bound = '\z'
  elsif bound == :line
    @word_test   = /./
    @left_bound  = '^'
    @right_bound = '$'
  elsif bound.is_a? Hash
    @word_test   = bound[:test]  || raise(SyntaxError.new('no boundary test provided'))
    @left_bound  = bound[:left]  || raise(SyntaxError.new('no left boundary expression provided'))
    @right_bound = bound[:right] || raise(SyntaxError.new('no right boundary expression provided'))
  elsif bound === true || bound == :word
    @word_test   = /\w/
    @left_bound  = '\b'
    @right_bound = '\b'
  elsif !( bound === false )
    raise "unfamiliar value for :bound option: #{bound.inspect}"
  end
  if normalize_whitespace
    @symbols[' '] = { pattern: '\s++' }
  end
  symbols.keys.each do |k|
    raise "symbols variable #{k} is neither a string, a symbol, nor a regex" unless k.is_a?(String) || k.is_a?(Symbol) || k.is_a?(Regexp)
  end
  if vet
    Special.new( self, @symbols, [] ).verify
  end
end

Instance Attribute Details

#atomicObject (readonly)

Returns the value of attribute atomic.



5
6
7
# File 'lib/list_matcher.rb', line 5

def atomic
  @atomic
end

#backtrackingObject (readonly)

Returns the value of attribute backtracking.



5
6
7
# File 'lib/list_matcher.rb', line 5

def backtracking
  @backtracking
end

#boundObject (readonly)

Returns the value of attribute bound.



5
6
7
# File 'lib/list_matcher.rb', line 5

def bound
  @bound
end

#case_insensitiveObject (readonly)

Returns the value of attribute case_insensitive.



5
6
7
# File 'lib/list_matcher.rb', line 5

def case_insensitive
  @case_insensitive
end

#left_boundObject (readonly)

Returns the value of attribute left_bound.



5
6
7
# File 'lib/list_matcher.rb', line 5

def left_bound
  @left_bound
end

#multilineObject (readonly)

Returns the value of attribute multiline.



5
6
7
# File 'lib/list_matcher.rb', line 5

def multiline
  @multiline
end

#nameObject (readonly)

Returns the value of attribute name.



5
6
7
# File 'lib/list_matcher.rb', line 5

def name
  @name
end

#normalize_whitespaceObject (readonly)

Returns the value of attribute normalize_whitespace.



5
6
7
# File 'lib/list_matcher.rb', line 5

def normalize_whitespace
  @normalize_whitespace
end

#right_boundObject (readonly)

Returns the value of attribute right_bound.



5
6
7
# File 'lib/list_matcher.rb', line 5

def right_bound
  @right_bound
end

#stripObject (readonly)

Returns the value of attribute strip.



5
6
7
# File 'lib/list_matcher.rb', line 5

def strip
  @strip
end

#vetObject (readonly)

Returns the value of attribute vet.



5
6
7
# File 'lib/list_matcher.rb', line 5

def vet
  @vet
end

#word_testObject (readonly)

Returns the value of attribute word_test.



5
6
7
# File 'lib/list_matcher.rb', line 5

def word_test
  @word_test
end

Class Method Details

.pattern(list, opts = {}) ⇒ Object

convenience method for one-off regexen where there’s no point in keeping around a pattern generator



9
10
11
# File 'lib/list_matcher.rb', line 9

def self.pattern(list, opts={})
  self.new(**opts).pattern list
end

.quote(s) ⇒ Object



203
204
205
# File 'lib/list_matcher.rb', line 203

def self.quote(s)
  s.gsub(QRX) { |c| Regexp.quote c }
end

.rx(list, opts = {}) ⇒ Object

like self.pattern, but returns a regex rather than a string



14
15
16
# File 'lib/list_matcher.rb', line 14

def self.rx(list, opts={})
  self.new(**opts).rx list
end

Instance Method Details

#bud(opts = {}) ⇒ Object

returns a new pattern matcher differing from the original only in the options specified



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/list_matcher.rb', line 80

def bud(opts={})
  opts = {
    atomic:               @atomic,
    backtracking:         @backtracking,
    bound:                @_bound,
    strip:                @strip,
    case_insensitive:     @case_insensitive,
    multiline:            @multiline,
    normalize_whitespace: @normalize_whitespace,
    symbols:              @symbols,
    name:                 @name,
    vet:                  @vet && opts[:symbols]
  }.merge opts
  self.class.new(**opts)
end

#modifiersObject



126
127
128
129
130
131
132
# File 'lib/list_matcher.rb', line 126

def modifiers
  ( @modifiers ||= if case_insensitive || multiline
    [ ( 'i' if case_insensitive ), ( 'm' if multiline ) ].compact.join
  else
    [nil]
  end )[0]
end

#pattern(list, opts = {}) ⇒ Object

converst list into a string representing a regex pattern suitable for inclusion in a larger regex



97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/list_matcher.rb', line 97

def pattern( list, opts={} )
  return bud(opts).pattern list unless opts.empty?
  list = list.compact.map(&:to_s).select{ |s| s.length > 0 }
  list.map!(&:strip).select!{ |s| s.length > 0 } if strip
  list.map!{ |s| s.gsub /\s++/, ' ' } if normalize_whitespace
  return nil if list.empty?
  specializer = Special.new self, @symbols, list
  list = specializer.normalize

  root = tree list, specializer
  root.root = true
  root.flatten
  rx = root.convert
  if m = modifiers
    rx = "(?#{m}:#{rx})"
    grouped = true
  end
  if name
    rx = "(?<#{name}>#{rx})"
    grouped = true
  end
  return rx if grouped && backtracking
  if atomic && !root.atomic?
    wrap rx
  else
    rx
  end
end

#pfxObject



139
140
141
# File 'lib/list_matcher.rb', line 139

def pfx
  @pfx ||= backtracking ? '(?:' : '(?>'
end

#qmarkObject



143
144
145
# File 'lib/list_matcher.rb', line 143

def qmark
  @qmark ||= backtracking ? '?' : '?+'
end

#quote(s) ⇒ Object



207
208
209
# File 'lib/list_matcher.rb', line 207

def quote(s)
  self.class.quote s
end

#rx(list, opts = {}) ⇒ Object

like pattern but it returns a regex instead of a string



135
136
137
# File 'lib/list_matcher.rb', line 135

def rx(list, opts={})
  Regexp.new pattern(list, opts)
end

#tree(list, symbols) ⇒ Object



155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# File 'lib/list_matcher.rb', line 155

def tree(list, symbols)
  if list.size == 1
    leaves = list[0].chars.map do |c|
      symbols.symbols(c) || Leaf.new( self, c )
    end
    if leaves.length == 1
      leaves.first
    else
      Sequence.new self, *leaves
    end
  elsif list.all?{ |w| w.length == 1 }
    chars = list.select{ |w| !symbols.symbols(w) }
    if chars.size > 1
      list -= chars
      c = CharClass.new self, chars
    end
    a = Alternate.new self, symbols, list unless list.empty?
    a.children.unshift c if a && c
    a || c
  elsif c = best_prefix(list)   # found a fixed-width prefix pattern
    if optional = c[1].include?('')
      c[1].reject!{ |w| w == '' }
    end
    c1 = tree c[0], symbols
    c2 = tree c[1], symbols
    c2.optional = optional
    Sequence.new self, c1, c2
  elsif c = best_suffix(list)   # found a fixed-width suffix pattern
    if optional = c[0].include?('')
      c[0].reject!{ |w| w == '' }
    end
    c1 = tree c[0], symbols
    c1.optional = optional
    c2 = tree c[1], symbols
    Sequence.new self, c1, c2
  else
    grouped = list.group_by{ |w| w[0] }
    chars = grouped.select{ |_, w| w.size == 1 && w[0].size == 1 && !symbols.symbols(w[0]) }.map{ |v, _| v }
    if chars.size > 1
      list -= chars
      c = CharClass.new self, chars
    end
    a = Alternate.new self, symbols, list
    a.children.unshift c if c
    a
  end
end

#wrap(s) ⇒ Object



147
148
149
# File 'lib/list_matcher.rb', line 147

def wrap(s)
  pfx + s + ')'
end

#wrap_sizeObject



151
152
153
# File 'lib/list_matcher.rb', line 151

def wrap_size
  @wrap_size ||= pfx.length + 1
end