Class: String

Inherits:
Object
  • Object
show all
Defined in:
lib/extensions/string.rb

Constant Summary collapse

IgnoredWords =
%w{
  a
  an
  and
  but
  de
  des
  for
  from
  le
  les
  of
  on
  or
  the
  to
}

Instance Method Summary collapse

Instance Method Details

#normalizeObject



21
22
23
24
25
26
27
# File 'lib/extensions/string.rb', line 21

def normalize
  downcase.                     # lowercase
    unaccent.                   # 'normalize' accents
    delete(%q{'"‘’“”}).         # remove quotes
    gsub(/[^a-z0-9]+/, ' ').    # convert non-alphanumeric to whitespace
    strip.squeeze(' ')          # compress/remove whitespace
end

#tokenizeObject



29
30
31
32
33
# File 'lib/extensions/string.rb', line 29

def tokenize
  words = normalize.split(/\s+/)
  new_words = words - IgnoredWords
  new_words.empty? ? words : new_words    # handles 'The The', etc.
end