Module: ClassifierReborn::Hasher

Extended by:
Hasher
Included in:
Hasher
Defined in:
lib/classifier-reborn/extensions/hasher.rb

Constant Summary collapse

STOPWORDS_PATH =
[File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
STOPWORDS =

Create a lazily-loaded hash of stopword data

Hash.new do |hash, language|
  hash[language] = []

  STOPWORDS_PATH.each do |path|
    if File.exist?(File.join(path, language))
      hash[language] = Set.new File.read(File.join(path, language.to_s)).split
      break
    end
  end

  hash[language]
end

Instance Method Summary collapse

Instance Method Details

#clean_word_hash(str, language = 'en') ⇒ Object

Return a word hash without extra punctuation or short symbols, just stemmed words



23
24
25
# File 'lib/classifier-reborn/extensions/hasher.rb', line 23

def clean_word_hash(str, language = 'en')
  word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
end

#word_hash(str, language = 'en') ⇒ Object

Return a Hash of strings => ints. Each word in the string is stemmed, interned, and indexes to its frequency in the document.



16
17
18
19
20
# File 'lib/classifier-reborn/extensions/hasher.rb', line 16

def word_hash(str, language = 'en')
  cleaned_word_hash = clean_word_hash(str, language)
  symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
  return cleaned_word_hash.merge(symbol_hash)
end

#word_hash_for_symbols(words) ⇒ Object



37
38
39
40
41
42
43
# File 'lib/classifier-reborn/extensions/hasher.rb', line 37

def word_hash_for_symbols(words)
  d = Hash.new(0)
  words.each do |word|
    d[word.intern] += 1
  end
  return d
end

#word_hash_for_words(words, language = 'en') ⇒ Object



27
28
29
30
31
32
33
34
35
# File 'lib/classifier-reborn/extensions/hasher.rb', line 27

def word_hash_for_words(words, language = 'en')
  d = Hash.new(0)
  words.each do |word|
    if word.length > 2 && !STOPWORDS[language].include?(word)
      d[word.stem.intern] += 1
    end
  end
  return d
end