Module: ClassifierReborn::Hasher
Constant Summary collapse
- STOPWORDS_PATH =
[File.(File.dirname(__FILE__) + '/../../../data/stopwords')]
- STOPWORDS =
Create a lazily-loaded hash of stopword data
Hash.new do |hash, language| hash[language] = [] STOPWORDS_PATH.each do |path| if File.exist?(File.join(path, language)) hash[language] = Set.new File.read(File.join(path, language.to_s)).split break end end hash[language] end
Instance Method Summary collapse
-
#clean_word_hash(str, language = 'en') ⇒ Object
Return a word hash without extra punctuation or short symbols, just stemmed words.
-
#word_hash(str, language = 'en') ⇒ Object
Return a Hash of strings => ints.
- #word_hash_for_symbols(words) ⇒ Object
- #word_hash_for_words(words, language = 'en') ⇒ Object
Instance Method Details
#clean_word_hash(str, language = 'en') ⇒ Object
Return a word hash without extra punctuation or short symbols, just stemmed words
23 24 25 |
# File 'lib/classifier-reborn/extensions/hasher.rb', line 23 def clean_word_hash(str, language = 'en') word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language end |
#word_hash(str, language = 'en') ⇒ Object
Return a Hash of strings => ints. Each word in the string is stemmed, interned, and indexes to its frequency in the document.
16 17 18 19 20 |
# File 'lib/classifier-reborn/extensions/hasher.rb', line 16 def word_hash(str, language = 'en') cleaned_word_hash = clean_word_hash(str, language) symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/)) return cleaned_word_hash.merge(symbol_hash) end |
#word_hash_for_symbols(words) ⇒ Object
37 38 39 40 41 42 43 |
# File 'lib/classifier-reborn/extensions/hasher.rb', line 37 def word_hash_for_symbols(words) d = Hash.new(0) words.each do |word| d[word.intern] += 1 end return d end |
#word_hash_for_words(words, language = 'en') ⇒ Object
27 28 29 30 31 32 33 34 35 |
# File 'lib/classifier-reborn/extensions/hasher.rb', line 27 def word_hash_for_words(words, language = 'en') d = Hash.new(0) words.each do |word| if word.length > 2 && !STOPWORDS[language].include?(word) d[word.stem.intern] += 1 end end return d end |