Class: Classifier::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/classifier/base.rb

Direct Known Subclasses

Bayes, LSI

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Base

Returns a new instance of Base.



9
10
11
12
13
14
# File 'lib/classifier/base.rb', line 9

def initialize(options = {})
  options.reverse_merge!(:language => 'en')
  options.reverse_merge!(:encoding => 'UTF_8')

  @options = options
end

Instance Method Details

#clean_word_hash(str) ⇒ Object

Return a word hash without extra punctuation or short symbols, just stemmed words



35
36
37
# File 'lib/classifier/base.rb', line 35

def clean_word_hash str
  word_hash_for_words str.gsub(/[^\w\s]/,"").split
end

#prepare_category_name(val) ⇒ Object



16
17
18
# File 'lib/classifier/base.rb', line 16

def prepare_category_name val
  val.to_s.gsub("_"," ").capitalize.intern 
end

#without_punctuation(str) ⇒ Object

Removes common punctuation symbols, returning a new string. E.g.,

"Hello (greeting's), with {braces} < >...?".without_punctuation
=> "Hello  greetings   with  braces         "


24
25
26
# File 'lib/classifier/base.rb', line 24

def without_punctuation str
  str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
end

#word_hash(str) ⇒ Object

Return a Hash of strings => ints. Each word in the string is stemmed, interned, and indexes to its frequency in the document.



30
31
32
# File 'lib/classifier/base.rb', line 30

def word_hash str
  word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
end