Class: Splam::Ngram
- Inherits:
-
Object
- Object
- Splam::Ngram
- Defined in:
- lib/splam/ngram.rb
Class Method Summary collapse
Instance Method Summary collapse
- #compare(text) ⇒ Object
-
#initialize(site_id = nil) ⇒ Ngram
constructor
A new instance of Ngram.
-
#train(words, spam = false, retrain = false) ⇒ Object
Train the temporary corpus with your data.
Constructor Details
#initialize(site_id = nil) ⇒ Ngram
Returns a new instance of Ngram.
26 27 28 |
# File 'lib/splam/ngram.rb', line 26 def initialize site_id=nil @site_id = site_id end |
Class Method Details
.trigram(text) ⇒ Object
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
# File 'lib/splam/ngram.rb', line 3 def self.trigram text # this won't be utf-8 happy. Oh well! words = text.gsub("'", "").split(/\W/) hash = Hash.new 0 i = 0 while (i < words.length) tri = [] count = 0 while ((words.length > i + count) && (tri.length < 3)) word = words[i + count] if word && word != "" tri << words[i + count] end count += 1 end if tri.length == 3 hash[tri.join(' ')] += 1 end i += 1 end hash end |
Instance Method Details
#compare(text) ⇒ Object
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/splam/ngram.rb', line 48 def compare text tri = self.class.trigram(text) score = 0 spam = 0 ham_key = @site_id ? "ham-#{@site_id}" : "ham" spam_key = @site_id ? "spam-#{@site_id}" : "spam" @ham_tri = Hash.new 0 @spam_tri = Hash.new 0 tri.each do |key,value| next if key.nil? || key.strip == "" hmatch = REDIS.hget(ham_key, key).to_i # ham_tri[key] smatch = REDIS.hget(spam_key, key).to_i # spam_tri[key] if hmatch > 0 && smatch > 0 # tri appears in both # ignore. next end if hmatch > 0 score += hmatch + value elsif smatch > 0 spam += smatch + value end end [score, spam] end |
#train(words, spam = false, retrain = false) ⇒ Object
Train the temporary corpus with your data
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/splam/ngram.rb', line 31 def train words, spam = false, retrain = false if words.is_a?(String) words = self.class.trigram(words) end words.each do |word,value| key = spam ? "spam" : "ham" REDIS.hincrby key, word, value REDIS.hincrby "#{key}-#{@site_id}", word, value if @site_id if retrain # Remove phrases from existing corpus key = spam ? "ham" : "spam" REDIS.hincrby key, word, -value REDIS.hincrby "#{key}-#{@site_id}", word, -value if @site_id end end end |