Class: Splam::Ngram

Inherits:
Object
  • Object
show all
Defined in:
lib/splam/ngram.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(site_id = nil) ⇒ Ngram

Returns a new instance of Ngram.



26
27
28
# File 'lib/splam/ngram.rb', line 26

def initialize site_id=nil
  @site_id = site_id
end

Class Method Details

.trigram(text) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/splam/ngram.rb', line 3

def self.trigram text
  # this won't be utf-8 happy. Oh well!
  words = text.gsub("'", "").split(/\W/)
  hash = Hash.new 0
  i = 0
  while (i < words.length)
    tri = []
    count = 0
    while ((words.length > i + count) && (tri.length < 3))
      word = words[i + count]
      if word && word != ""
        tri << words[i + count]
      end
      count += 1
    end
    if tri.length == 3
      hash[tri.join(' ')] += 1
    end
    i += 1
  end
  hash
end

Instance Method Details

#compare(text) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/splam/ngram.rb', line 48

def compare text
  tri = self.class.trigram(text)
  score = 0
  spam = 0
  
  ham_key = @site_id ? "ham-#{@site_id}" : "ham"
  spam_key = @site_id ? "spam-#{@site_id}" : "spam"

  @ham_tri = Hash.new 0
  @spam_tri = Hash.new 0

  tri.each do |key,value|
    next if key.nil? || key.strip == ""
    hmatch = REDIS.hget(ham_key, key).to_i #  ham_tri[key]
    smatch = REDIS.hget(spam_key, key).to_i  # spam_tri[key]

    if hmatch > 0 && smatch > 0
      # tri appears in both
      # ignore.
      next
    end
    if hmatch > 0
      score += hmatch + value
    elsif smatch > 0
      spam += smatch + value
    end
  end
  [score, spam]
end

#train(words, spam = false, retrain = false) ⇒ Object

Train the temporary corpus with your data



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/splam/ngram.rb', line 31

def train words, spam = false, retrain = false
  if words.is_a?(String)
    words = self.class.trigram(words)
  end
  words.each do |word,value|
    key = spam ? "spam" : "ham"
    REDIS.hincrby key, word, value
    REDIS.hincrby "#{key}-#{@site_id}", word, value if @site_id
    if retrain
      # Remove phrases from existing corpus
      key = spam ? "ham" : "spam"
      REDIS.hincrby key, word, -value
      REDIS.hincrby "#{key}-#{@site_id}", word, -value if @site_id
    end
  end
end