Class: RBayes

Inherits:
Object
  • Object
show all
Defined in:
lib/rbayes.rb

Overview

Dan Peterson <[email protected]> you can do whatever you want with this file but i appreciate credit

Refactored by Eric Hodel <[email protected]>

Constant Summary collapse

VERSION =

The version of RBayes you are using.

'1.0.0'
COUNT_BLAND =

:stopdoc:

" count_bland "
COUNT_TASTY =
" count_tasty "

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(token_file, case_sensitive = false, test = false, debug = false) ⇒ RBayes

Creates a new RBayes object using the database token_file. If test is true no writes are performed. If debug is true stuff gets logged to $stderr. case_sensitive should be obvious.



42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/rbayes.rb', line 42

def initialize(token_file, case_sensitive = false, test = false,
               debug = false)
  @case_sensitive = case_sensitive
  @test = test
  @debug = debug

  @database = BDB1::Hash.open token_file, 'a+'

  @count_tasty = @database[COUNT_TASTY].to_i || 0
  @count_bland = @database[COUNT_BLAND].to_i || 0

  log "ham tokens: #{@count_tasty} bland tokens: #{@count_bland}"
end

Instance Attribute Details

#count_blandObject (readonly)

Bland tokens



25
26
27
# File 'lib/rbayes.rb', line 25

def count_bland
  @count_bland
end

#count_tastyObject (readonly)

Tasty tokens



30
31
32
# File 'lib/rbayes.rb', line 30

def count_tasty
  @count_tasty
end

#databaseObject (readonly)

The BDB1 DB holding the token information.



35
36
37
# File 'lib/rbayes.rb', line 35

def database
  @database
end

Instance Method Details

#count_tokens_in(message) ⇒ Object

Returns a Hash mapping tokens to the number of occurances in message.



96
97
98
99
100
101
102
103
104
# File 'lib/rbayes.rb', line 96

def count_tokens_in(message)
  counts = Hash.new 0

  read_tokens_in message do |tok|
    counts[tok] += 1
  end

  return counts
end

#log(s) ⇒ Object

Logs s to $stderr if debugging is on.



59
60
61
# File 'lib/rbayes.rb', line 59

def log(s)
  $stderr.puts s if @debug
end

#rate(message) ⇒ Object

Rates message as tasty or bland.



109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/rbayes.rb', line 109

def rate(message)
  ratings = {}

  read_tokens_in message do |tok|
    unless ratings.has_key? tok then
      ratings[tok] = (0.5 - rate_token(tok)).abs
    end
  end

  inttok = ratings.sort_by { |v| -v[1] }[0..14]

  p = 1.0
  m1p = 1.0

  inttok.each do |tok, blandness|
    y = rate_token tok
    log "token #{tok} is %0.2f bland" % y
    p *= y
    m1p *= 1.0 - y
  end

  return p / (p + m1p)
end

#rate_token(tok) ⇒ Object

Rates token tok for tastiness. Returns a probability between 0 and 1.



200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/rbayes.rb', line 200

def rate_token(tok)
  tnum, bnum = (@database[tok] || "0 0").split(/\s+/)
  tnum, bnum = tnum.to_i, bnum.to_i

  if tnum == 0 && bnum > 0 then
    return 0.99

  elsif bnum == 0 && tnum > 0 then
    return 0.01

  elsif tnum == 0 && bnum == 0 then
    return 0.4

  end

  tasty = 2.0 * tnum
  bland = bnum.to_f

  tasty /= @count_tasty.to_f
  tasty = 1.0 if tasty > 1.0
  bland /= @count_bland.to_f
  bland = 1.0 if bland > 1.0
  
  t = bland / (tasty + bland)
  t = 0.99 if t > 0.99
  t = 0.01 if t < 0.01
  
  return t
end

#read_tokens_in(message) ⇒ Object

Yields tokens in message ignoring the boring headers and such.



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/rbayes.rb', line 66

def read_tokens_in(message)
  message.split($/).each do |line|
    line.chomp! "\r\n"
    
    next if line =~ /^\.?Date:/i
    next if line =~ /^\.?Message-ID:/i
    next if line =~ /^\.?In-Reply-To:/i
    next if line =~ /^\.?References:/i
    next if line =~ /^\.?[A-Za-z0-9\/\+]+$/
    next if line =~ /SMTP id/i
    next if line =~ /boundary=/
    next if line =~ /name=\"/
    next if line =~ /filename=\"/
    next if line =~ /^--[^\s\n]*$/
    
    line.downcase! unless @case_sensitive

    #log "Tokenizing #{line.inspect}"
    line.split(/(?:[^\w.?'@:$\/+-]+)/).each do |token|
      next if token.length < 3
      next if token =~ /^\d+$/

      yield token
    end
  end
end

#update_db_with(message, mode) ⇒ Object

Updates the database with tokens from message.

mode may be:

:add_bland

increases tastiness of found tokens

:add_tasty

increases tastiness of found tokens

:remove_bland

decreases blandness of found tokens

:remove_tasty

decreases tastiness of found tokens



143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/rbayes.rb', line 143

def update_db_with(message, mode)
  unless [:add_bland, :remove_bland, :add_tasty, :remove_tasty].include? mode
    raise ArgumentError, 'invalid mode'
  end
  log "updating db: #{mode}"

  counts = count_tokens_in message

  counts.each do |tok, cnt|
    tnum, bnum = (@database[tok] || "0 0").split(/\s+/)
    tnum, bnum = tnum.to_i, bnum.to_i
    log "found: #{tok} #{cnt} times, tasty: #{tnum}, bland: #{bnum}"
    
    unless @test then
      case mode
      when :add_tasty then tnum += cnt
      when :add_bland then bnum += cnt
      when :remove_tasty then tnum -= cnt
      when :remove_bland then bnum -= cnt
      end
    end
    
    tnum = 0 if tnum < 0
    bnum = 0 if bnum < 0

    # token not needed any more, don't waste space
    if tnum == 0 && bnum == 0 then
      @database.delete tok unless @test
      log "probs: #{tok} deleted"

    # update probability database
    else
      @database[tok] = [tnum, bnum].join(" ") unless @test
      log "update: #{tok}, tasty: #{tnum}, bland: #{bnum}"
    end
  end

  # for master count
  case mode
  when :add_tasty then @count_tasty += 1
  when :add_bland then @count_bland += 1
  when :remove_tasty then @count_tasty -= 1
  when :remove_bland then @count_bland -= 1
  end

  @count_tasty = 0 if @count_tasty < 0
  @count_bland = 0 if @count_bland < 0

  unless @test then
    @database[COUNT_TASTY] = @count_tasty
    @database[COUNT_BLAND] = @count_bland
  end
end