Class: Bishop::Bayes

Inherits:
Object
  • Object
show all
Defined in:
lib/bishop.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(tokenizer = nil, data_class = BayesData, &combiner) ⇒ Bayes

Returns a new instance of Bayes.



56
57
58
59
60
61
62
63
64
65
# File 'lib/bishop.rb', line 56

def initialize( tokenizer = nil, data_class = BayesData, &combiner )
  @tokenizer = tokenizer || Tokenizer.new
  @combiner = combiner || Proc.new { |probs,ignore| Bishop.robinson( probs, ignore ) }
  @data_class = data_class
  @pools = {}
  @corpus = new_pool( '__Corpus__' )
  @pools['__Corpus__'] = @corpus
  @train_count = 0
  @dirty = true
end

Instance Attribute Details

#cacheObject

Returns the value of attribute cache.



53
54
55
# File 'lib/bishop.rb', line 53

def cache
  @cache
end

#combinerObject

Returns the value of attribute combiner.



53
54
55
# File 'lib/bishop.rb', line 53

def combiner
  @combiner
end

#corpusObject

Returns the value of attribute corpus.



53
54
55
# File 'lib/bishop.rb', line 53

def corpus
  @corpus
end

#data_classObject

Returns the value of attribute data_class.



53
54
55
# File 'lib/bishop.rb', line 53

def data_class
  @data_class
end

#dirtyObject

Returns the value of attribute dirty.



53
54
55
# File 'lib/bishop.rb', line 53

def dirty
  @dirty
end

#poolsObject

Returns the value of attribute pools.



53
54
55
# File 'lib/bishop.rb', line 53

def pools
  @pools
end

#tokenizerObject

Returns the value of attribute tokenizer.



53
54
55
# File 'lib/bishop.rb', line 53

def tokenizer
  @tokenizer
end

#train_countObject

Returns the value of attribute train_count.



53
54
55
# File 'lib/bishop.rb', line 53

def train_count
  @train_count
end

Instance Method Details

#build_cacheObject

Create a cache of the metrics for each pool.



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/bishop.rb', line 145

def build_cache
  self.cache = {}
  
  self.pools.each do |name,pool|
    unless name == '__Corpus__'
    
      pool_count = pool.token_count
      them_count = [ 1, self.corpus.token_count - pool_count ].max
      cache_dict = self.cache[ name ] ||= @data_class.new( name )
      
      self.corpus.data.each do |token,tot_count|
        this_count = pool.data[token]

        unless this_count == 0.0
          other_count = tot_count - this_count
          
          if pool_count > 0
            good_metric = [ 1.0, other_count / pool_count ].min
          else
            good_metric = 1.0
          end
        
          bad_metric = [ 1.0, this_count / them_count ].min
        
          f = bad_metric / ( good_metric + bad_metric )
          
          if ( f - 0.5 ).abs >= 0.1
            cache_dict.data[token] = [ 0.0001, [ 0.9999, f ].min ].max
          end  
        end
      end
    end
  end
end

#commitObject



67
68
69
# File 'lib/bishop.rb', line 67

def commit
  self.save
end

#dirty?Boolean

Returns:

  • (Boolean)


71
72
73
# File 'lib/bishop.rb', line 71

def dirty?
  self.dirty
end

#exportObject



121
122
123
# File 'lib/bishop.rb', line 121

def export
  self.pools.to_yaml
end

#get_probs(pool, words) ⇒ Object

For each word trained in the pool, collect it’s occurrence data in the pool into a sorted array.



196
197
198
# File 'lib/bishop.rb', line 196

def get_probs( pool, words )
  words.find_all { |word| pool.data.has_key? word }.map { |word| [word,pool.data[word]] }.sort
end

#get_tokens(input) ⇒ Object

Create a token array from the specified input.



191
192
193
# File 'lib/bishop.rb', line 191

def get_tokens( input )
  self.tokenizer.tokenize( input )
end

#guess(msg) ⇒ Object

Call this method to classify a “message”. The return value will be an array containing tuples (pool, probability) for each pool which is a likely match for the message.



264
265
266
267
268
269
270
271
272
273
274
275
276
# File 'lib/bishop.rb', line 264

def guess( msg )
  tokens = get_tokens( msg )
  res = {}
  
  pool_probs.each do |pool_name,pool|
    p = get_probs( pool, tokens )
    if p.length != 0
      res[pool_name] = self.combiner.call( p, pool_name )
    end    
  end
  
  res.sort
end

#load(file = 'bayesdata.yml') ⇒ Object



125
126
127
128
129
130
131
# File 'lib/bishop.rb', line 125

def load( file = 'bayesdata.yml' )
  begin
    File.open( file ) { |f| load_data( f ) }
  rescue Errno::ENOENT
    # File does not exist
  end
end

#load_data(source) ⇒ Object



133
134
135
136
137
138
# File 'lib/bishop.rb', line 133

def load_data( source )
  self.pools = YAML.load( source )
  self.pools.each { |pool_name,pool| pool.data.default = 0.0 }
  self.corpus = self.pools['__Corpus__']
  self.dirty = true
end

#merge_pools(dest_name, source_name) ⇒ Object

Merge the contents of the source pool into the destination destination pool.



94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/bishop.rb', line 94

def merge_pools( dest_name, source_name )
  dest_pool = self.pools[dest_name]
  self.pools[source_name].data.each do |token,count|
    if dest_pool.data.has_key?( token )
      dest_pool.data[token] += count
    else
      dest_pool.data[token] = count
      dest_pool.token_count += 1
    end
  end
  self.dirty = true  
end

#new_pool(pool_name) ⇒ Object

Create a new, empty, pool without training.



76
77
78
79
# File 'lib/bishop.rb', line 76

def new_pool( pool_name )
  self.dirty = true
  self.pools[ pool_name ] ||= @data_class.new( pool_name )
end

#pool_data(pool_name) ⇒ Object

Return an array of token counts for the specified pool.



108
109
110
# File 'lib/bishop.rb', line 108

def pool_data( pool_name )
  self.pools[pool_name].data.to_a
end

#pool_namesObject



140
141
142
# File 'lib/bishop.rb', line 140

def pool_names
     self.pools.keys.sort.reject { |name| name == '__Corpus__' }
end

#pool_probsObject

Get the probabilities for each pool, recreating the cached information if any token information for any of the pools has changed.



182
183
184
185
186
187
188
# File 'lib/bishop.rb', line 182

def pool_probs
  if self.dirty?
    self.build_cache
    self.dirty = false
  end    
  self.cache
end

#pool_tokens(pool_name) ⇒ Object

Return an array of tokens trained in the specified pool.



113
114
115
# File 'lib/bishop.rb', line 113

def pool_tokens( pool_name )
  self.pools[pool_name].data.keys
end

#remove_pool(pool_name) ⇒ Object



81
82
83
# File 'lib/bishop.rb', line 81

def remove_pool( pool_name )
  self.pools.delete( pool_name ) 
end

#rename_pool(pool_name, new_name) ⇒ Object



85
86
87
88
89
90
# File 'lib/bishop.rb', line 85

def rename_pool( pool_name, new_name )
  self.pools[new_name] = self.pools[pool_name]
  self.pools[new_name].name = new_name
  self.pools.delete( pool_name )
  self.dirty = true
end

#save(file = 'bayesdata.yml') ⇒ Object



117
118
119
# File 'lib/bishop.rb', line 117

def save( file = 'bayesdata.yml' )
  File.open( file, 'w' ) { |f| YAML.dump( self.pools, f ) }
end

#train(pool_name, item, uid = nil) ⇒ Object



200
201
202
203
204
205
206
207
208
209
210
# File 'lib/bishop.rb', line 200

def train( pool_name, item, uid = nil )
  tokens = get_tokens( item )
  pool = new_pool( pool_name )
  train_( pool, tokens )
  self.corpus.train_count += 1
  pool.train_count += 1
  if uid
    pool.training.push( uid )
  end    
  self.dirty = true
end

#trained_on?(msg) ⇒ Boolean

Returns:

  • (Boolean)


257
258
259
# File 'lib/bishop.rb', line 257

def trained_on?( msg )
  self.cache.values.any? { |v| v.trained_on? msg }
end

#untrain(pool_name, item, uid = nil) ⇒ Object



223
224
225
226
227
228
229
230
231
232
233
# File 'lib/bishop.rb', line 223

def untrain( pool_name, item, uid = nil )
  tokens = get_tokens( item )
  pool = new_pool( pool_name )
  untrain_( pool, tokens )
  self.corpus.train_count += 1
  pool.train_count += 1
  if uid
    pool.training.delete( uid )
  end    
  self.dirty = true  
end