Class: Ankusa::CassandraStorage

Inherits:
Object
  • Object
show all
Defined in:
lib/ankusa/cassandra_storage.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(host = '127.0.0.1', port = 9160, keyspace = 'ankusa', max_classes = 100) ⇒ CassandraStorage

Necessary to set max classes since current implementation of ruby cassandra client doesn’t support table scans. Using crufty get_range method at the moment.



21
22
23
24
25
26
27
28
# File 'lib/ankusa/cassandra_storage.rb', line 21

def initialize(host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100)
  @cassandra  = Cassandra.new('system', "#{host}:#{port}")
  @klass_word_counts = {}
  @klass_doc_counts = {}
  @keyspace    = keyspace
  @max_classes = max_classes
  init_tables
end

Instance Attribute Details

#cassandraObject (readonly)

Returns the value of attribute cassandra.



14
15
16
# File 'lib/ankusa/cassandra_storage.rb', line 14

def cassandra
  @cassandra
end

Instance Method Details

#classnamesObject

Fetch the names of the distinct classes for classification: eg. :spam, :good, etc



34
35
36
37
38
# File 'lib/ankusa/cassandra_storage.rb', line 34

def classnames
  @cassandra.get_range(:totals, {:start => '', :finish => '', :count => @max_classes}).inject([]) do |cs, key_slice|
    cs << key_slice.key.to_sym
  end
end

#closeObject

Doesn’t do anything



174
175
# File 'lib/ankusa/cassandra_storage.rb', line 174

def close
end

#doc_count_totalsObject



167
168
169
# File 'lib/ankusa/cassandra_storage.rb', line 167

def doc_count_totals
  get_summary "doc_count"
end

#drop_tablesObject

Drop ankusa keyspace, reset internal caches

FIXME: truncate doesn’t work with cassandra-beta2



50
51
52
53
54
55
56
# File 'lib/ankusa/cassandra_storage.rb', line 50

def drop_tables
  @cassandra.truncate!('classes')
  @cassandra.truncate!('totals')
  @cassandra.drop_keyspace(@keyspace)
  @klass_word_counts = {}
  @klass_doc_counts = {}
end

#get_doc_count(klass) ⇒ Object

Fetch total documents for a given class and cache it



109
110
111
# File 'lib/ankusa/cassandra_storage.rb', line 109

def get_doc_count(klass)
  @klass_doc_counts[klass] = @cassandra.get(:totals, klass.to_s, "doc_count").values.last.to_f
end

#get_total_word_count(klass) ⇒ Object

Fetch total word count for a given class and cache it



102
103
104
# File 'lib/ankusa/cassandra_storage.rb', line 102

def get_total_word_count(klass)
  @klass_word_counts[klass] = @cassandra.get(:totals, klass.to_s, "wordcount").values.last.to_f
end

#get_vocabulary_sizesObject

Does a table ‘scan’ of summary table pulling out the ‘vocabsize’ column from each row. Generates a hash of (class, vocab_size) key value pairs



95
96
97
# File 'lib/ankusa/cassandra_storage.rb', line 95

def get_vocabulary_sizes
  get_summary "vocabsize"
end

#get_word_counts(word) ⇒ Object

Fetch hash of word counts as a single row from cassandra. Here column_name is the class and column value is the count



84
85
86
87
88
89
# File 'lib/ankusa/cassandra_storage.rb', line 84

def get_word_counts(word)
  # fetch all (class,count) pairs for a given word
  row = @cassandra.get(:classes, word.to_s)
  return row.to_hash if row.empty?
  row.inject({}){|counts, col| counts[col.first.to_sym] = [col.last.to_f,0].max; counts}
end

#incr_doc_count(klass, count) ⇒ Object

Increment total document count for a given class by ‘count’



159
160
161
162
163
164
165
# File 'lib/ankusa/cassandra_storage.rb', line 159

def incr_doc_count(klass, count)
  klass = klass.to_s
  doc_count = @cassandra.get(:totals, klass, "doc_count").values.last.to_i
  doc_count += count
  @cassandra.insert(:totals, klass, {"doc_count" => doc_count.to_s})
  @klass_doc_counts[klass.to_sym] = doc_count
end

#incr_total_word_count(klass, count) ⇒ Object

Increment total word count for a given class by ‘count’



148
149
150
151
152
153
154
# File 'lib/ankusa/cassandra_storage.rb', line 148

def incr_total_word_count(klass, count)
  klass = klass.to_s
  wordcount = @cassandra.get(:totals, klass, "wordcount").values.last.to_i
  wordcount += count
  @cassandra.insert(:totals, klass, {"wordcount" => wordcount.to_s})
  @klass_word_counts[klass.to_sym] = wordcount
end

#incr_word_count(klass, word, count) ⇒ Object

Increment the count for a given (word,class) pair. Evidently, cassandra does not support atomic increment/decrement. Psh. HBase uses ZooKeeper to implement atomic operations, ain’t it special?



118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/ankusa/cassandra_storage.rb', line 118

def incr_word_count(klass, word, count)
  # Only wants strings
  klass = klass.to_s
  word  = word.to_s

  prior_count = @cassandra.get(:classes, word, klass).values.last.to_i
  new_count   = prior_count + count
  @cassandra.insert(:classes, word, {klass => new_count.to_s})

  if (prior_count == 0 && count > 0)
    #
    # we've never seen this word before and we're not trying to unlearn it
    #
    vocab_size = @cassandra.get(:totals, klass, "vocabsize").values.last.to_i
    vocab_size += 1
    @cassandra.insert(:totals, klass, {"vocabsize" => vocab_size.to_s})
  elsif new_count == 0
    #
    # we've seen this word before but we're trying to unlearn it
    #
    vocab_size = @cassandra.get(:totals, klass, "vocabsize").values.last.to_i
    vocab_size -= 1
    @cassandra.insert(:totals, klass, {"vocabsize" => vocab_size.to_s})
  end
  new_count
end

#init_tablesObject

Create required keyspace and column families



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/ankusa/cassandra_storage.rb', line 62

def init_tables
  # Do nothing if keyspace already exists
  if @cassandra.keyspaces.include?(@keyspace)
    @cassandra.keyspace = @keyspace
  else
    freq_table    = Cassandra::ColumnFamily.new({:keyspace => @keyspace, :name => "classes"}) # word  => {classname => count}
    summary_table = Cassandra::ColumnFamily.new({:keyspace => @keyspace, :name => "totals"})  # class => {wordcount => count}
    ks_def = Cassandra::Keyspace.new({
        :name               => @keyspace,
        :strategy_class     => 'org.apache.cassandra.locator.SimpleStrategy',
        :replication_factor => 1,
        :cf_defs            => [freq_table, summary_table] 
      })
    @cassandra.add_keyspace ks_def
    @cassandra.keyspace = @keyspace
  end
end

#resetObject



40
41
42
43
# File 'lib/ankusa/cassandra_storage.rb', line 40

def reset
  drop_tables
  init_tables
end