Class: SClust::LDA2::LDA2

Inherits:
Object
  • Object
show all
Defined in:
lib/sclust/lda/lda2.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeLDA2

Documents may be added after LDA is created, unlike k-mean clustering.



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/sclust/lda/lda2.rb', line 74

def initialize()
    @iterations  = 3
    @wordlist    = []
    @doclist     = []
    @logger      = Log4r::Logger.new(self.class.to_s)
    @logger.add('default')
    @topic_change_rate = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
    @word_prob_avg = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
    @doc_prob_avg = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)

    # Used for inverse document frequency values.
    @document_collection = SClust::Util::DocumentCollection.new()
    
    # Array the same size as @wordlist but stores the document object at index i
    # that produced @wordlist[i].
    @word2doc = []
    
    self.topics = 10
end

Instance Attribute Details

#doclistObject

Returns the value of attribute doclist.



70
71
72
# File 'lib/sclust/lda/lda2.rb', line 70

def doclist
  @doclist
end

#document_collectionObject (readonly)

Returns the value of attribute document_collection.



68
69
70
# File 'lib/sclust/lda/lda2.rb', line 68

def document_collection
  @document_collection
end

#iterationsObject

Returns the value of attribute iterations.



70
71
72
# File 'lib/sclust/lda/lda2.rb', line 70

def iterations
  @iterations
end

#loggerObject

Returns the value of attribute logger.



70
71
72
# File 'lib/sclust/lda/lda2.rb', line 70

def logger
  @logger
end

#topicsObject

Returns the value of attribute topics.



70
71
72
# File 'lib/sclust/lda/lda2.rb', line 70

def topics
  @topics
end

Instance Method Details

#<<(document) ⇒ Object

Add a document to the collection backing this cluster. This must be a SClust::Util::Document.



104
105
106
107
108
109
110
111
112
# File 'lib/sclust/lda/lda2.rb', line 104

def <<(document)
    @doclist        << document
    
    @document_collection << document
    
    @wordlist       += document.words

    document.words.size.times { @word2doc << document }
end

#build_randomized_index_into_wordsObject

Build a wordlist index array. This is an array that contains indexes into @wordlist. However, instead of being simply 0,1,2,3… this array is randomized so that we index into @wordlist in a random order.



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/sclust/lda/lda2.rb', line 140

def build_randomized_index_into_words()
    
    @logger.info("Randomizing words.")
    
    @randomized_word_index = []
    
    @wordlist.each_index { |i| @randomized_word_index << i }
    
    @wordlist.each_index do |i|  
        new_home = (@wordlist.length * rand).to_i
        tmp = @randomized_word_index[i]
        @randomized_word_index[i] = @randomized_word_index[new_home]
        @randomized_word_index[new_home] = tmp
    end
    
end

#each_radomized_word_index(&call) ⇒ Object



191
192
193
# File 'lib/sclust/lda/lda2.rb', line 191

def each_radomized_word_index(&call)
    @randomized_word_index.each &call
end

#each_topic(&topicproc) ⇒ Object

Takes {|topic| … }



291
292
293
# File 'lib/sclust/lda/lda2.rb', line 291

def each_topic(&topicproc)
    @topics.each &topicproc
end

#get_max_terms(n = 3) ⇒ Object

Returns list list list. Each list is a topic list. Each topic list contains a word list.

[ z, word, topic ], …


316
317
318
319
320
321
322
# File 'lib/sclust/lda/lda2.rb', line 316

def get_max_terms(n=3)
    topics = []
    
    each_topic { |t| topics << get_top_words_for_topic(t, n) }
    
    topics
end

#get_top_words_for_topic(topic, n = 3) ⇒ Object

Return a list lists, [ z, word ].



296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# File 'lib/sclust/lda/lda2.rb', line 296

def get_top_words_for_topic(topic, n = 3)
    
    # List of (z, topic, word)
    tupleList = []
    
    topic.words.each_key do |word|
        tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
    end
    
    # Yes, rev the comparison so the list sorts backwards.
    tupleList.sort! { |x, y| y.weight <=> x.weight }

    tupleList[0...n]
    
end

#lda(opts = {}) ⇒ Object Also known as: cluster



276
277
278
279
280
281
282
283
284
285
286
287
288
# File 'lib/sclust/lda/lda2.rb', line 276

def lda(opts={})
    opts[:iterations] ||= @iterations
    
    unless (opts[:continue])
        @logger.info("Setting up to run LDA.")
        lda_setup()
    end
    
    opts[:iterations].times do |i|
        @logger.info { "LDA Iteration #{i+1} / #{opts[:iterations]}"}
        lda_once()
    end
end

#lda_onceObject

Perform 1 phase of lda



217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# File 'lib/sclust/lda/lda2.rb', line 217

def lda_once()
    each_radomized_word_index do |random_word_index|
        
        random_word = @wordlist[random_word_index]
        doc         = @word2doc[random_word_index]

        zdist = []
        ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
        
        # Compute distribution over z for word i.
        @topics.each do |topic|
            z = p_of_z(topic, random_word, doc) 
            ztotal += z 
            zdist << z
        end
                            
        r      = rand * ztotal # Random value to pick topic with.
        zacc   = 0.0           # Accumulator of seen values of zdist[topic_i].
        topic_i = (rand() * @topics.size).to_i 

        # Pick a topic, t
        
        catch(:picked_topic) do
            @topics.each_index do |topic_i|
                zacc += zdist[topic_i]
                throw :picked_topic if r < zacc
            end
        end
        
        topic = @topics[topic_i]
        
        previous_topic = @topics[@word2topic[random_word_index]]

        # Skip if src and dst topic are the same                    
        if @word2topic[random_word_index] == topic_i
            
            @topic_change_rate.adjust(0.0) # adjust...

        else
            
            # Adjust the topic change rate. This is how we will trac convergence. 
            # Few topic moves (comparatively) and we're done.                    
            @topic_change_rate.adjust(1.0)
    
            # Remove word from previous topic.
            
            previous_topic.remove(random_word, doc) if previous_topic.has_word_and_doc?(random_word, doc)
            
            # Add word to chosen topic.
            @word2topic[random_word_index] = topic_i           # Record that this word goes to this topic.
            
            topic.add(random_word, doc)
            
        end
    end
    
    $logger.info { "Topic change rate: #{@topic_change_rate.value} Doc% #{ @doc_prob_avg.value} Word% #{ @word_prob_avg.value}" }
end

#lda_setupObject



195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'lib/sclust/lda/lda2.rb', line 195

def lda_setup()
    @beta  = 0.01 
    @alpha = 1.0 #( @doclist.size / @topics.length ).to_f
    
    build_randomized_index_into_words()
    
    @word2topic       = []
    @doc2topic        = []
    
    each_radomized_word_index do |i|
        topic = (@topics.size * rand).to_i
    
        @word2topic[i] = topic                        # Record that this word goes to this topic.
        
        @topics[topic].add(@wordlist[i], @word2doc[i])
    end
    
    @topic_change_rate.weight = 1.0 / @wordlist.size
    
end

#p_of_z(topic, word, doc = nil) ⇒ Object

Compute p(z_i|theta) * p(w|z_i,B).



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'lib/sclust/lda/lda2.rb', line 160

def p_of_z(topic, word, doc=nil)
    
    beta = @beta
    
    words_from_doc_in_topic = (doc.nil?) ?
        topic.docs.reduce(0.0) { |x, num| x+num[1] } : 
        words_from_doc_in_topic = topic.docs[doc]
    
    word_prob_avg = ((topic.words[word] - 1.0 + beta)  / (topic.wordcount - 1.0 + beta ) )
    doc_prob_avg  = ((words_from_doc_in_topic - 1.0 + @alpha) / (topic.wordcount - 1.0 + @alpha ))

    
    # Stop-gap protection for when the denominator gets wonky.
    doc_prob_avg = 0.0 if doc_prob_avg.nan? || doc_prob_avg < 0.0
    word_prob_avg = 0.0 if word_prob_avg.nan? || word_prob_avg < 0.0
    
    @word_prob_avg.adjust(word_prob_avg)
    @doc_prob_avg.adjust(doc_prob_avg)
    
    #@logger.info("WHAJL:KJ:LKDS: #{doc_prob_avg} #{topic.docs.size} #{@doclist.size}")
    
    # Final result.
    doc_prob_avg * word_prob_avg
    
    # Alternate forumla. Denominator changed.
    #((topic.words[word] - 1.0 + beta)  / (topic.wordcount - 1.0 + beta ) ) * 
    #((topic.docs.size - 1.0 + alpha) / (@doclist.size - topic.docs.size - 1.0 + alpha ))

    
end

#rebuild_document_collectionObject

If you edit the document collection behind the scenes, you need to run this to avoid terms with 0 showing up.



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/sclust/lda/lda2.rb', line 116

def rebuild_document_collection()
    
    @logger.debug { "Collection now has #{@doclist.size} documents, #{@wordlist.size} words."}
    @logger.info("Rebuilding document collection and word list.")                
    
    dl = @document_collection.doclist

    @doclist = []

    @document_collection = SClust::Util::DocumentCollection.new()
    
    @wordlist = []
    
    @word2doc = []
    
    dl.each { |doc| self << doc }
    
    @logger.debug { "Collection now has #{@doclist.size} documents, #{@wordlist.size} words."}
    
end