Class: SClust::LDA::LDA

Inherits:
Object
  • Object
show all
Defined in:
lib/sclust/lda/lda.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(docCol = nil) ⇒ LDA

Documents may be added after LDA is created, unlike k-mean clustering.



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/sclust/lda/lda.rb', line 49

def initialize(docCol=nil)
    @iterations = 3
    @wordlist    = []
    @doclist     = []
    @logger      = Log4r::Logger.new('Clusterer')

    
    # Array the same size as @wordlist but stores the document object at index i
    # that produced @wordlist[i].
    @word2doc = []
    
    self.topics = 10
    
    if ( docCol )
        docCol.each {|d| self << d}
    end
end

Instance Attribute Details

#doclistObject

Returns the value of attribute doclist.



45
46
47
# File 'lib/sclust/lda/lda.rb', line 45

def doclist
  @doclist
end

#iterationsObject

Returns the value of attribute iterations.



45
46
47
# File 'lib/sclust/lda/lda.rb', line 45

def iterations
  @iterations
end

#loggerObject

Returns the value of attribute logger.



45
46
47
# File 'lib/sclust/lda/lda.rb', line 45

def logger
  @logger
end

#topicsObject

Returns the value of attribute topics.



45
46
47
# File 'lib/sclust/lda/lda.rb', line 45

def topics
  @topics
end

Instance Method Details

#<<(document) ⇒ Object



67
68
69
70
71
# File 'lib/sclust/lda/lda.rb', line 67

def <<(document)
    @doclist << document
    @wordlist += document.words
    document.words.length.times {@word2doc << document}
end

#build_randomized_index_into_wordsObject

Build a wordlist index array. This is an array that contains indexes into @wordlist. However, instead of being simply 0,1,2,3… this array is randomized so that we index into @wordlist in a random order.



84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/sclust/lda/lda.rb', line 84

def build_randomized_index_into_words()
    @randomized_word_index = []
    
    @wordlist.each_index { |i| @randomized_word_index << i }
    
    @wordlist.each_index do |i|  
        new_home = (@wordlist.length * rand).to_i
        tmp = @randomized_word_index[i]
        @randomized_word_index[i] = @randomized_word_index[new_home]
        @randomized_word_index[new_home] = tmp
    end
    
end

#each_radomized_word_index(&call) ⇒ Object



109
110
111
# File 'lib/sclust/lda/lda.rb', line 109

def each_radomized_word_index(&call)
    @randomized_word_index.each &call
end

#each_topic(&topicproc) ⇒ Object

Takes {|topic| … }



206
207
208
# File 'lib/sclust/lda/lda.rb', line 206

def each_topic(&topicproc)
    @topics.each &topicproc
end

#get_max_terms(n = 3) ⇒ Object

Returns list list list. Each list is a topic list. Each topic list contains a word list.

[ z, word, topic ], …


231
232
233
234
235
236
237
# File 'lib/sclust/lda/lda.rb', line 231

def get_max_terms(n=3)
    topics = []
    
    each_topic { |t| topics << get_top_words_for_topic(t, n) }
    
    topics
end

#get_top_words_for_topic(topic, n = 3) ⇒ Object

Return a list lists, [ z, word ].



211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/sclust/lda/lda.rb', line 211

def get_top_words_for_topic(topic, n = 3)
    
    # List of (z, topic, word)
    tupleList = []
    
    topic.words.each_key do |word|
        tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
    end
    
    # Yes, rev the comparison so the list sorts backwards.
    tupleList.sort! { |x, y| y.weight <=> x.weight }
    
    tupleList[0...n]
    
end

#lda(opts = {}) ⇒ Object Also known as: cluster



193
194
195
196
197
198
199
200
201
202
203
# File 'lib/sclust/lda/lda.rb', line 193

def lda(opts={})
    opts[:iterations] ||= @iterations
    
    unless (opts[:continue])
        lda_setup()
    end
    
    opts[:iterations].times do |i|
        lda_once()
    end
end

#lda_onceObject

Perform 1 phase of lda



137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/sclust/lda/lda.rb', line 137

def lda_once()
    each_radomized_word_index do |random_word_index|
        
        random_word = @wordlist[random_word_index]
        
        zdist = []
        ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
        
        # Compute distribution over z for word i.
        @topics.each do |topic| 
            z = p_of_z(topic, random_word) 
            ztotal += z 
            zdist << z
        end
                            
        r      = rand * ztotal # Random value to pick topic with.
        zacc   = 0.0           # Accumulator of seen values of zdist[topici].
        topici = (rand() * @topics.size).to_i 

        # Pick a topic, t
        
        catch(:picked_topic) do
            @topics.each_index do |topici|
                zacc += zdist[topici]
                throw :picked_topic if r < zacc
            end
        end
        
        topic = @topics[topici]
        
        previous_topic = @topics[@word2topic[random_word_index]]

        # Skip if src and dst topic are the same                    
        next if @word2topic[random_word_index] == topici

        # Remove word from previous topic.
        
        if ( previous_topic.words[@wordlist[random_word_index]] > 0 )
            previous_topic.words[@wordlist[random_word_index]] -= 1    # Remove a new word in this topic
            previous_topic.wordcount                           -= 1    # Reduce sum of words
            previous_topic.docs[@word2doc[random_word_index]]  -= 1   # Remove this doc index in this topic
            
            previous_topic.docs.delete(@word2doc[random_word_index]) if previous_topic.docs[@word2doc[random_word_index]] <= 0
        end
        
        topic.words[@wordlist[random_word_index]] ||= 0     # If word was not in previous topic, add to this one.
        topic.docs[@word2doc[random_word_index]]  ||= 0     # If doc was not previously here.
        
        # Add word to chosen topic.
        @word2topic[random_word_index] = topici           # Record that this word goes to this topic.
        topic.words[@wordlist[random_word_index]] += 1    # Record a new word in this topic
        topic.wordcount                           += 1    # Total sum of words
        topic.docs[@word2doc[random_word_index]]  += 1 # Record this doc index in this topic
    end
end

#lda_setupObject



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# File 'lib/sclust/lda/lda.rb', line 113

def lda_setup()
    @beta  = 0.01
    @alpha = 50.0 / @topics.length
    
    build_randomized_index_into_words()
    
    @word2topic       = []
    @doc2topic        = []
    
    each_radomized_word_index do |i|
        topic = (@topics.size * rand).to_i

        @word2topic[i] = topic                        # Record that this word goes to this topic.
        @topics[topic].words[@wordlist[i]] ||= 0
        @topics[topic].docs[@word2doc[i]]  ||= 0
        
        @topics[topic].words[@wordlist[i]]  += 1    # Record a new word in this topic
        @topics[topic].wordcount            += 1    # Total sum of words
        @topics[topic].docs[@word2doc[i]]   += 1   # Record this doc index in this topic
    end
    
end

#p_of_z(topic, word) ⇒ Object

Compute P(z=j | z…_i, w). Or, the probability that a topic z is the topic j represented by the given word given that word.



100
101
102
103
104
105
106
107
# File 'lib/sclust/lda/lda.rb', line 100

def p_of_z(topic, word)
    
    return 0 unless topic.words[word]
    
    ((topic.words[word] - 1 + @beta)  / (topic.wordcount - topic.words[word] - 1 + @beta  * @wordlist.length)) * 
    ((topic.docs.size   - 1 + @alpha) / (@doclist.size    - 1 + @alpha * @topics.size))
    
end