Class: SClust::LDA::LDA
- Inherits:
-
Object
- Object
- SClust::LDA::LDA
- Defined in:
- lib/sclust/lda/lda.rb
Instance Attribute Summary collapse
-
#doclist ⇒ Object
Returns the value of attribute doclist.
-
#iterations ⇒ Object
Returns the value of attribute iterations.
-
#logger ⇒ Object
Returns the value of attribute logger.
-
#topics ⇒ Object
Returns the value of attribute topics.
Instance Method Summary collapse
- #<<(document) ⇒ Object
-
#build_randomized_index_into_words ⇒ Object
Build a wordlist index array.
- #each_radomized_word_index(&call) ⇒ Object
-
#each_topic(&topicproc) ⇒ Object
Takes {|topic| … }.
-
#get_max_terms(n = 3) ⇒ Object
Returns list list list.
-
#get_top_words_for_topic(topic, n = 3) ⇒ Object
Return a list lists, [ z, word ].
-
#initialize(docCol = nil) ⇒ LDA
constructor
Documents may be added after LDA is created, unlike k-mean clustering.
- #lda(opts = {}) ⇒ Object (also: #cluster)
-
#lda_once ⇒ Object
Perform 1 phase of lda.
- #lda_setup ⇒ Object
-
#p_of_z(topic, word) ⇒ Object
Compute P(z=j | z…_i, w).
Constructor Details
#initialize(docCol = nil) ⇒ LDA
Documents may be added after LDA is created, unlike k-mean clustering.
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/sclust/lda/lda.rb', line 49 def initialize(docCol=nil) @iterations = 3 @wordlist = [] @doclist = [] @logger = Log4r::Logger.new('Clusterer') # Array the same size as @wordlist but stores the document object at index i # that produced @wordlist[i]. @word2doc = [] self.topics = 10 if ( docCol ) docCol.each {|d| self << d} end end |
Instance Attribute Details
#doclist ⇒ Object
Returns the value of attribute doclist.
45 46 47 |
# File 'lib/sclust/lda/lda.rb', line 45 def doclist @doclist end |
#iterations ⇒ Object
Returns the value of attribute iterations.
45 46 47 |
# File 'lib/sclust/lda/lda.rb', line 45 def iterations @iterations end |
#logger ⇒ Object
Returns the value of attribute logger.
45 46 47 |
# File 'lib/sclust/lda/lda.rb', line 45 def logger @logger end |
#topics ⇒ Object
Returns the value of attribute topics.
45 46 47 |
# File 'lib/sclust/lda/lda.rb', line 45 def topics @topics end |
Instance Method Details
#<<(document) ⇒ Object
67 68 69 70 71 |
# File 'lib/sclust/lda/lda.rb', line 67 def <<(document) @doclist << document @wordlist += document.words document.words.length.times {@word2doc << document} end |
#build_randomized_index_into_words ⇒ Object
Build a wordlist index array. This is an array that contains indexes into @wordlist. However, instead of being simply 0,1,2,3… this array is randomized so that we index into @wordlist in a random order.
84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/sclust/lda/lda.rb', line 84 def build_randomized_index_into_words() @randomized_word_index = [] @wordlist.each_index { |i| @randomized_word_index << i } @wordlist.each_index do |i| new_home = (@wordlist.length * rand).to_i tmp = @randomized_word_index[i] @randomized_word_index[i] = @randomized_word_index[new_home] @randomized_word_index[new_home] = tmp end end |
#each_radomized_word_index(&call) ⇒ Object
109 110 111 |
# File 'lib/sclust/lda/lda.rb', line 109 def each_radomized_word_index(&call) @randomized_word_index.each &call end |
#each_topic(&topicproc) ⇒ Object
Takes {|topic| … }
206 207 208 |
# File 'lib/sclust/lda/lda.rb', line 206 def each_topic(&topicproc) @topics.each &topicproc end |
#get_max_terms(n = 3) ⇒ Object
Returns list list list. Each list is a topic list. Each topic list contains a word list.
- [ z, word, topic ], …
231 232 233 234 235 236 237 |
# File 'lib/sclust/lda/lda.rb', line 231 def get_max_terms(n=3) topics = [] each_topic { |t| topics << get_top_words_for_topic(t, n) } topics end |
#get_top_words_for_topic(topic, n = 3) ⇒ Object
Return a list lists, [ z, word ].
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
# File 'lib/sclust/lda/lda.rb', line 211 def get_top_words_for_topic(topic, n = 3) # List of (z, topic, word) tupleList = [] topic.words.each_key do |word| tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } ) end # Yes, rev the comparison so the list sorts backwards. tupleList.sort! { |x, y| y.weight <=> x.weight } tupleList[0...n] end |
#lda(opts = {}) ⇒ Object Also known as: cluster
193 194 195 196 197 198 199 200 201 202 203 |
# File 'lib/sclust/lda/lda.rb', line 193 def lda(opts={}) opts[:iterations] ||= @iterations unless (opts[:continue]) lda_setup() end opts[:iterations].times do |i| lda_once() end end |
#lda_once ⇒ Object
Perform 1 phase of lda
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/sclust/lda/lda.rb', line 137 def lda_once() each_radomized_word_index do |random_word_index| random_word = @wordlist[random_word_index] zdist = [] ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0. # Compute distribution over z for word i. @topics.each do |topic| z = p_of_z(topic, random_word) ztotal += z zdist << z end r = rand * ztotal # Random value to pick topic with. zacc = 0.0 # Accumulator of seen values of zdist[topici]. topici = (rand() * @topics.size).to_i # Pick a topic, t catch(:picked_topic) do @topics.each_index do |topici| zacc += zdist[topici] throw :picked_topic if r < zacc end end topic = @topics[topici] previous_topic = @topics[@word2topic[random_word_index]] # Skip if src and dst topic are the same next if @word2topic[random_word_index] == topici # Remove word from previous topic. if ( previous_topic.words[@wordlist[random_word_index]] > 0 ) previous_topic.words[@wordlist[random_word_index]] -= 1 # Remove a new word in this topic previous_topic.wordcount -= 1 # Reduce sum of words previous_topic.docs[@word2doc[random_word_index]] -= 1 # Remove this doc index in this topic previous_topic.docs.delete(@word2doc[random_word_index]) if previous_topic.docs[@word2doc[random_word_index]] <= 0 end topic.words[@wordlist[random_word_index]] ||= 0 # If word was not in previous topic, add to this one. topic.docs[@word2doc[random_word_index]] ||= 0 # If doc was not previously here. # Add word to chosen topic. @word2topic[random_word_index] = topici # Record that this word goes to this topic. topic.words[@wordlist[random_word_index]] += 1 # Record a new word in this topic topic.wordcount += 1 # Total sum of words topic.docs[@word2doc[random_word_index]] += 1 # Record this doc index in this topic end end |
#lda_setup ⇒ Object
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# File 'lib/sclust/lda/lda.rb', line 113 def lda_setup() @beta = 0.01 @alpha = 50.0 / @topics.length build_randomized_index_into_words() @word2topic = [] @doc2topic = [] each_radomized_word_index do |i| topic = (@topics.size * rand).to_i @word2topic[i] = topic # Record that this word goes to this topic. @topics[topic].words[@wordlist[i]] ||= 0 @topics[topic].docs[@word2doc[i]] ||= 0 @topics[topic].words[@wordlist[i]] += 1 # Record a new word in this topic @topics[topic].wordcount += 1 # Total sum of words @topics[topic].docs[@word2doc[i]] += 1 # Record this doc index in this topic end end |
#p_of_z(topic, word) ⇒ Object
Compute P(z=j | z…_i, w). Or, the probability that a topic z is the topic j represented by the given word given that word.
100 101 102 103 104 105 106 107 |
# File 'lib/sclust/lda/lda.rb', line 100 def p_of_z(topic, word) return 0 unless topic.words[word] ((topic.words[word] - 1 + @beta) / (topic.wordcount - topic.words[word] - 1 + @beta * @wordlist.length)) * ((topic.docs.size - 1 + @alpha) / (@doclist.size - 1 + @alpha * @topics.size)) end |