Class: SClust::LDA::LDA

Inherits:

Object

Object
SClust::LDA::LDA

show all

Defined in:: lib/sclust/lda/lda.rb

Instance Attribute Summary collapse

#doclist ⇒ Object

Returns the value of attribute doclist.
#iterations ⇒ Object

Returns the value of attribute iterations.
#logger ⇒ Object

Returns the value of attribute logger.
#topics ⇒ Object

Returns the value of attribute topics.

Instance Method Summary collapse

#<<(document) ⇒ Object
#build_randomized_index_into_words ⇒ Object

Build a wordlist index array.
#each_radomized_word_index(&call) ⇒ Object
#each_topic(&topicproc) ⇒ Object

Takes {|topic| … }.
#get_max_terms(n = 3) ⇒ Object

Returns list list list.
#get_top_words_for_topic(topic, n = 3) ⇒ Object

Return a list lists, [ z, word ].
#initialize(docCol = nil) ⇒ LDA constructor

Documents may be added after LDA is created, unlike k-mean clustering.
#lda(opts = {}) ⇒ Object (also: #cluster)
#lda_once ⇒ Object

Perform 1 phase of lda.
#lda_setup ⇒ Object
#p_of_z(topic, word) ⇒ Object

Compute P(z=j | z…_i, w).

Constructor Details

#initialize(docCol = nil) ⇒ `LDA`

Documents may be added after LDA is created, unlike k-mean clustering.

# File 'lib/sclust/lda/lda.rb', line 49

def initialize(docCol=nil)
    @iterations = 3
    @wordlist    = []
    @doclist     = []
    @logger      = Log4r::Logger.new('Clusterer')

    
    # Array the same size as @wordlist but stores the document object at index i
    # that produced @wordlist[i].
    @word2doc = []
    
    self.topics = 10
    
    if ( docCol )
        docCol.each {|d| self << d}
    end
end

Instance Attribute Details

#doclist ⇒ `Object`

Returns the value of attribute doclist.



45
46
47

# File 'lib/sclust/lda/lda.rb', line 45

def doclist
  @doclist
end

#iterations ⇒ `Object`

Returns the value of attribute iterations.



45
46
47

# File 'lib/sclust/lda/lda.rb', line 45

def iterations
  @iterations
end

#logger ⇒ `Object`

Returns the value of attribute logger.



45
46
47

# File 'lib/sclust/lda/lda.rb', line 45

def logger
  @logger
end

#topics ⇒ `Object`

Returns the value of attribute topics.



45
46
47

# File 'lib/sclust/lda/lda.rb', line 45

def topics
  @topics
end

Instance Method Details

#<<(document) ⇒ `Object`

# File 'lib/sclust/lda/lda.rb', line 67

def <<(document)
    @doclist << document
    @wordlist += document.words
    document.words.length.times {@word2doc << document}
end

#build_randomized_index_into_words ⇒ `Object`

Build a wordlist index array. This is an array that contains indexes into @wordlist. However, instead of being simply 0,1,2,3… this array is randomized so that we index into @wordlist in a random order.

# File 'lib/sclust/lda/lda.rb', line 84

def build_randomized_index_into_words()
    @randomized_word_index = []
    
    @wordlist.each_index { |i| @randomized_word_index << i }
    
    @wordlist.each_index do |i|  
        new_home = (@wordlist.length * rand).to_i
        tmp = @randomized_word_index[i]
        @randomized_word_index[i] = @randomized_word_index[new_home]
        @randomized_word_index[new_home] = tmp
    end
    
end

#each_radomized_word_index(&call) ⇒ `Object`



109
110
111

# File 'lib/sclust/lda/lda.rb', line 109

def each_radomized_word_index(&call)
    @randomized_word_index.each &call
end

#each_topic(&topicproc) ⇒ `Object`

Takes {|topic| … }



206
207
208

# File 'lib/sclust/lda/lda.rb', line 206

def each_topic(&topicproc)
    @topics.each &topicproc
end

#get_max_terms(n = 3) ⇒ `Object`

Returns list list list. Each list is a topic list. Each topic list contains a word list.

[ z, word, topic ], …

# File 'lib/sclust/lda/lda.rb', line 231

def get_max_terms(n=3)
    topics = []
    
    each_topic { |t| topics << get_top_words_for_topic(t, n) }
    
    topics
end

#get_top_words_for_topic(topic, n = 3) ⇒ `Object`

Return a list lists, [ z, word ].

# File 'lib/sclust/lda/lda.rb', line 211

def get_top_words_for_topic(topic, n = 3)
    
    # List of (z, topic, word)
    tupleList = []
    
    topic.words.each_key do |word|
        tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
    end
    
    # Yes, rev the comparison so the list sorts backwards.
    tupleList.sort! { |x, y| y.weight <=> x.weight }
    
    tupleList[0...n]
    
end

#lda(opts = {}) ⇒ `Object` Also known as: cluster

# File 'lib/sclust/lda/lda.rb', line 193

def lda(opts={})
    opts[:iterations] ||= @iterations
    
    unless (opts[:continue])
        lda_setup()
    end
    
    opts[:iterations].times do |i|
        lda_once()
    end
end

#lda_once ⇒ `Object`

Perform 1 phase of lda

# File 'lib/sclust/lda/lda.rb', line 137

def lda_once()
    each_radomized_word_index do |random_word_index|
        
        random_word = @wordlist[random_word_index]
        
        zdist = []
        ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
        
        # Compute distribution over z for word i.
        @topics.each do |topic| 
            z = p_of_z(topic, random_word) 
            ztotal += z 
            zdist << z
        end
                            
        r      = rand * ztotal # Random value to pick topic with.
        zacc   = 0.0           # Accumulator of seen values of zdist[topici].
        topici = (rand() * @topics.size).to_i 

        # Pick a topic, t
        
        catch(:picked_topic) do
            @topics.each_index do |topici|
                zacc += zdist[topici]
                throw :picked_topic if r < zacc
            end
        end
        
        topic = @topics[topici]
        
        previous_topic = @topics[@word2topic[random_word_index]]

        # Skip if src and dst topic are the same                    
        next if @word2topic[random_word_index] == topici

        # Remove word from previous topic.
        
        if ( previous_topic.words[@wordlist[random_word_index]] > 0 )
            previous_topic.words[@wordlist[random_word_index]] -= 1    # Remove a new word in this topic
            previous_topic.wordcount                           -= 1    # Reduce sum of words
            previous_topic.docs[@word2doc[random_word_index]]  -= 1   # Remove this doc index in this topic
            
            previous_topic.docs.delete(@word2doc[random_word_index]) if previous_topic.docs[@word2doc[random_word_index]] <= 0
        end
        
        topic.words[@wordlist[random_word_index]] ||= 0     # If word was not in previous topic, add to this one.
        topic.docs[@word2doc[random_word_index]]  ||= 0     # If doc was not previously here.
        
        # Add word to chosen topic.
        @word2topic[random_word_index] = topici           # Record that this word goes to this topic.
        topic.words[@wordlist[random_word_index]] += 1    # Record a new word in this topic
        topic.wordcount                           += 1    # Total sum of words
        topic.docs[@word2doc[random_word_index]]  += 1 # Record this doc index in this topic
    end
end

#lda_setup ⇒ `Object`

# File 'lib/sclust/lda/lda.rb', line 113

def lda_setup()
    @beta  = 0.01
    @alpha = 50.0 / @topics.length
    
    build_randomized_index_into_words()
    
    @word2topic       = []
    @doc2topic        = []
    
    each_radomized_word_index do |i|
        topic = (@topics.size * rand).to_i

        @word2topic[i] = topic                        # Record that this word goes to this topic.
        @topics[topic].words[@wordlist[i]] ||= 0
        @topics[topic].docs[@word2doc[i]]  ||= 0
        
        @topics[topic].words[@wordlist[i]]  += 1    # Record a new word in this topic
        @topics[topic].wordcount            += 1    # Total sum of words
        @topics[topic].docs[@word2doc[i]]   += 1   # Record this doc index in this topic
    end
    
end

#p_of_z(topic, word) ⇒ `Object`

Compute P(z=j | z…_i, w). Or, the probability that a topic z is the topic j represented by the given word given that word.

# File 'lib/sclust/lda/lda.rb', line 100

def p_of_z(topic, word)
    
    return 0 unless topic.words[word]
    
    ((topic.words[word] - 1 + @beta)  / (topic.wordcount - topic.words[word] - 1 + @beta  * @wordlist.length)) * 
    ((topic.docs.size   - 1 + @alpha) / (@doclist.size    - 1 + @alpha * @topics.size))
    
end

Class: SClust::LDA::LDA

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(docCol = nil) ⇒ LDA

Instance Attribute Details

#doclist ⇒ Object

#iterations ⇒ Object

#logger ⇒ Object

#topics ⇒ Object

Instance Method Details

#<<(document) ⇒ Object

#build_randomized_index_into_words ⇒ Object

#each_radomized_word_index(&call) ⇒ Object

#each_topic(&topicproc) ⇒ Object

#get_max_terms(n = 3) ⇒ Object

#get_top_words_for_topic(topic, n = 3) ⇒ Object

#lda(opts = {}) ⇒ Object Also known as: cluster

#lda_once ⇒ Object

#lda_setup ⇒ Object

#p_of_z(topic, word) ⇒ Object