Class: SClust::LDA2::LDA2

Inherits:

Object

Object
SClust::LDA2::LDA2

show all

Defined in:: lib/sclust/lda/lda2.rb

Instance Attribute Summary collapse

#doclist ⇒ Object

Returns the value of attribute doclist.
#document_collection ⇒ Object readonly

Returns the value of attribute document_collection.
#iterations ⇒ Object

Returns the value of attribute iterations.
#logger ⇒ Object

Returns the value of attribute logger.
#topics ⇒ Object

Returns the value of attribute topics.

Instance Method Summary collapse

#<<(document) ⇒ Object

Add a document to the collection backing this cluster.
#build_randomized_index_into_words ⇒ Object

Build a wordlist index array.
#each_radomized_word_index(&call) ⇒ Object
#each_topic(&topicproc) ⇒ Object

Takes {|topic| … }.
#get_max_terms(n = 3) ⇒ Object

Returns list list list.
#get_top_words_for_topic(topic, n = 3) ⇒ Object

Return a list lists, [ z, word ].
#initialize ⇒ LDA2 constructor

Documents may be added after LDA is created, unlike k-mean clustering.
#lda(opts = {}) ⇒ Object (also: #cluster)
#lda_once ⇒ Object

Perform 1 phase of lda.
#lda_setup ⇒ Object
#p_of_z(topic, word, doc = nil) ⇒ Object

Compute p(z_i|theta) * p(w|z_i,B).
#rebuild_document_collection ⇒ Object

If you edit the document collection behind the scenes, you need to run this to avoid terms with 0 showing up.

Constructor Details

#initialize ⇒ `LDA2`

Documents may be added after LDA is created, unlike k-mean clustering.

# File 'lib/sclust/lda/lda2.rb', line 74

def initialize()
    @iterations  = 3
    @wordlist    = []
    @doclist     = []
    @logger      = Log4r::Logger.new(self.class.to_s)
    @logger.add('default')
    @topic_change_rate = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
    @word_prob_avg = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
    @doc_prob_avg = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)

    # Used for inverse document frequency values.
    @document_collection = SClust::Util::DocumentCollection.new()
    
    # Array the same size as @wordlist but stores the document object at index i
    # that produced @wordlist[i].
    @word2doc = []
    
    self.topics = 10
end

Instance Attribute Details

#doclist ⇒ `Object`

Returns the value of attribute doclist.



70
71
72

# File 'lib/sclust/lda/lda2.rb', line 70

def doclist
  @doclist
end

#document_collection ⇒ `Object` (readonly)

Returns the value of attribute document_collection.



68
69
70

# File 'lib/sclust/lda/lda2.rb', line 68

def document_collection
  @document_collection
end

#iterations ⇒ `Object`

Returns the value of attribute iterations.



70
71
72

# File 'lib/sclust/lda/lda2.rb', line 70

def iterations
  @iterations
end

#logger ⇒ `Object`

Returns the value of attribute logger.



70
71
72

# File 'lib/sclust/lda/lda2.rb', line 70

def logger
  @logger
end

#topics ⇒ `Object`

Returns the value of attribute topics.



70
71
72

# File 'lib/sclust/lda/lda2.rb', line 70

def topics
  @topics
end

Instance Method Details

#<<(document) ⇒ `Object`

Add a document to the collection backing this cluster. This must be a SClust::Util::Document.

# File 'lib/sclust/lda/lda2.rb', line 104

def <<(document)
    @doclist        << document
    
    @document_collection << document
    
    @wordlist       += document.words

    document.words.size.times { @word2doc << document }
end

#build_randomized_index_into_words ⇒ `Object`

Build a wordlist index array. This is an array that contains indexes into @wordlist. However, instead of being simply 0,1,2,3… this array is randomized so that we index into @wordlist in a random order.

# File 'lib/sclust/lda/lda2.rb', line 140

def build_randomized_index_into_words()
    
    @logger.info("Randomizing words.")
    
    @randomized_word_index = []
    
    @wordlist.each_index { |i| @randomized_word_index << i }
    
    @wordlist.each_index do |i|  
        new_home = (@wordlist.length * rand).to_i
        tmp = @randomized_word_index[i]
        @randomized_word_index[i] = @randomized_word_index[new_home]
        @randomized_word_index[new_home] = tmp
    end
    
end

#each_radomized_word_index(&call) ⇒ `Object`



191
192
193

# File 'lib/sclust/lda/lda2.rb', line 191

def each_radomized_word_index(&call)
    @randomized_word_index.each &call
end

#each_topic(&topicproc) ⇒ `Object`

Takes {|topic| … }



291
292
293

# File 'lib/sclust/lda/lda2.rb', line 291

def each_topic(&topicproc)
    @topics.each &topicproc
end

#get_max_terms(n = 3) ⇒ `Object`

Returns list list list. Each list is a topic list. Each topic list contains a word list.

[ z, word, topic ], …

# File 'lib/sclust/lda/lda2.rb', line 316

def get_max_terms(n=3)
    topics = []
    
    each_topic { |t| topics << get_top_words_for_topic(t, n) }
    
    topics
end

#get_top_words_for_topic(topic, n = 3) ⇒ `Object`

Return a list lists, [ z, word ].

# File 'lib/sclust/lda/lda2.rb', line 296

def get_top_words_for_topic(topic, n = 3)
    
    # List of (z, topic, word)
    tupleList = []
    
    topic.words.each_key do |word|
        tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
    end
    
    # Yes, rev the comparison so the list sorts backwards.
    tupleList.sort! { |x, y| y.weight <=> x.weight }

    tupleList[0...n]
    
end

#lda(opts = {}) ⇒ `Object` Also known as: cluster

# File 'lib/sclust/lda/lda2.rb', line 276

def lda(opts={})
    opts[:iterations] ||= @iterations
    
    unless (opts[:continue])
        @logger.info("Setting up to run LDA.")
        lda_setup()
    end
    
    opts[:iterations].times do |i|
        @logger.info { "LDA Iteration #{i+1} / #{opts[:iterations]}"}
        lda_once()
    end
end

#lda_once ⇒ `Object`

Perform 1 phase of lda

# File 'lib/sclust/lda/lda2.rb', line 217

def lda_once()
    each_radomized_word_index do |random_word_index|
        
        random_word = @wordlist[random_word_index]
        doc         = @word2doc[random_word_index]

        zdist = []
        ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
        
        # Compute distribution over z for word i.
        @topics.each do |topic|
            z = p_of_z(topic, random_word, doc) 
            ztotal += z 
            zdist << z
        end
                            
        r      = rand * ztotal # Random value to pick topic with.
        zacc   = 0.0           # Accumulator of seen values of zdist[topic_i].
        topic_i = (rand() * @topics.size).to_i 

        # Pick a topic, t
        
        catch(:picked_topic) do
            @topics.each_index do |topic_i|
                zacc += zdist[topic_i]
                throw :picked_topic if r < zacc
            end
        end
        
        topic = @topics[topic_i]
        
        previous_topic = @topics[@word2topic[random_word_index]]

        # Skip if src and dst topic are the same                    
        if @word2topic[random_word_index] == topic_i
            
            @topic_change_rate.adjust(0.0) # adjust...

        else
            
            # Adjust the topic change rate. This is how we will trac convergence. 
            # Few topic moves (comparatively) and we're done.                    
            @topic_change_rate.adjust(1.0)
    
            # Remove word from previous topic.
            
            previous_topic.remove(random_word, doc) if previous_topic.has_word_and_doc?(random_word, doc)
            
            # Add word to chosen topic.
            @word2topic[random_word_index] = topic_i           # Record that this word goes to this topic.
            
            topic.add(random_word, doc)
            
        end
    end
    
    $logger.info { "Topic change rate: #{@topic_change_rate.value} Doc% #{ @doc_prob_avg.value} Word% #{ @word_prob_avg.value}" }
end

#lda_setup ⇒ `Object`

# File 'lib/sclust/lda/lda2.rb', line 195

def lda_setup()
    @beta  = 0.01 
    @alpha = 1.0 #( @doclist.size / @topics.length ).to_f
    
    build_randomized_index_into_words()
    
    @word2topic       = []
    @doc2topic        = []
    
    each_radomized_word_index do |i|
        topic = (@topics.size * rand).to_i
    
        @word2topic[i] = topic                        # Record that this word goes to this topic.
        
        @topics[topic].add(@wordlist[i], @word2doc[i])
    end
    
    @topic_change_rate.weight = 1.0 / @wordlist.size
    
end

#p_of_z(topic, word, doc = nil) ⇒ `Object`

Compute p(z_i|theta) * p(w|z_i,B).

# File 'lib/sclust/lda/lda2.rb', line 160

def p_of_z(topic, word, doc=nil)
    
    beta = @beta
    
    words_from_doc_in_topic = (doc.nil?) ?
        topic.docs.reduce(0.0) { |x, num| x+num[1] } : 
        words_from_doc_in_topic = topic.docs[doc]
    
    word_prob_avg = ((topic.words[word] - 1.0 + beta)  / (topic.wordcount - 1.0 + beta ) )
    doc_prob_avg  = ((words_from_doc_in_topic - 1.0 + @alpha) / (topic.wordcount - 1.0 + @alpha ))

    
    # Stop-gap protection for when the denominator gets wonky.
    doc_prob_avg = 0.0 if doc_prob_avg.nan? || doc_prob_avg < 0.0
    word_prob_avg = 0.0 if word_prob_avg.nan? || word_prob_avg < 0.0
    
    @word_prob_avg.adjust(word_prob_avg)
    @doc_prob_avg.adjust(doc_prob_avg)
    
    #@logger.info("WHAJL:KJ:LKDS: #{doc_prob_avg} #{topic.docs.size} #{@doclist.size}")
    
    # Final result.
    doc_prob_avg * word_prob_avg
    
    # Alternate forumla. Denominator changed.
    #((topic.words[word] - 1.0 + beta)  / (topic.wordcount - 1.0 + beta ) ) * 
    #((topic.docs.size - 1.0 + alpha) / (@doclist.size - topic.docs.size - 1.0 + alpha ))

    
end

#rebuild_document_collection ⇒ `Object`

If you edit the document collection behind the scenes, you need to run this to avoid terms with 0 showing up.

# File 'lib/sclust/lda/lda2.rb', line 116

def rebuild_document_collection()
    
    @logger.debug { "Collection now has #{@doclist.size} documents, #{@wordlist.size} words."}
    @logger.info("Rebuilding document collection and word list.")                
    
    dl = @document_collection.doclist

    @doclist = []

    @document_collection = SClust::Util::DocumentCollection.new()
    
    @wordlist = []
    
    @word2doc = []
    
    dl.each { |doc| self << doc }
    
    @logger.debug { "Collection now has #{@doclist.size} documents, #{@wordlist.size} words."}
    
end

Class: SClust::LDA2::LDA2

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ LDA2

Instance Attribute Details

#doclist ⇒ Object

#document_collection ⇒ Object (readonly)

#iterations ⇒ Object

#logger ⇒ Object

#topics ⇒ Object

Instance Method Details

#<<(document) ⇒ Object

#build_randomized_index_into_words ⇒ Object

#each_radomized_word_index(&call) ⇒ Object

#each_topic(&topicproc) ⇒ Object

#get_max_terms(n = 3) ⇒ Object

#get_top_words_for_topic(topic, n = 3) ⇒ Object

#lda(opts = {}) ⇒ Object Also known as: cluster

#lda_once ⇒ Object

#lda_setup ⇒ Object

#p_of_z(topic, word, doc = nil) ⇒ Object

#rebuild_document_collection ⇒ Object