Class: Lda::Lda

Inherits:

Object

Object
Lda::Lda

show all

Defined in:: lib/lda-ruby.rb,
ext/lda-ruby/lda-inference.c

Instance Attribute Summary collapse

#corpus ⇒ Object readonly

Returns the value of attribute corpus.
#vocab ⇒ Object readonly

Returns the value of attribute vocab.

Instance Method Summary collapse

#beta ⇒ Object

Get the beta matrix after the model has been run.
#compute_phi ⇒ Object

Compute the phi values by running inference after the initial EM run has been completed.
#compute_topic_document_probability ⇒ Object

Compute the average log probability for each topic for each document in the corpus.
#convergence ⇒ Object

Get the convergence setting.
#convergence=(converged) ⇒ Object

Set the convergence setting.
#corpus(rcorpus) ⇒ Object readonly

Set the corpus.
#em(start) ⇒ Object

Run the EM algorithm with the loaded corpus and using the current configuration settings.
#em_convergence ⇒ Object

Get the convergence value for EM.
#em_convergence=(em_converged) ⇒ Object

Set the convergence value for EM.
#em_max_iter ⇒ Object

Get the max iterations for the EM algorithm.
#em_max_iter=(em_max_iter) ⇒ Object

Set the max iterations for the EM algorithm.
#est_alpha ⇒ Object

Get the estimate alpha value (fixed = 0).
#est_alpha=(est_alpha) ⇒ Object

Set the estimate alpha value (fixed = 0).
#fast_load_corpus_from_file(filename) ⇒ Object

Load the corpus from the given file.
#gamma ⇒ Object

Get the gamma values after the model has been run.
#init_alpha ⇒ Object

Get the initial alpha value.
#init_alpha=(initial_alpha) ⇒ Object

Set the initial value of alpha.
#initialize(corpus) ⇒ Lda constructor

A new instance of Lda.
#load_corpus(filename) ⇒ Object
#load_default_settings ⇒ Object
#load_settings(settings_file) ⇒ Object

Load settings from the given file.
#load_vocabulary(vocab) ⇒ Object
#max_iter ⇒ Object

Get the maximum iterations.
#max_iter=(max_iter) ⇒ Object

Set the maximum iterations.
#model ⇒ Object

Get the settings used for the model.
#num_topics ⇒ Object

Get the number of topics being clustered.
#num_topics=(ntopics) ⇒ Object

Set the number of topics to be clustered.
#phi(recompute = false) ⇒ Object

Get the phi matrix which can be used to assign probabilities to words belonging to a specific topic in each document.
#print_topics(words_per_topic = 10) ⇒ Object

Visualization method for printing out the top words_per_topic words for each topic.
#set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha) ⇒ Object

Set all of the settings in one command:.
#to_s ⇒ Object

String representation displaying current settings.
#top_word_indices(words_per_topic = 10) ⇒ Object

After the model has been run and a vocabulary has been loaded, return the words_per_topic top words chosen by the model for each topic.
#top_words(words_per_topic = 10) ⇒ Object
#verbose ⇒ Object

Get the verbosity setting.
#verbose=(verbosity) ⇒ Object

Set the verbosity level (true, false).

Constructor Details

#initialize(corpus) ⇒ `Lda`

Returns a new instance of Lda.

# File 'lib/lda-ruby.rb', line 17

def initialize(corpus)
  load_default_settings

  @vocab = nil
  self.corpus = corpus
  @vocab = corpus.vocabulary.to_a if corpus.vocabulary

  @phi = nil
end

Instance Attribute Details

#corpus ⇒ `Object` (readonly)

Returns the value of attribute corpus.



15
16
17

# File 'lib/lda-ruby.rb', line 15

def corpus
  @corpus
end

#vocab ⇒ `Object` (readonly)

Returns the value of attribute vocab.



15
16
17

# File 'lib/lda-ruby.rb', line 15

def vocab
  @vocab
end

Instance Method Details

#beta ⇒ `Object`

Get the beta matrix after the model has been run.

# File 'ext/lda-ruby/lda-inference.c', line 918

static VALUE wrap_get_model_beta(VALUE self) {
	if (!model_loaded)
		return Qnil;

	// beta is a double[num_topics][num_terms]
	VALUE arr;
	int i = 0, j = 0;

	arr = rb_ary_new2(last_model->num_topics);
	for (i = 0; i < last_model->num_topics; i++) {
		VALUE arr2 = rb_ary_new2(last_model->num_terms);
		for (j = 0; j < last_model->num_terms; j++) {
			rb_ary_store(arr2, j, rb_float_new(last_model->log_prob_w[i][j]));
		}
		rb_ary_store(arr, i, arr2);
	}

	return arr;
}

#compute_phi ⇒ `Object`

Compute the phi values by running inference after the initial EM run has been completed.

Returns a 3D matrix: num_docs x length x num_topics.

# File 'ext/lda-ruby/lda-inference.c', line 882

static VALUE wrap_get_phi(VALUE self) {
    if (!model_loaded)
        return Qnil;

    VALUE arr = rb_ary_new2(last_corpus->num_docs);
    int i = 0, j = 0, k = 0;

    //int max_length = max_corpus_length(last_corpus);
    short error = 0;

    for (i = 0; i < last_corpus->num_docs; i++) {
        VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);

        lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error);

        for (j = 0; j < last_corpus->docs[i].length; j++) {
            VALUE arr2 = rb_ary_new2(last_model->num_topics);

            for (k = 0; k < last_model->num_topics; k++) {
                rb_ary_store(arr2, k, rb_float_new(last_phi[j][k]));
            }

            rb_ary_store(arr1, j, arr2);
        }

        rb_ary_store(arr, i, arr1);
    }

    return arr;
}

#compute_topic_document_probability ⇒ `Object`

Compute the average log probability for each topic for each document in the corpus. This method returns a matrix: num_docs x num_topics with the average log probability for the topic in the document.

# File 'lib/lda-ruby.rb', line 134

def compute_topic_document_probability
  outp = Array.new

  @corpus.documents.each_with_index do |doc, idx|
    tops = [0.0] * self.num_topics
    ttl  = doc.counts.inject(0.0) {|sum, i| sum + i}
    self.phi[idx].each_with_index do |word_dist, word_idx|
      word_dist.each_with_index do |top_prob, top_idx|
        tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
      end
    end
    tops = tops.map {|i| i / ttl}
    outp << tops
  end

  outp
end

#convergence ⇒ `Object`

Get the convergence setting.



649
650
651

# File 'ext/lda-ruby/lda-inference.c', line 649

static VALUE wrap_get_converged(VALUE self) {
	return rb_float_new(VAR_CONVERGED);
}

#convergence=(converged) ⇒ `Object`

Set the convergence setting.

# File 'ext/lda-ruby/lda-inference.c', line 656

static VALUE wrap_set_converged(VALUE self, VALUE converged) {
	VAR_CONVERGED = (float)NUM2DBL(converged);

	return converged;
}

#corpus=(rcorpus) ⇒ `Object` (readonly)

Set the corpus.

# File 'ext/lda-ruby/lda-inference.c', line 814

static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
	corpus* c;
	int i = 0;
	int j = 0;

	c = malloc(sizeof(corpus));
	c->num_terms = NUM2INT(rb_iv_get(rcorpus, "@num_terms"));
	c->num_docs = NUM2INT(rb_iv_get(rcorpus, "@num_docs"));
	c->docs = (document*) malloc(sizeof(document) * c->num_docs);
	VALUE doc_ary = rb_iv_get(rcorpus, "@documents");
	for (i = 0; i < c->num_docs; i++) {
		VALUE one_doc = rb_ary_entry(doc_ary, i);
		VALUE words = rb_iv_get(one_doc, "@words");
		VALUE counts = rb_iv_get(one_doc, "@counts");

		c->docs[i].length = NUM2INT(rb_iv_get(one_doc, "@length"));
		c->docs[i].total = NUM2INT(rb_iv_get(one_doc, "@total"));
		c->docs[i].words = malloc(sizeof(int) * c->docs[i].length);
		c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length);
		for (j = 0; j < c->docs[i].length; j++) {
			int one_word = NUM2INT(rb_ary_entry(words, j));
			int one_count = NUM2INT(rb_ary_entry(counts, j));
      if( one_word > c->num_terms ) {
        rb_raise(rb_eRuntimeError, "error term count(%d) less than word index(%d)", c->num_terms, one_word);
      }
			c->docs[i].words[j] = one_word;
			c->docs[i].counts[j] = one_count;
		}
	}

	last_corpus = c;
	corpus_loaded = TRUE;

	rb_iv_set(self, "@corpus", rcorpus);

	return Qtrue;
}

#em(start) ⇒ `Object`

Run the EM algorithm with the loaded corpus and using the current configuration settings. The start parameter can take the following values:

* random - starting alpha are randomized
* seeded - loaded based on the corpus values
* <filename> - path to the file containing the model

# File 'ext/lda-ruby/lda-inference.c', line 777

static VALUE wrap_em(VALUE self, VALUE start) {
	if (!corpus_loaded)
		return Qnil;

	run_quiet_em(StringValuePtr(start), last_corpus);

	return Qnil;
}

#em_convergence ⇒ `Object`

Get the convergence value for EM.



681
682
683

# File 'ext/lda-ruby/lda-inference.c', line 681

static VALUE wrap_get_em_converged(VALUE self) {
	return rb_float_new(EM_CONVERGED);
}

#em_convergence=(em_converged) ⇒ `Object`

Set the convergence value for EM.

# File 'ext/lda-ruby/lda-inference.c', line 688

static VALUE wrap_set_em_converged(VALUE self, VALUE em_converged) {
	EM_CONVERGED = (float)NUM2DBL(em_converged);

	return em_converged;
}

#em_max_iter ⇒ `Object`

Get the max iterations for the EM algorithm.



665
666
667

# File 'ext/lda-ruby/lda-inference.c', line 665

static VALUE wrap_get_em_max_iter(VALUE self) {
	return rb_int_new(EM_MAX_ITER);
}

#em_max_iter=(em_max_iter) ⇒ `Object`

Set the max iterations for the EM algorithm.

# File 'ext/lda-ruby/lda-inference.c', line 672

static VALUE wrap_set_em_max_iter(VALUE self, VALUE em_max_iter) {
	EM_MAX_ITER = NUM2INT(em_max_iter);

	return em_max_iter;
}

#est_alpha ⇒ `Object`

Get the estimate alpha value (fixed = 0).



729
730
731

# File 'ext/lda-ruby/lda-inference.c', line 729

static VALUE wrap_get_estimate_alpha(VALUE self) {
	return rb_int_new(ESTIMATE_ALPHA);
}

#est_alpha=(est_alpha) ⇒ `Object`

Set the estimate alpha value (fixed = 0).

# File 'ext/lda-ruby/lda-inference.c', line 736

static VALUE wrap_set_estimate_alpha(VALUE self, VALUE est_alpha) {
	ESTIMATE_ALPHA = NUM2INT(est_alpha);

	return est_alpha;
}

#fast_load_corpus_from_file(filename) ⇒ `Object`

Load the corpus from the given file. This will not create a Corpus object that is accessible, but it will load the corpus much faster.

# File 'ext/lda-ruby/lda-inference.c', line 801

static VALUE wrap_load_corpus(VALUE self, VALUE filename) {
	if (!corpus_loaded) {
		last_corpus = read_data(StringValuePtr(filename));
		corpus_loaded = TRUE;
		return Qtrue;
	} else {
		return Qtrue;
	}
}

#gamma ⇒ `Object`

Get the gamma values after the model has been run.

# File 'ext/lda-ruby/lda-inference.c', line 856

static VALUE wrap_get_gamma(VALUE self) {
	if (!model_loaded)
		return Qnil;

	// last_gamma is a double[num_docs][num_topics]
	VALUE arr;
	int i = 0, j = 0;

	arr = rb_ary_new2(last_corpus->num_docs);
	for (i = 0; i < last_corpus->num_docs; i++) {
		VALUE arr2 = rb_ary_new2(last_model->num_topics);
		for (j = 0; j < last_model->num_topics; j++) {
			rb_ary_store(arr2, j, rb_float_new(last_gamma[i][j]));
		}
		rb_ary_store(arr, i, arr2);
	}

	return arr;
}

#init_alpha ⇒ `Object`

Get the initial alpha value.



697
698
699

# File 'ext/lda-ruby/lda-inference.c', line 697

static VALUE wrap_get_initial_alpha(VALUE self) {
	return rb_float_new(INITIAL_ALPHA);
}

#init_alpha=(initial_alpha) ⇒ `Object`

Set the initial value of alpha.

# File 'ext/lda-ruby/lda-inference.c', line 711

static VALUE wrap_set_initial_alpha(VALUE self, VALUE initial_alpha) {
	INITIAL_ALPHA = (float)NUM2DBL(initial_alpha);

	return initial_alpha;
}

#load_corpus(filename) ⇒ `Object`

# File 'lib/lda-ruby.rb', line 39

def load_corpus(filename)
  @corpus = Corpus.new
  @corpus.load_from_file(filename)

  true
end

#load_default_settings ⇒ `Object`

# File 'lib/lda-ruby.rb', line 27

def load_default_settings
  self.max_iter = 20
  self.convergence = 1e-6
  self.em_max_iter = 100
  self.em_convergence = 1e-4
  self.num_topics = 20
  self.init_alpha = 0.3
  self.est_alpha = 1

  [20, 1e-6, 100, 1e-4, 20, 0.3, 1]
end

#load_settings(settings_file) ⇒ `Object`

Load settings from the given file.

# File 'ext/lda-ruby/lda-inference.c', line 790

static VALUE wrap_load_settings(VALUE self, VALUE settings_file) {
	read_settings(StringValuePtr(settings_file));

	return Qtrue;
}

#load_vocabulary(vocab) ⇒ `Object`

# File 'lib/lda-ruby.rb', line 46

def load_vocabulary(vocab)
  if vocab.is_a?(Array)
    @vocab = Marshal::load(Marshal::dump(vocab))      # deep clone array
  elsif vocab.is_a?(Vocabulary)
    @vocab = vocab.to_a
  else
    @vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) }
  end

  true
end

#max_iter ⇒ `Object`

Get the maximum iterations.



633
634
635

# File 'ext/lda-ruby/lda-inference.c', line 633

static VALUE wrap_get_max_iter(VALUE self) {
	return rb_int_new(VAR_MAX_ITER);
}

#max_iter=(max_iter) ⇒ `Object`

Set the maximum iterations.

# File 'ext/lda-ruby/lda-inference.c', line 640

static VALUE wrap_set_max_iter(VALUE self, VALUE max_iter) {
	VAR_MAX_ITER = NUM2INT(max_iter);

	return max_iter;
}

#model ⇒ `Object`

Get the settings used for the model.

# File 'ext/lda-ruby/lda-inference.c', line 942

static VALUE wrap_get_model_settings(VALUE self) {
	if (!model_loaded)
		return Qnil;

	VALUE arr;

	arr = rb_ary_new();
	rb_ary_push(arr, rb_int_new(last_model->num_topics));
	rb_ary_push(arr, rb_int_new(last_model->num_terms));
	rb_ary_push(arr, rb_float_new(last_model->alpha));

	return arr;		//	[num_topics, num_terms, alpha]
}

#num_topics ⇒ `Object`

Get the number of topics being clustered.



704
705
706

# File 'ext/lda-ruby/lda-inference.c', line 704

static VALUE wrap_get_num_topics(VALUE self) {
	return rb_int_new(NTOPICS);
}

#num_topics=(ntopics) ⇒ `Object`

Set the number of topics to be clustered.

# File 'ext/lda-ruby/lda-inference.c', line 720

static VALUE wrap_set_num_topics(VALUE self, VALUE ntopics) {
	NTOPICS = NUM2INT(ntopics);

	return ntopics;
}

#phi(recompute = false) ⇒ `Object`

Get the phi matrix which can be used to assign probabilities to words belonging to a specific topic in each document. The return value is a 3D matrix: num_docs x doc_length x num_topics. The value is cached after the first call, so if it needs to be recomputed, set the recompute value to true.

# File 'lib/lda-ruby.rb', line 121

def phi(recompute=false)
  if @phi.nil? || recompute
    @phi = self.compute_phi
  end

  @phi
end

#print_topics(words_per_topic = 10) ⇒ `Object`

Visualization method for printing out the top words_per_topic words for each topic.

#set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha) ⇒ `Object`

Set all of the settings in one command:

* init_alpha
* num_topics
* max_iter
* convergence
* em_max_iter
* em_convergence
* est_alpha

# File 'ext/lda-ruby/lda-inference.c', line 617

static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) {
	INITIAL_ALPHA = NUM2DBL(init_alpha);
	NTOPICS = NUM2INT(num_topics);
  if( NTOPICS < 0 ) { rb_raise(rb_eRuntimeError, "NTOPICS must be greater than 0 - %d", NTOPICS); }
	VAR_MAX_ITER = NUM2INT(max_iter);
	VAR_CONVERGED = (float)NUM2DBL(convergence);
	EM_MAX_ITER = NUM2INT(em_max_iter);
	EM_CONVERGED = (float)NUM2DBL(em_convergence);
	ESTIMATE_ALPHA = NUM2INT(est_alpha);

	return Qtrue;
}

#to_s ⇒ `Object`

String representation displaying current settings.

# File 'lib/lda-ruby.rb', line 155

def to_s
  outp = ["LDA Settings:"]
  outp << "    Initial alpha: %0.6f" % self.init_alpha
  outp << "      # of topics: %d" % self.num_topics
  outp << "   Max iterations: %d" % self.max_iter
  outp << "      Convergence: %0.6f" % self.convergence
  outp << "EM max iterations: %d" % self.em_max_iter
  outp << "   EM convergence: %0.6f" % self.em_convergence
  outp << "   Estimate alpha: %d" % self.est_alpha

  outp.join("\n")
end

#top_word_indices(words_per_topic = 10) ⇒ `Object`

After the model has been run and a vocabulary has been loaded, return the words_per_topic top words chosen by the model for each topic. This is returned as a hash mapping the topic number to an array of top words (in descending order of importance).

topic_number => [w1, w2, ..., w_n]

#top_words(words_per_topic = 10) ⇒ `Object`

# File 'lib/lda-ruby.rb', line 103

def top_words(words_per_topic = 10)
  output = Hash.new

  topics = top_word_indices(words_per_topic)
  topics.each_pair do |topic_num, words|
    output[topic_num] = words.map { |w| @vocab[w] }
  end

  output
end

#verbose ⇒ `Object`

Get the verbosity setting.

# File 'ext/lda-ruby/lda-inference.c', line 745

static VALUE wrap_get_verbosity(VALUE self) {
    if (VERBOSE) {
        return Qtrue;
    } else {
        return Qfalse;
    }
}

#verbose=(verbosity) ⇒ `Object`

Set the verbosity level (true, false).

# File 'ext/lda-ruby/lda-inference.c', line 757

static VALUE wrap_set_verbosity(VALUE self, VALUE verbosity) {
    if (verbosity == Qtrue) {
        VERBOSE = TRUE;
    } else {
        VERBOSE = FALSE;
    }

    return verbosity;
}

Class: Lda::Lda

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(corpus) ⇒ Lda

Instance Attribute Details

#corpus ⇒ Object (readonly)

#vocab ⇒ Object (readonly)

Instance Method Details

#beta ⇒ Object

#compute_phi ⇒ Object

#compute_topic_document_probability ⇒ Object

#convergence ⇒ Object

#convergence=(converged) ⇒ Object

#corpus=(rcorpus) ⇒ Object (readonly)

#em(start) ⇒ Object

#em_convergence ⇒ Object

#em_convergence=(em_converged) ⇒ Object

#em_max_iter ⇒ Object

#em_max_iter=(em_max_iter) ⇒ Object

#est_alpha ⇒ Object

#est_alpha=(est_alpha) ⇒ Object

#fast_load_corpus_from_file(filename) ⇒ Object

#gamma ⇒ Object

#init_alpha ⇒ Object

#init_alpha=(initial_alpha) ⇒ Object

#load_corpus(filename) ⇒ Object

#load_default_settings ⇒ Object

#load_settings(settings_file) ⇒ Object

#load_vocabulary(vocab) ⇒ Object

#max_iter ⇒ Object

#max_iter=(max_iter) ⇒ Object

#model ⇒ Object

#num_topics ⇒ Object

#num_topics=(ntopics) ⇒ Object

#phi(recompute = false) ⇒ Object

#print_topics(words_per_topic = 10) ⇒ Object

#set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha) ⇒ Object

#to_s ⇒ Object

#top_word_indices(words_per_topic = 10) ⇒ Object

#top_words(words_per_topic = 10) ⇒ Object

#verbose ⇒ Object

#verbose=(verbosity) ⇒ Object