Class: Lda::Lda

Inherits:
Object
  • Object
show all
Defined in:
lib/lda-ruby.rb,
ext/lda-ruby/lda-inference.c

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(corpus) ⇒ Lda

Returns a new instance of Lda.



17
18
19
20
21
22
23
24
25
# File 'lib/lda-ruby.rb', line 17

def initialize(corpus)
  load_default_settings

  @vocab = nil
  self.corpus = corpus
  @vocab = corpus.vocabulary.to_a if corpus.vocabulary

  @phi = nil
end

Instance Attribute Details

#corpusObject (readonly)

Returns the value of attribute corpus.



15
16
17
# File 'lib/lda-ruby.rb', line 15

def corpus
  @corpus
end

#vocabObject (readonly)

Returns the value of attribute vocab.



15
16
17
# File 'lib/lda-ruby.rb', line 15

def vocab
  @vocab
end

Instance Method Details

#betaObject

Get the beta matrix after the model has been run.



918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
# File 'ext/lda-ruby/lda-inference.c', line 918

static VALUE wrap_get_model_beta(VALUE self) {
	if (!model_loaded)
		return Qnil;

	// beta is a double[num_topics][num_terms]
	VALUE arr;
	int i = 0, j = 0;

	arr = rb_ary_new2(last_model->num_topics);
	for (i = 0; i < last_model->num_topics; i++) {
		VALUE arr2 = rb_ary_new2(last_model->num_terms);
		for (j = 0; j < last_model->num_terms; j++) {
			rb_ary_store(arr2, j, rb_float_new(last_model->log_prob_w[i][j]));
		}
		rb_ary_store(arr, i, arr2);
	}

	return arr;
}

#compute_phiObject

Compute the phi values by running inference after the initial EM run has been completed.

Returns a 3D matrix: num_docs x length x num_topics.



882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
# File 'ext/lda-ruby/lda-inference.c', line 882

static VALUE wrap_get_phi(VALUE self) {
    if (!model_loaded)
        return Qnil;

    VALUE arr = rb_ary_new2(last_corpus->num_docs);
    int i = 0, j = 0, k = 0;

    //int max_length = max_corpus_length(last_corpus);
    short error = 0;

    for (i = 0; i < last_corpus->num_docs; i++) {
        VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);

        lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error);

        for (j = 0; j < last_corpus->docs[i].length; j++) {
            VALUE arr2 = rb_ary_new2(last_model->num_topics);

            for (k = 0; k < last_model->num_topics; k++) {
                rb_ary_store(arr2, k, rb_float_new(last_phi[j][k]));
            }

            rb_ary_store(arr1, j, arr2);
        }

        rb_ary_store(arr, i, arr1);
    }

    return arr;
}

#compute_topic_document_probabilityObject

Compute the average log probability for each topic for each document in the corpus. This method returns a matrix: num_docs x num_topics with the average log probability for the topic in the document.



134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/lda-ruby.rb', line 134

def compute_topic_document_probability
  outp = Array.new

  @corpus.documents.each_with_index do |doc, idx|
    tops = [0.0] * self.num_topics
    ttl  = doc.counts.inject(0.0) {|sum, i| sum + i}
    self.phi[idx].each_with_index do |word_dist, word_idx|
      word_dist.each_with_index do |top_prob, top_idx|
        tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
      end
    end
    tops = tops.map {|i| i / ttl}
    outp << tops
  end

  outp
end

#convergenceObject

Get the convergence setting.



649
650
651
# File 'ext/lda-ruby/lda-inference.c', line 649

static VALUE wrap_get_converged(VALUE self) {
	return rb_float_new(VAR_CONVERGED);
}

#convergence=(converged) ⇒ Object

Set the convergence setting.



656
657
658
659
660
# File 'ext/lda-ruby/lda-inference.c', line 656

static VALUE wrap_set_converged(VALUE self, VALUE converged) {
	VAR_CONVERGED = (float)NUM2DBL(converged);

	return converged;
}

#corpus=(rcorpus) ⇒ Object (readonly)

Set the corpus.



814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
# File 'ext/lda-ruby/lda-inference.c', line 814

static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
	corpus* c;
	int i = 0;
	int j = 0;

	c = malloc(sizeof(corpus));
	c->num_terms = NUM2INT(rb_iv_get(rcorpus, "@num_terms"));
	c->num_docs = NUM2INT(rb_iv_get(rcorpus, "@num_docs"));
	c->docs = (document*) malloc(sizeof(document) * c->num_docs);
	VALUE doc_ary = rb_iv_get(rcorpus, "@documents");
	for (i = 0; i < c->num_docs; i++) {
		VALUE one_doc = rb_ary_entry(doc_ary, i);
		VALUE words = rb_iv_get(one_doc, "@words");
		VALUE counts = rb_iv_get(one_doc, "@counts");

		c->docs[i].length = NUM2INT(rb_iv_get(one_doc, "@length"));
		c->docs[i].total = NUM2INT(rb_iv_get(one_doc, "@total"));
		c->docs[i].words = malloc(sizeof(int) * c->docs[i].length);
		c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length);
		for (j = 0; j < c->docs[i].length; j++) {
			int one_word = NUM2INT(rb_ary_entry(words, j));
			int one_count = NUM2INT(rb_ary_entry(counts, j));
      if( one_word > c->num_terms ) {
        rb_raise(rb_eRuntimeError, "error term count(%d) less than word index(%d)", c->num_terms, one_word);
      }
			c->docs[i].words[j] = one_word;
			c->docs[i].counts[j] = one_count;
		}
	}

	last_corpus = c;
	corpus_loaded = TRUE;

	rb_iv_set(self, "@corpus", rcorpus);

	return Qtrue;
}

#em(start) ⇒ Object

Run the EM algorithm with the loaded corpus and using the current configuration settings. The start parameter can take the following values:

* random - starting alpha are randomized
* seeded - loaded based on the corpus values
* <filename> - path to the file containing the model


777
778
779
780
781
782
783
784
# File 'ext/lda-ruby/lda-inference.c', line 777

static VALUE wrap_em(VALUE self, VALUE start) {
	if (!corpus_loaded)
		return Qnil;

	run_quiet_em(StringValuePtr(start), last_corpus);

	return Qnil;
}

#em_convergenceObject

Get the convergence value for EM.



681
682
683
# File 'ext/lda-ruby/lda-inference.c', line 681

static VALUE wrap_get_em_converged(VALUE self) {
	return rb_float_new(EM_CONVERGED);
}

#em_convergence=(em_converged) ⇒ Object

Set the convergence value for EM.



688
689
690
691
692
# File 'ext/lda-ruby/lda-inference.c', line 688

static VALUE wrap_set_em_converged(VALUE self, VALUE em_converged) {
	EM_CONVERGED = (float)NUM2DBL(em_converged);

	return em_converged;
}

#em_max_iterObject

Get the max iterations for the EM algorithm.



665
666
667
# File 'ext/lda-ruby/lda-inference.c', line 665

static VALUE wrap_get_em_max_iter(VALUE self) {
	return rb_int_new(EM_MAX_ITER);
}

#em_max_iter=(em_max_iter) ⇒ Object

Set the max iterations for the EM algorithm.



672
673
674
675
676
# File 'ext/lda-ruby/lda-inference.c', line 672

static VALUE wrap_set_em_max_iter(VALUE self, VALUE em_max_iter) {
	EM_MAX_ITER = NUM2INT(em_max_iter);

	return em_max_iter;
}

#est_alphaObject

Get the estimate alpha value (fixed = 0).



729
730
731
# File 'ext/lda-ruby/lda-inference.c', line 729

static VALUE wrap_get_estimate_alpha(VALUE self) {
	return rb_int_new(ESTIMATE_ALPHA);
}

#est_alpha=(est_alpha) ⇒ Object

Set the estimate alpha value (fixed = 0).



736
737
738
739
740
# File 'ext/lda-ruby/lda-inference.c', line 736

static VALUE wrap_set_estimate_alpha(VALUE self, VALUE est_alpha) {
	ESTIMATE_ALPHA = NUM2INT(est_alpha);

	return est_alpha;
}

#fast_load_corpus_from_file(filename) ⇒ Object

Load the corpus from the given file. This will not create a Corpus object that is accessible, but it will load the corpus much faster.



801
802
803
804
805
806
807
808
809
# File 'ext/lda-ruby/lda-inference.c', line 801

static VALUE wrap_load_corpus(VALUE self, VALUE filename) {
	if (!corpus_loaded) {
		last_corpus = read_data(StringValuePtr(filename));
		corpus_loaded = TRUE;
		return Qtrue;
	} else {
		return Qtrue;
	}
}

#gammaObject

Get the gamma values after the model has been run.



856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
# File 'ext/lda-ruby/lda-inference.c', line 856

static VALUE wrap_get_gamma(VALUE self) {
	if (!model_loaded)
		return Qnil;

	// last_gamma is a double[num_docs][num_topics]
	VALUE arr;
	int i = 0, j = 0;

	arr = rb_ary_new2(last_corpus->num_docs);
	for (i = 0; i < last_corpus->num_docs; i++) {
		VALUE arr2 = rb_ary_new2(last_model->num_topics);
		for (j = 0; j < last_model->num_topics; j++) {
			rb_ary_store(arr2, j, rb_float_new(last_gamma[i][j]));
		}
		rb_ary_store(arr, i, arr2);
	}

	return arr;
}

#init_alphaObject

Get the initial alpha value.



697
698
699
# File 'ext/lda-ruby/lda-inference.c', line 697

static VALUE wrap_get_initial_alpha(VALUE self) {
	return rb_float_new(INITIAL_ALPHA);
}

#init_alpha=(initial_alpha) ⇒ Object

Set the initial value of alpha.



711
712
713
714
715
# File 'ext/lda-ruby/lda-inference.c', line 711

static VALUE wrap_set_initial_alpha(VALUE self, VALUE initial_alpha) {
	INITIAL_ALPHA = (float)NUM2DBL(initial_alpha);

	return initial_alpha;
}

#load_corpus(filename) ⇒ Object



39
40
41
42
43
44
# File 'lib/lda-ruby.rb', line 39

def load_corpus(filename)
  @corpus = Corpus.new
  @corpus.load_from_file(filename)

  true
end

#load_default_settingsObject



27
28
29
30
31
32
33
34
35
36
37
# File 'lib/lda-ruby.rb', line 27

def load_default_settings
  self.max_iter = 20
  self.convergence = 1e-6
  self.em_max_iter = 100
  self.em_convergence = 1e-4
  self.num_topics = 20
  self.init_alpha = 0.3
  self.est_alpha = 1

  [20, 1e-6, 100, 1e-4, 20, 0.3, 1]
end

#load_settings(settings_file) ⇒ Object

Load settings from the given file.



790
791
792
793
794
# File 'ext/lda-ruby/lda-inference.c', line 790

static VALUE wrap_load_settings(VALUE self, VALUE settings_file) {
	read_settings(StringValuePtr(settings_file));

	return Qtrue;
}

#load_vocabulary(vocab) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
# File 'lib/lda-ruby.rb', line 46

def load_vocabulary(vocab)
  if vocab.is_a?(Array)
    @vocab = Marshal::load(Marshal::dump(vocab))      # deep clone array
  elsif vocab.is_a?(Vocabulary)
    @vocab = vocab.to_a
  else
    @vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) }
  end

  true
end

#max_iterObject

Get the maximum iterations.



633
634
635
# File 'ext/lda-ruby/lda-inference.c', line 633

static VALUE wrap_get_max_iter(VALUE self) {
	return rb_int_new(VAR_MAX_ITER);
}

#max_iter=(max_iter) ⇒ Object

Set the maximum iterations.



640
641
642
643
644
# File 'ext/lda-ruby/lda-inference.c', line 640

static VALUE wrap_set_max_iter(VALUE self, VALUE max_iter) {
	VAR_MAX_ITER = NUM2INT(max_iter);

	return max_iter;
}

#modelObject

Get the settings used for the model.



942
943
944
945
946
947
948
949
950
951
952
953
954
# File 'ext/lda-ruby/lda-inference.c', line 942

static VALUE wrap_get_model_settings(VALUE self) {
	if (!model_loaded)
		return Qnil;

	VALUE arr;

	arr = rb_ary_new();
	rb_ary_push(arr, rb_int_new(last_model->num_topics));
	rb_ary_push(arr, rb_int_new(last_model->num_terms));
	rb_ary_push(arr, rb_float_new(last_model->alpha));

	return arr;		//	[num_topics, num_terms, alpha]
}

#num_topicsObject

Get the number of topics being clustered.



704
705
706
# File 'ext/lda-ruby/lda-inference.c', line 704

static VALUE wrap_get_num_topics(VALUE self) {
	return rb_int_new(NTOPICS);
}

#num_topics=(ntopics) ⇒ Object

Set the number of topics to be clustered.



720
721
722
723
724
# File 'ext/lda-ruby/lda-inference.c', line 720

static VALUE wrap_set_num_topics(VALUE self, VALUE ntopics) {
	NTOPICS = NUM2INT(ntopics);

	return ntopics;
}

#phi(recompute = false) ⇒ Object

Get the phi matrix which can be used to assign probabilities to words belonging to a specific topic in each document. The return value is a 3D matrix: num_docs x doc_length x num_topics. The value is cached after the first call, so if it needs to be recomputed, set the recompute value to true.



121
122
123
124
125
126
127
# File 'lib/lda-ruby.rb', line 121

def phi(recompute=false)
  if @phi.nil? || recompute
    @phi = self.compute_phi
  end

  @phi
end

Visualization method for printing out the top words_per_topic words for each topic.

See also top_words.



64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/lda-ruby.rb', line 64

def print_topics(words_per_topic = 10)
  raise 'No vocabulary loaded.' unless @vocab

  self.beta.each_with_index do |topic, topic_num|
    # Sort the topic array and return the sorted indices of the best scores
    indices = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]

    puts "Topic #{topic_num}"
    puts "\t#{indices.map {|i| @vocab[i]}.join("\n\t")}"
    puts ""
  end

  nil
end

#set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha) ⇒ Object

Set all of the settings in one command:

* init_alpha
* num_topics
* max_iter
* convergence
* em_max_iter
* em_convergence
* est_alpha


617
618
619
620
621
622
623
624
625
626
627
628
# File 'ext/lda-ruby/lda-inference.c', line 617

static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) {
	INITIAL_ALPHA = NUM2DBL(init_alpha);
	NTOPICS = NUM2INT(num_topics);
  if( NTOPICS < 0 ) { rb_raise(rb_eRuntimeError, "NTOPICS must be greater than 0 - %d", NTOPICS); }
	VAR_MAX_ITER = NUM2INT(max_iter);
	VAR_CONVERGED = (float)NUM2DBL(convergence);
	EM_MAX_ITER = NUM2INT(em_max_iter);
	EM_CONVERGED = (float)NUM2DBL(em_convergence);
	ESTIMATE_ALPHA = NUM2INT(est_alpha);

	return Qtrue;
}

#to_sObject

String representation displaying current settings.



155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/lda-ruby.rb', line 155

def to_s
  outp = ["LDA Settings:"]
  outp << "    Initial alpha: %0.6f" % self.init_alpha
  outp << "      # of topics: %d" % self.num_topics
  outp << "   Max iterations: %d" % self.max_iter
  outp << "      Convergence: %0.6f" % self.convergence
  outp << "EM max iterations: %d" % self.em_max_iter
  outp << "   EM convergence: %0.6f" % self.em_convergence
  outp << "   Estimate alpha: %d" % self.est_alpha

  outp.join("\n")
end

#top_word_indices(words_per_topic = 10) ⇒ Object

After the model has been run and a vocabulary has been loaded, return the words_per_topic top words chosen by the model for each topic. This is returned as a hash mapping the topic number to an array of top words (in descending order of importance).

topic_number => [w1, w2, ..., w_n]

See also print_topics.



89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/lda-ruby.rb', line 89

def top_word_indices(words_per_topic = 10)
  raise 'No vocabulary loaded.' unless @vocab

  # find the highest scoring words per topic
  topics = Hash.new
  indices = (0...@vocab.size).to_a

  self.beta.each_with_index do |topic, topic_num|
    topics[topic_num] = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
  end

  topics
end

#top_words(words_per_topic = 10) ⇒ Object



103
104
105
106
107
108
109
110
111
112
# File 'lib/lda-ruby.rb', line 103

def top_words(words_per_topic = 10)
  output = Hash.new

  topics = top_word_indices(words_per_topic)
  topics.each_pair do |topic_num, words|
    output[topic_num] = words.map { |w| @vocab[w] }
  end

  output
end

#verboseObject

Get the verbosity setting.



745
746
747
748
749
750
751
# File 'ext/lda-ruby/lda-inference.c', line 745

static VALUE wrap_get_verbosity(VALUE self) {
    if (VERBOSE) {
        return Qtrue;
    } else {
        return Qfalse;
    }
}

#verbose=(verbosity) ⇒ Object

Set the verbosity level (true, false).



757
758
759
760
761
762
763
764
765
# File 'ext/lda-ruby/lda-inference.c', line 757

static VALUE wrap_set_verbosity(VALUE self, VALUE verbosity) {
    if (verbosity == Qtrue) {
        VERBOSE = TRUE;
    } else {
        VERBOSE = FALSE;
    }

    return verbosity;
}