Class: Lda::Lda
- Inherits:
-
Object
- Object
- Lda::Lda
- Defined in:
- lib/lda-ruby.rb,
ext/lda-ruby/lda-inference.c
Instance Attribute Summary collapse
-
#corpus ⇒ Object
readonly
Returns the value of attribute corpus.
-
#vocab ⇒ Object
readonly
Returns the value of attribute vocab.
Instance Method Summary collapse
-
#beta ⇒ Object
Get the beta matrix after the model has been run.
-
#compute_phi ⇒ Object
Compute the phi values by running inference after the initial EM run has been completed.
-
#compute_topic_document_probability ⇒ Object
Compute the average log probability for each topic for each document in the corpus.
-
#convergence ⇒ Object
Get the convergence setting.
-
#convergence=(converged) ⇒ Object
Set the convergence setting.
-
#corpus(rcorpus) ⇒ Object
readonly
Set the corpus.
-
#em(start) ⇒ Object
Run the EM algorithm with the loaded corpus and using the current configuration settings.
-
#em_convergence ⇒ Object
Get the convergence value for EM.
-
#em_convergence=(em_converged) ⇒ Object
Set the convergence value for EM.
-
#em_max_iter ⇒ Object
Get the max iterations for the EM algorithm.
-
#em_max_iter=(em_max_iter) ⇒ Object
Set the max iterations for the EM algorithm.
-
#est_alpha ⇒ Object
Get the estimate alpha value (fixed = 0).
-
#est_alpha=(est_alpha) ⇒ Object
Set the estimate alpha value (fixed = 0).
-
#fast_load_corpus_from_file(filename) ⇒ Object
Load the corpus from the given file.
-
#gamma ⇒ Object
Get the gamma values after the model has been run.
-
#init_alpha ⇒ Object
Get the initial alpha value.
-
#init_alpha=(initial_alpha) ⇒ Object
Set the initial value of alpha.
-
#initialize(corpus) ⇒ Lda
constructor
A new instance of Lda.
- #load_corpus(filename) ⇒ Object
- #load_default_settings ⇒ Object
-
#load_settings(settings_file) ⇒ Object
Load settings from the given file.
- #load_vocabulary(vocab) ⇒ Object
-
#max_iter ⇒ Object
Get the maximum iterations.
-
#max_iter=(max_iter) ⇒ Object
Set the maximum iterations.
-
#model ⇒ Object
Get the settings used for the model.
-
#num_topics ⇒ Object
Get the number of topics being clustered.
-
#num_topics=(ntopics) ⇒ Object
Set the number of topics to be clustered.
-
#phi(recompute = false) ⇒ Object
Get the phi matrix which can be used to assign probabilities to words belonging to a specific topic in each document.
-
#print_topics(words_per_topic = 10) ⇒ Object
Visualization method for printing out the top
words_per_topic
words for each topic. -
#set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha) ⇒ Object
Set all of the settings in one command:.
-
#to_s ⇒ Object
String representation displaying current settings.
-
#top_word_indices(words_per_topic = 10) ⇒ Object
After the model has been run and a vocabulary has been loaded, return the
words_per_topic
top words chosen by the model for each topic. - #top_words(words_per_topic = 10) ⇒ Object
-
#verbose ⇒ Object
Get the verbosity setting.
-
#verbose=(verbosity) ⇒ Object
Set the verbosity level (true, false).
Constructor Details
#initialize(corpus) ⇒ Lda
Returns a new instance of Lda.
17 18 19 20 21 22 23 24 25 |
# File 'lib/lda-ruby.rb', line 17 def initialize(corpus) load_default_settings @vocab = nil self.corpus = corpus @vocab = corpus.vocabulary.to_a if corpus.vocabulary @phi = nil end |
Instance Attribute Details
#corpus ⇒ Object (readonly)
Returns the value of attribute corpus.
15 16 17 |
# File 'lib/lda-ruby.rb', line 15 def corpus @corpus end |
#vocab ⇒ Object (readonly)
Returns the value of attribute vocab.
15 16 17 |
# File 'lib/lda-ruby.rb', line 15 def vocab @vocab end |
Instance Method Details
#beta ⇒ Object
Get the beta matrix after the model has been run.
918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 |
# File 'ext/lda-ruby/lda-inference.c', line 918
static VALUE wrap_get_model_beta(VALUE self) {
if (!model_loaded)
return Qnil;
// beta is a double[num_topics][num_terms]
VALUE arr;
int i = 0, j = 0;
arr = rb_ary_new2(last_model->num_topics);
for (i = 0; i < last_model->num_topics; i++) {
VALUE arr2 = rb_ary_new2(last_model->num_terms);
for (j = 0; j < last_model->num_terms; j++) {
rb_ary_store(arr2, j, rb_float_new(last_model->log_prob_w[i][j]));
}
rb_ary_store(arr, i, arr2);
}
return arr;
}
|
#compute_phi ⇒ Object
Compute the phi values by running inference after the initial EM run has been completed.
Returns a 3D matrix: num_docs x length x num_topics
.
882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 |
# File 'ext/lda-ruby/lda-inference.c', line 882
static VALUE wrap_get_phi(VALUE self) {
if (!model_loaded)
return Qnil;
VALUE arr = rb_ary_new2(last_corpus->num_docs);
int i = 0, j = 0, k = 0;
//int max_length = max_corpus_length(last_corpus);
short error = 0;
for (i = 0; i < last_corpus->num_docs; i++) {
VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);
lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error);
for (j = 0; j < last_corpus->docs[i].length; j++) {
VALUE arr2 = rb_ary_new2(last_model->num_topics);
for (k = 0; k < last_model->num_topics; k++) {
rb_ary_store(arr2, k, rb_float_new(last_phi[j][k]));
}
rb_ary_store(arr1, j, arr2);
}
rb_ary_store(arr, i, arr1);
}
return arr;
}
|
#compute_topic_document_probability ⇒ Object
Compute the average log probability for each topic for each document in the corpus. This method returns a matrix: num_docs x num_topics with the average log probability for the topic in the document.
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# File 'lib/lda-ruby.rb', line 134 def compute_topic_document_probability outp = Array.new @corpus.documents.each_with_index do |doc, idx| tops = [0.0] * self.num_topics ttl = doc.counts.inject(0.0) {|sum, i| sum + i} self.phi[idx].each_with_index do |word_dist, word_idx| word_dist.each_with_index do |top_prob, top_idx| tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx] end end tops = tops.map {|i| i / ttl} outp << tops end outp end |
#convergence ⇒ Object
Get the convergence setting.
649 650 651 |
# File 'ext/lda-ruby/lda-inference.c', line 649 static VALUE wrap_get_converged(VALUE self) { return rb_float_new(VAR_CONVERGED); } |
#convergence=(converged) ⇒ Object
Set the convergence setting.
656 657 658 659 660 |
# File 'ext/lda-ruby/lda-inference.c', line 656
static VALUE wrap_set_converged(VALUE self, VALUE converged) {
VAR_CONVERGED = (float)NUM2DBL(converged);
return converged;
}
|
#corpus=(rcorpus) ⇒ Object (readonly)
Set the corpus.
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 |
# File 'ext/lda-ruby/lda-inference.c', line 814
static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
corpus* c;
int i = 0;
int j = 0;
c = malloc(sizeof(corpus));
c->num_terms = NUM2INT(rb_iv_get(rcorpus, "@num_terms"));
c->num_docs = NUM2INT(rb_iv_get(rcorpus, "@num_docs"));
c->docs = (document*) malloc(sizeof(document) * c->num_docs);
VALUE doc_ary = rb_iv_get(rcorpus, "@documents");
for (i = 0; i < c->num_docs; i++) {
VALUE one_doc = rb_ary_entry(doc_ary, i);
VALUE words = rb_iv_get(one_doc, "@words");
VALUE counts = rb_iv_get(one_doc, "@counts");
c->docs[i].length = NUM2INT(rb_iv_get(one_doc, "@length"));
c->docs[i].total = NUM2INT(rb_iv_get(one_doc, "@total"));
c->docs[i].words = malloc(sizeof(int) * c->docs[i].length);
c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length);
for (j = 0; j < c->docs[i].length; j++) {
int one_word = NUM2INT(rb_ary_entry(words, j));
int one_count = NUM2INT(rb_ary_entry(counts, j));
if( one_word > c->num_terms ) {
rb_raise(rb_eRuntimeError, "error term count(%d) less than word index(%d)", c->num_terms, one_word);
}
c->docs[i].words[j] = one_word;
c->docs[i].counts[j] = one_count;
}
}
last_corpus = c;
corpus_loaded = TRUE;
rb_iv_set(self, "@corpus", rcorpus);
return Qtrue;
}
|
#em(start) ⇒ Object
Run the EM algorithm with the loaded corpus and using the current configuration settings. The start
parameter can take the following values:
* random - starting alpha are randomized
* seeded - loaded based on the corpus values
* <filename> - path to the file containing the model
777 778 779 780 781 782 783 784 |
# File 'ext/lda-ruby/lda-inference.c', line 777
static VALUE wrap_em(VALUE self, VALUE start) {
if (!corpus_loaded)
return Qnil;
run_quiet_em(StringValuePtr(start), last_corpus);
return Qnil;
}
|
#em_convergence ⇒ Object
Get the convergence value for EM.
681 682 683 |
# File 'ext/lda-ruby/lda-inference.c', line 681 static VALUE wrap_get_em_converged(VALUE self) { return rb_float_new(EM_CONVERGED); } |
#em_convergence=(em_converged) ⇒ Object
Set the convergence value for EM.
688 689 690 691 692 |
# File 'ext/lda-ruby/lda-inference.c', line 688
static VALUE wrap_set_em_converged(VALUE self, VALUE em_converged) {
EM_CONVERGED = (float)NUM2DBL(em_converged);
return em_converged;
}
|
#em_max_iter ⇒ Object
Get the max iterations for the EM algorithm.
665 666 667 |
# File 'ext/lda-ruby/lda-inference.c', line 665 static VALUE wrap_get_em_max_iter(VALUE self) { return rb_int_new(EM_MAX_ITER); } |
#em_max_iter=(em_max_iter) ⇒ Object
Set the max iterations for the EM algorithm.
672 673 674 675 676 |
# File 'ext/lda-ruby/lda-inference.c', line 672
static VALUE wrap_set_em_max_iter(VALUE self, VALUE em_max_iter) {
EM_MAX_ITER = NUM2INT(em_max_iter);
return em_max_iter;
}
|
#est_alpha ⇒ Object
Get the estimate alpha value (fixed = 0).
729 730 731 |
# File 'ext/lda-ruby/lda-inference.c', line 729 static VALUE wrap_get_estimate_alpha(VALUE self) { return rb_int_new(ESTIMATE_ALPHA); } |
#est_alpha=(est_alpha) ⇒ Object
Set the estimate alpha value (fixed = 0).
736 737 738 739 740 |
# File 'ext/lda-ruby/lda-inference.c', line 736
static VALUE wrap_set_estimate_alpha(VALUE self, VALUE est_alpha) {
ESTIMATE_ALPHA = NUM2INT(est_alpha);
return est_alpha;
}
|
#fast_load_corpus_from_file(filename) ⇒ Object
Load the corpus from the given file. This will not create a Corpus
object that is accessible, but it will load the corpus much faster.
801 802 803 804 805 806 807 808 809 |
# File 'ext/lda-ruby/lda-inference.c', line 801
static VALUE wrap_load_corpus(VALUE self, VALUE filename) {
if (!corpus_loaded) {
last_corpus = read_data(StringValuePtr(filename));
corpus_loaded = TRUE;
return Qtrue;
} else {
return Qtrue;
}
}
|
#gamma ⇒ Object
Get the gamma values after the model has been run.
856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 |
# File 'ext/lda-ruby/lda-inference.c', line 856
static VALUE wrap_get_gamma(VALUE self) {
if (!model_loaded)
return Qnil;
// last_gamma is a double[num_docs][num_topics]
VALUE arr;
int i = 0, j = 0;
arr = rb_ary_new2(last_corpus->num_docs);
for (i = 0; i < last_corpus->num_docs; i++) {
VALUE arr2 = rb_ary_new2(last_model->num_topics);
for (j = 0; j < last_model->num_topics; j++) {
rb_ary_store(arr2, j, rb_float_new(last_gamma[i][j]));
}
rb_ary_store(arr, i, arr2);
}
return arr;
}
|
#init_alpha ⇒ Object
Get the initial alpha value.
697 698 699 |
# File 'ext/lda-ruby/lda-inference.c', line 697 static VALUE wrap_get_initial_alpha(VALUE self) { return rb_float_new(INITIAL_ALPHA); } |
#init_alpha=(initial_alpha) ⇒ Object
Set the initial value of alpha.
711 712 713 714 715 |
# File 'ext/lda-ruby/lda-inference.c', line 711
static VALUE wrap_set_initial_alpha(VALUE self, VALUE initial_alpha) {
INITIAL_ALPHA = (float)NUM2DBL(initial_alpha);
return initial_alpha;
}
|
#load_corpus(filename) ⇒ Object
39 40 41 42 43 44 |
# File 'lib/lda-ruby.rb', line 39 def load_corpus(filename) @corpus = Corpus.new @corpus.load_from_file(filename) true end |
#load_default_settings ⇒ Object
27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/lda-ruby.rb', line 27 def load_default_settings self.max_iter = 20 self.convergence = 1e-6 self.em_max_iter = 100 self.em_convergence = 1e-4 self.num_topics = 20 self.init_alpha = 0.3 self.est_alpha = 1 [20, 1e-6, 100, 1e-4, 20, 0.3, 1] end |
#load_settings(settings_file) ⇒ Object
Load settings from the given file.
790 791 792 793 794 |
# File 'ext/lda-ruby/lda-inference.c', line 790
static VALUE wrap_load_settings(VALUE self, VALUE settings_file) {
read_settings(StringValuePtr(settings_file));
return Qtrue;
}
|
#load_vocabulary(vocab) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/lda-ruby.rb', line 46 def load_vocabulary(vocab) if vocab.is_a?(Array) @vocab = Marshal::load(Marshal::dump(vocab)) # deep clone array elsif vocab.is_a?(Vocabulary) @vocab = vocab.to_a else @vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) } end true end |
#max_iter ⇒ Object
Get the maximum iterations.
633 634 635 |
# File 'ext/lda-ruby/lda-inference.c', line 633 static VALUE wrap_get_max_iter(VALUE self) { return rb_int_new(VAR_MAX_ITER); } |
#max_iter=(max_iter) ⇒ Object
Set the maximum iterations.
640 641 642 643 644 |
# File 'ext/lda-ruby/lda-inference.c', line 640
static VALUE wrap_set_max_iter(VALUE self, VALUE max_iter) {
VAR_MAX_ITER = NUM2INT(max_iter);
return max_iter;
}
|
#model ⇒ Object
Get the settings used for the model.
942 943 944 945 946 947 948 949 950 951 952 953 954 |
# File 'ext/lda-ruby/lda-inference.c', line 942
static VALUE wrap_get_model_settings(VALUE self) {
if (!model_loaded)
return Qnil;
VALUE arr;
arr = rb_ary_new();
rb_ary_push(arr, rb_int_new(last_model->num_topics));
rb_ary_push(arr, rb_int_new(last_model->num_terms));
rb_ary_push(arr, rb_float_new(last_model->alpha));
return arr; // [num_topics, num_terms, alpha]
}
|
#num_topics ⇒ Object
Get the number of topics being clustered.
704 705 706 |
# File 'ext/lda-ruby/lda-inference.c', line 704 static VALUE wrap_get_num_topics(VALUE self) { return rb_int_new(NTOPICS); } |
#num_topics=(ntopics) ⇒ Object
Set the number of topics to be clustered.
720 721 722 723 724 |
# File 'ext/lda-ruby/lda-inference.c', line 720
static VALUE wrap_set_num_topics(VALUE self, VALUE ntopics) {
NTOPICS = NUM2INT(ntopics);
return ntopics;
}
|
#phi(recompute = false) ⇒ Object
Get the phi matrix which can be used to assign probabilities to words belonging to a specific topic in each document. The return value is a 3D matrix: num_docs x doc_length x num_topics. The value is cached after the first call, so if it needs to be recomputed, set the recompute
value to true.
121 122 123 124 125 126 127 |
# File 'lib/lda-ruby.rb', line 121 def phi(recompute=false) if @phi.nil? || recompute @phi = self.compute_phi end @phi end |
#print_topics(words_per_topic = 10) ⇒ Object
Visualization method for printing out the top words_per_topic
words for each topic.
See also top_words
.
64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/lda-ruby.rb', line 64 def print_topics(words_per_topic = 10) raise 'No vocabulary loaded.' unless @vocab self.beta.each_with_index do |topic, topic_num| # Sort the topic array and return the sorted indices of the best scores indices = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic] puts "Topic #{topic_num}" puts "\t#{indices.map {|i| @vocab[i]}.join("\n\t")}" puts "" end nil end |
#set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha) ⇒ Object
Set all of the settings in one command:
* init_alpha
* num_topics
* max_iter
* convergence
* em_max_iter
* em_convergence
* est_alpha
617 618 619 620 621 622 623 624 625 626 627 628 |
# File 'ext/lda-ruby/lda-inference.c', line 617
static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) {
INITIAL_ALPHA = NUM2DBL(init_alpha);
NTOPICS = NUM2INT(num_topics);
if( NTOPICS < 0 ) { rb_raise(rb_eRuntimeError, "NTOPICS must be greater than 0 - %d", NTOPICS); }
VAR_MAX_ITER = NUM2INT(max_iter);
VAR_CONVERGED = (float)NUM2DBL(convergence);
EM_MAX_ITER = NUM2INT(em_max_iter);
EM_CONVERGED = (float)NUM2DBL(em_convergence);
ESTIMATE_ALPHA = NUM2INT(est_alpha);
return Qtrue;
}
|
#to_s ⇒ Object
String representation displaying current settings.
155 156 157 158 159 160 161 162 163 164 165 166 |
# File 'lib/lda-ruby.rb', line 155 def to_s outp = ["LDA Settings:"] outp << " Initial alpha: %0.6f" % self.init_alpha outp << " # of topics: %d" % self.num_topics outp << " Max iterations: %d" % self.max_iter outp << " Convergence: %0.6f" % self.convergence outp << "EM max iterations: %d" % self.em_max_iter outp << " EM convergence: %0.6f" % self.em_convergence outp << " Estimate alpha: %d" % self.est_alpha outp.join("\n") end |
#top_word_indices(words_per_topic = 10) ⇒ Object
After the model has been run and a vocabulary has been loaded, return the words_per_topic
top words chosen by the model for each topic. This is returned as a hash mapping the topic number to an array of top words (in descending order of importance).
topic_number => [w1, w2, ..., w_n]
See also print_topics
.
89 90 91 92 93 94 95 96 97 98 99 100 101 |
# File 'lib/lda-ruby.rb', line 89 def top_word_indices(words_per_topic = 10) raise 'No vocabulary loaded.' unless @vocab # find the highest scoring words per topic topics = Hash.new indices = (0...@vocab.size).to_a self.beta.each_with_index do |topic, topic_num| topics[topic_num] = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic] end topics end |
#top_words(words_per_topic = 10) ⇒ Object
103 104 105 106 107 108 109 110 111 112 |
# File 'lib/lda-ruby.rb', line 103 def top_words(words_per_topic = 10) output = Hash.new topics = top_word_indices(words_per_topic) topics.each_pair do |topic_num, words| output[topic_num] = words.map { |w| @vocab[w] } end output end |
#verbose ⇒ Object
Get the verbosity setting.
745 746 747 748 749 750 751 |
# File 'ext/lda-ruby/lda-inference.c', line 745
static VALUE wrap_get_verbosity(VALUE self) {
if (VERBOSE) {
return Qtrue;
} else {
return Qfalse;
}
}
|
#verbose=(verbosity) ⇒ Object
Set the verbosity level (true, false).
757 758 759 760 761 762 763 764 765 |
# File 'ext/lda-ruby/lda-inference.c', line 757
static VALUE wrap_set_verbosity(VALUE self, VALUE verbosity) {
if (verbosity == Qtrue) {
VERBOSE = TRUE;
} else {
VERBOSE = FALSE;
}
return verbosity;
}
|