Class: TfIdfSimilarity::TermCountModel
- Inherits:
-
Object
- Object
- TfIdfSimilarity::TermCountModel
- Includes:
- MatrixMethods
- Defined in:
- lib/tf-idf-similarity/term_count_model.rb
Instance Attribute Summary collapse
-
#average_document_size ⇒ Object
readonly
The average number of tokens in a document.
-
#documents ⇒ Object
readonly
The documents in the corpus.
-
#terms ⇒ Object
readonly
The set of terms in the corpus.
Instance Method Summary collapse
-
#document_count(term) ⇒ Integer
The number of documents the term appears in.
-
#initialize(documents, opts = {}) ⇒ TermCountModel
constructor
A new instance of TermCountModel.
-
#term_count(term) ⇒ Integer
The number of times the term appears in the corpus.
Constructor Details
#initialize(documents, opts = {}) ⇒ TermCountModel
Returns a new instance of TermCountModel.
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/tf-idf-similarity/term_count_model.rb', line 16 def initialize(documents, opts = {}) @documents = documents @terms = Set.new(documents.map(&:terms).flatten).to_a @library = (opts[:library] || :matrix).to_sym array = Array.new(terms.size) do |i| Array.new(documents.size) do |j| documents[j].term_count(terms[i]) end end @matrix = initialize_matrix(array) @average_document_size = documents.empty? ? 0 : sum / column_size.to_f end |
Instance Attribute Details
#average_document_size ⇒ Object (readonly)
The average number of tokens in a document.
11 12 13 |
# File 'lib/tf-idf-similarity/term_count_model.rb', line 11 def average_document_size @average_document_size end |
#documents ⇒ Object (readonly)
The documents in the corpus.
7 8 9 |
# File 'lib/tf-idf-similarity/term_count_model.rb', line 7 def documents @documents end |
#terms ⇒ Object (readonly)
The set of terms in the corpus.
9 10 11 |
# File 'lib/tf-idf-similarity/term_count_model.rb', line 9 def terms @terms end |
Instance Method Details
#document_count(term) ⇒ Integer
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/tf-idf-similarity/term_count_model.rb', line 34 def document_count(term) index = terms.index(term) if index case @library when :gsl, :narray row(index).where.size when :nmatrix row(index).each.count(&:nonzero?) else vector = row(index) unless vector.respond_to?(:count) vector = vector.to_a end vector.count(&:nonzero?) end else 0 end end |
#term_count(term) ⇒ Integer
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/tf-idf-similarity/term_count_model.rb', line 56 def term_count(term) index = terms.index(term) if index case @library when :gsl, :narray row(index).sum when :nmatrix row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower else vector = row(index) unless vector.respond_to?(:reduce) vector = vector.to_a end vector.reduce(0, :+) end else 0 end end |