Class: Epitome::Corpus

Inherits:
Object
  • Object
show all
Defined in:
lib/epitome/corpus.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(document_collection, lang = "en") ⇒ Corpus

Returns a new instance of Corpus.



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/epitome/corpus.rb', line 8

def initialize(document_collection, lang="en")
  # lang is the language used to initialize the stopword list
  @lang = lang

  # Massage the document_collection into a more workable form
  @original_corpus = {}
  document_collection.each { |document| @original_corpus[document.id] = document.text }
  @clean_corpus = {}
  @original_corpus.each do |key, value|
    @clean_corpus[key] = clean value
  end

  # Dictionary of term-frequency for each word
  # to avoid unnecessary computations
  @word_tf_doc = {}

  # Just the sentences
  @sentences = @original_corpus.values.flatten

  # The number of documents in the corpus
  @n_docs = @original_corpus.keys.size
  
end

Instance Attribute Details

#original_corpusObject (readonly)

Returns the value of attribute original_corpus.



7
8
9
# File 'lib/epitome/corpus.rb', line 7

def original_corpus
  @original_corpus
end

Instance Method Details

#summary(summary_length, threshold = 0.2) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/epitome/corpus.rb', line 32

def summary(summary_length, threshold=0.2)
  s = @clean_corpus.values.flatten
  # n is the number of sentences in the total corpus
  n = @clean_corpus.values.flatten.size

  # Vector of Similarity Degree for each sentence in the corpus
  degree = Array.new(n) {0.00} 

  # Square matrix of dimension n = number of sentences
  cosine_matrix = Matrix.build(n) do |i, j|
    if idf_modified_cosine(s[i], s[j]) > threshold
      degree[i] += 1.0
      1.0
    else
      0.0
    end
  end
  
  # Similarity Matrix
  similarity_matrix = Matrix.build(n) do |i,j|
    degree[i] == 0 ? 0.0 : ( cosine_matrix[i,j] / degree[i] )
  end

  # Random walk ala PageRank
  # in the form of a power method
  results = power_method similarity_matrix, n, 0.85

  # Ugly sleight of hand to return a text based on results
  # <Array>Results => <Hash>Results => <String>ResultsText
  h = Hash[@sentences.zip(results)]
  return h.sort_by {|k, v| v}.reverse.first(summary_length).to_h.keys
end