Class: Document

Inherits:
Object
  • Object
show all
Defined in:
lib/similarity/document.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(hash_args) ⇒ Document

Returns a new instance of Document.



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/similarity/document.rb', line 4

def initialize(hash_args)
  content = hash_args[:content]
  if content && !content.empty?
    @content = content
    @term_frequency = nil
    @terms = nil
  else
    raise ArgumentError, "text cannot be nil or blank"
  end

  id = hash_args[:id]
  if id && !id.nil?
    @id = id
  else
    @id = self.object_id
  end
end

Instance Attribute Details

#contentObject (readonly)

Returns the value of attribute content.



2
3
4
# File 'lib/similarity/document.rb', line 2

def content
  @content
end

#idObject (readonly)

Returns the value of attribute id.



2
3
4
# File 'lib/similarity/document.rb', line 2

def id
  @id
end

Instance Method Details

#calculate_term_frequenciesObject



32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/similarity/document.rb', line 32

def calculate_term_frequencies
  tf = {}
  terms.each do |term|
    if tf[term]
      tf[term] += 1
    else
      tf[term] = 1
    end
  end
  total_number_of_terms = terms.size.to_f
  tf.each_pair { |k,v| tf[k] = (tf[k] / total_number_of_terms) }
end

#has_term?(term) ⇒ Boolean

Returns:

  • (Boolean)


53
54
55
# File 'lib/similarity/document.rb', line 53

def has_term?(term)
  terms.include? term
end

#term_frequenciesObject



28
29
30
# File 'lib/similarity/document.rb', line 28

def term_frequencies
  @term_frequencies ||= calculate_term_frequencies
end

#term_frequency(term) ⇒ Object



45
46
47
48
49
50
51
# File 'lib/similarity/document.rb', line 45

def term_frequency(term)
  if tf = term_frequencies[term]
    tf
  else
    0
  end
end

#termsObject



22
23
24
25
26
# File 'lib/similarity/document.rb', line 22

def terms
  @terms ||=
    @content.gsub(/(\d|\s|\W)+/, ' ').
    split(/\s/).map { |term| term.downcase }
end