Class: Document
- Inherits:
-
Object
- Object
- Document
- Defined in:
- lib/similarity/document.rb
Instance Attribute Summary collapse
-
#content ⇒ Object
readonly
Returns the value of attribute content.
-
#id ⇒ Object
readonly
Returns the value of attribute id.
Instance Method Summary collapse
- #calculate_term_frequencies ⇒ Object
- #has_term?(term) ⇒ Boolean
-
#initialize(hash_args) ⇒ Document
constructor
A new instance of Document.
- #term_frequencies ⇒ Object
- #term_frequency(term) ⇒ Object
- #terms ⇒ Object
Constructor Details
#initialize(hash_args) ⇒ Document
Returns a new instance of Document.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
# File 'lib/similarity/document.rb', line 4 def initialize(hash_args) content = hash_args[:content] if content && !content.empty? @content = content @term_frequency = nil @terms = nil else raise ArgumentError, "text cannot be nil or blank" end id = hash_args[:id] if id && !id.nil? @id = id else @id = self.object_id end end |
Instance Attribute Details
#content ⇒ Object (readonly)
Returns the value of attribute content.
2 3 4 |
# File 'lib/similarity/document.rb', line 2 def content @content end |
#id ⇒ Object (readonly)
Returns the value of attribute id.
2 3 4 |
# File 'lib/similarity/document.rb', line 2 def id @id end |
Instance Method Details
#calculate_term_frequencies ⇒ Object
32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/similarity/document.rb', line 32 def calculate_term_frequencies tf = {} terms.each do |term| if tf[term] tf[term] += 1 else tf[term] = 1 end end total_number_of_terms = terms.size.to_f tf.each_pair { |k,v| tf[k] = (tf[k] / total_number_of_terms) } end |
#has_term?(term) ⇒ Boolean
53 54 55 |
# File 'lib/similarity/document.rb', line 53 def has_term?(term) terms.include? term end |
#term_frequencies ⇒ Object
28 29 30 |
# File 'lib/similarity/document.rb', line 28 def term_frequencies @term_frequencies ||= calculate_term_frequencies end |
#term_frequency(term) ⇒ Object
45 46 47 48 49 50 51 |
# File 'lib/similarity/document.rb', line 45 def term_frequency(term) if tf = term_frequencies[term] tf else 0 end end |
#terms ⇒ Object
22 23 24 25 26 |
# File 'lib/similarity/document.rb', line 22 def terms @terms ||= @content.gsub(/(\d|\s|\W)+/, ' '). split(/\s/).map { |term| term.downcase } end |