Class: Reckon::CosineSimilarity
- Inherits:
-
Object
- Object
- Reckon::CosineSimilarity
- Defined in:
- lib/reckon/cosine_similarity.rb
Instance Method Summary collapse
- #add_document(account, doc) ⇒ Object
-
#find_similar(query) ⇒ Object
find most similar documents to query.
-
#initialize(options) ⇒ CosineSimilarity
constructor
A new instance of CosineSimilarity.
Constructor Details
#initialize(options) ⇒ CosineSimilarity
Returns a new instance of CosineSimilarity.
8 9 10 11 12 |
# File 'lib/reckon/cosine_similarity.rb', line 8 def initialize() @options = @tokens = {} @accounts = Hash.new(0) end |
Instance Method Details
#add_document(account, doc) ⇒ Object
14 15 16 17 18 19 20 21 22 23 |
# File 'lib/reckon/cosine_similarity.rb', line 14 def add_document(account, doc) tokenize(doc).each do |n| (token, count) = n @tokens[token] ||= {} @tokens[token][account] ||= 0 @tokens[token][account] += count @accounts[account] += count end end |
#find_similar(query) ⇒ Object
find most similar documents to query
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/reckon/cosine_similarity.rb', line 26 def find_similar(query) (query_scores, corpus_scores) = td_idf_scores_for(query) query_vector = Vector.elements(query_scores, false) # For each doc, calculate the similarity to the query suggestions = corpus_scores.map do |account, scores| acct_vector = Vector.elements(scores, false) acct_query_dp = acct_vector.inner_product(query_vector) # similarity is a float between 1 and -1, where 1 is exactly the same and -1 is # exactly opposite # see https://en.wikipedia.org/wiki/Cosine_similarity # cos(theta) = (A . B) / (||A|| ||B||) # where A . B is the "dot product" and ||A|| is the magnitude of A # ruby has the 'matrix' library we can use to do these calculations. { similarity: acct_query_dp / (acct_vector.magnitude * query_vector.magnitude), account: account, } end.select { |n| n[:similarity] > 0 }.sort_by { |n| -n[:similarity] } LOGGER.info "most similar accounts: #{suggestions}" return suggestions end |