Module: Simhash
Constant Summary collapse
- HASHBITS =
64- OPTIONS =
{ min_token_length: 1, unique: false, stemming: false, stop_words: [] }.freeze
- VERSION =
'0.0.2'.freeze
Instance Method Summary collapse
- #generate(str, options = {}) ⇒ Object
- #generate_from_tokens(tokens, options = {}) ⇒ Object
- #hamming_distance(simhash1, simhash2) ⇒ Object
Instance Method Details
#generate(str, options = {}) ⇒ Object
16 17 18 |
# File 'lib/simhash2.rb', line 16 def generate(str, = {}) generate_from_tokens(str.split(/\s+/), ) end |
#generate_from_tokens(tokens, options = {}) ⇒ Object
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/simhash2.rb', line 20 def generate_from_tokens(tokens, = {}) filter_tokens(tokens, OPTIONS.merge()) v = [0] * HASHBITS masks = v.dup masks.each_with_index { |_e, i| masks[i] = (1 << i) } hashes = tokens.map { |token| simple_string_hash(token, HASHBITS) } hashes.each do |h| HASHBITS.times do |i| v[i] += (h & masks[i]).zero? ? -1 : +1 end end simhash = 0 HASHBITS.times { |i| simhash += 1 << i if v[i] >= 0 } simhash end |
#hamming_distance(simhash1, simhash2) ⇒ Object
41 42 43 |
# File 'lib/simhash2.rb', line 41 def hamming_distance(simhash1, simhash2) (simhash1.to_i ^ simhash2.to_i).to_s(2).count('1') end |