Module: Simhash
Constant Summary collapse
- HASHBITS =
64- OPTIONS =
{ min_token_length: 1, unique: false, stemming: false, stop_words: [] }.freeze
- VERSION =
'0.0.4'.freeze
Instance Method Summary collapse
- #generate(str, options = {}) ⇒ Object
- #generate_from_tokens(tokens, options = {}) ⇒ Object
- #hamming_distance(simhash1, simhash2) ⇒ Object
- #hash_similarity(left, right) ⇒ Object
- #similarity(string1, string2, options = {}) ⇒ Object
Instance Method Details
#generate(str, options = {}) ⇒ Object
19 20 21 22 23 |
# File 'lib/simhash2.rb', line 19 def generate(str, = {}) # the split is how we get our tokens (or shingles) # adjust that, if we want to use shingles generate_from_tokens(str.split(/\s+/), ) end |
#generate_from_tokens(tokens, options = {}) ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/simhash2.rb', line 25 def generate_from_tokens(tokens, = {}) v = [0] * HASHBITS masks = v.dup masks.each_with_index { |_e, i| masks[i] = (1 << i) } filter_tokens(tokens, OPTIONS.merge()) do |token| h = simple_string_hash(token, HASHBITS) #warn "simple_string_hash (for: #{token.inspect}): #{h.inspect}" HASHBITS.times do |i| v[i] += (h & masks[i]).zero? ? -1 : +1 end end simhash = 0 HASHBITS.times { |i| simhash += 1 << i if v[i] >= 0 } simhash end |
#hamming_distance(simhash1, simhash2) ⇒ Object
45 46 47 |
# File 'lib/simhash2.rb', line 45 def hamming_distance(simhash1, simhash2) (simhash1.to_i ^ simhash2.to_i).to_s(2).count('1') end |
#hash_similarity(left, right) ⇒ Object
49 50 51 |
# File 'lib/simhash2.rb', line 49 def hash_similarity(left, right) return (1.0 - (hamming_distance(left, right).to_f / HASHBITS)) end |
#similarity(string1, string2, options = {}) ⇒ Object
15 16 17 |
# File 'lib/simhash2.rb', line 15 def similarity(string1, string2, = {}) return hash_similarity(generate(string1, ), generate(string2, )) end |