Module: Simhash

Extended by:
Simhash
Included in:
Simhash
Defined in:
lib/simhash2.rb,
lib/simhash2/version.rb

Constant Summary collapse

HASHBITS =
64
OPTIONS =
{
  min_token_length: 1,
  unique: false,
  stemming: false,
  stop_words: []
}.freeze
VERSION =
'0.0.4'.freeze

Instance Method Summary collapse

Instance Method Details

#generate(str, options = {}) ⇒ Object



19
20
21
22
23
# File 'lib/simhash2.rb', line 19

def generate(str, options = {})
  # the split is how we get our tokens (or shingles)
  # adjust that, if we want to use shingles
  generate_from_tokens(str.split(/\s+/), options)
end

#generate_from_tokens(tokens, options = {}) ⇒ Object



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/simhash2.rb', line 25

def generate_from_tokens(tokens, options = {})
  v = [0] * HASHBITS
  masks = v.dup
  masks.each_with_index { |_e, i| masks[i] = (1 << i) }

  filter_tokens(tokens, OPTIONS.merge(options)) do |token|
    h = simple_string_hash(token, HASHBITS)
    #warn "simple_string_hash (for: #{token.inspect}): #{h.inspect}"

    HASHBITS.times do |i|
      v[i] += (h & masks[i]).zero? ? -1 : +1
    end
  end

  simhash = 0
  HASHBITS.times { |i| simhash += 1 << i if v[i] >= 0 }

  simhash
end

#hamming_distance(simhash1, simhash2) ⇒ Object



45
46
47
# File 'lib/simhash2.rb', line 45

def hamming_distance(simhash1, simhash2)
  (simhash1.to_i ^ simhash2.to_i).to_s(2).count('1')
end

#hash_similarity(left, right) ⇒ Object



49
50
51
# File 'lib/simhash2.rb', line 49

def hash_similarity(left, right)
  return (1.0 - (hamming_distance(left, right).to_f / HASHBITS))
end

#similarity(string1, string2, options = {}) ⇒ Object



15
16
17
# File 'lib/simhash2.rb', line 15

def similarity(string1, string2, options = {})
  return hash_similarity(generate(string1, options), generate(string2, options))
end