Module: Simhash

Extended by:
Simhash
Included in:
Simhash
Defined in:
lib/simhash2.rb,
lib/simhash2/version.rb

Constant Summary collapse

HASHBITS =
64
OPTIONS =
{
  min_token_length: 1,
  unique: false,
  stemming: false,
  stop_words: []

}.freeze
VERSION =
'0.0.2'.freeze

Instance Method Summary collapse

Instance Method Details

#generate(str, options = {}) ⇒ Object



16
17
18
# File 'lib/simhash2.rb', line 16

def generate(str, options = {})
  generate_from_tokens(str.split(/\s+/), options)
end

#generate_from_tokens(tokens, options = {}) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/simhash2.rb', line 20

def generate_from_tokens(tokens, options = {})
  filter_tokens(tokens, OPTIONS.merge(options))

  v = [0] * HASHBITS

  masks = v.dup
  masks.each_with_index { |_e, i| masks[i] = (1 << i) }

  hashes = tokens.map { |token| simple_string_hash(token, HASHBITS) }
  hashes.each do |h|
    HASHBITS.times do |i|
      v[i] += (h & masks[i]).zero? ? -1 : +1
    end
  end

  simhash = 0
  HASHBITS.times { |i| simhash += 1 << i if v[i] >= 0 }

  simhash
end

#hamming_distance(simhash1, simhash2) ⇒ Object



41
42
43
# File 'lib/simhash2.rb', line 41

def hamming_distance(simhash1, simhash2)
  (simhash1.to_i ^ simhash2.to_i).to_s(2).count('1')
end