Class: Minwise::Minhash

Inherits:
Object
  • Object
show all
Defined in:
lib/minwise/minhash.rb,
ext/minwise/minwise.c

Overview

The classic minhash algorithm.

The minhash digests of two arrays will share approximately the same proportion of elements as the underlying arrays. This is useful for finding similar items in large datasets.

The interface is similar to classes in the Digest module of the standard library.

Minwise::Minhash.digest([1, 2, 3])
# => [1005141192, 713750329, 346603495, ...]

String inputs are first converted to an array of fixed width chunks of characters called “shingles”. For example ‘“Chunky”` with a shingle size of 3 would become `[“Chu”, “hun”, “unk”, “nky”]`, then that array is used to generate the minhash. The shingle_size option controls the chunk width used.

Minwise::Minhash.digest("Chunky bacon", shingle_size: 3)
# => [437974493, 147728091, 1185236492, ...]

The size of the output hash_size and seed for generating the hash can also be set in options. Larger minhashes will give more accurate estimates of similarity between items, but are slower to generate and take more space.

Minwise::Minhash.digest("Chunky bacon", hash_size: 900, seed: 84)
# => [355390344, 825885127, 262059926, ...]

When comparing two minhashes all the options used to generate the minhashes must be identical for the comparison to be meaningful.

Detailed information on how the minhash algorithm works and how minhashes can be used can be found in “Chapter 3: Finding Similar Items” of the book “Mining of Massive Datasets”, by Leskovec, Rajaraman, and Ulman, available for free at www.mmds.org/.

Constant Summary collapse

DEFAULT_OPTIONS =
{
  hash_size: 128,
  shingle_size: 5,
  seed: 3_141_592
}.freeze

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(data = [], options = {}) ⇒ Minhash

Returns a new instance of Minhash.



51
52
53
54
# File 'lib/minwise/minhash.rb', line 51

def initialize(data = [], options = {})
  @options = DEFAULT_OPTIONS.merge(options)
  @data = parse(data)
end

Class Method Details

.__hash(rb_set, rb_hash_len, rb_hash_seed) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'ext/minwise/minwise.c', line 41

static VALUE c_minhash(VALUE self, VALUE rb_set, VALUE rb_hash_len, VALUE rb_hash_seed) {
  Check_Type(rb_set, T_ARRAY);

  size_t rb_set_len = RARRAY_LEN(rb_set);
  size_t c_hash_len = NUM2SIZET(rb_hash_len);
  uint64_t c_hash_seed = NUM2ULONG(rb_hash_seed);

  uint32_t *c_set = (uint32_t *)malloc(rb_set_len * sizeof(uint32_t));
  for (size_t i = 0; i < rb_set_len; i++) {
    c_set[i] = NUM2UINT(rb_ary_entry(rb_set, i));
  }

  uint32_t *c_hash = (uint32_t *)malloc(c_hash_len * sizeof(uint32_t));
  minhash(c_set, rb_set_len, c_hash, c_hash_len, c_hash_seed);

  VALUE rb_hash = rb_ary_new_capa(c_hash_len);
  for (size_t i = 0; i < c_hash_len; i++) {
    rb_ary_store(rb_hash, i, UINT2NUM(c_hash[i]));
  }

  free(c_set);
  free(c_hash);

  return rb_hash;
}

.__tokenize(rb_string, rb_shingle_size) ⇒ Object



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'ext/minwise/minwise.c', line 79

static VALUE c_tokenize(VALUE self, VALUE rb_string, VALUE rb_shingle_size) {
  char *c_string = StringValueCStr(rb_string);
  size_t c_string_len = RSTRING_LEN(rb_string);
  size_t c_shingle_size = NUM2SIZET(rb_shingle_size);

  if (c_string_len <= c_shingle_size) {
    VALUE rb_tokens = rb_ary_new_capa(1);
    rb_ary_store(rb_tokens, 0, UINT2NUM(fnv1a(c_string)));

    return rb_tokens;
  }

  size_t rb_tokens_len = c_string_len - c_shingle_size + 1;
  VALUE rb_tokens = rb_ary_new_capa(rb_tokens_len);

  char buffer[c_shingle_size + 1];
  for (size_t i = 0; i < rb_tokens_len; i++) {
    for (size_t j = 0; j < c_shingle_size; j++) {
      buffer[j] = c_string[i + j];
    }
    buffer[c_shingle_size + 1] = 0;

    rb_ary_store(rb_tokens, i, UINT2NUM(fnv1a(buffer)));
  }

  return rb_tokens;
}

.digest(data, options = {}) ⇒ Object



47
48
49
# File 'lib/minwise/minhash.rb', line 47

def self.digest(data, options = {})
  new(data, options).digest
end

Instance Method Details

#digestObject

Raises:

  • (ArgumentError)


60
61
62
63
64
# File 'lib/minwise/minhash.rb', line 60

def digest
  raise ArgumentError, "input must not be empty" if @data.empty?

  self.class.__hash(@data, @options[:hash_size], @options[:seed])
end

#update(element) ⇒ Object



56
57
58
# File 'lib/minwise/minhash.rb', line 56

def update(element)
  @data << element
end