Class: Minwise::Minhash
- Inherits:
-
Object
- Object
- Minwise::Minhash
- Defined in:
- lib/minwise/minhash.rb,
ext/minwise/minwise.c
Overview
The classic minhash algorithm.
The minhash digests of two arrays will share approximately the same proportion of elements as the underlying arrays. This is useful for finding similar items in large datasets.
The interface is similar to classes in the Digest module of the standard library.
Minwise::Minhash.digest([1, 2, 3])
# => [1005141192, 713750329, 346603495, ...]
String inputs are first converted to an array of fixed width chunks of characters called “shingles”. For example ‘“Chunky”` with a shingle size of 3 would become `[“Chu”, “hun”, “unk”, “nky”]`, then that array is used to generate the minhash. The shingle_size option controls the chunk width used.
Minwise::Minhash.digest("Chunky bacon", shingle_size: 3)
# => [437974493, 147728091, 1185236492, ...]
The size of the output hash_size and seed for generating the hash can also be set in options. Larger minhashes will give more accurate estimates of similarity between items, but are slower to generate and take more space.
Minwise::Minhash.digest("Chunky bacon", hash_size: 900, seed: 84)
# => [355390344, 825885127, 262059926, ...]
When comparing two minhashes all the options used to generate the minhashes must be identical for the comparison to be meaningful.
Detailed information on how the minhash algorithm works and how minhashes can be used can be found in “Chapter 3: Finding Similar Items” of the book “Mining of Massive Datasets”, by Leskovec, Rajaraman, and Ulman, available for free at www.mmds.org/.
Constant Summary collapse
- DEFAULT_OPTIONS =
{ hash_size: 128, shingle_size: 5, seed: 3_141_592 }.freeze
Class Method Summary collapse
- .__hash(rb_set, rb_hash_len, rb_hash_seed) ⇒ Object
- .__tokenize(rb_string, rb_shingle_size) ⇒ Object
- .digest(data, options = {}) ⇒ Object
Instance Method Summary collapse
- #digest ⇒ Object
-
#initialize(data = [], options = {}) ⇒ Minhash
constructor
A new instance of Minhash.
- #update(element) ⇒ Object
Constructor Details
#initialize(data = [], options = {}) ⇒ Minhash
Returns a new instance of Minhash.
51 52 53 54 |
# File 'lib/minwise/minhash.rb', line 51 def initialize(data = [], = {}) @options = DEFAULT_OPTIONS.merge() @data = parse(data) end |
Class Method Details
.__hash(rb_set, rb_hash_len, rb_hash_seed) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'ext/minwise/minwise.c', line 41
static VALUE c_minhash(VALUE self, VALUE rb_set, VALUE rb_hash_len, VALUE rb_hash_seed) {
Check_Type(rb_set, T_ARRAY);
size_t rb_set_len = RARRAY_LEN(rb_set);
size_t c_hash_len = NUM2SIZET(rb_hash_len);
uint64_t c_hash_seed = NUM2ULONG(rb_hash_seed);
uint32_t *c_set = (uint32_t *)malloc(rb_set_len * sizeof(uint32_t));
for (size_t i = 0; i < rb_set_len; i++) {
c_set[i] = NUM2UINT(rb_ary_entry(rb_set, i));
}
uint32_t *c_hash = (uint32_t *)malloc(c_hash_len * sizeof(uint32_t));
minhash(c_set, rb_set_len, c_hash, c_hash_len, c_hash_seed);
VALUE rb_hash = rb_ary_new_capa(c_hash_len);
for (size_t i = 0; i < c_hash_len; i++) {
rb_ary_store(rb_hash, i, UINT2NUM(c_hash[i]));
}
free(c_set);
free(c_hash);
return rb_hash;
}
|
.__tokenize(rb_string, rb_shingle_size) ⇒ Object
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'ext/minwise/minwise.c', line 79
static VALUE c_tokenize(VALUE self, VALUE rb_string, VALUE rb_shingle_size) {
char *c_string = StringValueCStr(rb_string);
size_t c_string_len = RSTRING_LEN(rb_string);
size_t c_shingle_size = NUM2SIZET(rb_shingle_size);
if (c_string_len <= c_shingle_size) {
VALUE rb_tokens = rb_ary_new_capa(1);
rb_ary_store(rb_tokens, 0, UINT2NUM(fnv1a(c_string)));
return rb_tokens;
}
size_t rb_tokens_len = c_string_len - c_shingle_size + 1;
VALUE rb_tokens = rb_ary_new_capa(rb_tokens_len);
char buffer[c_shingle_size + 1];
for (size_t i = 0; i < rb_tokens_len; i++) {
for (size_t j = 0; j < c_shingle_size; j++) {
buffer[j] = c_string[i + j];
}
buffer[c_shingle_size + 1] = 0;
rb_ary_store(rb_tokens, i, UINT2NUM(fnv1a(buffer)));
}
return rb_tokens;
}
|
.digest(data, options = {}) ⇒ Object
47 48 49 |
# File 'lib/minwise/minhash.rb', line 47 def self.digest(data, = {}) new(data, ).digest end |
Instance Method Details
#digest ⇒ Object
60 61 62 63 64 |
# File 'lib/minwise/minhash.rb', line 60 def digest raise ArgumentError, "input must not be empty" if @data.empty? self.class.__hash(@data, @options[:hash_size], @options[:seed]) end |
#update(element) ⇒ Object
56 57 58 |
# File 'lib/minwise/minhash.rb', line 56 def update(element) @data << element end |