Class: AhoCorasick::KeywordTree

Inherits:
Object
  • Object
show all
Defined in:
lib/ahocorasick.rb,
ext/ahocorasick/ruby-ahocorasick.c

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeObject

Creates a new KeywordTree

require 'ahocorasick'
kwt = Ahocorasick::KeywordTree.new


79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'ext/ahocorasick/ruby-ahocorasick.c', line 79

static VALUE 
rb_kwt_init(VALUE self)
{ 
  AC_STRUCT * tree;
  struct kwt_struct_data *kwt_data;

  kwt_data = ALLOC(struct kwt_struct_data);
  tree     = ac_alloc();
  DATA_PTR(self) = kwt_data;
  kwt_data->tree            = tree;
  kwt_data->last_id         = 1;
  kwt_data->dictionary_size = 0;
  kwt_data->is_frozen       = 0;
  rb_iv_set( self, "@filter", Qnil );
  return self;
}

Class Method Details

._from_fileObject

Note: It’s not safe to use this method, but rather from_file.



319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
# File 'ext/ahocorasick/ruby-ahocorasick.c', line 319

static VALUE
rb_kwt_new_from_file(int argc, VALUE *argv, VALUE klass)
{ 

  // TODO: 
  //  * use rb_kwt_add_string
  //  * use rb_io* to handle the file

  struct kwt_struct_data *kwt_data;
  char word[1024];
  int id = 0;
  VALUE self;
  VALUE filename;
  FILE *dictionary;

  rb_scan_args(argc, argv, "10", &filename);
  
  SafeStringValue(filename);
  self= rb_class_new_instance( 0, NULL, klass );
  KeywordTree( self, kwt_data );

  dictionary= fopen( StringValuePtr(filename), "r" );
  if(dictionary == NULL)
    rb_raise(rb_eRuntimeError, "Cannot open `%s\". No such file?", StringValuePtr(filename));

  while(fgets(word, 1024, dictionary) != NULL) {
    ac_add_string(kwt_data->tree, word, (int)(strlen(word)-1), id++);
    kwt_data->dictionary_size++;
  }

  kwt_data->last_id= id+1;
  fclose(dictionary);
  return self;
}

.from_file(filename) ⇒ Object

Creates a new KeywordTree and loads the dictionary from a file

% cat dict0.txt
foo
bar
base

k= AhoCorasick::KeywordTree.from_file "dict0.txt"
k.find_all("basement").size # => 1


32
33
34
# File 'lib/ahocorasick.rb', line 32

def self.from_file filename
  self._from_file filename
end

Instance Method Details

#add_string(string, id = nil) ⇒ Object Also known as: <<

Adds a sequence to this KeywordTree.

kwt.add_string("foo1$21^ 98N3 ba>Z")
kwt << "bar" # using the alias

Note: you can also specify the id, a number between 1 and k

kwt.add_string "bar", 123 # => 123

This id should be unique in the context of the current tree.

Returns the id of the inserted object.

kwt.add_string("test", 18) # => 18
kwt.add_string("baz") # => 19


241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# File 'ext/ahocorasick/ruby-ahocorasick.c', line 241

static VALUE
rb_kwt_add_string(int argc, VALUE *argv, VALUE self)
{ 
  VALUE v_string, v_id;
  struct kwt_struct_data *kwt_data;
  int id;

  rb_scan_args(argc, argv, "11", &v_string, &v_id);
 
  Check_Type(v_string, T_STRING);
  KeywordTree(self, kwt_data);

  if(kwt_data->is_frozen == 1)
    rb_raise(rb_eRuntimeError, "Cannot add `%s\" into a frozen tree.", StringValuePtr(v_string));

  if(v_id == Qnil) {
    id = kwt_data->last_id;
  } else if(TYPE(v_id) != T_FIXNUM) {
    rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%s\" given.", StringValuePtr(v_id));
  } else if(NUM2INT(v_id) <= 0) {
    rb_raise(rb_eRuntimeError, "Please use a number from 1 to K as id, or leave nil to auto-generate one. `%d\" given.", NUM2INT(v_id));
  } else {
    id= NUM2INT(v_id);
  }
  if(ac_add_string(kwt_data->tree, StringValuePtr(v_string), (int)NUM2INT(rb_funcall(v_string, rb_intern("length"), 0)), id) == 0)
    rb_raise(rb_eRuntimeError, "Failed to add `%s\", duplicate id `%d\"?", StringValuePtr(v_string), id);

  kwt_data->last_id= id + 1;
  kwt_data->dictionary_size++;
  return INT2FIX(id);
}

#filterObject

It gets the filter. D’oh.



303
304
305
306
307
308
309
310
311
# File 'ext/ahocorasick/ruby-ahocorasick.c', line 303

static VALUE
rb_kwt_get_filter(VALUE self) {
  VALUE filter;
  struct kwt_struct_data *kwt_data;
  KeywordTree( self, kwt_data );

  filter= rb_iv_get(self, "@filter");
  return filter;
}

#filter=(AhoCorasick: :ResultFilter) ⇒ Object

Attach a filter to this KeywordTree.

A filter should extend AhoCorasick::ResultFilter and implement valid? method.



281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# File 'ext/ahocorasick/ruby-ahocorasick.c', line 281

static VALUE
rb_kwt_set_filter(int argc, VALUE *argv, VALUE self) {
  struct kwt_struct_data *kwt_data;
  VALUE filter;

  rb_scan_args(argc, argv, "10", &filter);
  
  if(rb_obj_is_kind_of(filter, rb_cResultFilter) == 0)
    rb_raise(rb_eTypeError, "Type mismatch: required %s, %s given.", rb_class2name(rb_cResultFilter), rb_class2name(CLASS_OF(filter)));

  KeywordTree( self, kwt_data );
  rb_iv_set( self, "@filter", filter );

  return filter;
}

#find_all(string) ⇒ Object Also known as: search

Search the current tree.

It returns an array on hashes, e.g.

[ { :id => int, :value => string, :starts_at => int, :ends_at => int}, { ... } ]

Or an empty array if it did not find anything.

# assuming a valid KeywordTree kwt object:
kwt.add_string("one")
kwt.add_string("two")

kwt.find_all( "moved two times already" ).each  do | result |
  result[:id] # => 2
  result[:ends_at] # => 9
  result[:starts_at] # => 6
  result[:value] # => two
end # => 1


151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'ext/ahocorasick/ruby-ahocorasick.c', line 151

static VALUE
rb_kwt_find_all(int argc, VALUE *argv, VALUE self)
{
  char * remain;        // returned by ac_search, the remaing text to search
  int lgt, id, ends_at; // filled in by ac_search: the length of the result, the id, and starts_at/ends_at position
  VALUE v_result;  // one result, as hash
  VALUE v_results; // all the results, an array
  VALUE filter; // filter to be applied to results

  VALUE v_search;  // search string, function argument
  struct kwt_struct_data *kwt_data;
  
  // one mandatory argument.
  rb_scan_args(argc, argv, "1", &v_search);
  // it should be string.
  Check_Type(v_search, T_STRING);
  v_search= StringValue( v_search );

  // get the structure
  KeywordTree(self, kwt_data);
  // freeze the tree, if not already
  if(kwt_data->is_frozen == 0) {
    if(ac_prep( kwt_data->tree ) == 0) 
      rb_raise(rb_eRuntimeError, "Cannot freeze the tree!");
    kwt_data->is_frozen = 1;
  }
  // prepare the return value
  v_results= rb_ary_new();
  // fail quickly and return the empty array
  if(kwt_data->dictionary_size == 0) 
    return v_results;
  // prepare the search
  ac_search_init(kwt_data->tree, StringValuePtr(v_search), (int)NUM2INT(rb_funcall(v_search, rb_intern("length"), 0)));
  // get the filter
  filter= rb_iv_get(self, "@filter");
  // loop trought the results
  while((remain= ac_search(kwt_data->tree, &lgt, &id, &ends_at)) != NULL) {
    // this is an individual result as a hash
    v_result= rb_hash_new();
    rb_hash_aset( v_result, sym_id,        INT2NUM( (long)id ) );
    rb_hash_aset( v_result, sym_starts_at, INT2NUM( (long)(ends_at - lgt - 1) ) );
    rb_hash_aset( v_result, sym_ends_at,   INT2NUM( (long)(ends_at - 1) ) );
    rb_hash_aset( v_result, sym_value, rb_str_new(remain, (long)lgt) );
    if (filter == Qnil || rb_funcall( filter, rb_intern("valid?"), 2, v_result, rb_str_new(remain, (long)strlen(remain)) )!=Qfalse)
      rb_ary_push( v_results, v_result );
  }
  // reopen the tree
  kwt_data->is_frozen= 0;
  return v_results;
}

#from_file(file) ⇒ Object

Loads the contents of file into the KeywordTree

k= AhoCorasick::KeywordTree.new
k.from_file "dictionary.txt"


16
17
18
19
# File 'lib/ahocorasick.rb', line 16

def from_file file
  File.read(file).each { | string | self.add_string string }
  self
end

#makeObject

It freezes the current KeywordTree.

Note: This method is called internally by search

require 'ahocorasick'

kwt = Ahocorasick::KeywordTree.new

kwt.add_string("one")
kwt.add_string("two")
kwt.make()


111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'ext/ahocorasick/ruby-ahocorasick.c', line 111

static VALUE 
rb_kwt_make(VALUE self)
{ 
  struct kwt_struct_data *kwt_data;
  KeywordTree(self, kwt_data);

  if(kwt_data->is_frozen == 1)
    return Qtrue;
  
  if(ac_prep( kwt_data->tree ) == 1) {
    kwt_data->is_frozen = 1;
    return Qtrue;
  }

  rb_raise(rb_eRuntimeError, "Cannot freeze the tree");
}

#sizeObject

Returns the size of this KeywordTree

kwt.add_string("foo")
kwt.add_string("bar")
kwt.size #=> 2


212
213
214
215
216
217
218
219
# File 'ext/ahocorasick/ruby-ahocorasick.c', line 212

static VALUE 
rb_kwt_size(VALUE self)
{ 
  struct kwt_struct_data *kwt_data;
  KeywordTree(self, kwt_data);

  return INT2FIX(kwt_data->dictionary_size);
}