Class: Mittens::Stemmer

Inherits:
Object
  • Object
show all
Defined in:
ext/mittens/ext.c

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(*args) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'ext/mittens/ext.c', line 32

static VALUE stemmer_initialize(int argc, VALUE* argv, VALUE self)
{
    VALUE opts;
    rb_scan_args(argc, argv, ":", &opts);

    const char * algorithm = "english";
    if (!NIL_P(opts)) {
        VALUE language = rb_hash_aref(opts, ID2SYM(rb_intern("language")));
        if (!NIL_P(language)) {
            Check_Type(language, T_STRING);
            algorithm = RSTRING_PTR(language);
        }
    }

    stemmer_t *stemmer;
    TypedData_Get_Struct(self, stemmer_t, &stemmer_data_type, stemmer);

    // in case called multiple times
    sb_stemmer_delete(stemmer->stemmer);

    // if adding support for encoding, may want to change encoding returned from stem
    stemmer->stemmer = sb_stemmer_new(algorithm, NULL);
    if (stemmer->stemmer == NULL) {
        rb_raise(rb_eArgError, "unknown language: %s", algorithm);
    }

    return self;
}

Class Method Details

.languagesObject



75
76
77
78
79
80
81
82
83
84
85
86
# File 'ext/mittens/ext.c', line 75

static VALUE stemmer_languages(VALUE klass)
{
    VALUE out = rb_ary_new();

    const char **language = sb_stemmer_list();
    while (*language != NULL) {
        rb_ary_push(out, rb_utf8_str_new_cstr(*language));
        language++;
    }

    return out;
}

Instance Method Details

#stem(value) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'ext/mittens/ext.c', line 61

static VALUE stemmer_stem(VALUE self, VALUE value)
{
    stemmer_t *stemmer;
    TypedData_Get_Struct(self, stemmer_t, &stemmer_data_type, stemmer);

    Check_Type(value, T_STRING);

    const sb_symbol * word = (const sb_symbol *) RSTRING_PTR(value);
    int size = (int) RSTRING_LEN(value);
    const sb_symbol * pointer_out = sb_stemmer_stem(stemmer->stemmer, word, size);

    return rb_utf8_str_new_cstr((char *) pointer_out);
}