Class: Word2vec::Model

Inherits:
Object
  • Object
show all
Defined in:
ext/word2vec/word2vec.c

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.build_vocab(rb_train_file_name, rb_vocab_file_name) ⇒ Object

build the vocabubaly file from train file

Parameters:

  • rb_train_file_name (String)
  • rb_vocab_file_name (String)


135
136
137
138
139
140
141
142
# File 'ext/word2vec/word2vec.c', line 135

static VALUE build_vocab(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_file_name) {
  char* train_filename = StringValueCStr(rb_train_file_name);
  char* vocab_filename = StringValueCStr(rb_vocab_file_name);

  word2vec_build_vocab(train_filename, vocab_filename);

  return Qtrue;
}

.load(rb_filename) ⇒ Object

model_load load the vectors.bin file from disc

Parameters:

  • rb_filename (String)


40
41
42
43
44
45
46
47
48
# File 'ext/word2vec/word2vec.c', line 40

static VALUE model_load(VALUE mod, VALUE rb_filename) {
  word2vec_model* model = ZALLOC(word2vec_model);
  
  char* filename = StringValueCStr(rb_filename);
  
  word2vec_model_load(model, filename);

  return Data_Wrap_Struct(mod, NULL, model_deallocate, model);
}

.tokenize(rb_train_file_name, rb_vocab_file_name, rb_output_file_name) ⇒ Object

tokenize a file

Parameters:

  • rb_train_file_name (String)
  • rb_vocab_file_name (String)
  • rb_output_file_name (String)


150
151
152
153
154
155
156
157
158
# File 'ext/word2vec/word2vec.c', line 150

static VALUE tokenize(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_file_name, VALUE rb_output_file_name) {
  char* train_filename = StringValueCStr(rb_train_file_name);
  char* vocab_filename = StringValueCStr(rb_vocab_file_name);
  char* output_filename = StringValueCStr(rb_output_file_name);

  word2vec_tokenize(train_filename, vocab_filename, output_filename);

  return Qtrue;
}

Instance Method Details

#accuracy(rb_file_name) ⇒ Object

model find the analog word to other three

Parameters:

  • rb_file_name (String)


120
121
122
123
124
125
126
127
128
# File 'ext/word2vec/word2vec.c', line 120

static VALUE model_accuracy(VALUE mod, VALUE rb_file_name) {
  word2vec_model *model;
  Data_Get_Struct(mod, word2vec_model, model);
  char* filename = StringValueCStr(rb_file_name);

  word2vec_model_accuracy(model, filename);
  
  return Qtrue;
}

#analogy(rb_wordx1, rb_wordy1, rb_wordx2) ⇒ Hash<String, Float>

model find the analog word to other three

Parameters:

  • rb_wordx1 (String)
  • rb_wordy1 (String)
  • rb_wordx2 (String)

Returns:

  • (Hash<String, Float>)


100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'ext/word2vec/word2vec.c', line 100

static VALUE model_analogy(VALUE mod, VALUE rb_wordx1, VALUE rb_wordy1, VALUE rb_wordx2) {
  word2vec_model *model;
  Data_Get_Struct(mod, word2vec_model, model);
  char* wordx1 = StringValueCStr(rb_wordx1);
  char* wordy1 = StringValueCStr(rb_wordy1);
  char* wordx2 = StringValueCStr(rb_wordx2);

  WordSimilarity word_list[N];
  
  size_t word_count = word2vec_model_analogy(model, wordx1, wordy1, wordx2, word_list);

  VALUE rb_ret = wordSimilarotyToHash(model, word_list, word_count);

  return rb_ret;
}

#distance(rb_word) ⇒ Hash<String, Float>

model find the nearest distance words

Parameters:

  • rb_word (String)

Returns:

  • (Hash<String, Float>)


79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'ext/word2vec/word2vec.c', line 79

static VALUE model_distance(VALUE mod, VALUE rb_word) {
  word2vec_model *model;
  Data_Get_Struct(mod, word2vec_model, model);
  char* word = StringValueCStr(rb_word);

  WordSimilarity word_list[N];
  
  size_t word_count = word2vec_model_distance(model, word, word_list);

  VALUE rb_ret = wordSimilarotyToHash(model, word_list, word_count);

  return rb_ret;
}

#vector_dimInteger

model vector dimensionality

Returns:

  • (Integer)


66
67
68
69
70
71
72
# File 'ext/word2vec/word2vec.c', line 66

static VALUE model_vector_dim(VALUE mod) {
  word2vec_model *model;

  Data_Get_Struct(mod, word2vec_model, model);

  return SIZET2NUM(model->vector_dim);
}

#word_countInteger

model vocabulary length

Returns:

  • (Integer)


54
55
56
57
58
59
60
# File 'ext/word2vec/word2vec.c', line 54

static VALUE model_word_count(VALUE mod) {
  word2vec_model *model;

  Data_Get_Struct(mod, word2vec_model, model);

  return SIZET2NUM(model->word_count);
}