Module: WordTree::Text
- Defined in:
- lib/wordtree/text.rb,
ext/wordtree.cc
Class Method Summary collapse
- ._add_ngrams_with_suffix(text, hash, upto_n_value, suffix, incr_existing_keys_only) ⇒ Object
- .add_ngrams_with_suffix(text, hash, upto_n = 4, suffix = nil, incr_existing_keys_only = false) ⇒ Object
- .clean(text) ⇒ Object
- .common_trigrams(text) ⇒ Object
- .incr_value(hash, key, suffix, incr_existing_keys_only) ⇒ Object
- .split_near(text, split_index) ⇒ Object
-
.word_wrap(input, wrap = 120) ⇒ Object
Remove punctuation an non-alphabetical characters from a text, and return a cleaned-up version wrapped at
wrap
characters per line.
Class Method Details
._add_ngrams_with_suffix(text, hash, upto_n_value, suffix, incr_existing_keys_only) ⇒ Object
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 |
# File 'ext/wordtree.cc', line 165
VALUE text_add_ngrams_with_suffix(
VALUE self,
VALUE text,
VALUE hash,
VALUE upto_n_value,
VALUE suffix,
VALUE incr_existing_keys_only)
{
char* head = RSTRING_PTR(text);
char* tail = RSTRING_PTR(text);
char* next_head = head;
char* next_tail = tail;
int word_count = 0;
int text_len = RSTRING_LEN(text);
int incr_existing = RTEST(incr_existing_keys_only);
int upto_n = FIX2INT(upto_n_value);
if (text_len == 0) return self;
do {
if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
word_count++;
if (word_count == 1 || upto_n == 1) {
next_head = next_tail = tail + 1;
} else if (word_count == 2) {
next_tail = tail;
}
if (word_count <= upto_n) {
_incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);
}
if (word_count == upto_n) {
head = next_head;
tail = next_tail;
word_count = 0;
} else {
tail++;
}
} else {
tail++;
}
} while(*tail);
// add the last ngram of size upto_n
_incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);
// add the 1..(upto_n-1) sized ngrams at the tail
if (upto_n > 1) {
while(head < RSTRING_PTR(text)+text_len) {
if(*head == ' ' || *head == '.') {
_incr_value(hash, rb_str_new(head + 1, tail - head - 1), suffix, incr_existing_keys_only);
}
head++;
}
}
return self;
}
|
.add_ngrams_with_suffix(text, hash, upto_n = 4, suffix = nil, incr_existing_keys_only = false) ⇒ Object
36 37 38 |
# File 'lib/wordtree/text.rb', line 36 def self.add_ngrams_with_suffix(text, hash, upto_n=4, suffix=nil, incr_existing_keys_only=false) _add_ngrams_with_suffix(text, hash, upto_n, suffix, incr_existing_keys_only) end |
.clean(text) ⇒ Object
104 105 106 107 108 109 110 111 112 113 |
# File 'ext/wordtree.cc', line 104
static VALUE text_clean(VALUE self, VALUE text) {
rb_str_modify(text);
char* ctext = StringValueCStr(text);
size_t new_length = text_clean_cstr(ctext);
rb_str_set_len(text, (long)new_length);
return text;
}
|
.common_trigrams(text) ⇒ Object
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'ext/wordtree.cc', line 79
static VALUE text_common_trigrams(VALUE self, VALUE text) {
char* ptext = RSTRING_PTR(text);
long len = RSTRING_LEN(text);
if (len < 3) return INT2NUM(0);
/* 28 most common English trigrams, all squished together */
char common_trigrams[] = "allandedtentereforhashatherhisingionithmenncendeoftsthterthathethitiotisverwaswityou";
char* ptr = ptext;
char* tail = ptext + len;
int i = 0, common_matched = 0;
while (ptr < tail) {
for (i = 0; i < sizeof(common_trigrams); i += 3) {
if (memcmp(ptr, common_trigrams + i, 3) == 0) {
common_matched++;
break;
}
}
ptr++;
}
return INT2NUM(common_matched);
}
|
.incr_value(hash, key, suffix, incr_existing_keys_only) ⇒ Object
155 156 157 158 159 160 161 162 163 |
# File 'ext/wordtree.cc', line 155
VALUE text_incr_value(VALUE self, VALUE hash, VALUE key, VALUE suffix, VALUE incr_existing_keys_only)
{
Check_Type(hash, T_HASH);
Check_Type(key, T_STRING);
if (suffix != Qnil) Check_Type(suffix, T_SYMBOL);
_incr_value(hash, key, suffix, incr_existing_keys_only);
return self;
}
|
.split_near(text, split_index) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/wordtree/text.rb', line 6 def self.split_near(text, split_index) if split_index >= text.size return [text, ""] else index = split_index while index >= 0 if text[index] == ' ' return [text[0...index], text[(index+1)..-1]] end index -= 1 end return [text[0...split_index], text[split_index..-1]] end end |
.word_wrap(input, wrap = 120) ⇒ Object
Remove punctuation an non-alphabetical characters from a text, and return a cleaned-up version wrapped at wrap
characters per line.
23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/wordtree/text.rb', line 23 def self.word_wrap(input, wrap=120) output_line = String.new wrapped_output = String.new begin output_line, remainder = split_near(input, wrap) wrapped_output << output_line + "\n" output = remainder end while remainder.size > wrap wrapped_output << remainder + "\n" unless remainder.empty? return wrapped_output end |