Module: WordTree::Text
- Defined in:
- lib/wordtree/text.rb,
ext/wordtree.cc
Class Method Summary collapse
- .clean(text) ⇒ Object
- .common_trigrams(text) ⇒ Object
- .split_near(text, split_index) ⇒ Object
-
.word_wrap(input, wrap = 120) ⇒ Object
Remove punctuation an non-alphabetical characters from a text, and return a cleaned-up version wrapped at
wrapcharacters per line.
Class Method Details
.clean(text) ⇒ Object
104 105 106 107 108 109 110 111 112 113 |
# File 'ext/wordtree.cc', line 104
static VALUE text_clean(VALUE self, VALUE text) {
rb_str_modify(text);
char* ctext = StringValueCStr(text);
size_t new_length = text_clean_cstr(ctext);
rb_str_set_len(text, (long)new_length);
return text;
}
|
.common_trigrams(text) ⇒ Object
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'ext/wordtree.cc', line 79
static VALUE text_common_trigrams(VALUE self, VALUE text) {
char* ptext = RSTRING_PTR(text);
long len = RSTRING_LEN(text);
if (len < 3) return INT2NUM(0);
/* 28 most common English trigrams, all squished together */
char common_trigrams[] = "allandedtentereforhashatherhisingionithmenncendeoftsthterthathethitiotisverwaswityou";
char* ptr = ptext;
char* tail = ptext + len;
int i = 0, common_matched = 0;
while (ptr < tail) {
for (i = 0; i < sizeof(common_trigrams); i += 3) {
if (memcmp(ptr, common_trigrams + i, 3) == 0) {
common_matched++;
break;
}
}
ptr++;
}
return INT2NUM(common_matched);
}
|
.split_near(text, split_index) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/wordtree/text.rb', line 6 def self.split_near(text, split_index) if split_index >= text.size return [text, ""] else index = split_index while index >= 0 if text[index] == ' ' return [text[0...index], text[(index+1)..-1]] end index -= 1 end return [text[0...split_index], text[split_index..-1]] end end |
.word_wrap(input, wrap = 120) ⇒ Object
Remove punctuation an non-alphabetical characters from a text, and return a cleaned-up version wrapped at wrap characters per line.
23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/wordtree/text.rb', line 23 def self.word_wrap(input, wrap=120) output_line = String.new wrapped_output = String.new begin output_line, remainder = split_near(input, wrap) wrapped_output << output_line + "\n" output = remainder end while remainder.size > wrap wrapped_output << remainder + "\n" unless remainder.empty? return wrapped_output end |