Module: WordTree::Text

Defined in:
lib/wordtree/text.rb,
ext/wordtree.cc

Class Method Summary collapse

Class Method Details

._add_ngrams_with_suffix(text, hash, upto_n_value, suffix, incr_existing_keys_only) ⇒ Object



165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'ext/wordtree.cc', line 165

VALUE text_add_ngrams_with_suffix(
  VALUE self,
  VALUE text,
  VALUE hash,
  VALUE upto_n_value,
  VALUE suffix,
  VALUE incr_existing_keys_only)
{
  char* head = RSTRING_PTR(text);
  char* tail = RSTRING_PTR(text);
  char* next_head = head;
  char* next_tail = tail;
  int word_count = 0;
  int text_len = RSTRING_LEN(text);
  int incr_existing = RTEST(incr_existing_keys_only);
  int upto_n = FIX2INT(upto_n_value);

  if (text_len == 0) return self;

  do {
    if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
      word_count++;
      if (word_count == 1 || upto_n == 1) {
        next_head = next_tail = tail + 1;
      } else if (word_count == 2) {
        next_tail = tail;
      }
      if (word_count <= upto_n) {
        _incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);
      }
      if (word_count == upto_n) {
        head = next_head;
        tail = next_tail;
        word_count = 0;
      } else {
        tail++;
      }
    } else {
      tail++;
    }
  } while(*tail);

  // add the last ngram of size upto_n
  _incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);

  // add the 1..(upto_n-1) sized ngrams at the tail
  if (upto_n > 1) {
    while(head < RSTRING_PTR(text)+text_len) {
      if(*head == ' ' || *head == '.') {
        _incr_value(hash, rb_str_new(head + 1, tail - head - 1), suffix, incr_existing_keys_only);
      }
      head++;
    }
  }

  return self;
}

.add_ngrams_with_suffix(text, hash, upto_n = 4, suffix = nil, incr_existing_keys_only = false) ⇒ Object



36
37
38
# File 'lib/wordtree/text.rb', line 36

def self.add_ngrams_with_suffix(text, hash, upto_n=4, suffix=nil, incr_existing_keys_only=false)
  _add_ngrams_with_suffix(text, hash, upto_n, suffix, incr_existing_keys_only)
end

.clean(text) ⇒ Object



104
105
106
107
108
109
110
111
112
113
# File 'ext/wordtree.cc', line 104

static VALUE text_clean(VALUE self, VALUE text) {
    rb_str_modify(text);

    char* ctext = StringValueCStr(text);
    size_t new_length = text_clean_cstr(ctext);

    rb_str_set_len(text, (long)new_length);

    return text;
}

.common_trigrams(text) ⇒ Object



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'ext/wordtree.cc', line 79

static VALUE text_common_trigrams(VALUE self, VALUE text) {
  char* ptext = RSTRING_PTR(text);
  long len = RSTRING_LEN(text);

  if (len < 3) return INT2NUM(0);

  /* 28 most common English trigrams, all squished together */
  char common_trigrams[] = "allandedtentereforhashatherhisingionithmenncendeoftsthterthathethitiotisverwaswityou";

  char* ptr = ptext;
  char* tail = ptext + len;
  int i = 0, common_matched = 0;
  while (ptr < tail) {
    for (i = 0; i < sizeof(common_trigrams); i += 3) {
      if (memcmp(ptr, common_trigrams + i, 3) == 0) {
        common_matched++;
        break;
      }
    }
    ptr++;
  }

  return INT2NUM(common_matched);
}

.incr_value(hash, key, suffix, incr_existing_keys_only) ⇒ Object



155
156
157
158
159
160
161
162
163
# File 'ext/wordtree.cc', line 155

VALUE text_incr_value(VALUE self, VALUE hash, VALUE key, VALUE suffix, VALUE incr_existing_keys_only)
{
  Check_Type(hash, T_HASH);
  Check_Type(key, T_STRING);
  if (suffix != Qnil) Check_Type(suffix, T_SYMBOL);

  _incr_value(hash, key, suffix, incr_existing_keys_only);
  return self;
}

.split_near(text, split_index) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/wordtree/text.rb', line 6

def self.split_near(text, split_index)
  if split_index >= text.size
    return [text, ""]
  else
    index = split_index
    while index >= 0
      if text[index] == ' '
        return [text[0...index], text[(index+1)..-1]]
      end
      index -= 1
    end
    return [text[0...split_index], text[split_index..-1]]
  end
end

.word_wrap(input, wrap = 120) ⇒ Object

Remove punctuation an non-alphabetical characters from a text, and return a cleaned-up version wrapped at wrap characters per line.



23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/wordtree/text.rb', line 23

def self.word_wrap(input, wrap=120)
  output_line = String.new
  wrapped_output = String.new
  begin
    output_line, remainder = split_near(input, wrap)
    wrapped_output << output_line + "\n"
    output = remainder
  end while remainder.size > wrap
  wrapped_output << remainder + "\n" unless remainder.empty?

  return wrapped_output
end