Module: WordTree::Text

Defined in:
lib/wordtree/text.rb,
ext/wordtree.cc

Class Method Summary collapse

Class Method Details

.clean(text) ⇒ Object



104
105
106
107
108
109
110
111
112
113
# File 'ext/wordtree.cc', line 104

static VALUE text_clean(VALUE self, VALUE text) {
    rb_str_modify(text);

    char* ctext = StringValueCStr(text);
    size_t new_length = text_clean_cstr(ctext);

    rb_str_set_len(text, (long)new_length);

    return text;
}

.common_trigrams(text) ⇒ Object



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'ext/wordtree.cc', line 79

static VALUE text_common_trigrams(VALUE self, VALUE text) {
  char* ptext = RSTRING_PTR(text);
  long len = RSTRING_LEN(text);

  if (len < 3) return INT2NUM(0);

  /* 28 most common English trigrams, all squished together */
  char common_trigrams[] = "allandedtentereforhashatherhisingionithmenncendeoftsthterthathethitiotisverwaswityou";

  char* ptr = ptext;
  char* tail = ptext + len;
  int i = 0, common_matched = 0;
  while (ptr < tail) {
    for (i = 0; i < sizeof(common_trigrams); i += 3) {
      if (memcmp(ptr, common_trigrams + i, 3) == 0) {
        common_matched++;
        break;
      }
    }
    ptr++;
  }

  return INT2NUM(common_matched);
}

.split_near(text, split_index) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/wordtree/text.rb', line 6

def self.split_near(text, split_index)
  if split_index >= text.size
    return [text, ""]
  else
    index = split_index
    while index >= 0
      if text[index] == ' '
        return [text[0...index], text[(index+1)..-1]]
      end
      index -= 1
    end
    return [text[0...split_index], text[split_index..-1]]
  end
end

.word_wrap(input, wrap = 120) ⇒ Object

Remove punctuation an non-alphabetical characters from a text, and return a cleaned-up version wrapped at wrap characters per line.



23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/wordtree/text.rb', line 23

def self.word_wrap(input, wrap=120)
  output_line = String.new
  wrapped_output = String.new
  begin
    output_line, remainder = split_near(input, wrap)
    wrapped_output << output_line + "\n"
    output = remainder
  end while remainder.size > wrap
  wrapped_output << remainder + "\n" unless remainder.empty?

  return wrapped_output
end