Module: WordTree::Text

Defined in:: lib/wordtree/text.rb,
ext/wordtree.cc

Class Method Summary collapse

._add_ngrams_with_suffix(text, hash, upto_n_value, suffix, incr_existing_keys_only) ⇒ Object
.add_ngrams_with_suffix(text, hash, upto_n = 4, suffix = nil, incr_existing_keys_only = false) ⇒ Object
.clean(text) ⇒ Object
.common_trigrams(text) ⇒ Object
.incr_value(hash, key, suffix, incr_existing_keys_only) ⇒ Object
.split_near(text, split_index) ⇒ Object
.word_wrap(input, wrap = 120) ⇒ Object

Remove punctuation an non-alphabetical characters from a text, and return a cleaned-up version wrapped at wrap characters per line.

Class Method Details

._add_ngrams_with_suffix(text, hash, upto_n_value, suffix, incr_existing_keys_only) ⇒ `Object`

# File 'ext/wordtree.cc', line 165

VALUE text_add_ngrams_with_suffix(
  VALUE self,
  VALUE text,
  VALUE hash,
  VALUE upto_n_value,
  VALUE suffix,
  VALUE incr_existing_keys_only)
{
  char* head = RSTRING_PTR(text);
  char* tail = RSTRING_PTR(text);
  char* next_head = head;
  char* next_tail = tail;
  int word_count = 0;
  int text_len = RSTRING_LEN(text);
  int incr_existing = RTEST(incr_existing_keys_only);
  int upto_n = FIX2INT(upto_n_value);

  if (text_len == 0) return self;

  do {
    if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
      word_count++;
      if (word_count == 1 || upto_n == 1) {
        next_head = next_tail = tail + 1;
      } else if (word_count == 2) {
        next_tail = tail;
      }
      if (word_count <= upto_n) {
        _incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);
      }
      if (word_count == upto_n) {
        head = next_head;
        tail = next_tail;
        word_count = 0;
      } else {
        tail++;
      }
    } else {
      tail++;
    }
  } while(*tail);

  // add the last ngram of size upto_n
  _incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);

  // add the 1..(upto_n-1) sized ngrams at the tail
  if (upto_n > 1) {
    while(head < RSTRING_PTR(text)+text_len) {
      if(*head == ' ' || *head == '.') {
        _incr_value(hash, rb_str_new(head + 1, tail - head - 1), suffix, incr_existing_keys_only);
      }
      head++;
    }
  }

  return self;
}

.add_ngrams_with_suffix(text, hash, upto_n = 4, suffix = nil, incr_existing_keys_only = false) ⇒ `Object`



36
37
38

# File 'lib/wordtree/text.rb', line 36

def self.add_ngrams_with_suffix(text, hash, upto_n=4, suffix=nil, incr_existing_keys_only=false)
  _add_ngrams_with_suffix(text, hash, upto_n, suffix, incr_existing_keys_only)
end

.clean(text) ⇒ `Object`

# File 'ext/wordtree.cc', line 104

static VALUE text_clean(VALUE self, VALUE text) {
    rb_str_modify(text);

    char* ctext = StringValueCStr(text);
    size_t new_length = text_clean_cstr(ctext);

    rb_str_set_len(text, (long)new_length);

    return text;
}

.common_trigrams(text) ⇒ `Object`

# File 'ext/wordtree.cc', line 79

static VALUE text_common_trigrams(VALUE self, VALUE text) {
  char* ptext = RSTRING_PTR(text);
  long len = RSTRING_LEN(text);

  if (len < 3) return INT2NUM(0);

  /* 28 most common English trigrams, all squished together */
  char common_trigrams[] = "allandedtentereforhashatherhisingionithmenncendeoftsthterthathethitiotisverwaswityou";

  char* ptr = ptext;
  char* tail = ptext + len;
  int i = 0, common_matched = 0;
  while (ptr < tail) {
    for (i = 0; i < sizeof(common_trigrams); i += 3) {
      if (memcmp(ptr, common_trigrams + i, 3) == 0) {
        common_matched++;
        break;
      }
    }
    ptr++;
  }

  return INT2NUM(common_matched);
}

.incr_value(hash, key, suffix, incr_existing_keys_only) ⇒ `Object`

# File 'ext/wordtree.cc', line 155

VALUE text_incr_value(VALUE self, VALUE hash, VALUE key, VALUE suffix, VALUE incr_existing_keys_only)
{
  Check_Type(hash, T_HASH);
  Check_Type(key, T_STRING);
  if (suffix != Qnil) Check_Type(suffix, T_SYMBOL);

  _incr_value(hash, key, suffix, incr_existing_keys_only);
  return self;
}

.split_near(text, split_index) ⇒ `Object`

# File 'lib/wordtree/text.rb', line 6

def self.split_near(text, split_index)
  if split_index >= text.size
    return [text, ""]
  else
    index = split_index
    while index >= 0
      if text[index] == ' '
        return [text[0...index], text[(index+1)..-1]]
      end
      index -= 1
    end
    return [text[0...split_index], text[split_index..-1]]
  end
end

.word_wrap(input, wrap = 120) ⇒ `Object`

Remove punctuation an non-alphabetical characters from a text, and return a cleaned-up version wrapped at wrap characters per line.

# File 'lib/wordtree/text.rb', line 23

def self.word_wrap(input, wrap=120)
  output_line = String.new
  wrapped_output = String.new
  begin
    output_line, remainder = split_near(input, wrap)
    wrapped_output << output_line + "\n"
    output = remainder
  end while remainder.size > wrap
  wrapped_output << remainder + "\n" unless remainder.empty?

  return wrapped_output
end

Module: WordTree::Text

Class Method Summary collapse

Class Method Details

._add_ngrams_with_suffix(text, hash, upto_n_value, suffix, incr_existing_keys_only) ⇒ Object

.add_ngrams_with_suffix(text, hash, upto_n = 4, suffix = nil, incr_existing_keys_only = false) ⇒ Object

.clean(text) ⇒ Object

.common_trigrams(text) ⇒ Object

.incr_value(hash, key, suffix, incr_existing_keys_only) ⇒ Object

.split_near(text, split_index) ⇒ Object

.word_wrap(input, wrap = 120) ⇒ Object

._add_ngrams_with_suffix(text, hash, upto_n_value, suffix, incr_existing_keys_only) ⇒ `Object`

.add_ngrams_with_suffix(text, hash, upto_n = 4, suffix = nil, incr_existing_keys_only = false) ⇒ `Object`

.clean(text) ⇒ `Object`

.common_trigrams(text) ⇒ `Object`

.incr_value(hash, key, suffix, incr_existing_keys_only) ⇒ `Object`

.split_near(text, split_index) ⇒ `Object`

.word_wrap(input, wrap = 120) ⇒ `Object`