Class: SplitSentence

Inherits:
Object
  • Object
show all
Defined in:
lib/markovite/splitter.rb

Overview

class that takes a corpus and breaks it down into arrays. each array is one sentence.

Constant Summary collapse

ENDERS =
['?', '.', '!']
ABBREVIATIONS =
[
  'ave.','blvd.','ln','rd.','st.', #directional
  'tsp.','t.', 'tbs.', 'tbsp.','gal.','lb.','pt.','qt.', #cooking
  "ak.", "al.", "ar.", "az.", "ca.", "co.", "ct.", "dc.", "de.", "fl.",
  "ga.", "gu.", "hi.", "ia.", "id.", "il.", "in.", "ks.", "ky.", "la.",
  "ma.", "md.", "me.", "mh.", "mi.", "mn.", "mo.", "ms.", "mt.", "nc.",
  "nd.", "ne.", "nh.", "nj.", "nm.", "nv.", "ny.", "oh.", "ok.", "or.",
  "pa.", "pr.", "pw.", "ri.", "sc.", "sd.", "tn.", "tx.", "ut.", "va.",
  "vi.", "vt.", "wa.", "wi.", "wv.", "wy.",  "u.s.", "u.s.a,", #us locations
  "dr.", "esq.", "jr.", "mr.", "mrs.", "ms.", "mx.",
  "prof.", "rev.", "rt. hon.", "sr.", "st." #personal
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(corpus) ⇒ SplitSentence



24
25
26
27
28
# File 'lib/markovite/splitter.rb', line 24

def initialize(corpus)
  self.corpus = corpus.dup
  self.sentences = []
  split_text
end

Instance Attribute Details

#corpusObject

look into detecting abbreviations!



22
23
24
# File 'lib/markovite/splitter.rb', line 22

def corpus
  @corpus
end

#sentencesObject

look into detecting abbreviations!



22
23
24
# File 'lib/markovite/splitter.rb', line 22

def sentences
  @sentences
end

Instance Method Details

#clear_sentencesObject



30
31
32
# File 'lib/markovite/splitter.rb', line 30

def clear_sentences
  sentences.clear
end

#split_text(new_text = nil) ⇒ Object

might be cool to count punct. separately, we can point to punct as a way to indicate the end. if the sentences are delimited by n, we can have nil be the value it points to instead. This way, we can impose grammatical rules by making the first word of the sentence capitalized, and the end of the sentence will end with some sort of punctuation.



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/markovite/splitter.rb', line 43

def split_text(new_text = nil)
  clear_sentences
  current_sentence = []
  new_text = new_text || corpus
  all_words = split_words(new_text)
  all_words.each do |word|
    if is_end_of_sentence?(word)
      current_sentence = add_sentence(current_sentence, word)
    elsif has_newline?(word)
      newline_words = split_newline(word)
      current_sentence = add_sentence(current_sentence, newline_words[0])
      current_sentence << newline_words[1]
    else
      current_sentence << word
    end
  end
  add_sentence(current_sentence, nil) if !current_sentence.empty?
  sentences
end