Class: Momblish::CorpusAnalyzer
- Inherits:
-
Object
- Object
- Momblish::CorpusAnalyzer
- Defined in:
- lib/momblish/corpus_analyzer.rb
Constant Summary collapse
- PUNCTUATION =
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\n".split('')
Instance Attribute Summary collapse
-
#corpus ⇒ Object
Returns the value of attribute corpus.
-
#words ⇒ Object
Returns the value of attribute words.
Instance Method Summary collapse
- #init_occurrences ⇒ Object
- #init_weighted_bigrams ⇒ Object
-
#initialize(corpus = []) ⇒ CorpusAnalyzer
constructor
A new instance of CorpusAnalyzer.
Constructor Details
#initialize(corpus = []) ⇒ CorpusAnalyzer
Returns a new instance of CorpusAnalyzer.
10 11 12 13 14 15 |
# File 'lib/momblish/corpus_analyzer.rb', line 10 def initialize(corpus = []) @words = corpus.map(&:rstrip) @corpus = Corpus.new({}, {}) init_weighted_bigrams init_occurrences end |
Instance Attribute Details
#corpus ⇒ Object
Returns the value of attribute corpus.
8 9 10 |
# File 'lib/momblish/corpus_analyzer.rb', line 8 def corpus @corpus end |
#words ⇒ Object
Returns the value of attribute words.
8 9 10 |
# File 'lib/momblish/corpus_analyzer.rb', line 8 def words @words end |
Instance Method Details
#init_occurrences ⇒ Object
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'lib/momblish/corpus_analyzer.rb', line 36 def init_occurrences all_trigrams = @words.each.with_object([]) { |word, memo| word_chars = word.chomp.upcase.chars next if (word_chars & PUNCTUATION).any? memo.concat(word_chars.each_cons(3).to_a) } occurrences = Hash.new { |h, k| h[k] = Hash.new(0) } all_trigrams .group_by { |trigram| trigram[0..1].join } .each_pair do |bigram, trigrams| trigrams.each do |trigram| last_char = trigram.last occurrences[bigram][last_char] += 1 end end @corpus.occurrences = Hash.new { |h, k| h[k] = Hash.new(&h.default_proc) } occurrences.each do |bigram, last_letters| total = last_letters.values.sum.to_f last_letters.each do |last_letter, count| @corpus.occurrences[bigram][last_letter] = count / total end end end |
#init_weighted_bigrams ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/momblish/corpus_analyzer.rb', line 17 def init_weighted_bigrams starting_bigrams = Hash.new(0) filtered_words = @words.lazy.select do |word| word.length > 2 && (word[0..1].chars & PUNCTUATION).empty? end filtered_words.each do |word| bigram = word[0..1].upcase starting_bigrams[bigram] += 1 end total = starting_bigrams.values.sum starting_bigrams.each do |bigram, count| @corpus.weighted_bigrams[bigram] = count.to_f / total end end |