Class: Momblish::CorpusAnalyzer

Inherits:
Object
  • Object
show all
Defined in:
lib/momblish/corpus_analyzer.rb

Constant Summary collapse

PUNCTUATION =
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\n".split('')

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(corpus = []) ⇒ CorpusAnalyzer

Returns a new instance of CorpusAnalyzer.



10
11
12
13
14
15
# File 'lib/momblish/corpus_analyzer.rb', line 10

def initialize(corpus = [])
  @words = corpus.map(&:rstrip)
  @corpus = Corpus.new({}, {})
  init_weighted_bigrams
  init_occurrences
end

Instance Attribute Details

#corpusObject

Returns the value of attribute corpus.



8
9
10
# File 'lib/momblish/corpus_analyzer.rb', line 8

def corpus
  @corpus
end

#wordsObject

Returns the value of attribute words.



8
9
10
# File 'lib/momblish/corpus_analyzer.rb', line 8

def words
  @words
end

Instance Method Details

#init_occurrencesObject



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/momblish/corpus_analyzer.rb', line 36

def init_occurrences
  all_trigrams = @words.each.with_object([]) { |word, memo|
    word_chars = word.chomp.upcase.chars
    next if (word_chars & PUNCTUATION).any?

    memo.concat(word_chars.each_cons(3).to_a)
  }

  occurrences = Hash.new { |h, k| h[k] = Hash.new(0) }

  all_trigrams
    .group_by { |trigram| trigram[0..1].join }
    .each_pair do |bigram, trigrams|
      trigrams.each do |trigram|
        last_char = trigram.last
        occurrences[bigram][last_char] += 1
      end
    end

  @corpus.occurrences = Hash.new { |h, k| h[k] = Hash.new(&h.default_proc) }

  occurrences.each do |bigram, last_letters|
    total = last_letters.values.sum.to_f
    last_letters.each do |last_letter, count|
      @corpus.occurrences[bigram][last_letter] = count / total
    end
  end
end

#init_weighted_bigramsObject



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/momblish/corpus_analyzer.rb', line 17

def init_weighted_bigrams
  starting_bigrams = Hash.new(0)

  filtered_words = @words.lazy.select do |word|
    word.length > 2 && (word[0..1].chars & PUNCTUATION).empty?
  end

  filtered_words.each do |word|
    bigram = word[0..1].upcase
    starting_bigrams[bigram] += 1
  end

  total = starting_bigrams.values.sum

  starting_bigrams.each do |bigram, count|
    @corpus.weighted_bigrams[bigram] = count.to_f / total
  end
end