Module: Markovfun::Trigram

Includes:
Util
Defined in:
lib/markovfun/trigram.rb

Class Method Summary collapse

Methods included from Util

get_lines, get_sentences

Class Method Details

.counts_to_probs(counts_hash) ⇒ Object

Generates a probability hash, given a counts hash. Similar to counts_hash, except containing the probability that a word follows two preceding words (as opposed to number of occurrences).



41
42
43
44
45
46
47
48
49
50
51
# File 'lib/markovfun/trigram.rb', line 41

def self.counts_to_probs(counts_hash)
  probs_hash = {}
  counts_hash.each do |prev, cur_freq|
    probs_hash[prev] ||= {}
    cur_freq.each do |cur, freq|
      prob = freq.to_f / cur_freq.values.reduce(:+)
      probs_hash[prev][cur] = prob
    end
  end
  probs_hash
end

.get_counts(sentences) ⇒ Object

Returns a counts hash, given a list of sentences. The keys to the hash are all observed combinations of [prev2, prev1], where prev2 and prev1 are the two previous words. The values are hashes, in which the keys are words (cur) that have followed prev2 and prev1, and the values are the number of occurrences.



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/markovfun/trigram.rb', line 21

def self.get_counts(sentences)
  counts_hash = {}
  sentences.each do |sent|
    # nil denotes the beginnings and ends of sentences
    sent = [nil, nil] + sent + [nil]
    sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
      counts_hash[[prev2, prev1]] ||= {}
      if !(counts_hash[[prev2, prev1]][cur])
        counts_hash[[prev2, prev1]][cur] = 1
      else
        counts_hash[[prev2, prev1]][cur] += 1
      end
    end
  end
  counts_hash
end

.sample_sentence(probs_hash) ⇒ Object

Generates a sample sentence, given a probability hash.



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/markovfun/trigram.rb', line 63

def self.sample_sentence(probs_hash)
  prev2 = nil
  prev1 = nil
  out = []

  while true
    cur = sample_word(probs_hash[[prev2, prev1]])
    if cur.nil?
      return out
    else
      out << cur
      prev2 = prev1
      prev1 = cur
    end
  end
end

.sample_word(probs_hash) ⇒ Object

Generates a sample word, given a probability hash.



54
55
56
57
58
59
60
# File 'lib/markovfun/trigram.rb', line 54

def self.sample_word(probs_hash)
  score = rand
  probs_hash.each do |word, prob|
    return word if score < prob
    score -= prob
  end
end

.score_sentence(sent, probs) ⇒ Object

Scores a sentence, depending on the likelihood that it occurs within a corpus.



92
93
94
95
96
97
98
99
100
101
102
# File 'lib/markovfun/trigram.rb', line 92

def self.score_sentence(sent, probs)
  total_surprise = 0

  sent = sent[0..-2].split(" ").push(".") if sent.class == String
  sent = [nil, nil] + sent + [nil]

  sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
    total_surprise += -Math.log(probs[[prev2, prev1]][cur], 2)
  end
  total_surprise
end

.sentence_from_file(filename, min_length) ⇒ Object

Generates a sentence, given a file.



9
10
11
12
13
14
# File 'lib/markovfun/trigram.rb', line 9

def self.sentence_from_file(filename, min_length)
  sentences = get_sentences(filename)
  counts = get_counts(sentences)
  probs = counts_to_probs(counts)
  sentence_from_probs_hash(probs, min_length)
end

.sentence_from_probs_hash(probs, min_length) ⇒ Object

Generates a sentence from a probability hash.



81
82
83
84
85
86
87
88
# File 'lib/markovfun/trigram.rb', line 81

def self.sentence_from_probs_hash(probs, min_length)
  sent = []
  while score_sentence(sent, probs) > 30 || sent.length < min_length
    sent = sample_sentence(probs)
  end
  sent = sent[0..-2].join(" ") + "."
  sent
end