Class: Ebooks::MarkovModel

Inherits:
Object
  • Object
show all
Defined in:
lib/twitter_ebooks/markov.rb

Overview

This is an ngram-based Markov model optimized to build from a tokenized sentence list without requiring too much transformation

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.build(sentences) ⇒ Object



11
12
13
# File 'lib/twitter_ebooks/markov.rb', line 11

def self.build(sentences)
  MarkovModel.new.consume(sentences)
end

Instance Method Details

#chain(tokens) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/twitter_ebooks/markov.rb', line 55

def chain(tokens)
  if tokens.length == 1
    matches = @unigrams[tokens[-1]]
  else
    matches = @bigrams[tokens[-2]][tokens[-1]]
    matches = @unigrams[tokens[-1]] if matches.length < 2
  end

  if matches.empty?
    # This should never happen unless a strange token is
    # supplied from outside the dataset
    raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
  end

  next_token = find_token(matches.sample)

  if next_token == INTERIM # We chose to end the sentence
    return tokens
  else
    return chain(tokens + [next_token])
  end
end

#consume(sentences) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/twitter_ebooks/markov.rb', line 15

def consume(sentences)
  # These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
  # We map by both bigrams and unigrams so we can fall back to the latter in
  # cases where an input bigram is unavailable, such as starting a sentence
  @sentences = sentences
  @unigrams = {}
  @bigrams = {}

  sentences.each_with_index do |tokens, i|
    last_token = INTERIM
    tokens.each_with_index do |token, j|
      @unigrams[last_token] ||= []
      @unigrams[last_token] << [i, j]

      @bigrams[last_token] ||= {}
      @bigrams[last_token][token] ||= []

      if j == tokens.length-1 # Mark sentence endings
        @unigrams[token] ||= []
        @unigrams[token] << INTERIM
        @bigrams[last_token][token] << INTERIM
      else
        @bigrams[last_token][token] << [i, j+1]
      end

      last_token = token
    end
  end

  self
end

#find_token(index) ⇒ Object



47
48
49
50
51
52
53
# File 'lib/twitter_ebooks/markov.rb', line 47

def find_token(index)
  if index == INTERIM
    INTERIM
  else
    @sentences[index[0]][index[1]]
  end
end

#generateObject



78
79
80
# File 'lib/twitter_ebooks/markov.rb', line 78

def generate
  NLP.reconstruct(chain([INTERIM]))
end