Class: Ebooks::MarkovModel

Inherits:

Object

Object
Ebooks::MarkovModel

Defined in:: lib/twitter_ebooks/markov.rb

Overview

This is an ngram-based Markov model optimized to build from a tokenized sentence list without requiring too much transformation

Class Method Summary collapse

.build(sentences) ⇒ Object

Instance Method Summary collapse

Class Method Details

.build(sentences) ⇒ `Object`



11
12
13

# File 'lib/twitter_ebooks/markov.rb', line 11

def self.build(sentences)
  MarkovModel.new.consume(sentences)
end

Instance Method Details

#chain(tokens) ⇒ `Object`

# File 'lib/twitter_ebooks/markov.rb', line 55

def chain(tokens)
  if tokens.length == 1
    matches = @unigrams[tokens[-1]]
  else
    matches = @bigrams[tokens[-2]][tokens[-1]]
    matches = @unigrams[tokens[-1]] if matches.length < 2
  end

  if matches.empty?
    # This should never happen unless a strange token is
    # supplied from outside the dataset
    raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
  end

  next_token = find_token(matches.sample)

  if next_token == INTERIM # We chose to end the sentence
    return tokens
  else
    return chain(tokens + [next_token])
  end
end

#consume(sentences) ⇒ `Object`

# File 'lib/twitter_ebooks/markov.rb', line 15

def consume(sentences)
  # These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
  # We map by both bigrams and unigrams so we can fall back to the latter in
  # cases where an input bigram is unavailable, such as starting a sentence
  @sentences = sentences
  @unigrams = {}
  @bigrams = {}

  sentences.each_with_index do |tokens, i|
    last_token = INTERIM
    tokens.each_with_index do |token, j|
      @unigrams[last_token] ||= []
      @unigrams[last_token] << [i, j]

      @bigrams[last_token] ||= {}
      @bigrams[last_token][token] ||= []

      if j == tokens.length-1 # Mark sentence endings
        @unigrams[token] ||= []
        @unigrams[token] << INTERIM
        @bigrams[last_token][token] << INTERIM
      else
        @bigrams[last_token][token] << [i, j+1]
      end

      last_token = token
    end
  end

  self
end

#find_token(index) ⇒ `Object`

# File 'lib/twitter_ebooks/markov.rb', line 47

def find_token(index)
  if index == INTERIM
    INTERIM
  else
    @sentences[index[0]][index[1]]
  end
end

#generate ⇒ `Object`



78
79
80

# File 'lib/twitter_ebooks/markov.rb', line 78

def generate
  NLP.reconstruct(chain([INTERIM]))
end

Class: Ebooks::MarkovModel

Overview

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.build(sentences) ⇒ Object

Instance Method Details

#chain(tokens) ⇒ Object

#consume(sentences) ⇒ Object

#find_token(index) ⇒ Object

#generate ⇒ Object

.build(sentences) ⇒ `Object`

#chain(tokens) ⇒ `Object`

#consume(sentences) ⇒ `Object`

#find_token(index) ⇒ `Object`

#generate ⇒ `Object`