Class: Ebooks::SuffixGenerator

Inherits:
Object
  • Object
show all
Defined in:
lib/twitter_ebooks/suffix.rb

Overview

This generator uses data identical to a markov model, but instead of making a chain by looking up bigrams it uses the positions to randomly replace suffixes in one sentence with matching suffixes in another

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(sentences) ⇒ SuffixGenerator

Returns a new instance of SuffixGenerator.



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/twitter_ebooks/suffix.rb', line 16

def initialize(sentences)
  @sentences = sentences.reject { |s| s.length < 2 }
  @unigrams = {}
  @bigrams = {}

  @sentences.each_with_index do |tikis, i|
    last_tiki = INTERIM
    tikis.each_with_index do |tiki, j|
      @unigrams[last_tiki] ||= []
      @unigrams[last_tiki] << [i, j]

      @bigrams[last_tiki] ||= {}
      @bigrams[last_tiki][tiki] ||= []

      if j == tikis.length-1 # Mark sentence endings
        @unigrams[tiki] ||= []
        @unigrams[tiki] << [i, INTERIM]
        @bigrams[last_tiki][tiki] << [i, INTERIM]
      else
        @bigrams[last_tiki][tiki] << [i, j+1]
      end

      last_tiki = tiki
    end
  end

  self
end

Class Method Details

.build(sentences) ⇒ SuffixGenerator

Build a generator from a corpus of tikified sentences

Parameters:

  • sentences (Array<Array<Integer>>)

Returns:



12
13
14
# File 'lib/twitter_ebooks/suffix.rb', line 12

def self.build(sentences)
  SuffixGenerator.new(sentences)
end

Instance Method Details

#generate(passes = 5, n = :unigrams) ⇒ Array<Integer>

Generate a recombined sequence of tikis

Parameters:

  • passes (Integer) (defaults to: 5)

    number of times to recombine

  • n (Symbol) (defaults to: :unigrams)

    :unigrams or :bigrams (affects how conservative the model is)

Returns:

  • (Array<Integer>)


50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/twitter_ebooks/suffix.rb', line 50

def generate(passes=5, n=:unigrams)
  index = rand(@sentences.length)
  tikis = @sentences[index]
  used = [index] # Sentences we've already used
  verbatim = [tikis] # Verbatim sentences to avoid reproducing

  0.upto(passes-1) do
    varsites = {} # Map bigram start site => next tiki alternatives

    tikis.each_with_index do |tiki, i|
      next_tiki = tikis[i+1]
      break if next_tiki.nil?

      alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
      # Filter out suffixes from previous sentences
      alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
      varsites[i] = alternatives unless alternatives.empty?
    end

    variant = nil
    varsites.to_a.shuffle.each do |site|
      start = site[0]

      site[1].shuffle.each do |alt|
        start, alt = site[0], site[1].sample
        verbatim << @sentences[alt[0]]
        suffix = @sentences[alt[0]][alt[1]..-1]
        potential = tikis[0..start+1] + suffix

        # Ensure we're not just rebuilding some segment of another sentence
        unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
          used << alt[0]
          variant = potential
          break
        end
      end

      break if variant
    end

    tikis = variant if variant
  end

  tikis
end