Class: Ebooks::SuffixGenerator

Inherits:
Object
  • Object
show all
Defined in:
lib/twitter_ebooks/suffix.rb

Overview

This generator uses data similar to a Markov model, but instead of making a chain by looking up bigrams it uses the positions to randomly replace token array suffixes in one sentence with matching suffixes in another

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(sentences) ⇒ SuffixGenerator

Returns a new instance of SuffixGenerator.



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/twitter_ebooks/suffix.rb', line 19

def initialize(sentences)
  @sentences = sentences.reject { |s| s.empty? }
  @unigrams = {}
  @bigrams = {}

  @sentences.each_with_index do |tikis, i|
    if (i % 10000 == 0) then
      log ("Building: sentence #{i} of #{sentences.length}")
    end
    last_tiki = INTERIM
    tikis.each_with_index do |tiki, j|
      @unigrams[last_tiki] ||= []
      @unigrams[last_tiki] << [i, j]

      @bigrams[last_tiki] ||= {}
      @bigrams[last_tiki][tiki] ||= []

      if j == tikis.length-1 # Mark sentence endings
        @unigrams[tiki] ||= []
        @unigrams[tiki] << [i, INTERIM]
        @bigrams[last_tiki][tiki] << [i, INTERIM]
      else
        @bigrams[last_tiki][tiki] << [i, j+1]
      end

      last_tiki = tiki
    end
  end

  self
end

Class Method Details

.build(sentences) ⇒ SuffixGenerator

Build a generator from a corpus of tikified sentences “tikis” are token indexes– a way of representing words and punctuation as their integer position in a big array of such tokens

Parameters:

  • sentences (Array<Array<Integer>>)

Returns:



15
16
17
# File 'lib/twitter_ebooks/suffix.rb', line 15

def self.build(sentences)
  SuffixGenerator.new(sentences)
end

Instance Method Details

#generate(passes = 5, n = :unigrams) ⇒ Array<Integer>

Generate a recombined sequence of tikis

Parameters:

  • passes (Integer) (defaults to: 5)

    number of times to recombine

  • n (Symbol) (defaults to: :unigrams)

    :unigrams or :bigrams (affects how conservative the model is)

Returns:

  • (Array<Integer>)


55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/twitter_ebooks/suffix.rb', line 55

def generate(passes=5, n=:unigrams)
  index = rand(@sentences.length)
  tikis = @sentences[index]
  used = [index] # Sentences we've already used
  verbatim = [tikis] # Verbatim sentences to avoid reproducing

  0.upto(passes-1) do
    varsites = {} # Map bigram start site => next tiki alternatives

    tikis.each_with_index do |tiki, i|
      next_tiki = tikis[i+1]
      break if next_tiki.nil?

      alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
      # Filter out suffixes from previous sentences
      alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
      varsites[i] = alternatives unless alternatives.empty?
    end

    variant = nil
    varsites.to_a.shuffle.each do |site|
      start = site[0]

      site[1].shuffle.each do |alt|
        verbatim << @sentences[alt[0]]
        suffix = @sentences[alt[0]][alt[1]..-1]
        potential = tikis[0..start+1] + suffix

        # Ensure we're not just rebuilding some segment of another sentence
        unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
          used << alt[0]
          variant = potential
          break
        end
      end

      break if variant
    end

    # If we failed to produce a variation from any alternative, there
    # is no use running additional passes-- they'll have the same result.
    break if variant.nil?

    tikis = variant
  end

  tikis
end