Class: Ebooks::SuffixGenerator

Inherits:
Object
  • Object
show all
Defined in:
lib/foxdear_ebooks/suffix.rb

Overview

This generator uses data similar to a Markov model, but instead of making a chain by looking up bigrams it uses the positions to randomly replace token array suffixes in one sentence with matching suffixes in another

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(sentences) ⇒ SuffixGenerator



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/foxdear_ebooks/suffix.rb', line 19

def initialize(sentences)
  @sentences = sentences.reject { |s| s.empty? }
  @unigrams = {}
  @bigrams = {}

  @sentences.each_with_index do |tikis, i|
    if (i % 10000 == 0) then
      log ("Building: sentence #{i} of #{sentences.length}")
    end
    last_tiki = INTERIM
    tikis.each_with_index do |tiki, j|
      @unigrams[last_tiki] ||= []
      @unigrams[last_tiki] << [i, j]

      @bigrams[last_tiki] ||= {}
      @bigrams[last_tiki][tiki] ||= []

      if j == tikis.length-1 # Mark sentence endings
        @unigrams[tiki] ||= []
        @unigrams[tiki] << [i, INTERIM]
        @bigrams[last_tiki][tiki] << [i, INTERIM]
      else
        @bigrams[last_tiki][tiki] << [i, j+1]
      end

      last_tiki = tiki
    end
  end

  self
end

Class Method Details

.build(sentences) ⇒ SuffixGenerator

Build a generator from a corpus of tikified sentences “tikis” are token indexes– a way of representing words and punctuation as their integer position in a big array of such tokens



15
16
17
# File 'lib/foxdear_ebooks/suffix.rb', line 15

def self.build(sentences)
  SuffixGenerator.new(sentences)
end

Instance Method Details

#generate(passes = 5, n = :unigrams) ⇒ Array<Integer>

Generate a recombined sequence of tikis



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/foxdear_ebooks/suffix.rb', line 55

def generate(passes=5, n=:unigrams)
  index = rand(@sentences.length)
  tikis = @sentences[index]
  used = [index] # Sentences we've already used
  verbatim = [tikis] # Verbatim sentences to avoid reproducing

  0.upto(passes-1) do
    varsites = {} # Map bigram start site => next tiki alternatives

    tikis.each_with_index do |tiki, i|
      next_tiki = tikis[i+1]
      break if next_tiki.nil?

      alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
      # Filter out suffixes from previous sentences
      alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
      varsites[i] = alternatives unless alternatives.empty?
    end

    variant = nil
    varsites.to_a.shuffle.each do |site|
      start = site[0]

      site[1].shuffle.each do |alt|
        verbatim << @sentences[alt[0]]
        suffix = @sentences[alt[0]][alt[1]..-1]
        potential = tikis[0..start+1] + suffix

        # Ensure we're not just rebuilding some segment of another sentence
        unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
          used << alt[0]
          variant = potential
          break
        end
      end

      break if variant
    end

    # If we failed to produce a variation from any alternative, there
    # is no use running additional passes-- they'll have the same result.
    break if variant.nil?

    tikis = variant
  end

  tikis
end