Class: Loremarkov

Inherits:
Object
  • Object
show all
Defined in:
lib/loremarkov.rb

Constant Summary collapse

TOKENS =
["\n", "\t", ' ', "'", '"']

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(num_prefix_words) ⇒ Loremarkov

Returns a new instance of Loremarkov.



69
70
71
72
# File 'lib/loremarkov.rb', line 69

def initialize(num_prefix_words)
  @num_prefix_words = num_prefix_words
  @markov = {}
end

Instance Attribute Details

#markovObject (readonly)

Returns the value of attribute markov.



67
68
69
# File 'lib/loremarkov.rb', line 67

def markov
  @markov
end

Class Method Details

.analyze(text, num_prefix_words) ⇒ Object

Generate a markov data structure Arrays of string for keys and values Keys are prefixes – ordered word sequence of constant length Values are an accumulation of the next word after the prefix, however many times it may occur. e.g. If a prefix occurs twice, then the value will be an array of two words – possibly the same word twice.



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/loremarkov.rb', line 39

def self.analyze(text, num_prefix_words)
  markov = {}
  words = lex(text)

  # Go through the possible valid prefixes.
  # Adding 1 gives you the final key:
  # *num_prefix_words* words with a nil value  -- signifying EOF
  #
  (words.length - num_prefix_words + 1).times { |i|
    prefix_words = []
    num_prefix_words.times { |j| prefix_words << words[i + j] }

    # set to empty array on a new prefix
    #
    markov[prefix_words] ||= []
    # add the target word, which will be nil on the last iteration
    markov[prefix_words] << words[i + num_prefix_words]
  }
  markov
end

.lex(str, tokens = TOKENS) ⇒ Object

Decompose text into an array of tokens, including and delimited by TOKENS e.g. “Hello”, he said. # => [‘“’, ‘Hello’, ‘”’, ‘,’, ‘ ’, ‘he’, ‘ ’, ‘said.’,] This operation can be losslessly reversed by calling #join on the resulting array. i.e. lex(str).join == str



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/loremarkov.rb', line 11

def self.lex(str, tokens = TOKENS)
  final_ary = []
  word = ''
  str.each_byte { |b|   # yes I am terrible with encodings
    # either a token (thereby ending the current word)
    # or part of the current word
    #
    if tokens.include?(b.chr)
      final_ary << word if !word.empty?
      final_ary << b.chr
      word = ''
    else
      word << b.chr
    end
  }
  final_ary << word if !word.empty?
  final_ary
end

.start_prefix(text, num_prefix_words) ⇒ Object

given the entire text, use an extremely conservative heuristic to grab only the first chunk to pass to lex



63
64
65
# File 'lib/loremarkov.rb', line 63

def self.start_prefix(text, num_prefix_words)
  lex(text[0, 999 * num_prefix_words])[0, num_prefix_words]
end

Instance Method Details

#analyze(text) ⇒ Object

text should have a definite end, not just a convenient buffer split



76
77
78
# File 'lib/loremarkov.rb', line 76

def analyze(text)
  @markov.merge!(self.class.analyze(text, @num_prefix_words))
end

#destroy(text) ⇒ Object

do it, you know you want to



98
99
100
101
# File 'lib/loremarkov.rb', line 98

def destroy(text)
  analyze(text)
  generate_all(self.class.start_prefix(text, @num_prefix_words))
end

#generate_all(start_prefix_words) ⇒ Object

given the start prefix, generate words until EOF



88
89
90
91
92
93
94
# File 'lib/loremarkov.rb', line 88

def generate_all(start_prefix_words)
  words = start_prefix_words
  while tmp = generate_one(words[-1 * @num_prefix_words, @num_prefix_words])
    words << tmp
  end
  words.join
end

#generate_one(prefix_words) ⇒ Object

given a prefix, give me the next word



82
83
84
# File 'lib/loremarkov.rb', line 82

def generate_one(prefix_words)
  @markov.fetch(prefix_words).sample
end