Class: MarkovSentenceGenerator
- Inherits:
-
Object
- Object
- MarkovSentenceGenerator
- Defined in:
- lib/kusari/markov_sentence_generator.rb
Constant Summary collapse
- HEAD =
"[HEAD]"
- TAIL =
"[TAIL]"
Instance Method Summary collapse
- #add(string) ⇒ Object
- #generate(limit) ⇒ Object
-
#initialize(gram = 3, ipadic_path = "./ipadic") ⇒ MarkovSentenceGenerator
constructor
A new instance of MarkovSentenceGenerator.
- #load_table(path) ⇒ Object
- #save_table(path) ⇒ Object
- #tokenize(string) ⇒ Object
Constructor Details
#initialize(gram = 3, ipadic_path = "./ipadic") ⇒ MarkovSentenceGenerator
Returns a new instance of MarkovSentenceGenerator.
10 11 12 13 14 15 16 17 18 |
# File 'lib/kusari/markov_sentence_generator.rb', line 10 def initialize(gram=3, ipadic_path="./ipadic") @gram = gram # Japanese tokenizer @tagger = Igo::Tagger.new(ipadic_path) # save arrays of tokenized words based on the N-gram model @markov_table = Array.new end |
Instance Method Details
#add(string) ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/kusari/markov_sentence_generator.rb', line 47 def add(string) tokens = tokenize(string) # if there are at least N+1 tokens, we can create both of HEAD-started and TAIL-ended array of words return if tokens.size < @gram + 1 # update markov_table i = 0 loop do @markov_table << tokens[i..(i+@gram-1)] break if tokens[i+@gram-1] == TAIL i += 1 end end |
#generate(limit) ⇒ Object
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/kusari/markov_sentence_generator.rb', line 62 def generate(limit) # select all HEAD-started arrays head_arrays = Array.new @markov_table.each do |markov_array| if markov_array[0] == HEAD head_arrays << markov_array end end # sample one HEAD-started array and create initial sentence based on that sampled_array = head_arrays.sample sentence = sampled_array[1..@gram-1].join # start Markov chain until getting the TAIL flag loop do # select all arrays which can chain their head word to current tail of the sentence chain_arrays = Array.new @markov_table.each do |markov_array| if markov_array[0] == sampled_array[@gram-1] chain_arrays << markov_array end end # finish here if we cannot continue to chain break if chain_arrays.length == 0 # grow current sentence and check if it has the TAIL flag sampled_array = chain_arrays.sample if sampled_array[sampled_array.length-1] == TAIL sentence += sampled_array[1..@gram-2].join break else concat_string = sampled_array[1..@gram-1].join break if sentence.length + concat_string.length > limit sentence += concat_string end end sentence end |
#load_table(path) ⇒ Object
20 21 22 23 24 25 26 27 28 29 |
# File 'lib/kusari/markov_sentence_generator.rb', line 20 def load_table(path) if File.exists?(path) f = File.new(path, "rb").read pack = MessagePack.unpack(f) @gram = pack["gram"] @markov_table = pack["table"] else false end end |
#save_table(path) ⇒ Object
31 32 33 34 35 36 37 38 |
# File 'lib/kusari/markov_sentence_generator.rb', line 31 def save_table(path) pack = Hash.new pack["gram"] = @gram pack["table"] = @markov_table File.open(path, "wb") do |f| f.write pack.to_msgpack end end |
#tokenize(string) ⇒ Object
40 41 42 43 44 45 |
# File 'lib/kusari/markov_sentence_generator.rb', line 40 def tokenize(string) tokens = Array.new tokens << HEAD tokens += @tagger.wakati(string) tokens << TAIL end |