Class: MarkovSentenceGenerator

Inherits:
Object
  • Object
show all
Defined in:
lib/kusari/markov_sentence_generator.rb

Constant Summary collapse

HEAD =
"[HEAD]"
TAIL =
"[TAIL]"

Instance Method Summary collapse

Constructor Details

#initialize(gram = 3, ipadic_path = "./ipadic") ⇒ MarkovSentenceGenerator

Returns a new instance of MarkovSentenceGenerator.



10
11
12
13
14
15
16
17
18
# File 'lib/kusari/markov_sentence_generator.rb', line 10

def initialize(gram=3, ipadic_path="./ipadic")
  @gram = gram

  # Japanese tokenizer
  @tagger = Igo::Tagger.new(ipadic_path)

  # save arrays of tokenized words based on the N-gram model
  @markov_table = Array.new
end

Instance Method Details

#add(string) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/kusari/markov_sentence_generator.rb', line 47

def add(string)
  tokens = tokenize(string)

  # if there are at least N+1 tokens, we can create both of HEAD-started and TAIL-ended array of words
  return if tokens.size < @gram + 1

  # update markov_table
  i = 0
  loop do
    @markov_table << tokens[i..(i+@gram-1)]
    break if tokens[i+@gram-1] == TAIL
    i += 1
  end
end

#generate(limit) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/kusari/markov_sentence_generator.rb', line 62

def generate(limit)
  # select all HEAD-started arrays
  head_arrays = Array.new
  @markov_table.each do |markov_array|
    if markov_array[0] == HEAD
      head_arrays << markov_array
    end
  end

  # sample one HEAD-started array and create initial sentence based on that
  sampled_array = head_arrays.sample
  sentence = sampled_array[1..@gram-1].join

  # start Markov chain until getting the TAIL flag
  loop do
    # select all arrays which can chain their head word to current tail of the sentence
    chain_arrays = Array.new
    @markov_table.each do |markov_array|
      if markov_array[0] == sampled_array[@gram-1]
        chain_arrays << markov_array
      end
    end

    # finish here if we cannot continue to chain
    break if chain_arrays.length == 0

    # grow current sentence and check if it has the TAIL flag
    sampled_array = chain_arrays.sample
    if sampled_array[sampled_array.length-1] == TAIL
      sentence += sampled_array[1..@gram-2].join
      break
    else
      concat_string = sampled_array[1..@gram-1].join
      break if sentence.length + concat_string.length > limit
      sentence += concat_string
    end
  end
  sentence
end

#load_table(path) ⇒ Object



20
21
22
23
24
25
26
27
28
29
# File 'lib/kusari/markov_sentence_generator.rb', line 20

def load_table(path)
  if File.exists?(path)
    f = File.new(path, "rb").read
    pack = MessagePack.unpack(f)
    @gram = pack["gram"]
    @markov_table = pack["table"]
  else
    false
  end
end

#save_table(path) ⇒ Object



31
32
33
34
35
36
37
38
# File 'lib/kusari/markov_sentence_generator.rb', line 31

def save_table(path)
  pack = Hash.new
  pack["gram"] = @gram
  pack["table"] = @markov_table
  File.open(path, "wb") do |f|
    f.write pack.to_msgpack
  end
end

#tokenize(string) ⇒ Object



40
41
42
43
44
45
# File 'lib/kusari/markov_sentence_generator.rb', line 40

def tokenize(string)
  tokens = Array.new
  tokens << HEAD
  tokens += @tagger.wakati(string)
  tokens << TAIL
end