Class: Maxixe::Trainer

Inherits:
Object
  • Object
show all
Defined in:
lib/maxixe.rb

Class Method Summary collapse

Class Method Details

.check_recognition(index, samples) ⇒ Object



128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/maxixe.rb', line 128

def self.check_recognition(index, samples)
  # Get all subsets of N
  ns = 1.upto(index.keys.size).map{|i| index.keys.combination(i).to_a}.flatten(1)
  results = ns.inject({}) do |res, n|
    n_index = index.select{|key, value| n.include? key}
    m = Maxixe::Segmenter.new(n_index)

    t_values = ((0.1)..(1.0)).step(0.1).inject({}) do |res, t|
      difference = samples.inject(0) do |result, (not_split, split)|
        temp = m.segment(not_split, t)
        result += Text::Levenshtein.distance(temp, split) 
      end
      res[t] = difference
      res
    end
    res[n] = t_values
    res
  end
  results
end

.generate_corpus_from_io(n, io) ⇒ Object



102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/maxixe.rb', line 102

def self.generate_corpus_from_io(n , io)
  result = n.inject({}){|r, c_n| r[c_n.to_s] = Hash.new{0}; r} 
  io.each_line do |line|
    n.each do |c_n|
      n_grams = line.each_char.each_cons(c_n).map(&:join).to_a
      n_grams.each do |n_gram|
        result[c_n.to_s][n_gram] += 1
      end
    end
  end
  result
end

.optimize(index, samples) ⇒ Object



115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/maxixe.rb', line 115

def self.optimize(index, samples)
  res = check_recognition(index, samples)
  min = nil
  res.each do |n, ts|
    ts.each do |t, score|
      if !min or score < min[1]
        min = [[n,t],score]
      end
    end
  end 
  {:n => min[0][0], :t => min[0][1], :score => min[1]}
end