Class: VectorModel

Inherits:
Object
  • Object
show all
Defined in:
lib/rbbt/vector/model.rb,
lib/rbbt/vector/model/util.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(directory = nil, extract_features = nil, train_model = nil, eval_model = nil, post_process = nil, names = nil, factor_levels = nil) ⇒ VectorModel

Returns a new instance of VectorModel.



130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# File 'lib/rbbt/vector/model.rb', line 130

def initialize(directory = nil, extract_features = nil, train_model = nil, eval_model = nil, post_process = nil, names = nil, factor_levels = nil)
  @directory = directory
  if @directory
    FileUtils.mkdir_p @directory unless File.exists?(@directory)

    @model_file = File.join(@directory, "model")
    @extract_features_file = File.join(@directory, "features")
    @train_model_file = File.join(@directory, "train_model")
    @eval_model_file = File.join(@directory, "eval_model")
    @post_process_file = File.join(@directory, "post_process")
    @train_model_file_R = File.join(@directory, "train_model.R")
    @eval_model_file_R = File.join(@directory, "eval_model.R")
    @post_process_file_R = File.join(@directory, "post_process.R")
    @names_file = File.join(@directory, "feature_names")
    @levels_file = File.join(@directory, "levels")
    @options_file = File.join(@directory, "options.json")

    if File.exists?(@options_file)
      @model_options = JSON.parse(Open.read(@options_file))
      IndiferentHash.setup(@model_options)
    end
  end

  if extract_features.nil?
    if @extract_features_file && File.exists?(@extract_features_file)
      @extract_features = __load_method @extract_features_file
    end
  else
    @extract_features = extract_features 
  end

  if train_model.nil?
    if @train_model_file && File.exists?(@train_model_file)
      @train_model = __load_method @train_model_file
    elsif @train_model_file_R && File.exists?(@train_model_file_R)
      @train_model = Open.read(@train_model_file_R)
    end
  else
    @train_model = train_model 
  end

  if eval_model.nil?
    if @eval_model_file && File.exists?(@eval_model_file)
      @eval_model = __load_method @eval_model_file
    elsif @eval_model_file_R && File.exists?(@eval_model_file_R)
      @eval_model = Open.read(@eval_model_file_R)
    end
  else
    @eval_model = eval_model
  end

  if post_process.nil?
    if @post_process_file && File.exists?(@post_process_file)
      @post_process = __load_method @post_process_file
    elsif @post_process_file_R && File.exists?(@post_process_file_R)
      @post_process = Open.read(@post_process_file_R)
    end
  else
    @post_process = post_process
  end


  if names.nil?
    if @names_file && File.exists?(@names_file)
      @names = Open.read(@names_file).split("\n")
    end
  else
    @extract_features = names 
  end

  if factor_levels.nil?
    if @levels_file && File.exists?(@levels_file)
      @factor_levels = YAML.load(Open.read(@levels_file))
    end
    if @model_file && File.exists?(@model_file + '.factor_levels')
      @factor_levels = TSV.open(@model_file + '.factor_levels')
    end
  else
    @factor_levels = factor_levels 
  end

  @features = []
  @labels = []
end

Instance Attribute Details

#bar(max = nil, desc = nil) ⇒ Object

Returns the value of attribute bar.



2
3
4
# File 'lib/rbbt/vector/model/util.rb', line 2

def bar
  @bar
end

#directoryObject

Returns the value of attribute directory.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def directory
  @directory
end

#eval_model(&block) ⇒ Object

Returns the value of attribute eval_model.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def eval_model
  @eval_model
end

#extract_features(&block) ⇒ Object

Returns the value of attribute extract_features.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def extract_features
  @extract_features
end

#factor_levelsObject

Returns the value of attribute factor_levels.



6
7
8
# File 'lib/rbbt/vector/model.rb', line 6

def factor_levels
  @factor_levels
end

#featuresObject

Returns the value of attribute features.



6
7
8
# File 'lib/rbbt/vector/model.rb', line 6

def features
  @features
end

#labelsObject

Returns the value of attribute labels.



6
7
8
# File 'lib/rbbt/vector/model.rb', line 6

def labels
  @labels
end

#model_fileObject

Returns the value of attribute model_file.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def model_file
  @model_file
end

#model_optionsObject

Returns the value of attribute model_options.



7
8
9
# File 'lib/rbbt/vector/model.rb', line 7

def model_options
  @model_options
end

#namesObject

Returns the value of attribute names.



6
7
8
# File 'lib/rbbt/vector/model.rb', line 6

def names
  @names
end

#post_process(&block) ⇒ Object

Returns the value of attribute post_process.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def post_process
  @post_process
end

#train_model(&block) ⇒ Object

Returns the value of attribute train_model.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def train_model
  @train_model
end

Class Method Details

.f1_metrics(test, predicted, good_label = nil) ⇒ Object



334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# File 'lib/rbbt/vector/model.rb', line 334

def self.f1_metrics(test, predicted, good_label = nil)
  tp, tn, fp, fn, pr, re, f1 = [0, 0, 0, 0, nil, nil, nil]

  labels = (test + predicted).uniq

  if labels.length == 2 || good_label
    good_label = labels.uniq.select{|l| l.to_s == "true"}.first if good_label.nil?
    good_label = labels.uniq.select{|l| l.to_s == "1"}.first if good_label.nil?
    good_label = labels.uniq.sort.first if good_label.nil?
    good_label = good_label.to_s

    test.zip(predicted).each do |gs,pred|
      gs = gs.to_s
      pred = pred.to_s

      tp += 1 if pred == good_label && gs == good_label
      fp += 1 if pred == good_label && gs != good_label
      tn += 1 if pred != good_label && gs != good_label 
      fn += 1 if pred != good_label && gs == good_label
    end

    p = tp + fn
    pp = tp + fp

    pr = tp.to_f / pp
    re = tp.to_f / p

    f1 = (2.0 * tp) / (2.0 * tp + fp + fn) 

    [tp, tn, fp, fn, pr, re, f1]
  else 
    num = labels.length
    acc = []
    labels.each do |good_label|
      values = VectorModel.f1_metrics(test, predicted, good_label)
      tp, tn, fp, fn, pr, re, f1 = values
      Log.debug "Partial CV #{good_label} - P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
      acc << values
    end
    Misc.zip_fields(acc).collect{|s| Misc.mean(s)}
  end
end

.R_eval(model_file, features, list, code, names = nil, factor_levels = nil) ⇒ Object



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/rbbt/vector/model.rb', line 91

def self.R_eval(model_file, features, list, code, names = nil, factor_levels = nil)
  TmpFile.with_file do |feature_file|
    if list
      Open.write(feature_file, features.collect{|feat| feat * "\t"} * "\n" + "\n")
    else
      Open.write(feature_file, features * "\t" + "\n")
    end
    Open.write(feature_file + '.names', names * "\n" + "\n") if names

    TmpFile.with_file do |results|

      io = R.run "features = read.table(\"\#{ feature_file }\", sep =\"\\\\t\", stringsAsFactors=TRUE);\n\#{\"names(features) = make.names(readLines('\#{feature_file + '.names'}'))\" if names }\n\#{ factor_levels.collect do |name,levels|\n  \"features[['\#{name}']] = factor(features[['\#{name}']], levels=\#{R.ruby2R levels})\"\nend * \"\\n\" if factor_levels }\nload(file=\"\#{model_file}\");\n\#{code}\ncat(paste(label, sep=\"\\\\n\", collapse=\"\\\\n\"));\n      EOF\n      txt = io.read\n      res = txt.sub(/WARNING: .*?\\n/s,'').split(/\\s+/)\n\n      if list\n        res\n      else\n        res.first\n      end\n    end\n  end\nend\n"

.R_run(model_file, features, labels, code, names = nil, factor_levels = nil) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/rbbt/vector/model.rb', line 30

def self.R_run(model_file, features, labels, code, names = nil, factor_levels = nil)
  TmpFile.with_file do |feature_file|
    Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
    Open.write(feature_file + '.label', labels * "\n" + "\n")
    Open.write(feature_file + '.names', names * "\n" + "\n") if names


    what = case labels.first
           when Numeric, Integer, Float
             'numeric()'
           else
             'character()'
           end

    R.run "features = read.table(\"\#{ feature_file }\", sep =\"\\\\t\", stringsAsFactors=TRUE);\n\#{\"names(features) = make.names(readLines('\#{feature_file + '.names'}'))\" if names }\n\#{ factor_levels.collect do |name,levels|\n  \"features[['\#{name}']] = factor(features[['\#{name}']], levels=\#{R.ruby2R levels})\"\nend * \"\\n\" if factor_levels }\nlabels = scan(\"\#{ feature_file }.label\", what=\#{what});\nfeatures = cbind(features, label = labels);\n\#{code}\n    EOF\n  end\nend\n"

.R_train(model_file, features, labels, code, names = nil, factor_levels = nil) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/rbbt/vector/model.rb', line 57

def self.R_train(model_file, features, labels, code, names = nil, factor_levels = nil)
  TmpFile.with_file do |feature_file|
    Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
    Open.write(feature_file + '.label', labels * "\n" + "\n")
    Open.write(feature_file + '.names', names * "\n" + "\n") if names

    what = case labels.first
           when Numeric, Integer, Float
             'numeric()'
           else
             'character()'
           end

    R.run "features = read.table(\"\#{ feature_file }\", sep =\"\\\\t\", stringsAsFactors=TRUE);\nlabels = scan(\"\#{ feature_file }.label\", what=\#{what});\n\#{\"names(features) = make.names(readLines('\#{feature_file + '.names'}'))\" if names }\nfeatures = cbind(features, label = labels);\n\#{ factor_levels.collect do |name,levels|\n  \"features[['\#{name}']] = factor(features[['\#{name}']], levels=\#{R.ruby2R levels})\"\nend * \"\\n\" if factor_levels }\n\#{code}\n# Save used factor levels\nfactor_levels = c()\nfor (c in names(features)){\nif (is.factor(features[[c]]))\n  factor_levels[c] = paste(levels(features[[c]]), collapse=\"\\t\")\n}\nrbbt.tsv.write(\"\#{model_file}.factor_levels\", factor_levels, names=c('Levels'), type='flat')\nsave(model, file='\#{model_file}')\n    EOF\n  end\nend\n"

Instance Method Details

#__load_method(file) ⇒ Object



124
125
126
127
128
# File 'lib/rbbt/vector/model.rb', line 124

def __load_method(file)
  code = Open.read(file)
  code.sub!(/.*(\sdo\b|{)/, 'Proc.new\1')
  instance_eval code, file
end

#add(element, label = nil) ⇒ Object



220
221
222
223
224
# File 'lib/rbbt/vector/model.rb', line 220

def add(element, label = nil)
  features = @extract_features ? self.instance_exec(element, &@extract_features) : element
  @features << features
  @labels << label 
end

#add_list(elements, labels = nil) ⇒ Object



226
227
228
229
230
231
232
233
234
235
236
# File 'lib/rbbt/vector/model.rb', line 226

def add_list(elements, labels = nil)
  if @extract_features.nil? || @extract_features.arity == 1
    elements.zip(labels || [nil]).each do |elem,label|
      add(elem, label)
    end
  else
    features = self.instance_exec(nil, elements, &@extract_features)
    @features.concat  features
    @labels.concat labels if labels
  end
end

#clearObject



215
216
217
218
# File 'lib/rbbt/vector/model.rb', line 215

def clear
  @features = []
  @labels = []
end

#cross_validation(folds = 10, good_label = nil) ⇒ Object



377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
# File 'lib/rbbt/vector/model.rb', line 377

def cross_validation(folds = 10, good_label = nil)

  orig_features = @features
  orig_labels = @labels

  multiclass = @labels.uniq.length > 2

  if multiclass
    res = TSV.setup({}, "Fold~P,R,F1#:type=:list")
  else
    res = TSV.setup({}, "Fold~TP,TN,FP,FN,P,R,F1#:type=:list")
  end

  begin
    if folds == 1
      feature_folds = [@features]
      labels_folds = [@labels]
    else
      feature_folds = Misc.divide(@features, folds)
      labels_folds = Misc.divide(@labels, folds)
    end

    folds.times do |fix|

      if folds == 1
        rest = [fix]
      else
        rest = (0..(folds-1)).to_a - [fix]
      end

      test_set = feature_folds[fix]
      train_set = feature_folds.values_at(*rest).inject([]){|acc,e| acc += e; acc}

      test_labels = labels_folds[fix]
      train_labels = labels_folds.values_at(*rest).flatten

      @features = train_set
      @labels = train_labels

      self.reset_model if self.respond_to? :reset_model
      self.train
      predictions = self.eval_list test_set, false

      raise "Number of predictions (#{predictions.length}) and test labels (#{test_labels.length}) do not match" if predictions.length != test_labels.length

      different_labels = test_labels.uniq

      Log.debug do "Accuracy Fold #{fix}: #{(100 * test_labels.zip(predictions).select{|t,p| t == p }.length.to_f / test_labels.length).round(2)}%"  end

      tp, tn, fp, fn, pr, re, f1 = VectorModel.f1_metrics(test_labels, predictions, good_label)

      if multiclass 
        Log.low "Multi-class CV Fold #{fix} - Average P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1}"
        res[fix] = [pr,re,f1]
      else
        Log.low "CV Fold #{fix} P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
        res[fix] = [tp,tn,fp,fn,pr,re,f1]
      end

    end
  ensure
    @features = orig_features
    @labels = orig_labels
  end unless folds == -1
  self.reset_model if self.respond_to? :reset_model
  self.train unless folds == 1
  res
end

#eval(element) ⇒ Object



293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# File 'lib/rbbt/vector/model.rb', line 293

def eval(element)
  features = @extract_features.nil? ? element : self.instance_exec(element, &@extract_features)

  result = case 
           when Proc === @eval_model
             self.instance_exec(@model_file, features, false, nil, @names, @factor_levels, &@eval_model)
           when String === @eval_model
             VectorModel.R_eval(@model_file, features, false, eval_model, @names, @factor_levels)
           else
             raise "No @eval_model function or R script"
           end

  result = self.instance_exec(result, &@post_process) if Proc === @post_process 

  result
end

#eval_list(elements, extract = true) ⇒ Object



310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
# File 'lib/rbbt/vector/model.rb', line 310

def eval_list(elements, extract = true)

  if extract && ! @extract_features.nil? 
    features = if @extract_features.arity == 1
                 elements.collect{|element| self.instance_exec(element, &@extract_features) }
               else
                 self.instance_exec(nil, elements, &@extract_features)
               end
  else
    features = elements
  end

  result = case 
           when Proc === eval_model
             self.instance_exec(@model_file, features, true, nil, @names, @factor_levels, &@eval_model)
           when String === eval_model
             VectorModel.R_eval(@model_file, features, true, eval_model, @names, @factor_levels)
           end

  result = self.instance_exec(result, &@post_process) if Proc === @post_process 

  result
end

#run(code) ⇒ Object



289
290
291
# File 'lib/rbbt/vector/model.rb', line 289

def run(code)
  VectorModel.R_run(@model_file,  @features, @labels, code, @names, @factor_levels)
end

#save_modelsObject



238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# File 'lib/rbbt/vector/model.rb', line 238

def save_models
  require 'method_source'

  case 
  when Proc === train_model
    begin
      Open.write(@train_model_file, train_model.source)
    rescue
    end
  when String === train_model
    Open.write(@train_model_file_R, @train_model)
  end

  Open.write(@extract_features_file, @extract_features.source) if @extract_features

  case 
  when Proc === eval_model
    begin
      Open.write(@eval_model_file, eval_model.source)
    rescue
    end
  when String === eval_model
    Open.write(@eval_model_file_R, eval_model)
  end

  case 
  when Proc === post_process
    begin
      Open.write(@post_process_file, post_process.source)
    rescue
    end
  when String === post_process
    Open.write(@post_process_file_R, post_process)
  end


  Open.write(@levels_file, @factor_levels.to_yaml) if @factor_levels
  Open.write(@names_file, @names * "\n" + "\n") if @names
  Open.write(@options_file, @model_options.to_json) if @model_options
end

#trainObject



279
280
281
282
283
284
285
286
287
# File 'lib/rbbt/vector/model.rb', line 279

def train
  case 
  when Proc === @train_model
    self.instance_exec(@model_file, @features, @labels, @names, @factor_levels, &@train_model)
  when String === @train_model
    VectorModel.R_train(@model_file,  @features, @labels, train_model, @names, @factor_levels)
  end
  save_models if @directory
end