Class: VectorModel

Inherits:
Object
  • Object
show all
Defined in:
lib/rbbt/vector/model.rb,
lib/rbbt/vector/model/util.rb

Direct Known Subclasses

PythonModel, RFModel, SVMModel, SpaCyModel, TensorFlowModel

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(directory = nil, model_options = {}) ⇒ VectorModel

Returns a new instance of VectorModel.



143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/rbbt/vector/model.rb', line 143

def initialize(directory = nil, model_options = {})
  @directory = directory
  @model_options = IndiferentHash.setup(model_options)

  if @directory
    FileUtils.mkdir_p @directory unless File.exist?(@directory)

    @model_path            = File.join(@directory, "model")

    @extract_features_file = File.join(@directory, "features")
    @init_model_path       = File.join(@directory, "init_model")

    @train_model_path      = File.join(@directory, "train_model")
    @train_model_path_R    = File.join(@directory, "train_model.R")

    @eval_model_path       = File.join(@directory, "eval_model")
    @eval_model_path_R     = File.join(@directory, "eval_model.R")

    @post_process_file     = File.join(@directory, "post_process")
    @post_process_file_R   = File.join(@directory, "post_process.R")

    @names_file            = File.join(@directory, "feature_names")
    @levels_file           = File.join(@directory, "levels")
    @options_file          = File.join(@directory, "options.json")

    if File.exist?(@options_file)
      @model_options = JSON.parse(Open.read(@options_file)).merge(@model_options || {})
      IndiferentHash.setup(@model_options)
    end
  end
  
  if extract_features.nil?
    if @extract_features_file && File.exist?(@extract_features_file)
      @extract_features = __load_method @extract_features_file
    end
  else
    @extract_features = extract_features 
  end

  if init_model.nil?
    if @init_model_path && File.exist?(@init_model_path)
      @init_model = __load_method @init_model_path
    end
  else
    @init_model = init_model 
  end

  if train_model.nil?
    if @train_model_path && File.exist?(@train_model_path)
      @train_model = __load_method @train_model_path
    elsif @train_model_path_R && File.exist?(@train_model_path_R)
      @train_model = Open.read(@train_model_path_R)
    end
  else
    @train_model = train_model 
  end

  if eval_model.nil?
    if @eval_model_path && File.exist?(@eval_model_path)
      @eval_model = __load_method @eval_model_path
    elsif @eval_model_path_R && File.exist?(@eval_model_path_R)
      @eval_model = Open.read(@eval_model_path_R)
    end
  else
    @eval_model = eval_model
  end

  if post_process.nil?
    if @post_process_file && File.exist?(@post_process_file)
      @post_process = __load_method @post_process_file
    elsif @post_process_file_R && File.exist?(@post_process_file_R)
      @post_process = Open.read(@post_process_file_R)
    end
  else
    @post_process = post_process
  end


  if names.nil?
    if @names_file && File.exist?(@names_file)
      @names = Open.read(@names_file).split("\n")
    end
  else
    @extract_features = names 
  end

  if factor_levels.nil?
    if @levels_file && File.exist?(@levels_file)
      @factor_levels = YAML.load(Open.read(@levels_file))
    end
    if @model_path && File.exist?(@model_path + '.factor_levels')
      @factor_levels = TSV.open(@model_path + '.factor_levels')
    end
  else
    @factor_levels = factor_levels 
  end

  @features = []
  @labels = []
end

Instance Attribute Details

#balanceObject

Returns the value of attribute balance.



9
10
11
# File 'lib/rbbt/vector/model.rb', line 9

def balance
  @balance
end

#bar(max = nil, desc = nil) ⇒ Object

Returns the value of attribute bar.



2
3
4
# File 'lib/rbbt/vector/model/util.rb', line 2

def bar
  @bar
end

#directoryObject

Returns the value of attribute directory.



9
10
11
# File 'lib/rbbt/vector/model.rb', line 9

def directory
  @directory
end

#eval_model(&block) ⇒ Object

Returns the value of attribute eval_model.



9
10
11
# File 'lib/rbbt/vector/model.rb', line 9

def eval_model
  @eval_model
end

#extract_features(&block) ⇒ Object

Returns the value of attribute extract_features.



9
10
11
# File 'lib/rbbt/vector/model.rb', line 9

def extract_features
  @extract_features
end

#factor_levelsObject

Returns the value of attribute factor_levels.



10
11
12
# File 'lib/rbbt/vector/model.rb', line 10

def factor_levels
  @factor_levels
end

#featuresObject

Returns the value of attribute features.



10
11
12
# File 'lib/rbbt/vector/model.rb', line 10

def features
  @features
end

#init_model(&block) ⇒ Object

Returns the value of attribute init_model.



9
10
11
# File 'lib/rbbt/vector/model.rb', line 9

def init_model
  @init_model
end

#labelsObject

Returns the value of attribute labels.



10
11
12
# File 'lib/rbbt/vector/model.rb', line 10

def labels
  @labels
end

#modelObject

Returns the value of attribute model.



11
12
13
# File 'lib/rbbt/vector/model.rb', line 11

def model
  @model
end

#model_optionsObject

Returns the value of attribute model_options.



11
12
13
# File 'lib/rbbt/vector/model.rb', line 11

def model_options
  @model_options
end

#model_pathObject

Returns the value of attribute model_path.



9
10
11
# File 'lib/rbbt/vector/model.rb', line 9

def model_path
  @model_path
end

#namesObject

Returns the value of attribute names.



10
11
12
# File 'lib/rbbt/vector/model.rb', line 10

def names
  @names
end

#post_process(&block) ⇒ Object

Returns the value of attribute post_process.



9
10
11
# File 'lib/rbbt/vector/model.rb', line 9

def post_process
  @post_process
end

#train_model(&block) ⇒ Object

Returns the value of attribute train_model.



9
10
11
# File 'lib/rbbt/vector/model.rb', line 9

def train_model
  @train_model
end

Class Method Details

.f1_metrics(test, predicted, good_label = nil) ⇒ Object



377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
# File 'lib/rbbt/vector/model.rb', line 377

def self.f1_metrics(test, predicted, good_label = nil)
  tp, tn, fp, fn, pr, re, f1 = [0, 0, 0, 0, nil, nil, nil]

  labels = (test + predicted).uniq

  if labels.length == 2 || good_label
    good_label = labels.uniq.select{|l| l.to_s == "true"}.first if good_label.nil?
    good_label = labels.uniq.select{|l| l.to_s == "1"}.first if good_label.nil?
    good_label = labels.uniq.sort.first if good_label.nil?
    good_label = good_label.to_s

    test.zip(predicted).each do |gs,pred|
      gs = gs.to_s
      pred = pred.to_s

      tp += 1 if pred == good_label && gs == good_label
      fp += 1 if pred == good_label && gs != good_label
      tn += 1 if pred != good_label && gs != good_label 
      fn += 1 if pred != good_label && gs == good_label
    end

    p = tp + fn
    pp = tp + fp

    pr = tp.to_f / pp
    re = tp.to_f / p

    f1 = (2.0 * tp) / (2.0 * tp + fp + fn) 

    [tp, tn, fp, fn, pr, re, f1]
  else 
    num = labels.length
    acc = []
    labels.each do |good_label|
      values = VectorModel.f1_metrics(test, predicted, good_label)
      tp, tn, fp, fn, pr, re, f1 = values
      Log.debug "Partial CV #{good_label} - P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
      acc << values
    end
    Misc.zip_fields(acc).collect{|s| Misc.mean(s)}
  end
end

.R_eval(model_path, features, list, code, names = nil, factor_levels = nil) ⇒ Object



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/rbbt/vector/model.rb', line 104

def self.R_eval(model_path, features, list, code, names = nil, factor_levels = nil)
  TmpFile.with_file do |feature_file|
    if list
      Open.write(feature_file, features.collect{|feat| feat * "\t"} * "\n" + "\n")
    else
      Open.write(feature_file, features * "\t" + "\n")
    end
    Open.write(feature_file + '.names', names * "\n" + "\n") if names

    TmpFile.with_file do |results|

      io = R.run <<-EOF
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=TRUE);
#{"names(features) = make.names(readLines('#{feature_file + '.names'}'))" if names }
#{ factor_levels.collect do |name,levels|
  "features[['#{name}']] = factor(features[['#{name}']], levels=#{R.ruby2R levels})"
end * "\n" if factor_levels }
load(file="#{model_path}");
#{code}
cat(paste(label, sep="\\n", collapse="\\n"));
      EOF
      txt = io.read
      res = txt.sub(/WARNING: .*?\n/s,'').split(/\s+/)

      if list
        res
      else
        res.first
      end
    end
  end
end

.R_run(model_path, features, labels, code, names = nil, factor_levels = nil) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/rbbt/vector/model.rb', line 43

def self.R_run(model_path, features, labels, code, names = nil, factor_levels = nil)
  TmpFile.with_file do |feature_file|
    Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
    Open.write(feature_file + '.label', labels * "\n" + "\n")
    Open.write(feature_file + '.names', names * "\n" + "\n") if names


    what = case labels.first
           when Numeric, Integer, Float
             'numeric()'
           else
             'character()'
           end

    R.run <<-EOF
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=TRUE);
#{"names(features) = make.names(readLines('#{feature_file + '.names'}'))" if names }
#{ factor_levels.collect do |name,levels|
  "features[['#{name}']] = factor(features[['#{name}']], levels=#{R.ruby2R levels})"
end * "\n" if factor_levels }
labels = scan("#{ feature_file }.label", what=#{what});
features = cbind(features, label = labels);
#{code}
    EOF
  end
end

.R_train(model_path, features, labels, code, names = nil, factor_levels = nil) ⇒ Object



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/rbbt/vector/model.rb', line 70

def self.R_train(model_path, features, labels, code, names = nil, factor_levels = nil)
  TmpFile.with_file do |feature_file|
    Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
    Open.write(feature_file + '.label', labels * "\n" + "\n")
    Open.write(feature_file + '.names', names * "\n" + "\n") if names

    what = case labels.first
           when Numeric, Integer, Float
             'numeric()'
           else
             'character()'
           end

    R.run <<-EOF
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=TRUE);
labels = scan("#{ feature_file }.label", what=#{what});
#{"names(features) = make.names(readLines('#{feature_file + '.names'}'))" if names }
features = cbind(features, label = labels);
#{ factor_levels.collect do |name,levels|
  "features[['#{name}']] = factor(features[['#{name}']], levels=#{R.ruby2R levels})"
end * "\n" if factor_levels }
#{code}
# Save used factor levels
factor_levels = c()
for (c in names(features)){
if (is.factor(features[[c]]))
  factor_levels[c] = paste(levels(features[[c]]), collapse="\t")
}
rbbt.tsv.write("#{model_path}.factor_levels", factor_levels, names=c('Levels'), type='flat')
save(model, file='#{model_path}')
    EOF
  end
end

Instance Method Details

#__load_method(file) ⇒ Object



137
138
139
140
141
# File 'lib/rbbt/vector/model.rb', line 137

def __load_method(file)
  code = Open.read(file)
  code.sub!(/.*(\sdo\b|{)/, 'Proc.new\1')
  instance_eval code, file
end

#add(element, label = nil) ⇒ Object



249
250
251
252
253
# File 'lib/rbbt/vector/model.rb', line 249

def add(element, label = nil)
  features = @extract_features ? self.instance_exec(element, &@extract_features) : element
  @features << features
  @labels << label 
end

#add_list(elements, labels = nil) ⇒ Object



255
256
257
258
259
260
261
262
263
264
265
# File 'lib/rbbt/vector/model.rb', line 255

def add_list(elements, labels = nil)
  if @extract_features.nil? || @extract_features.arity == 1
    elements.zip(labels || [nil]).each do |elem,label|
      add(elem, label)
    end
  else
    features = self.instance_exec(nil, elements, &@extract_features)
    @features.concat  features
    @labels.concat labels if labels
  end
end

#balance_labelsObject



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/rbbt/vector/model/util.rb', line 13

def balance_labels
  counts = Misc.counts(@labels)
  min = counts.values.min

  used = {}
  new_labels = []
  new_features = []
  @labels.zip(@features).shuffle.each do |label, features|
    used[label] ||= 0
    next if used[label] > min
    used[label] += 1
    new_labels << label
    new_features << features
  end
  @labels = new_labels
  @features = new_features
end

#clearObject



244
245
246
247
# File 'lib/rbbt/vector/model.rb', line 244

def clear
  @features = []
  @labels = []
end

#cross_validation(folds = 10, good_label = nil) ⇒ Object



420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
# File 'lib/rbbt/vector/model.rb', line 420

def cross_validation(folds = 10, good_label = nil)

  orig_features = @features
  orig_labels = @labels

  multiclass = @labels.uniq.length > 2

  if multiclass
    res = TSV.setup({}, "Fold~P,R,F1#:type=:list")
  else
    res = TSV.setup({}, "Fold~TP,TN,FP,FN,P,R,F1#:type=:list")
  end

  begin
    if folds == 1
      feature_folds = [@features]
      labels_folds = [@labels]
    else
      feature_folds = Misc.divide(@features, folds)
      labels_folds = Misc.divide(@labels, folds)
    end

    folds.times do |fix|

      if folds == 1
        rest = [fix]
      else
        rest = (0..(folds-1)).to_a - [fix]
      end

      test_set = feature_folds[fix]
      train_set = feature_folds.values_at(*rest).flatten(1)

      test_labels = labels_folds[fix]
      train_labels = labels_folds.values_at(*rest).flatten(1)

      @features = train_set
      @labels = train_labels

      self.reset_model if self.respond_to? :reset_model
      self.train
      predictions = self.eval_list test_set, false

      raise "Number of predictions (#{predictions.length}) and test labels (#{test_labels.length}) do not match" if predictions.length != test_labels.length

      different_labels = test_labels.uniq

      Log.debug do "Accuracy Fold #{fix}: #{(100 * test_labels.zip(predictions).select{|t,p| t == p }.length.to_f / test_labels.length).round(2)}%"  end

      tp, tn, fp, fn, pr, re, f1 = VectorModel.f1_metrics(test_labels, predictions, good_label)

      if multiclass 
        Log.low "Multi-class CV Fold #{fix} - Average P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1}"
        res[fix] = [pr,re,f1]
      else
        Log.low "CV Fold #{fix} P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
        res[fix] = [tp,tn,fp,fn,pr,re,f1]
      end

    end
  ensure
    @features = orig_features
    @labels = orig_labels
  end unless folds == -1

  self.reset_model if self.respond_to? :reset_model
  self.train unless folds == 1
  res
end

#eval(element) ⇒ Object



336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
# File 'lib/rbbt/vector/model.rb', line 336

def eval(element)
  features = @extract_features.nil? ? element : self.instance_exec(element, &@extract_features)

  result = case 
           when Proc === @eval_model
             self.instance_exec(features, false, nil, @names, @factor_levels, &@eval_model)
           when String === @eval_model
             VectorModel.R_eval(@model_path, features, false, eval_model, @names, @factor_levels)
           else
             raise "No @eval_model function or R script"
           end

  result = self.instance_exec(result, false, &@post_process) if Proc === @post_process 

  result
end

#eval_list(elements, extract = true) ⇒ Object



353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# File 'lib/rbbt/vector/model.rb', line 353

def eval_list(elements, extract = true)

  if extract && ! @extract_features.nil? 
    features = if @extract_features.arity == 1
                 elements.collect{|element| self.instance_exec(element, &@extract_features) }
               else
                 self.instance_exec(nil, elements, &@extract_features)
               end
  else
    features = elements
  end

  result = case 
           when Proc === eval_model
             self.instance_exec(features, true, nil, @names, @factor_levels, &@eval_model)
           when String === eval_model
             VectorModel.R_eval(@model_path, features, true, eval_model, @names, @factor_levels)
           end

  result = self.instance_exec(result, true, &@post_process) if Proc === @post_process 

  result
end

#initObject



33
34
35
# File 'lib/rbbt/vector/model.rb', line 33

def init
  @model ||= self.instance_exec &@init_model
end

#run(code) ⇒ Object



332
333
334
# File 'lib/rbbt/vector/model.rb', line 332

def run(code)
  VectorModel.R_run(@model_path,  @features, @labels, code, @names, @factor_levels)
end

#save_modelsObject



267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# File 'lib/rbbt/vector/model.rb', line 267

def save_models
  require 'method_source'

  case 
  when Proc === train_model
    begin
      Open.write(@train_model_path, train_model.source)
    rescue
    end
  when String === train_model
    Open.write(@train_model_path_R, @train_model)
  end

  Open.write(@extract_features_file, @extract_features.source) if @extract_features
  Open.write(@init_model_path, @init_model.source) if @init_model

  case 
  when Proc === eval_model
    begin
      Open.write(@eval_model_path, eval_model.source)
    rescue
    end
  when String === eval_model
    Open.write(@eval_model_path_R, eval_model)
  end

  case 
  when Proc === post_process
    begin
      Open.write(@post_process_file, post_process.source)
    rescue
    end
  when String === post_process
    Open.write(@post_process_file_R, post_process)
  end

  Open.write(@levels_file, @factor_levels.to_yaml) if @factor_levels
  Open.write(@names_file, @names * "\n" + "\n") if @names
  Open.write(@options_file, @model_options.to_json) if @model_options
end

#trainObject



308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
# File 'lib/rbbt/vector/model.rb', line 308

def train
  begin
    if @balance
      @original_features = @features
      @original_labels = @labels
      self.balance_labels
    end

    case 
    when Proc === @train_model
      self.instance_exec(@features, @labels, @names, @factor_levels, &@train_model)
    when String === @train_model
      VectorModel.R_train(@model_path, @features, @labels, train_model, @names, @factor_levels)
    end
  ensure
    if @balance
      @features =  @original_features
      @labels = @original_labels
    end
  end

  save_models if @directory
end