Class: VectorModel

Inherits:
Object
  • Object
show all
Defined in:
lib/rbbt/vector/model.rb

Direct Known Subclasses

RFModel, SVMModel, SpaCyModel, TensorFlowModel

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(directory, extract_features = nil, train_model = nil, eval_model = nil, names = nil, factor_levels = nil) ⇒ VectorModel



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/rbbt/vector/model.rb', line 100

def initialize(directory, extract_features = nil, train_model = nil, eval_model = nil, names = nil, factor_levels = nil)
  @directory = directory
  FileUtils.mkdir_p @directory unless File.exists? @directory

  @model_file = File.join(@directory, "model")
  @extract_features_file = File.join(@directory, "features")
  @train_model_file = File.join(@directory, "train_model")
  @eval_model_file = File.join(@directory, "eval_model")
  @train_model_file_R = File.join(@directory, "train_model.R")
  @eval_model_file_R = File.join(@directory, "eval_model.R")
  @names_file = File.join(@directory, "feature_names")
  @levels_file = File.join(@directory, "levels")

  if extract_features.nil?
    if File.exists?(@extract_features_file)
      @extract_features = __load_method @extract_features_file
    end
  else
    @extract_features = extract_features 
  end

  if train_model.nil?
    if File.exists?(@train_model_file)
      @train_model = __load_method @train_model_file
    elsif File.exists?(@train_model_file_R)
      @train_model = Open.read(@train_model_file_R)
    end
  else
    @train_model = train_model 
  end

  if eval_model.nil?
    if File.exists?(@eval_model_file)
      @eval_model = __load_method @eval_model_file
    elsif File.exists?(@eval_model_file_R)
      @eval_model = Open.read(@eval_model_file_R)
    end
  else
    @eval_model = eval_model
  end

  if names.nil?
    if File.exists?(@names_file)
      @names = Open.read(@names_file).split("\n")
    end
  else
    @extract_features = names 
  end

  if factor_levels.nil?
    if File.exists?(@levels_file)
      @factor_levels = YAML.load(Open.read(@levels_file))
    end
  else
    @factor_levels = factor_levels 
  end

  @features = []
  @labels = []
end

Instance Attribute Details

#directoryObject

Returns the value of attribute directory.



4
5
6
# File 'lib/rbbt/vector/model.rb', line 4

def directory
  @directory
end

#eval_modelObject

Returns the value of attribute eval_model.



4
5
6
# File 'lib/rbbt/vector/model.rb', line 4

def eval_model
  @eval_model
end

#extract_featuresObject

Returns the value of attribute extract_features.



4
5
6
# File 'lib/rbbt/vector/model.rb', line 4

def extract_features
  @extract_features
end

#factor_levelsObject

Returns the value of attribute factor_levels.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def factor_levels
  @factor_levels
end

#featuresObject

Returns the value of attribute features.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def features
  @features
end

#labelsObject

Returns the value of attribute labels.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def labels
  @labels
end

#model_fileObject

Returns the value of attribute model_file.



4
5
6
# File 'lib/rbbt/vector/model.rb', line 4

def model_file
  @model_file
end

#namesObject

Returns the value of attribute names.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def names
  @names
end

#train_modelObject

Returns the value of attribute train_model.



4
5
6
# File 'lib/rbbt/vector/model.rb', line 4

def train_model
  @train_model
end

Class Method Details

.f1_metrics(test, predicted, good_label = nil) ⇒ Object

acc end



288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
# File 'lib/rbbt/vector/model.rb', line 288

def self.f1_metrics(test, predicted, good_label = nil)
  tp, tn, fp, fn, pr, re, f1 = [0, 0, 0, 0, nil, nil, nil]

  labels = (test + predicted).uniq

  if labels.length == 2 || good_label
    good_label = labels.uniq.select{|l| l.to_s == "true"}.first if good_label.nil?
    good_label = labels.uniq.select{|l| l.to_s == "1"}.first if good_label.nil?
    good_label = labels.uniq.sort.first if good_label.nil?
    good_label = good_label.to_s

    test.zip(predicted).each do |gs,pred|
      gs = gs.to_s
      pred = pred.to_s

      tp += 1 if pred == good_label && gs == good_label
      fp += 1 if pred == good_label && gs != good_label
      tn += 1 if pred != good_label && gs != good_label 
      fn += 1 if pred != good_label && gs == good_label
    end

    p = tp + fn
    pp = tp + fp

    pr = tp.to_f / pp
    re = tp.to_f / p

    f1 = (2.0 * tp) / (2.0 * tp + fp + fn) 

    [tp, tn, fp, fn, pr, re, f1]
  else 
    num = labels.length
    acc = []
    labels.each do |good_label|
      values = VectorModel.f1_metrics(test, predicted, good_label)
      tp, tn, fp, fn, pr, re, f1 = values
      Log.debug "Partial CV #{good_label} - P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
      acc << values
    end
    Misc.zip_fields(acc).collect{|s| Misc.mean(s)}
  end
end

.R_eval(model_file, features, list, code, names = nil, factor_levels = nil) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/rbbt/vector/model.rb', line 61

def self.R_eval(model_file, features, list, code, names = nil, factor_levels = nil)
  TmpFile.with_file do |feature_file|
    if list
      Open.write(feature_file, features.collect{|feat| feat * "\t"} * "\n" + "\n")
    else
      Open.write(feature_file, features * "\t" + "\n")
    end
    Open.write(feature_file + '.names', names * "\n" + "\n") if names

    TmpFile.with_file do |results|

      io = R.run "features = read.table(\"\#{ feature_file }\", sep =\"\\\\t\", stringsAsFactors=TRUE);\n\#{\"names(features) = make.names(readLines('\#{feature_file + '.names'}'))\" if names }\n\#{ factor_levels.collect do |name,levels|\n  \"features[['\#{name}']] = factor(features[['\#{name}']], levels=\#{R.ruby2R levels})\"\nend * \"\\n\" if factor_levels }\nload(file=\"\#{model_file}\");\n\#{code}\ncat(paste(label, sep=\"\\\\n\", collapse=\"\\\\n\"));\n      EOF\n      txt = io.read\n      res = txt.sub(/WARNING: .*?\\n/s,'').split(/\\s+/)\n\n      if list\n        res\n      else\n        res.first\n      end\n    end\n  end\nend\n"

.R_run(model_file, features, labels, code, names = nil, factor_levels = nil) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/rbbt/vector/model.rb', line 7

def self.R_run(model_file, features, labels, code, names = nil, factor_levels = nil)
  TmpFile.with_file do |feature_file|
    Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
    Open.write(feature_file + '.label', labels * "\n" + "\n")
    Open.write(feature_file + '.names', names * "\n" + "\n") if names


    what = case labels.first
           when Numeric, Integer, Float
             'numeric()'
           else
             'character()'
           end

    R.run "features = read.table(\"\#{ feature_file }\", sep =\"\\\\t\", stringsAsFactors=TRUE);\n\#{\"names(features) = make.names(readLines('\#{feature_file + '.names'}'))\" if names }\n\#{ factor_levels.collect do |name,levels|\n  \"features[['\#{name}']] = factor(features[['\#{name}']], levels=\#{R.ruby2R levels})\"\nend * \"\\n\" if factor_levels }\nlabels = scan(\"\#{ feature_file }.label\", what=\#{what});\nfeatures = cbind(features, label = labels);\n\#{code}\n    EOF\n  end\nend\n"

.R_train(model_file, features, labels, code, names = nil, factor_levels = nil) ⇒ Object



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/rbbt/vector/model.rb', line 34

def self.R_train(model_file, features, labels, code, names = nil, factor_levels = nil)
  TmpFile.with_file do |feature_file|
    Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
    Open.write(feature_file + '.label', labels * "\n" + "\n")
    Open.write(feature_file + '.names', names * "\n" + "\n") if names

    what = case labels.first
           when Numeric, Integer, Float
             'numeric()'
           else
             'character()'
           end

    R.run "features = read.table(\"\#{ feature_file }\", sep =\"\\\\t\", stringsAsFactors=TRUE);\nlabels = scan(\"\#{ feature_file }.label\", what=\#{what});\n\#{\"names(features) = make.names(readLines('\#{feature_file + '.names'}'))\" if names }\nfeatures = cbind(features, label = labels);\n\#{ factor_levels.collect do |name,levels|\n  \"features[['\#{name}']] = factor(features[['\#{name}']], levels=\#{R.ruby2R levels})\"\nend * \"\\n\" if factor_levels }\n\#{code}\nsave(model, file='\#{model_file}')\n    EOF\n  end\nend\n"

Instance Method Details

#__load_method(file) ⇒ Object



94
95
96
97
98
# File 'lib/rbbt/vector/model.rb', line 94

def __load_method(file)
  code = Open.read(file)
  code.sub!(/.*Proc\.new/, "Proc.new")
  instance_eval code, file
end

#add(element, label = nil) ⇒ Object



166
167
168
169
170
# File 'lib/rbbt/vector/model.rb', line 166

def add(element, label = nil)
  features = @extract_features ? extract_features.call(element) : element
  @features << features
  @labels << label 
end

#add_list(elements, labels = nil) ⇒ Object



172
173
174
175
176
177
178
179
180
181
182
# File 'lib/rbbt/vector/model.rb', line 172

def add_list(elements, labels = nil)
  if @extract_features.nil? || @extract_features.arity == 1
    elements.zip(labels || [nil]).each do |elem,label|
      add(elem, label)
    end
  else
    features = @extract_features.call(nil, elements)
    @features.concat  features
    @labels.concat labels if labels
  end
end

#clearObject



161
162
163
164
# File 'lib/rbbt/vector/model.rb', line 161

def clear
  @features = []
  @labels = []
end

#cross_validation(folds = 10, good_label = nil) ⇒ Object



331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
# File 'lib/rbbt/vector/model.rb', line 331

def cross_validation(folds = 10, good_label = nil)

  orig_features = @features
  orig_labels = @labels

  multiclass = @labels.uniq.length > 2

  if multiclass
    res = TSV.setup({}, "Fold~P,R,F1#:type=:list")
  else
    res = TSV.setup({}, "Fold~TP,TN,FP,FN,P,R,F1#:type=:list")
  end

  begin
    if folds == 1
      feature_folds = [@features]
      labels_folds = [@labels]
    else
      feature_folds = Misc.divide(@features, folds)
      labels_folds = Misc.divide(@labels, folds)
    end

    folds.times do |fix|

      if folds == 1
        rest = [fix]
      else
        rest = (0..(folds-1)).to_a - [fix]
      end

      test_set = feature_folds[fix]
      train_set = feature_folds.values_at(*rest).inject([]){|acc,e| acc += e; acc}

      test_labels = labels_folds[fix]
      train_labels = labels_folds.values_at(*rest).flatten

      @features = train_set
      @labels = train_labels

      self.train
      predictions = self.eval_list test_set, false

      raise "Number of predictions (#{predictions.length}) and test labels (#{test_labels.length}) do not match" if predictions.length != test_labels.length

      different_labels = test_labels.uniq

      Log.debug do "Accuracy Fold #{fix}: #{(100 * test_labels.zip(predictions).select{|t,p| t == p }.length.to_f / test_labels.length).round(2)}%"  end

      tp, tn, fp, fn, pr, re, f1 = VectorModel.f1_metrics(test_labels, predictions, good_label)

      if multiclass 
        Log.low "Multi-class CV Fold #{fix} - Average P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1}"
        res[fix] = [pr,re,f1]
      else
        Log.low "CV Fold #{fix} P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
        res[fix] = [tp,tn,fp,fn,pr,re,f1]
      end

    end
  ensure
    @features = orig_features
    @labels = orig_labels
  end
  self.train unless folds == 1
  res
end

#eval(element) ⇒ Object



227
228
229
230
231
232
233
234
# File 'lib/rbbt/vector/model.rb', line 227

def eval(element)
  case 
  when Proc === @eval_model
    @eval_model.call(@model_file, @extract_features.call(element), false, nil, @names, @factor_levels)
  when String === @eval_model
    VectorModel.R_eval(@model_file,  @extract_features.call(element), false, eval_model, @names, @factor_levels)
  end
end

#eval_list(elements, extract = true) ⇒ Object



236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# File 'lib/rbbt/vector/model.rb', line 236

def eval_list(elements, extract = true)

  if extract && ! @extract_features.nil? 
    features = if @extract_features.arity == 1
                 elements.collect{|element| @extract_features.call(element) }
               else
                 @extract_features.call(nil, elements)
               end
  else
    features = elements
  end

  case 
  when Proc === eval_model
    eval_model.call(@model_file, features, true, nil, @names, @factor_levels)
  when String === eval_model
    VectorModel.R_eval(@model_file, features, true, eval_model, @names, @factor_levels)
  end
end

#run(code) ⇒ Object



223
224
225
# File 'lib/rbbt/vector/model.rb', line 223

def run(code)
  VectorModel.R_run(@model_file,  @features, @labels, code, @names, @factor_levels)
end

#save_modelsObject



184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# File 'lib/rbbt/vector/model.rb', line 184

def save_models
  require 'method_source'

  case 
  when Proc === train_model
    begin
      Open.write(@train_model_file, train_model.source)
    rescue
    end
  when String === train_model
    Open.write(@train_model_file_R, @train_model)
  end

  Open.write(@extract_features_file, @extract_features.source) if @extract_features

  case 
  when Proc === eval_model
    begin
      Open.write(@eval_model_file, eval_model.source)
    rescue
    end
  when String === eval_model
    Open.write(@eval_model_file_R, eval_model)
  end

  Open.write(@levels_file, @factor_levels.to_yaml) if @factor_levels
  Open.write(@names_file, @names * "\n" + "\n") if @names
end

#trainObject



213
214
215
216
217
218
219
220
221
# File 'lib/rbbt/vector/model.rb', line 213

def train
  case 
  when Proc === train_model
    train_model.call(@model_file, @features, @labels, @names, @factor_levels)
  when String === train_model
    VectorModel.R_train(@model_file,  @features, @labels, train_model, @names, @factor_levels)
  end
  save_models
end