Class: OpenTox::Model::Lazar

Inherits:
Object
  • Object
show all
Includes:
Mongoid::Document, Mongoid::Timestamps, OpenTox
Defined in:
lib/model.rb

Direct Known Subclasses

LazarClassification, LazarRegression

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#independent_variablesArray<Array>

Get independent variables

Returns:



383
384
385
386
# File 'lib/model.rb', line 383

def independent_variables 
  @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
  @independent_variables
end

Class Method Details

.create(prediction_feature: nil, training_dataset:, algorithms: {}) ⇒ OpenTox::Model::Lazar

Create a lazar model

Parameters:

  • training_dataset (OpenTox::Dataset)
  • prediction_feature (OpenTox::Feature, nil) (defaults to: nil)

    By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature

  • algorithms (Hash, nil) (defaults to: {})

    Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.

Returns:

Raises:

  • (ArgumentError)


38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/model.rb', line 38

def self.create prediction_feature:nil, training_dataset:, algorithms:{}
  raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset
  prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature

  # guess model type
  prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new

  model.prediction_feature_id = prediction_feature.id
  model.training_dataset_id = training_dataset.id
  model.name = training_dataset.name
  
  # git or gem versioning
  dir = File.dirname(__FILE__)
  path = File.expand_path("../", File.expand_path(dir))
  if Dir.exists?(dir+"/.git")
    commit = `git rev-parse HEAD`.chomp
    branch = `git rev-parse --abbrev-ref HEAD`.chomp
    url = `git config --get remote.origin.url`.chomp
    model.version = {:url => url, :branch => branch, :commit => commit}
  else
    version = File.open(path+"/VERSION", &:gets).chomp
    url = "https://rubygems.org/gems/lazar/versions/"+version
    model.version = {:url => url, :branch => "gem", :commit => version}
  end

  # set defaults#
  substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
  raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1

  if substance_classes.first == "OpenTox::Compound"

    model.algorithms = {
      :descriptors => {
        :method => "fingerprint",
        :type => "MP2D",
      },
      :feature_selection => nil
    }

    if model.class == LazarClassification
      model.algorithms[:prediction] = {
          :method => "Algorithm::Classification.weighted_majority_vote",
      }
      model.algorithms[:similarity] = {
        :method => "Algorithm::Similarity.tanimoto",
        :min => [0.5,0.2],
      }
    elsif model.class == LazarRegression
      model.algorithms[:prediction] = {
        :method => "Algorithm::Caret.rf",
      }
      model.algorithms[:similarity] = {
        :method => "Algorithm::Similarity.tanimoto",
        :min => [0.5,0.2],
      }
    end

  elsif substance_classes.first == "OpenTox::Nanoparticle"
    model.algorithms = {
      :descriptors => {
        :method => "properties",
        :categories => ["P-CHEM"],
      },
      :similarity => {
        :method => "Algorithm::Similarity.weighted_cosine",
        :min => [0.5,0.2],
      },
      :prediction => {
        :method => "Algorithm::Caret.rf",
      },
      :feature_selection => {
        :method => "Algorithm::FeatureSelection.correlation_filter",
      },
    }
  else
    raise ArgumentError, "Cannot create models for #{substance_classes.first}."
  end
  
  # overwrite defaults with explicit parameters
  algorithms.each do |type,parameters|
    if parameters and parameters.is_a? Hash
      parameters.each do |p,v|
        model.algorithms[type] ||= {}
        model.algorithms[type][p] = v
        model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type
      end
    else
      model.algorithms[type] = parameters
    end
  end if algorithms

  # parse dependent_variables from training dataset
  training_dataset.substances.each do |substance|
    values = training_dataset.values(substance,model.prediction_feature_id)
    values.each do |v|
      model.substance_ids << substance.id.to_s
      model.dependent_variables << v
    end if values
  end

  descriptor_method = model.algorithms[:descriptors][:method]
  model.independent_variables = []
  case descriptor_method
  # parse fingerprints
  when "fingerprint"
    type = model.algorithms[:descriptors][:type]
    model.substances.each_with_index do |s,i|
      model.fingerprints[i] ||= [] 
      model.fingerprints[i] += s.fingerprint(type)
      model.fingerprints[i].uniq!
    end
    model.descriptor_ids = model.fingerprints.flatten.uniq
    model.descriptor_ids.each do |d|
      model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
    end
  # calculate physchem properties
  when "calculate_properties"
    features = model.algorithms[:descriptors][:features]
    model.descriptor_ids = features.collect{|f| f.id.to_s}
    model.algorithms[:descriptors].delete(:features)
    model.algorithms[:descriptors].delete(:type)
    model.substances.each_with_index do |s,i|
      props = s.calculate_properties(features)
      props.each_with_index do |v,j|
        model.independent_variables[j] ||= []
        model.independent_variables[j][i] = v
      end if props and !props.empty?
    end
  # parse independent_variables
  when "properties"
    categories = model.algorithms[:descriptors][:categories]
    feature_ids = []
    categories.each do |category|
      Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
    end
    properties = model.substances.collect { |s| s.properties  }
    property_ids = properties.collect{|p| p.keys}.flatten.uniq
    model.descriptor_ids = feature_ids & property_ids
    model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
  else
    raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented."
  end
  
  if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
    model = Algorithm.run model.algorithms[:feature_selection][:method], model
  end

  # scale independent_variables
  unless model.fingerprints?
    model.independent_variables.each_with_index do |var,i|
      model.descriptor_means[i] = var.mean
      model.descriptor_sds[i] =  var.standard_deviation
      model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
    end
  end
  model.save
  model
end

Instance Method Details

#descriptorsArray<OpenTox::Feature>

Get training descriptors

Returns:

  • (Array<OpenTox::Feature>)


402
403
404
# File 'lib/model.rb', line 402

def descriptors
  descriptor_ids.collect{|id| Feature.find(id)}
end

#fingerprints?TrueClass, FalseClass

Are fingerprints used as descriptors

Returns:

  • (TrueClass, FalseClass)


414
415
416
# File 'lib/model.rb', line 414

def fingerprints?
  algorithms[:descriptors][:method] == "fingerprint" ? true : false
end

#predict(object) ⇒ Hash, ...

Predict a substance (compound or nanoparticle), an array of substances or a dataset



304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
# File 'lib/model.rb', line 304

def predict object

  training_dataset = Dataset.find training_dataset_id

  # parse data
  substances = []
  if object.is_a? Substance
    substances = [object] 
  elsif object.is_a? Array
    substances = object
  elsif object.is_a? Dataset
    substances = object.substances
  else 
    raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
  end

  # make predictions
  predictions = {}
  substances.each do |c|
    predictions[c.id.to_s] = predict_substance c
    if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value]
      prediction_feature.accept_values.each do |v|
        predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity)
      end
    end
    predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id 
  end

  # serialize result
  if object.is_a? Substance
    prediction = predictions[substances.first.id.to_s]
    prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity
    return prediction
  elsif object.is_a? Array
    return predictions
  elsif object.is_a? Dataset
    d = object.copy
    #warning_feature = Warnings.find_or_create_by(:dataset_id => d.id)
    confidence_feature = Confidence.find_or_create_by(:dataset_id => d.id)
    if prediction_feature.is_a? NominalBioActivity
      f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id)
      probability_features = {}
      prediction_feature.accept_values.each do |v|
        probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
      end
    elsif prediction_feature.is_a? NumericBioActivity
      f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
      prediction_interval = []
      ["lower","upper"].each do |v|
        prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
      end
    end

    # add predictions to dataset
    predictions.each do |substance_id,p|
      substance_id = BSON::ObjectId.from_string(substance_id)
      d.add substance_id,confidence_feature,p[:confidence]
      unless p[:value].nil?
        d.add substance_id,f,p[:value]
        p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities]
        p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval]
      end
    end
    d.save
    return d
  end

end

#predict_substance(substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil) ⇒ Hash

Predict a substance (compound or nanoparticle)

Parameters:

Returns:

  • (Hash)


200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# File 'lib/model.rb', line 200

def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil
  
  @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
  case algorithms[:similarity][:method]
  when /tanimoto/ # binary features
    similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
    # TODO this excludes descriptors only present in the query substance
    # use for applicability domain?
    query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
  when /euclid|cosine/ # quantitative features
    if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors
      features = descriptor_ids.collect{|id| Feature.find(id)}
      query_descriptors = substance.calculate_properties(features)
      similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]}
    else
      similarity_descriptors = []
      query_descriptors = []
      descriptor_ids.each_with_index do |id,i|
        prop = substance.properties[id]
        prop = prop.median if prop.is_a? Array # measured
        if prop
          similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i]
          query_descriptors[i] = prop
        end
      end
    end
  else
    raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
  end
  
  prediction ||= {:warnings => [], :measurements => []}
  prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first
  neighbor_ids = []
  neighbor_similarities = []
  neighbor_dependent_variables = []
  neighbor_independent_variables = []

  # find neighbors
  substance_ids.each_with_index do |s,i|
    # handle query substance
    if substance.id.to_s == s
      prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass
      prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
    else
      if fingerprints?
        neighbor_descriptors = fingerprints[i]
      else
        next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
        neighbor_descriptors = scaled_variables.collect{|v| v[i]}
      end
      sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
      if sim >= threshold
        neighbor_ids << s
        neighbor_similarities << sim
        neighbor_dependent_variables << dependent_variables[i]
        independent_variables.each_with_index do |c,j|
          neighbor_independent_variables[j] ||= []
          neighbor_independent_variables[j] << @independent_variables[j][i]
        end
      end
    end
  end

  measurements = nil
  
  if neighbor_similarities.empty?
    prediction[:value] = nil
    prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset."
    if threshold == algorithms[:similarity][:min].last
      prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})."
      return prediction
    end
  elsif neighbor_similarities.size == 1
    prediction[:value] = nil
    prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})."
    prediction[:neighbors] = [{:id => neighbor_ids.first, :measurement => neighbor_dependent_variables[0], :similarity => neighbor_similarities.first}]
    if threshold == algorithms[:similarity][:min].last
      prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set."
      return prediction
    end
  else
    query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
    # call prediction algorithm
    result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
    prediction.merge! result
    prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
  end
  if threshold == algorithms[:similarity][:min].first
    if prediction[:warnings].empty? 
      prediction[:confidence] = "Similar to bioassay results"
      return prediction
    else # try again with a lower threshold
      prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}."
      predict_substance substance, algorithms[:similarity][:min].last, prediction
    end
  elsif threshold < algorithms[:similarity][:min].first
    prediction[:confidence] = "Lower than bioassay results"
    return prediction
  end
end

#prediction_featureOpenTox::Feature

Get prediction feature

Returns:

  • (OpenTox::Feature)


396
397
398
# File 'lib/model.rb', line 396

def prediction_feature
  Feature.find(prediction_feature_id)
end

#saveObject

Save the model

Stores independent_variables in GridFS to avoid Mongo database size limit problems


375
376
377
378
379
# File 'lib/model.rb', line 375

def save
  file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables")
  self.independent_variables_id = $gridfs.insert_one(file)
  super
end

#substancesArray<OpenTox::Substance>

Get training substances



408
409
410
# File 'lib/model.rb', line 408

def substances
  substance_ids.collect{|id| Substance.find(id)}
end

#training_datasetOpenTox::Dataset

Get training dataset

Returns:



390
391
392
# File 'lib/model.rb', line 390

def training_dataset
  Dataset.find(training_dataset_id)
end