Class: OpenTox::Model::Lazar

Inherits:
Object
  • Object
show all
Includes:
Mongoid::Document, Mongoid::Timestamps, OpenTox
Defined in:
lib/model.rb

Direct Known Subclasses

LazarClassification, LazarRegression

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#independent_variablesArray<Array>

Get independent variables

Returns:



345
346
347
348
# File 'lib/model.rb', line 345

def independent_variables 
  @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
  @independent_variables
end

Class Method Details

.create(prediction_feature: nil, training_dataset:, algorithms: {}) ⇒ OpenTox::Model::Lazar

Create a lazar model

Parameters:

  • training_dataset (OpenTox::Dataset)
  • prediction_feature (OpenTox::Feature, nil) (defaults to: nil)

    By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature

  • algorithms (Hash, nil) (defaults to: {})

    Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.

Returns:



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/model.rb', line 38

def self.create prediction_feature:nil, training_dataset:, algorithms:{}
  bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
  prediction_feature = training_dataset.features.first unless prediction_feature
  # TODO: prediction_feature without training_dataset: use all available data

  # guess model type
  prediction_feature.numeric? ?  model = LazarRegression.new : model = LazarClassification.new

  model.prediction_feature_id = prediction_feature.id
  model.training_dataset_id = training_dataset.id
  model.name = "#{prediction_feature.name} (#{training_dataset.name})" 
  # TODO: check if this works for gem version, add gem versioning?
  dir = File.dirname(__FILE__)
  commit = `cd #{dir}; git rev-parse HEAD`.chomp
  branch = `cd #{dir}; git rev-parse --abbrev-ref HEAD`.chomp
  url = `cd #{dir}; git config --get remote.origin.url`.chomp
  if branch
    model.version = {:url => url, :branch => branch, :commit => commit}
  else
    model.version = {:warning => "git is not installed"}
  end

  # set defaults#
  substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
  bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1

  if substance_classes.first == "OpenTox::Compound"

    model.algorithms = {
      :descriptors => {
        :method => "fingerprint",
        :type => "MP2D",
      },
      :feature_selection => nil
    }

    if model.class == LazarClassification
      model.algorithms[:prediction] = {
          :method => "Algorithm::Classification.weighted_majority_vote",
      }
      model.algorithms[:similarity] = {
        :method => "Algorithm::Similarity.tanimoto",
        :min => 0.1,
      }
    elsif model.class == LazarRegression
      model.algorithms[:prediction] = {
        :method => "Algorithm::Caret.rf",
      }
      model.algorithms[:similarity] = {
        :method => "Algorithm::Similarity.tanimoto",
        :min => 0.5,
      }
    end

  elsif substance_classes.first == "OpenTox::Nanoparticle"
    model.algorithms = {
      :descriptors => {
        :method => "properties",
        :categories => ["P-CHEM"],
      },
      :similarity => {
        :method => "Algorithm::Similarity.weighted_cosine",
        :min => 0.5,
      },
      :prediction => {
        :method => "Algorithm::Caret.rf",
      },
      :feature_selection => {
        :method => "Algorithm::FeatureSelection.correlation_filter",
      },
    }
  else
    bad_request_error "Cannot create models for #{substance_classes.first}."
  end
  
  # overwrite defaults with explicit parameters
  algorithms.each do |type,parameters|
    if parameters and parameters.is_a? Hash
      parameters.each do |p,v|
        model.algorithms[type] ||= {}
        model.algorithms[type][p] = v
        model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type
      end
    else
      model.algorithms[type] = parameters
    end
  end if algorithms

  # parse dependent_variables from training dataset
  training_dataset.substances.each do |substance|
    values = training_dataset.values(substance,model.prediction_feature_id)
    values.each do |v|
      model.substance_ids << substance.id.to_s
      model.dependent_variables << v
    end if values
  end

  descriptor_method = model.algorithms[:descriptors][:method]
  model.independent_variables = []
  case descriptor_method
  # parse fingerprints
  when "fingerprint"
    type = model.algorithms[:descriptors][:type]
    model.substances.each_with_index do |s,i|
      model.fingerprints[i] ||= [] 
      model.fingerprints[i] += s.fingerprint(type)
      model.fingerprints[i].uniq!
    end
    model.descriptor_ids = model.fingerprints.flatten.uniq
    model.descriptor_ids.each do |d|
      model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
    end
  # calculate physchem properties
  when "calculate_properties"
    features = model.algorithms[:descriptors][:features]
    model.descriptor_ids = features.collect{|f| f.id.to_s}
    model.algorithms[:descriptors].delete(:features)
    model.algorithms[:descriptors].delete(:type)
    model.substances.each_with_index do |s,i|
      props = s.calculate_properties(features)
      props.each_with_index do |v,j|
        model.independent_variables[j] ||= []
        model.independent_variables[j][i] = v
      end if props and !props.empty?
    end
  # parse independent_variables
  when "properties"
    categories = model.algorithms[:descriptors][:categories]
    feature_ids = []
    categories.each do |category|
      Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
    end
    properties = model.substances.collect { |s| s.properties  }
    property_ids = properties.collect{|p| p.keys}.flatten.uniq
    model.descriptor_ids = feature_ids & property_ids
    model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
  else
    bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
  end
  
  if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
    model = Algorithm.run model.algorithms[:feature_selection][:method], model
  end

  # scale independent_variables
  unless model.fingerprints?
    model.independent_variables.each_with_index do |var,i|
      model.descriptor_means[i] = var.mean
      model.descriptor_sds[i] =  var.standard_deviation
      model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
    end
  end
  model.save
  model
end

Instance Method Details

#descriptorsArray<OpenTox::Feature>

Get training descriptors

Returns:



364
365
366
# File 'lib/model.rb', line 364

def descriptors
  descriptor_ids.collect{|id| Feature.find(id)}
end

#fingerprints?TrueClass, FalseClass

Are fingerprints used as descriptors

Returns:

  • (TrueClass, FalseClass)


376
377
378
# File 'lib/model.rb', line 376

def fingerprints?
  algorithms[:descriptors][:method] == "fingerprint" ? true : false
end

#predict(object) ⇒ Hash, ...

Predict a substance (compound or nanoparticle), an array of substances or a dataset



289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# File 'lib/model.rb', line 289

def predict object

  training_dataset = Dataset.find training_dataset_id

  # parse data
  substances = []
  if object.is_a? Substance
    substances = [object] 
  elsif object.is_a? Array
    substances = object
  elsif object.is_a? Dataset
    substances = object.substances
  else 
    bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
  end

  # make predictions
  predictions = {}
  substances.each do |c|
    predictions[c.id.to_s] = predict_substance c
    predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id 
  end

  # serialize result
  if object.is_a? Substance
    prediction = predictions[substances.first.id.to_s]
    prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity
    return prediction
  elsif object.is_a? Array
    return predictions
  elsif object.is_a? Dataset
    # prepare prediction dataset
    measurement_feature = Feature.find prediction_feature_id

    prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
    prediction_dataset = LazarPrediction.create(
      :name => "Lazar prediction for #{prediction_feature.name}",
      :creator =>  __FILE__,
      :prediction_feature_id => prediction_feature.id,
      :predictions => predictions
    )
    return prediction_dataset
  end

end

#predict_substance(substance, threshold = ) ⇒ Hash

Predict a substance (compound or nanoparticle)

Parameters:

Returns:

  • (Hash)


197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# File 'lib/model.rb', line 197

def predict_substance substance, threshold = self.algorithms[:similarity][:min]
  
  @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
  case algorithms[:similarity][:method]
  when /tanimoto/ # binary features
    similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
    # TODO this excludes descriptors only present in the query substance
    # use for applicability domain?
    query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
  when /euclid|cosine/ # quantitative features
    if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors
      features = descriptor_ids.collect{|id| Feature.find(id)}
      query_descriptors = substance.calculate_properties(features)
      similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]}
    else
      similarity_descriptors = []
      query_descriptors = []
      descriptor_ids.each_with_index do |id,i|
        prop = substance.properties[id]
        prop = prop.median if prop.is_a? Array # measured
        if prop
          similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i]
          query_descriptors[i] = prop
        end
      end
    end
  else
    bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
  end
  
  prediction = {:warnings => [], :measurements => []}
  prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
  neighbor_ids = []
  neighbor_similarities = []
  neighbor_dependent_variables = []
  neighbor_independent_variables = []

  # find neighbors
  substance_ids.each_with_index do |s,i|
    # handle query substance
    if substance.id.to_s == s
      prediction[:measurements] << dependent_variables[i]
      prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
    else
      if fingerprints?
        neighbor_descriptors = fingerprints[i]
      else
        next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
        neighbor_descriptors = scaled_variables.collect{|v| v[i]}
      end
      sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
      if sim >= threshold
        neighbor_ids << s
        neighbor_similarities << sim
        neighbor_dependent_variables << dependent_variables[i]
        independent_variables.each_with_index do |c,j|
          neighbor_independent_variables[j] ||= []
          neighbor_independent_variables[j] << @independent_variables[j][i]
        end
      end
    end
  end

  measurements = nil
  
  if neighbor_similarities.empty?
    prediction[:value] = nil
    prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset."
  elsif neighbor_similarities.size == 1
    prediction[:value] = nil
    prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set."
    prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
  else
    query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
    # call prediction algorithm
    result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
    prediction.merge! result
    prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
    #if neighbor_similarities.max < algorithms[:similarity][:warn_min]
      #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
    #end
  end
  if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
    prediction
  else # try again with a lower threshold
    predict_substance substance, 0.2
  end
end

#prediction_featureOpenTox::Feature

Get prediction feature

Returns:



358
359
360
# File 'lib/model.rb', line 358

def prediction_feature
  Feature.find(prediction_feature_id)
end

#saveObject

Save the model

Stores independent_variables in GridFS to avoid Mongo database size limit problems


337
338
339
340
341
# File 'lib/model.rb', line 337

def save
  file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables")
  self.independent_variables_id = $gridfs.insert_one(file)
  super
end

#substancesArray<OpenTox::Substance>

Get training substances



370
371
372
# File 'lib/model.rb', line 370

def substances
  substance_ids.collect{|id| Substance.find(id)}
end

#training_datasetOpenTox::Dataset

Get training dataset

Returns:



352
353
354
# File 'lib/model.rb', line 352

def training_dataset
  Dataset.find(training_dataset_id)
end