Class: OpenTox::Model::Lazar
- Includes:
- Mongoid::Document, Mongoid::Timestamps, OpenTox
- Defined in:
- lib/model.rb
Direct Known Subclasses
Instance Attribute Summary collapse
-
#independent_variables ⇒ Array<Array>
Get independent variables.
Class Method Summary collapse
-
.create(prediction_feature: nil, training_dataset:, algorithms: {}) ⇒ OpenTox::Model::Lazar
Create a lazar model.
Instance Method Summary collapse
-
#descriptors ⇒ Array<OpenTox::Feature>
Get training descriptors.
-
#fingerprints? ⇒ TrueClass, FalseClass
Are fingerprints used as descriptors.
-
#predict(object) ⇒ Hash, ...
Predict a substance (compound or nanoparticle), an array of substances or a dataset.
-
#predict_substance(substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil) ⇒ Hash
Predict a substance (compound or nanoparticle).
-
#prediction_feature ⇒ OpenTox::Feature
Get prediction feature.
-
#save ⇒ Object
Save the model Stores independent_variables in GridFS to avoid Mongo database size limit problems.
-
#substances ⇒ Array<OpenTox::Substance>
Get training substances.
-
#training_dataset ⇒ OpenTox::Dataset
Get training dataset.
Instance Attribute Details
Class Method Details
.create(prediction_feature: nil, training_dataset:, algorithms: {}) ⇒ OpenTox::Model::Lazar
Create a lazar model
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
# File 'lib/model.rb', line 38 def self.create prediction_feature:nil, training_dataset:, algorithms:{} raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature # guess model type prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id model.name = training_dataset.name # git or gem versioning dir = File.dirname(__FILE__) path = File.("../", File.(dir)) if Dir.exists?(dir+"/.git") commit = `git rev-parse HEAD`.chomp branch = `git rev-parse --abbrev-ref HEAD`.chomp url = `git config --get remote.origin.url`.chomp model.version = {:url => url, :branch => branch, :commit => commit} else version = File.open(path+"/VERSION", &:gets).chomp url = "https://rubygems.org/gems/lazar/versions/"+version model.version = {:url => url, :branch => "gem", :commit => version} end # set defaults# substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 if substance_classes.first == "OpenTox::Compound" model.algorithms = { :descriptors => { :method => "fingerprint", :type => "MP2D", }, :feature_selection => nil } if model.class == LazarClassification model.algorithms[:prediction] = { :method => "Algorithm::Classification.weighted_majority_vote", } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", :min => [0.5,0.2], } elsif model.class == LazarRegression model.algorithms[:prediction] = { :method => "Algorithm::Caret.rf", } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", :min => [0.5,0.2], } end elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { :descriptors => { :method => "properties", :categories => ["P-CHEM"], }, :similarity => { :method => "Algorithm::Similarity.weighted_cosine", :min => [0.5,0.2], }, :prediction => { :method => "Algorithm::Caret.rf", }, :feature_selection => { :method => "Algorithm::FeatureSelection.correlation_filter", }, } else raise ArgumentError, "Cannot create models for #{substance_classes.first}." end # overwrite defaults with explicit parameters algorithms.each do |type,parameters| if parameters and parameters.is_a? Hash parameters.each do |p,v| model.algorithms[type] ||= {} model.algorithms[type][p] = v model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type end else model.algorithms[type] = parameters end end if algorithms # parse dependent_variables from training dataset training_dataset.substances.each do |substance| values = training_dataset.values(substance,model.prediction_feature_id) values.each do |v| model.substance_ids << substance.id.to_s model.dependent_variables << v end if values end descriptor_method = model.algorithms[:descriptors][:method] model.independent_variables = [] case descriptor_method # parse fingerprints when "fingerprint" type = model.algorithms[:descriptors][:type] model.substances.each_with_index do |s,i| model.fingerprints[i] ||= [] model.fingerprints[i] += s.fingerprint(type) model.fingerprints[i].uniq! end model.descriptor_ids = model.fingerprints.flatten.uniq model.descriptor_ids.each do |d| model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ end # calculate physchem properties when "calculate_properties" features = model.algorithms[:descriptors][:features] model.descriptor_ids = features.collect{|f| f.id.to_s} model.algorithms[:descriptors].delete(:features) model.algorithms[:descriptors].delete(:type) model.substances.each_with_index do |s,i| props = s.calculate_properties(features) props.each_with_index do |v,j| model.independent_variables[j] ||= [] model.independent_variables[j][i] = v end if props and !props.empty? end # parse independent_variables when "properties" categories = model.algorithms[:descriptors][:categories] feature_ids = [] categories.each do |category| Feature.where(category:category).each{|f| feature_ids << f.id.to_s} end properties = model.substances.collect { |s| s.properties } property_ids = properties.collect{|p| p.keys}.flatten.uniq model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} else raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented." end if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] model = Algorithm.run model.algorithms[:feature_selection][:method], model end # scale independent_variables unless model.fingerprints? model.independent_variables.each_with_index do |var,i| model.descriptor_means[i] = var.mean model.descriptor_sds[i] = var.standard_deviation model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil} end end model.save model end |
Instance Method Details
#descriptors ⇒ Array<OpenTox::Feature>
Get training descriptors
402 403 404 |
# File 'lib/model.rb', line 402 def descriptors descriptor_ids.collect{|id| Feature.find(id)} end |
#fingerprints? ⇒ TrueClass, FalseClass
Are fingerprints used as descriptors
414 415 416 |
# File 'lib/model.rb', line 414 def fingerprints? algorithms[:descriptors][:method] == "fingerprint" ? true : false end |
#predict(object) ⇒ Hash, ...
Predict a substance (compound or nanoparticle), an array of substances or a dataset
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 |
# File 'lib/model.rb', line 304 def predict object training_dataset = Dataset.find training_dataset_id # parse data substances = [] if object.is_a? Substance substances = [object] elsif object.is_a? Array substances = object elsif object.is_a? Dataset substances = object.substances else raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." end # make predictions predictions = {} substances.each do |c| predictions[c.id.to_s] = predict_substance c if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value] prediction_feature.accept_values.each do |v| predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity) end end predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id end # serialize result if object.is_a? Substance prediction = predictions[substances.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity return prediction elsif object.is_a? Array return predictions elsif object.is_a? Dataset d = object.copy #warning_feature = Warnings.find_or_create_by(:dataset_id => d.id) confidence_feature = Confidence.find_or_create_by(:dataset_id => d.id) if prediction_feature.is_a? NominalBioActivity f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id) probability_features = {} prediction_feature.accept_values.each do |v| probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) end elsif prediction_feature.is_a? NumericBioActivity f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) prediction_interval = [] ["lower","upper"].each do |v| prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) end end # add predictions to dataset predictions.each do |substance_id,p| substance_id = BSON::ObjectId.from_string(substance_id) d.add substance_id,confidence_feature,p[:confidence] unless p[:value].nil? d.add substance_id,f,p[:value] p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities] p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval] end end d.save return d end end |
#predict_substance(substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil) ⇒ Hash
Predict a substance (compound or nanoparticle)
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 |
# File 'lib/model.rb', line 200 def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] when /tanimoto/ # binary features similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] # TODO this excludes descriptors only present in the query substance # use for applicability domain? query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} when /euclid|cosine/ # quantitative features if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors features = descriptor_ids.collect{|id| Feature.find(id)} query_descriptors = substance.calculate_properties(features) similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]} else similarity_descriptors = [] query_descriptors = [] descriptor_ids.each_with_index do |id,i| prop = substance.properties[id] prop = prop.median if prop.is_a? Array # measured if prop similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i] query_descriptors[i] = prop end end end else raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end prediction ||= {:warnings => [], :measurements => []} prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first neighbor_ids = [] neighbor_similarities = [] neighbor_dependent_variables = [] neighbor_independent_variables = [] # find neighbors substance_ids.each_with_index do |s,i| # handle query substance if substance.id.to_s == s prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else if fingerprints? neighbor_descriptors = fingerprints[i] else next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions neighbor_descriptors = scaled_variables.collect{|v| v[i]} end sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] if sim >= threshold neighbor_ids << s neighbor_similarities << sim neighbor_dependent_variables << dependent_variables[i] independent_variables.each_with_index do |c,j| neighbor_independent_variables[j] ||= [] neighbor_independent_variables[j] << @independent_variables[j][i] end end end end measurements = nil if neighbor_similarities.empty? prediction[:value] = nil prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset." if threshold == algorithms[:similarity][:min].last prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})." return prediction end elsif neighbor_similarities.size == 1 prediction[:value] = nil prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})." prediction[:neighbors] = [{:id => neighbor_ids.first, :measurement => neighbor_dependent_variables[0], :similarity => neighbor_similarities.first}] if threshold == algorithms[:similarity][:min].last prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set." return prediction end else query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end if threshold == algorithms[:similarity][:min].first if prediction[:warnings].empty? prediction[:confidence] = "Similar to bioassay results" return prediction else # try again with a lower threshold prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}." predict_substance substance, algorithms[:similarity][:min].last, prediction end elsif threshold < algorithms[:similarity][:min].first prediction[:confidence] = "Lower than bioassay results" return prediction end end |
#prediction_feature ⇒ OpenTox::Feature
Get prediction feature
396 397 398 |
# File 'lib/model.rb', line 396 def prediction_feature Feature.find(prediction_feature_id) end |
#save ⇒ Object
Save the model
Stores independent_variables in GridFS to avoid Mongo database size limit problems
375 376 377 378 379 |
# File 'lib/model.rb', line 375 def save file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables") self.independent_variables_id = $gridfs.insert_one(file) super end |
#substances ⇒ Array<OpenTox::Substance>
Get training substances
408 409 410 |
# File 'lib/model.rb', line 408 def substances substance_ids.collect{|id| Substance.find(id)} end |
#training_dataset ⇒ OpenTox::Dataset
Get training dataset
390 391 392 |
# File 'lib/model.rb', line 390 def training_dataset Dataset.find(training_dataset_id) end |