Class: OpenTox::Model::Lazar
- Includes:
- Mongoid::Document, Mongoid::Timestamps, OpenTox
- Defined in:
- lib/model.rb
Direct Known Subclasses
Instance Attribute Summary collapse
-
#independent_variables ⇒ Array<Array>
Get independent variables.
Class Method Summary collapse
-
.create(prediction_feature: nil, training_dataset:, algorithms: {}) ⇒ OpenTox::Model::Lazar
Create a lazar model.
Instance Method Summary collapse
-
#descriptors ⇒ Array<OpenTox::Feature>
Get training descriptors.
-
#fingerprints? ⇒ TrueClass, FalseClass
Are fingerprints used as descriptors.
-
#predict(object) ⇒ Hash, ...
Predict a substance (compound or nanoparticle), an array of substances or a dataset.
-
#predict_substance(substance, threshold = ) ⇒ Hash
Predict a substance (compound or nanoparticle).
-
#prediction_feature ⇒ OpenTox::Feature
Get prediction feature.
-
#save ⇒ Object
Save the model Stores independent_variables in GridFS to avoid Mongo database size limit problems.
-
#substances ⇒ Array<OpenTox::Substance>
Get training substances.
-
#training_dataset ⇒ OpenTox::Dataset
Get training dataset.
Instance Attribute Details
Class Method Details
.create(prediction_feature: nil, training_dataset:, algorithms: {}) ⇒ OpenTox::Model::Lazar
Create a lazar model
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
# File 'lib/model.rb', line 38 def self.create prediction_feature:nil, training_dataset:, algorithms:{} bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset prediction_feature = training_dataset.features.first unless prediction_feature # TODO: prediction_feature without training_dataset: use all available data # guess model type prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id model.name = "#{prediction_feature.name} (#{training_dataset.name})" # TODO: check if this works for gem version, add gem versioning? dir = File.dirname(__FILE__) commit = `cd #{dir}; git rev-parse HEAD`.chomp branch = `cd #{dir}; git rev-parse --abbrev-ref HEAD`.chomp url = `cd #{dir}; git config --get remote.origin.url`.chomp if branch model.version = {:url => url, :branch => branch, :commit => commit} else model.version = {:warning => "git is not installed"} end # set defaults# substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 if substance_classes.first == "OpenTox::Compound" model.algorithms = { :descriptors => { :method => "fingerprint", :type => "MP2D", }, :feature_selection => nil } if model.class == LazarClassification model.algorithms[:prediction] = { :method => "Algorithm::Classification.weighted_majority_vote", } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", :min => 0.1, } elsif model.class == LazarRegression model.algorithms[:prediction] = { :method => "Algorithm::Caret.rf", } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", :min => 0.5, } end elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { :descriptors => { :method => "properties", :categories => ["P-CHEM"], }, :similarity => { :method => "Algorithm::Similarity.weighted_cosine", :min => 0.5, }, :prediction => { :method => "Algorithm::Caret.rf", }, :feature_selection => { :method => "Algorithm::FeatureSelection.correlation_filter", }, } else bad_request_error "Cannot create models for #{substance_classes.first}." end # overwrite defaults with explicit parameters algorithms.each do |type,parameters| if parameters and parameters.is_a? Hash parameters.each do |p,v| model.algorithms[type] ||= {} model.algorithms[type][p] = v model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type end else model.algorithms[type] = parameters end end if algorithms # parse dependent_variables from training dataset training_dataset.substances.each do |substance| values = training_dataset.values(substance,model.prediction_feature_id) values.each do |v| model.substance_ids << substance.id.to_s model.dependent_variables << v end if values end descriptor_method = model.algorithms[:descriptors][:method] model.independent_variables = [] case descriptor_method # parse fingerprints when "fingerprint" type = model.algorithms[:descriptors][:type] model.substances.each_with_index do |s,i| model.fingerprints[i] ||= [] model.fingerprints[i] += s.fingerprint(type) model.fingerprints[i].uniq! end model.descriptor_ids = model.fingerprints.flatten.uniq model.descriptor_ids.each do |d| model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ end # calculate physchem properties when "calculate_properties" features = model.algorithms[:descriptors][:features] model.descriptor_ids = features.collect{|f| f.id.to_s} model.algorithms[:descriptors].delete(:features) model.algorithms[:descriptors].delete(:type) model.substances.each_with_index do |s,i| props = s.calculate_properties(features) props.each_with_index do |v,j| model.independent_variables[j] ||= [] model.independent_variables[j][i] = v end if props and !props.empty? end # parse independent_variables when "properties" categories = model.algorithms[:descriptors][:categories] feature_ids = [] categories.each do |category| Feature.where(category:category).each{|f| feature_ids << f.id.to_s} end properties = model.substances.collect { |s| s.properties } property_ids = properties.collect{|p| p.keys}.flatten.uniq model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} else bad_request_error "Descriptor method '#{descriptor_method}' not implemented." end if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] model = Algorithm.run model.algorithms[:feature_selection][:method], model end # scale independent_variables unless model.fingerprints? model.independent_variables.each_with_index do |var,i| model.descriptor_means[i] = var.mean model.descriptor_sds[i] = var.standard_deviation model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil} end end model.save model end |
Instance Method Details
#descriptors ⇒ Array<OpenTox::Feature>
Get training descriptors
364 365 366 |
# File 'lib/model.rb', line 364 def descriptors descriptor_ids.collect{|id| Feature.find(id)} end |
#fingerprints? ⇒ TrueClass, FalseClass
Are fingerprints used as descriptors
376 377 378 |
# File 'lib/model.rb', line 376 def fingerprints? algorithms[:descriptors][:method] == "fingerprint" ? true : false end |
#predict(object) ⇒ Hash, ...
Predict a substance (compound or nanoparticle), an array of substances or a dataset
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 |
# File 'lib/model.rb', line 289 def predict object training_dataset = Dataset.find training_dataset_id # parse data substances = [] if object.is_a? Substance substances = [object] elsif object.is_a? Array substances = object elsif object.is_a? Dataset substances = object.substances else bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." end # make predictions predictions = {} substances.each do |c| predictions[c.id.to_s] = predict_substance c predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id end # serialize result if object.is_a? Substance prediction = predictions[substances.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity return prediction elsif object.is_a? Array return predictions elsif object.is_a? Dataset # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) prediction_dataset = LazarPrediction.create( :name => "Lazar prediction for #{prediction_feature.name}", :creator => __FILE__, :prediction_feature_id => prediction_feature.id, :predictions => predictions ) return prediction_dataset end end |
#predict_substance(substance, threshold = ) ⇒ Hash
Predict a substance (compound or nanoparticle)
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 |
# File 'lib/model.rb', line 197 def predict_substance substance, threshold = self.algorithms[:similarity][:min] @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] when /tanimoto/ # binary features similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] # TODO this excludes descriptors only present in the query substance # use for applicability domain? query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} when /euclid|cosine/ # quantitative features if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors features = descriptor_ids.collect{|id| Feature.find(id)} query_descriptors = substance.calculate_properties(features) similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]} else similarity_descriptors = [] query_descriptors = [] descriptor_ids.each_with_index do |id,i| prop = substance.properties[id] prop = prop.median if prop.is_a? Array # measured if prop similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i] query_descriptors[i] = prop end end end else bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end prediction = {:warnings => [], :measurements => []} prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min] neighbor_ids = [] neighbor_similarities = [] neighbor_dependent_variables = [] neighbor_independent_variables = [] # find neighbors substance_ids.each_with_index do |s,i| # handle query substance if substance.id.to_s == s prediction[:measurements] << dependent_variables[i] prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else if fingerprints? neighbor_descriptors = fingerprints[i] else next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions neighbor_descriptors = scaled_variables.collect{|v| v[i]} end sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] if sim >= threshold neighbor_ids << s neighbor_similarities << sim neighbor_dependent_variables << dependent_variables[i] independent_variables.each_with_index do |c,j| neighbor_independent_variables[j] ||= [] neighbor_independent_variables[j] << @independent_variables[j][i] end end end end measurements = nil if neighbor_similarities.empty? prediction[:value] = nil prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset." elsif neighbor_similarities.size == 1 prediction[:value] = nil prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set." prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}] else query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} #if neighbor_similarities.max < algorithms[:similarity][:warn_min] #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain." #end end if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2 prediction else # try again with a lower threshold predict_substance substance, 0.2 end end |
#prediction_feature ⇒ OpenTox::Feature
Get prediction feature
358 359 360 |
# File 'lib/model.rb', line 358 def prediction_feature Feature.find(prediction_feature_id) end |
#save ⇒ Object
Save the model
Stores independent_variables in GridFS to avoid Mongo database size limit problems
337 338 339 340 341 |
# File 'lib/model.rb', line 337 def save file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables") self.independent_variables_id = $gridfs.insert_one(file) super end |
#substances ⇒ Array<OpenTox::Substance>
Get training substances
370 371 372 |
# File 'lib/model.rb', line 370 def substances substance_ids.collect{|id| Substance.find(id)} end |
#training_dataset ⇒ OpenTox::Dataset
Get training dataset
352 353 354 |
# File 'lib/model.rb', line 352 def training_dataset Dataset.find(training_dataset_id) end |