Module: OpenTox::Validation::RegressionStatistics

Included in:: RegressionCrossValidation, RegressionLeaveOneOut, RegressionTrainTest

Defined in:: lib/validation-statistics.rb

Overview

Statistical evaluation of regression validations

Instance Attribute Summary collapse

#x ⇒ Object

Returns the value of attribute x.
#y ⇒ Object

Returns the value of attribute y.

Instance Method Summary collapse

#correlation_plot(format: "png") ⇒ Blob

Plot predicted vs measured values.
#statistics ⇒ Hash

Get statistics.
#worst_predictions ⇒ Hash

Get predictions with measurements outside of the prediction interval.

Instance Attribute Details

#x ⇒ `Object`

Returns the value of attribute x.



115
116
117

# File 'lib/validation-statistics.rb', line 115

def x
  @x
end

#y ⇒ `Object`

Returns the value of attribute y.



115
116
117

# File 'lib/validation-statistics.rb', line 115

def y
  @y
end

Instance Method Details

#correlation_plot(format: "png") ⇒ `Blob`

Plot predicted vs measured values

Parameters:

format (String, nil) (defaults to: "png")

Returns:

(Blob)

# File 'lib/validation-statistics.rb', line 180

def correlation_plot format: "png"
  #unless correlation_plot_id
    #tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
    tmpdir = "/tmp"
    #p tmpdir
    FileUtils.mkdir_p tmpdir
    tmpfile = File.join(tmpdir,"#{id.to_s}_correlation.#{format}")
    x = []
    y = []
    feature = Feature.find(predictions.first.last["prediction_feature_id"])
    predictions.each do |sid,p|
      x << p["measurements"].median
      y << p["value"]
    end
    R.assign "measurement", x
    R.assign "prediction", y
    R.eval "all = c(measurement,prediction)"
    R.eval "range = c(min(all), max(all))"
    if feature.name.match /Net cell association/ # ad hoc fix for awkward units
      title = "log2(Net cell association [mL/ug(Mg)])"
    else
      title = feature.name
      title += "-log10(#{feature.unit})" if feature.unit and !feature.unit.blank?
    end
    R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
    R.eval "image = image + geom_abline(intercept=0, slope=1)"
    R.eval "ggsave(file='#{tmpfile}', plot=image)"
    file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
    plot_id = $gridfs.insert_one(file)
    update(:correlation_plot_id => plot_id)
  #end
  $gridfs.find_one(_id: correlation_plot_id).data
end

#statistics ⇒ `Hash`

Get statistics

Returns:

(Hash)

# File 'lib/validation-statistics.rb', line 119

def statistics
  self.warnings = []
  self.rmse = {:all =>0,:confidence_high => 0,:confidence_low => 0}
  self.r_squared  = {:all =>0,:confidence_high => 0,:confidence_low => 0}
  self.mae = {:all =>0,:confidence_high => 0,:confidence_low => 0}
  self.within_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0}
  self.out_of_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0}
  @x = {:all => [],:confidence_high => [],:confidence_low => []}
  @y = {:all => [],:confidence_high => [],:confidence_low => []}
  self.nr_predictions = {:all =>0,:confidence_high => 0,:confidence_low => 0}
  predictions.each do |cid,pred|
    !if pred[:value] and pred[:measurements] and !pred[:measurements].empty?
      insert_prediction pred, :all
      if pred[:confidence].match(/Similar/i)
        insert_prediction pred, :confidence_high
      elsif pred[:confidence].match(/Low/i)
        insert_prediction pred, :confidence_low
      end
    else
      trd_id = model.training_dataset_id
      smiles = Compound.find(cid).smiles
      self.warnings << "No training activities for #{smiles} in training dataset #{trd_id}."
      $logger.debug "No training activities for #{smiles} in training dataset #{trd_id}."
    end
  end
  [:all,:confidence_high,:confidence_low].each do |a|
    if @x[a].size > 2
      R.assign "measurement", @x[a]
      R.assign "prediction", @y[a]
      R.eval "r <- cor(measurement,prediction,use='pairwise')"
      self.r_squared[a] = R.eval("r").to_ruby**2
    else
      self.r_squared[a] = 0
    end
    if self.nr_predictions[a] > 0
      self.mae[a] = self.mae[a]/self.nr_predictions[a]
      self.rmse[a] = Math.sqrt(self.rmse[a]/self.nr_predictions[a])
    else
      self.mae[a] = nil
      self.rmse[a] = nil
    end
  end
  $logger.debug "R^2 #{r_squared}"
  $logger.debug "RMSE #{rmse}"
  $logger.debug "MAE #{mae}"
  $logger.debug "Nr predictions #{nr_predictions}"
  $logger.debug "#{within_prediction_interval} measurements within prediction interval"
  save
  {
    :mae => mae,
    :rmse => rmse,
    :r_squared => r_squared,
    :within_prediction_interval => self.within_prediction_interval,
    :out_of_prediction_interval => out_of_prediction_interval,
    :nr_predictions => nr_predictions,
  }
end

#worst_predictions ⇒ `Hash`

Get predictions with measurements outside of the prediction interval

Returns:

(Hash)

# File 'lib/validation-statistics.rb', line 216

def worst_predictions
  worst_predictions = predictions.select do |sid,p|
    p["prediction_interval"] and p["value"] and (p["measurements"].max < p["prediction_interval"][0] or p["measurements"].min > p["prediction_interval"][1])
  end.compact.to_h
  worst_predictions.each do |sid,p|
    p["error"] = (p["value"] - p["measurements"].median).abs
    if p["measurements"].max < p["prediction_interval"][0]
      p["distance_prediction_interval"] = (p["measurements"].max - p["prediction_interval"][0]).abs
    elsif p["measurements"].min > p["prediction_interval"][1]
      p["distance_prediction_interval"] = (p["measurements"].min - p["prediction_interval"][1]).abs
    end
  end
  worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h
end

Module: OpenTox::Validation::RegressionStatistics

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#x ⇒ Object

#y ⇒ Object

Instance Method Details

#correlation_plot(format: "png") ⇒ Blob

#statistics ⇒ Hash

#worst_predictions ⇒ Hash

#x ⇒ `Object`

#y ⇒ `Object`

#correlation_plot(format: "png") ⇒ `Blob`

#statistics ⇒ `Hash`

#worst_predictions ⇒ `Hash`