Class: OpenTox::Dataset

Inherits:
Object show all
Defined in:
lib/dataset.rb

Overview

Collection of substances and features

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.from_csv_file(file) ⇒ OpenTox::Dataset

Create a dataset from CSV file

Parameters:

  • Input (File)

    file with the following format:

    • ID column (optional): header containing “ID” string, arbitrary ID values

    • SMILES/InChI column: header indicating “SMILES” or “InChI”, Smiles or InChI strings

    • one or more properties column(s): header with property name(s), property values files with a single property column are read as BioActivities (i.e. dependent variable) files with multiple property columns are read as SubstanceProperties (i.e. independent variables)

Returns:



137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/dataset.rb', line 137

def self.from_csv_file file
  md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
  dataset = self.find_by(:md5 => md5)
  if dataset
    $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
  else
    $logger.debug "Parsing #{file}."
    table = nil
    sep = ","
    ["\t",";"].each do |s| # guess alternative CSV separator
      if File.readlines(file).first.match(/#{s}/)
        sep = s
        break
      end
    end
    table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
    if table
      dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5)
      dataset.parse_table table
    else
      raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator."
    end
  end
  dataset
end

.from_pubchem_aid(aid) ⇒ OpenTox::Dataset

Create a dataset from PubChem Assay

Parameters:

  • PubChem (Integer)

    AssayID (AID)

Returns:



220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# File 'lib/dataset.rb', line 220

def self.from_pubchem_aid aid
  # TODO get regression data
  aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
   = JSON.parse(RestClientWrapper.get(File.join aid_url,"description/JSON").to_s)["PC_AssayContainer"][0]["assay"]["descr"]
  name = ["name"].gsub(/\s+/,"_")
  dataset = self.new(:source => aid_url, :name => name) 
  # Get assay data in chunks
  # Assay record retrieval is limited to 10000 SIDs
  # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
  list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
  listkey = list["ListKey"]
  size = list["Size"]
  start = 0
  csv = []
  while start < size
    url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
    csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
    start += 10000
  end
  table = [["SID","SMILES",name]]
  csv.each_slice(100) do |slice| # get SMILES in chunks
    cids = slice.collect{|s| s[2]}
    pubchem_cids = []
    JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
      i = cids.index(prop["CID"].to_s)
      value = slice[i][3]
      if value == "Active" or value == "Inactive"
        table << [slice[i][1].to_s,prop["CanonicalSMILES"],slice[i][3].to_s]
        pubchem_cids << prop["CID"].to_s
      else
        dataset.warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is #{value}."
      end
    end
    (cids-pubchem_cids).each { |cid| dataset.warnings << "Could not retrieve SMILES for CID #{cid}, all entries are ignored." }
  end
  dataset.parse_table table
  dataset
end

.from_sdf_file(file) ⇒ OpenTox::Dataset

Create a dataset from SDF file

files with a single data field are read as BioActivities (i.e. dependent variable)
files with multiple data fields are read as SubstanceProperties (i.e. independent variable)

Parameters:

Returns:



168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# File 'lib/dataset.rb', line 168

def self.from_sdf_file file
  md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
  dataset = self.find_by(:md5 => md5)
  if dataset
    $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
  else
    $logger.debug "Parsing #{file}."

    dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5)
    original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => dataset.name+".ID")

    read_result = false
    sdf = ""
    feature_name = ""
    compound = nil
    features = {}
    table = [["ID","SMILES"]]

    File.readlines(file).each do |line|
      if line.match %r{\$\$\$\$}
        sdf << line
        id = sdf.split("\n").first.chomp
        compound = Compound.from_sdf sdf
        row = [id,compound.smiles]
        features.each do |f,v|
          table[0] << f unless table[0].include? f
          row[table[0].index(f)] = v
        end
        table << row
        sdf = ""
        features = {}
      elsif line.match /^>\s+</
        feature_name = line.match(/^>\s+<(.*)>/)[1]
        read_result = true
      else
        if read_result
          value = line.chomp
          features[feature_name] = value
          read_result = false
        else
          sdf << line
        end
      end
    end
    dataset.parse_table table
  end
  dataset
end

.merge(datasets:, features:, value_maps:, keep_original_features:, remove_duplicates:) ⇒ OpenTox::Dataset

Merge an array of datasets

Parameters:

  • datasets (Array<OpenTox::Dataset>)

    Datasets to be merged

  • features (Array<OpenTox::Feature>)

    Features to be merged (same size as datasets)

  • value_maps (Array<Hash>)

    Value transfomations (use nil for keeping original values, same size as dataset)

  • keep_original_features (Bool)

    Copy original features/values to the merged dataset

  • remove_duplicates (Bool)

    Delete duplicated values (assuming they come from the same experiment)

Returns:



496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
# File 'lib/dataset.rb', line 496

def self.merge datasets: , features: , value_maps: , keep_original_features: , remove_duplicates: 
  dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")+" merged")

  datasets.each do |d|
    dataset.data_entries += d.data_entries
    dataset.warnings += d.warnings
  end if keep_original_features

  feature_classes = features.collect{|f| f.class}.uniq
  merged_feature = nil
  if feature_classes.size == 1
    if features.first.kind_of? NominalFeature
      merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name}.uniq.join(" and ") + " merged", :original_feature_ids => features.collect{|f| f.id}, :transformations => value_maps)
    else
      merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO: regression transformations 
    end
  else
    raise ArgumentError, "Cannot merge features of different types (#{feature_classes})."
  end

  accept_values = []
  features.each_with_index do |f,i|
    dataset.data_entries += datasets[i].data_entries.select{|de| de[1] == f.id}.collect do |de|
      value_maps[i] ?  v = value_maps[i][de[2]] : v = de[2]
      accept_values << v
      [de[0],merged_feature.id,v]
    end
  end

  if merged_feature.is_a? MergedNominalBioActivity
    merged_feature.accept_values = accept_values.uniq.sort
    merged_feature.save
  end

  dataset.data_entries.uniq! if remove_duplicates
  dataset.save
  dataset
end

Instance Method Details

#add(substance, feature, value) ⇒ Object

Add a value for a given substance and feature

Parameters:

  • substance (OpenTox::Substance, BSON::ObjectId, String)

    or substance id

  • feature (OpenTox::Feature, BSON::ObjectId, String)

    or feature id

  • (TrueClass, FalseClass, Float)


121
122
123
124
125
# File 'lib/dataset.rb', line 121

def add(substance,feature,value)
  substance = substance.id if substance.is_a? Substance
  feature = feature.id if feature.is_a? Feature
  data_entries << [substance,feature,value] if substance and feature and value
end

#bioactivity_featuresArray<OpenTox::NominalBioActivity,OpenTox::NumericBioActivity>

Get nominal and numeric bioactivity features



81
82
83
# File 'lib/dataset.rb', line 81

def bioactivity_features
  features.select{|f| f._type.match(/BioActivity/)}
end

#compoundsArray<OpenTox::Compound>

Get all compounds

Returns:



19
20
21
# File 'lib/dataset.rb', line 19

def compounds
  substances.select{|s| s.is_a? Compound}
end

#confidence_featureOpenTox::Confidence

Get Confidence feature

Returns:



75
76
77
# File 'lib/dataset.rb', line 75

def confidence_feature
  features.select{|f| f.is_a?(Confidence)}.first
end

#copyObject

Copy a dataset

Returns:

  • OpenTox::Dataset dataset copy



449
450
451
452
453
454
455
456
457
# File 'lib/dataset.rb', line 449

def copy
  dataset = Dataset.new
  dataset.data_entries = data_entries
  dataset.warnings = warnings
  dataset.name = name
  dataset.source = id.to_s
  dataset.save
  dataset
end

#featuresArray<OpenTox::Feature>

Get all features

Returns:

  • (Array<OpenTox::Feature>)


38
39
40
41
# File 'lib/dataset.rb', line 38

def features
  @features ||= data_entries.collect{|row| OpenTox::Feature.find(row[1])}.uniq
  @features
end

#folds(n) ⇒ Array

Split a dataset into n folds

Parameters:

  • number (Integer)

    of folds

Returns:

  • (Array)

    Array with folds [training_dataset,test_dataset]



462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
# File 'lib/dataset.rb', line 462

def folds n
  $logger.debug "Creating #{n} folds for #{name}."
  len = self.substances.size
  indices = (0..len-1).to_a.shuffle
  mid = (len/n)
  chunks = []
  start = 0
  1.upto(n) do |i|
    last = start+mid
    last = last-1 unless len%n >= i
    test_idxs = indices[start..last] || []
    test_substances = test_idxs.collect{|i| substances[i].id}
    training_idxs = indices-test_idxs
    training_substances = training_idxs.collect{|i| substances[i].id}
    chunk = [training_substances,test_substances].collect do |substances|
      self.class.create(
        :name => "#{self.name} (Fold #{i-1})",
        :source => self.id,
        :data_entries => data_entries.select{|row| substances.include? row[0]}
      )
    end
    start = last+1
    chunks << chunk
  end
  chunks
end

#merged_featuresArray<OpenTox::MergedNominalBioActivity,OpenTox::MergedNumericBioActivity>

Get nominal and numeric merged features



111
112
113
# File 'lib/dataset.rb', line 111

def merged_features
  features.select{|f| f._type.match("Merged")}
end

#nanoparticlesArray<OpenTox::Nanoparticle>

Get all nanoparticles



25
26
27
# File 'lib/dataset.rb', line 25

def nanoparticles
  substances.select{|s| s.is_a? Nanoparticle}
end

#original_id_featuresArray<OpenTox::OriginalId>

Get OriginalId features

Returns:



57
58
59
# File 'lib/dataset.rb', line 57

def original_id_features
  features.select{|f| f.is_a?(OriginalId)}
end

#original_smiles_featuresArray<OpenTox::OriginalSmiles>

Get OriginalSmiles features

Returns:



63
64
65
# File 'lib/dataset.rb', line 63

def original_smiles_features
  features.select{|f| f.is_a?(OriginalSmiles)}
end

#parse_table(table) ⇒ Object

Parse data in tabular format (e.g. from csv)

does a lot of guesswork in order to determine feature types

Parameters:

Raises:

  • (ArgumentError)


262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
# File 'lib/dataset.rb', line 262

def parse_table table

  # features
  feature_names = table.shift.collect{|f| f.strip}
  raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size

  if feature_names[0] !~ /SMILES|InChI/i # check ID column
    original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift)
  else
    original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID")
  end

  compound_format = feature_names.shift
  raise ArgumentError, "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
  original_smiles = OriginalSmiles.find_or_create_by(:dataset_id => self.id) if compound_format.match(/SMILES/i)

  numeric = []
  features = []

  # guess feature types
  bioactivity = true if feature_names.size == 1

  feature_names.each_with_index do |f,i|
    original_id.name.match(/LineID$/) ? j = i+1 : j = i+2
    values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact
    types = values.collect{|v| v.numeric? ? true : false}.uniq
    feature = nil
    if values.size == 0 # empty feature
    elsif  values.size > 5 and types.size == 1 and types.first == true # 5 max classes
      numeric[i] = true
      bioactivity ?  feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f)
    else
      numeric[i] = false
      bioactivity ?  feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort)
    end
    features << feature if feature
  end
  
  # substances and values

  all_substances = []
  table.each_with_index do |vals,i|
    original_id.name.match(/LineID$/) ? original_id_value = i+1 : original_id_value = vals.shift.to_s.strip
    identifier = vals.shift.strip
    begin
      case compound_format
      when /SMILES/i
        substance = Compound.from_smiles(identifier)
        add substance, original_smiles, identifier
      when /InChI/i
        substance = Compound.from_inchi(identifier)
      end
    rescue 
      substance = nil
    end

    if substance.nil? # compound parsers may return nil
      warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
      next
    end

    all_substances << substance
    add substance, original_id, original_id_value 

    vals.each_with_index do |v,j|
      if v.blank?
        warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'."
        next
      elsif numeric[j]
        v = v.to_f
      else
        v = v.strip
      end
      add substance, features[j], v
    end
  end

  warnings_feature = Warnings.find_or_create_by(:dataset_id => id)
  all_substances.duplicates.each do |substance|
    positions = []
    all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles}
    all_substances.select{|s| s.smiles == substance.smiles}.each do |s|
      add s, warnings_feature, "Duplicated compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." 
    end
  end
  save
end

#prediction_featureArray<OpenTox::NominalLazarPrediction,OpenTox::NumericLazarPrediction>

Get nominal and numeric prediction features



99
100
101
# File 'lib/dataset.rb', line 99

def prediction_feature
  features.select{|f| f._type.match(/Prediction$/)}.first
end

#prediction_supporting_featuresArray<OpenTox::LazarPredictionProbability,OpenTox::LazarPredictionInterval>

Get supporting nominal and numeric prediction features (class probabilities, prediction interval)



105
106
107
# File 'lib/dataset.rb', line 105

def prediction_supporting_features
  features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)}
end

#predictionsHash

Get lazar predictions from a dataset

Returns:

  • (Hash)

    predictions



426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
# File 'lib/dataset.rb', line 426

def predictions
  predictions = {}
  substances.each do |s| 
    predictions[s] ||= {}
    predictions[s][:value] = values(s,prediction_feature).first
    #predictions[s][:warnings] = []
    #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) }
    predictions[s][:confidence] = values(s,confidence_feature).first
    if predictions[s][:value] and prediction_feature.is_a? NominalLazarPrediction
      prediction_feature.accept_values.each do |v|
        f = LazarPredictionProbability.find_by(:name => v, :model_id => prediction_feature.model_id, :training_feature_id => prediction_feature.training_feature_id)
        predictions[s][:probabilities] ||= {}
        predictions[s][:probabilities][v] = values(s,f).first
      end
    end
  end
  predictions
end

#substance_property_featuresArray<OpenTox::NominalSubstanceProperty,OpenTox::NumericSubstanceProperty>

Get nominal and numeric substance property features



93
94
95
# File 'lib/dataset.rb', line 93

def substance_property_features
  features.select{|f| f._type.match("SubstanceProperty")}
end

#substancesArray<OpenTox::Substance>

Get all substances



31
32
33
34
# File 'lib/dataset.rb', line 31

def substances
  @substances ||= data_entries.collect{|row| OpenTox::Substance.find row[0]}.uniq
  @substances
end

#to_prediction_csvString

Convert lazar prediction dataset to csv format

Returns:



354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
# File 'lib/dataset.rb', line 354

def to_prediction_csv
  
  compound = substances.first.is_a? Compound
  header = ["ID"]
  header << "Original SMILES" if compound
  compound ? header << "Canonical SMILES" : header << "Name"
  header << "Prediction" if prediction_feature
  header << "Confidence" if confidence_feature
  header += prediction_supporting_features.collect{|f| f.name}
  header << "Measurements" 
  csv = [header]

  substances.each do |substance|
    row = original_id_features.collect{|f| values(substance,f).join(" ")}
    row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound
    compound ? row << substance.smiles : row << substance.name
    row << values(substance,prediction_feature).join(" ")
    row << values(substance,confidence_feature).join(" ")
    row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")}
    row << values(substance,bioactivity_features[0]).join(" ")
    csv << row
  end
  csv.collect{|r| r.join(",")}.join("\n")
end

#to_sdfString

Convert dataset to SDF format

Returns:



405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
# File 'lib/dataset.rb', line 405

def to_sdf
  sdf = ""
  compounds.each do |compound|
    sdf_lines = compound.sdf.sub(/\$\$\$\$\n/,"").split("\n")
    sdf_lines[0] = compound.smiles
    sdf += sdf_lines.join("\n")
    bioactivity_features.each do |f|
      v = values(compound,f)
      unless v.empty?
        sdf += "\n> <#{f.name}>\n"
        sdf += v.uniq.join ","
        sdf += "\n"
      end
    end
    sdf += "\n$$$$\n"
  end
  sdf
end

#to_training_csvString

Convert dataset into csv formatted training data

Returns:



381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
# File 'lib/dataset.rb', line 381

def to_training_csv 
  
  export_features = merged_features
  export_features = transformed_bioactivity_features if export_features.empty? 
  export_features = bioactivity_features if export_features.empty? 
  export_feature = export_features.first

  header = ["Canonical SMILES"]
  header << bioactivity_features.first.name # use original bioactivity name instead of long merged name
  csv = [header]

  substances.each do |substance|
    nr_activities = values(substance,bioactivity_features.first).size
    (0..nr_activities-1).each do |n| # new row for each value
      row = [substance.smiles]
      row << values(substance,export_feature)[n] 
      csv << row
    end
  end
  csv.collect{|r| r.join(",")}.join("\n")
end

#transformed_bioactivity_featuresArray<OpenTox::NominalBioActivity,OpenTox::NumericBioActivity>

Get nominal and numeric bioactivity features



87
88
89
# File 'lib/dataset.rb', line 87

def transformed_bioactivity_features
  features.select{|f| f._type.match(/Transformed.*BioActivity/)}
end

#values(substance, feature) ⇒ Array<TrueClass,FalseClass,Float>

Get all values for a given substance and feature

Parameters:

Returns:



47
48
49
50
51
52
53
# File 'lib/dataset.rb', line 47

def values substance,feature
  substance = substance.id if substance.is_a? Substance
  feature = feature.id if feature.is_a? Feature
  substance = BSON::ObjectId.from_string(substance) if substance.is_a? String
  feature = BSON::ObjectId.from_string(feature) if feature.is_a? String
  data_entries.select{|row| row[0] == substance and row[1] == feature}.collect{|row| row[2]}
end

#warnings_featuresArray<OpenTox::Warnings>

Get Warnings features

Returns:



69
70
71
# File 'lib/dataset.rb', line 69

def warnings_features
  features.select{|f| f.is_a?(Warnings)}
end