Class: OpenTox::Dataset

Inherits:
Object show all
Defined in:
lib/dataset.rb

Direct Known Subclasses

DescriptorDataset, LazarPrediction

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.from_csv_file(file, source = nil, bioassay = true) ⇒ Object

Create a dataset from CSV file TODO: document structure



147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/dataset.rb', line 147

def self.from_csv_file file, source=nil, bioassay=true#, layout={}
  source ||= file
  name = File.basename(file,".*")
  dataset = self.find_by(:source => source, :name => name)
  if dataset
    $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
  else
    $logger.debug "Parsing #{file}."
    table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
    dataset = self.new(:source => source, :name => name)
    dataset.parse_table table, bioassay#, layout
  end
  dataset
end

Instance Method Details

#compoundsObject

Get all compounds



17
18
19
20
# File 'lib/dataset.rb', line 17

def compounds
  @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id}
  @compounds
end

#compounds=(compounds) ⇒ Object

Set compounds



41
42
43
# File 'lib/dataset.rb', line 41

def compounds=(compounds)
  self.compound_ids = compounds.collect{|c| c.id}
end

#correlation_plot(training_dataset) ⇒ Object



110
111
112
113
114
115
# File 'lib/dataset.rb', line 110

def correlation_plot training_dataset
  # TODO: create/store svg
  R.assign "features", data_entries
  R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
  R.eval "featurePlot(features,activities)"
end

#density_plotObject



117
118
119
120
121
# File 'lib/dataset.rb', line 117

def density_plot
  # TODO: create/store svg
  R.assign "acts", data_entries.collect{|r| r.first }#.compact
  R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
end

#duplicates(feature = self.features.first) ⇒ Object

Diagnostics



99
100
101
102
103
104
105
106
107
108
# File 'lib/dataset.rb', line 99

def duplicates feature=self.features.first
  col = feature_ids.index feature.id
  dups = {}
  compound_ids.each_with_index do |cid,i|
    rows = compound_ids.each_index.select{|r| compound_ids[r] == cid }
    values = rows.collect{|row| data_entries[row][col]}
    dups[cid] = values if values.size > 1
  end
  dups
end

#featuresObject

Get all features



23
24
25
26
# File 'lib/dataset.rb', line 23

def features
  @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
  @features
end

#features=(features) ⇒ Object

Set features



46
47
48
# File 'lib/dataset.rb', line 46

def features=(features)
  self.feature_ids = features.collect{|f| f.id}
end

#fill_nil_with(n) ⇒ Object

Fill unset data entries

Parameters:

  • any

    value



278
279
280
281
282
283
284
285
# File 'lib/dataset.rb', line 278

def fill_nil_with n
  (0 .. compound_ids.size-1).each do |i|
    data_entries[i] ||= []
    (0 .. feature_ids.size-1).each do |j|
      data_entries[i][j] ||= n
    end
  end
end

#folds(n) ⇒ Array

Split a dataset into n folds

Parameters:

  • number (Integer)

    of folds

Returns:

  • (Array)

    Array with folds [training_dataset,test_dataset]



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/dataset.rb', line 55

def folds n
  unique_compound_data = {}
  compound_ids.each_with_index do |cid,i|
    unique_compound_data[cid] ||= []
    unique_compound_data[cid] << data_entries[i]
  end
  unique_compound_ids = unique_compound_data.keys
  len = unique_compound_ids.size
  indices = (0..len-1).to_a.shuffle
  mid = (len/n)
  chunks = []
  start = 0
  1.upto(n) do |i|
    last = start+mid
    last = last-1 unless len%n >= i
    test_idxs = indices[start..last] || []
    test_cids = test_idxs.collect{|i| unique_compound_ids[i]}
    training_idxs = indices-test_idxs
    training_cids = training_idxs.collect{|i| unique_compound_ids[i]}
    chunk = [training_cids,test_cids].collect do |unique_cids|
      cids = []
      data_entries = []
      unique_cids.each do |cid| 
        unique_compound_data[cid].each do |de|
          cids << cid
          data_entries << de
        end
      end
      dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
      dataset.compounds.each do |compound|
        compound.dataset_ids << dataset.id
        compound.save
      end
      dataset.save
      dataset
    end
    start = last+1
    chunks << chunk
  end
  chunks
end

#parse_table(table, bioassay = true) ⇒ Object

parse data in tabular format (e.g. from csv) does a lot of guesswork in order to determine feature types



164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# File 'lib/dataset.rb', line 164

def parse_table table, bioassay=true

  time = Time.now

  # features
  feature_names = table.shift.collect{|f| f.strip}
  warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
  compound_format = feature_names.shift.strip
  bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i

  numeric = []
  # guess feature types
  feature_names.each_with_index do |f,i|
     = {:name => f}
    values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
    types = values.collect{|v| v.numeric? ? true : false}.uniq
    if values.size == 0 # empty feature
    elsif  values.size > 5 and types.size == 1 and types.first == true # 5 max classes
      ["numeric"] = true
      numeric[i] = true
    else
      ["nominal"] = true
      ["accept_values"] = values
      numeric[i] = false
    end
    if bioassay
      if ["numeric"]
        feature = NumericBioAssay.find_or_create_by()
      elsif ["nominal"]
        feature = NominalBioAssay.find_or_create_by()
      end
    else
      .merge({:measured => false, :calculated => true})
      if ["numeric"]
        feature = NumericFeature.find_or_create_by()
      elsif ["nominal"]
        feature = NominalFeature.find_or_create_by()
      end
    end
    feature_ids << feature.id if feature
  end
  
  $logger.debug "Feature values: #{Time.now-time}"
  time = Time.now

  r = -1
  compound_time = 0
  value_time = 0

  # compounds and values
  self.data_entries = []

  table.each_with_index do |vals,i|
    ct = Time.now
    identifier = vals.shift.strip
    warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
    begin
      case compound_format
      when /SMILES/i
        compound = OpenTox::Compound.from_smiles(identifier)
      when /InChI/i
        compound = OpenTox::Compound.from_inchi(identifier)
      end
    rescue 
      compound = nil
    end
    if compound.nil?
      # compound parsers may return nil
      warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
      next
    end
    compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id
    compound_time += Time.now-ct
      
    r += 1
    unless vals.size == feature_ids.size # way cheaper than accessing features
      warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
      next
    end

    compound_ids << compound.id
    table.first.size == 0 ?  self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1) 
    
    vals.each_with_index do |v,j|
      if v.blank?
        warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
        next
      elsif numeric[j]
        v = v.to_f
      else
        v = v.strip
      end
      self.data_entries.last[j] = v
      #i = compound.feature_ids.index feature_ids[j]
      compound.features[feature_ids[j].to_s] ||= []
      compound.features[feature_ids[j].to_s] << v
      compound.save
    end
  end
  compounds.duplicates.each do |compound|
    positions = []
    compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
    warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." 
  end
  
  $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
  time = Time.now
  save
  $logger.debug "Saving: #{Time.now-time}"

end

#to_csv(inchi = false) ⇒ String

converts dataset to csv format including compound smiles as first column, other column headers are feature names

Returns:



127
128
129
130
131
132
133
134
# File 'lib/dataset.rb', line 127

def to_csv(inchi=false)
  CSV.generate() do |csv| #{:force_quotes=>true}
    csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
    compounds.each_with_index do |c,i|
      csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
    end
  end
end

#values(compound, feature) ⇒ Array

Find data entry values for a given compound and feature

Parameters:

Returns:

  • (Array)

    Data entry values



32
33
34
35
36
# File 'lib/dataset.rb', line 32

def values(compound, feature)
  rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id }
  col = feature_ids.index feature.id
  rows.collect{|row| data_entries[row][col]}
end