Class: OpenTox::RUtil

Inherits:
Object
  • Object
show all
Defined in:
lib/r-util.rb

Constant Summary collapse

@@feats =
{}

Instance Method Summary collapse

Constructor Details

#initializeRUtil

Returns a new instance of RUtil.



17
18
19
20
21
22
23
# File 'lib/r-util.rb', line 17

def initialize
  @r = RinRuby.new(true,false) unless defined?(@r) and @r
  @r.eval ".libPaths('#{PACKAGE_DIR}')"
  @r_packages = @r.pull "installed.packages()[,1]"
  ["sampling","gam","vegan"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto"
  @r.eval "source('#{File.join(Gem.loaded_specs['opentox-ruby'].full_gem_path,'lib/stratification.R')}')"
end

Instance Method Details

#boxplot(files, data, title = "") ⇒ Object

example: files = [“/tmp/box.svg”,“/tmp/box.png”] data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ] boxplot(files, data, “comparison1” )



69
70
71
72
73
74
75
# File 'lib/r-util.rb', line 69

def boxplot(files, data, title="")
  LOGGER.debug("r-util> create boxplot")
  assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s})
  plot_to_files(files) do |file|
    @r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))"
  end
end

#dataframe_to_dataset(df, subjectid = nil) ⇒ Object

converts a dataframe into a dataset (a new dataset is created at the dataset webservice) this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)



267
268
269
# File 'lib/r-util.rb', line 267

def dataframe_to_dataset( df, subjectid=nil )
  dataframe_to_dataset_indices( df, subjectid, nil)
end

#dataset_to_dataframe(dataset, missing_value = "NA", subjectid = nil, features = nil) ⇒ Object

dataset should be loaded completely (use Dataset.find) takes duplicates into account replaces missing values with param <missing_value> returns dataframe-variable-name in R



191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/r-util.rb', line 191

def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil )
  LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}"
  
  # count duplicates
  num_compounds = {}
  dataset.features.keys.each do |f|
    dataset.compounds.each do |c|
      if dataset.data_entries[c]
        val = dataset.data_entries[c][f]
        size = val==nil ? 1 : val.size
        num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max
      else
        num_compounds[c] = 1
      end
    end
  end  
  
  # use either all, or the provided features, sorting is important as col-index := features
  if features
    features.sort!
  else
    features = dataset.features.keys.sort
  end
  compounds = []
  dataset.compounds.each do |c|
    num_compounds[c].times do |i|
      compounds << c
    end
  end

  # values into 2D array, then to dataframe
  d_values = []
  dataset.compounds.each do |c|
    num_compounds[c].times do |i|
      c_values = []
      features.each do |f|
        if dataset.data_entries[c]
          val = dataset.data_entries[c][f]
          v = val==nil ? "" : val[i].to_s
        else
          raise "wtf" if i>0
          v = ""
        end
        v = missing_value if v.size()==0
        c_values << v
      end
      d_values << c_values
    end
  end  
  df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
  assign_dataframe(df_name,d_values,compounds,features)
  
  # set dataframe column types accordingly
  f_count = 1 #R starts at 1
  features.each do |f|
    feat = OpenTox::Feature.find(f,subjectid)
    nominal = feat.[RDF.type].to_a.flatten.include?(OT.NominalFeature)
    if nominal
      @r.eval "#{df_name}[,#{f_count}] <- as.character(#{df_name}[,#{f_count}])"
    else
      @r.eval "#{df_name}[,#{f_count}] <- as.numeric(#{df_name}[,#{f_count}])"
    end
    f_count += 1
  end
  #@r.eval "head(#{df_name})"
  
  # store compounds, and features (including metainformation)
  @@feats[df_name] = {}
  features.each do |f|
    @@feats[df_name][f] = dataset.features[f]
  end
  df_name
end

#double_hist_plot(files, data1, data2, is_numerical, log = false, name1 = "first", name2 = "second", title = "title", xaxis = "x-values") ⇒ Object

plots a double histogram data1 and data2 are arrays with values, either numerical or categorial (string values) is_numerical, boolean flag indicating value types log (only for numerical), plot logarithm of values



123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/r-util.rb', line 123

def double_hist_plot(files, data1, data2, is_numerical, log=false, name1="first", name2="second", title="title", xaxis="x-values")
  LOGGER.debug("r-util> create double hist plot")
  all = data1 + data2
  if (is_numerical)
    @r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values')
    {
      if (log)
      {
        data1 <- log(data1)
        data2 <- log(data2)
        xlab = paste('logarithm of',xlab,sep=' ')
      }
      xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2)))))
      h <- hist(rbind(data1,data2),plot=F)
      h1 <- hist(data1,plot=F,breaks=h$breaks)
      h2 <- hist(data2,plot=F,breaks=h$breaks)
      xlims = c(min(h$breaks),max(h$breaks))
      ylims = c(0,max(h1$counts,h2$counts))
      xaxps = c(min(h$breaks),max(h$breaks),(length(h$breaks)-1))
      plot(h1, col=rgb(1,0,0,2/4), xlim=xlims, xaxp=xaxps, ylim=ylims,
        main=title, xlab=xlab, ylab='counts' )
      plot(h2, col=rgb(0,1,0,2/4), add=T )
      legend('topleft',names,lty=c(1,1),col=c('red','green'))
    }" 
    @r.assign("data1",data1)
    @r.assign("data2",data2)
    @r.legend = [name1, name2]
  else
    raise "log not valid for categorial" if log
    vals = all.uniq.sort!
    counts1 = vals.collect{|e| data1.count(e)}
    counts2 = vals.collect{|e| data2.count(e)}
    @r.data1 = counts1
    @r.data2 = counts2
    @r.value_names = [name1, name2]
    @r.legend = vals
    @r.eval("data <- cbind(data1,data2)")
  end
  
  plot_to_files(files) do |file|
    if (is_numerical)
      @r.eval "double_plot(data1,data2,log=#{log ? "T":"F"},names=legend,title='#{title}',xlab='#{xaxis}')"
    else
      @r.eval("bp <- barplot(data, beside=T, names.arg=value_names, 
        main='#{title}', col=sort(rep(2:3,length(legend))))") #legend.text=c(legend),
      @r.eval "text(bp, 0, round(data, 1),cex=1,pos=3)"
    end
  end
end

#feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, features = nil, fast_plot = true, subjectid = nil, waiting_task = nil) ⇒ Object

embedds feature values of two datasets into 2D and plots it fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/r-util.rb', line 80

def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
    features=nil, fast_plot=true, subjectid=nil, waiting_task=nil)
    
  raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof")
  LOGGER.debug("r-util> create feature value plot")
  d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
  d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
  if features
    [d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}} 
  else
    raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if 
      (d1.features.keys.sort != d2.features.keys.sort)
    features = d1.features.keys
  end
  raise "at least two features needed" if d1.features.keys.size<2
  waiting_task.progress(25) if waiting_task
  
  df1 = dataset_to_dataframe(d1,0,subjectid,features)
  df2 = dataset_to_dataframe(d2,0,subjectid,features)
  waiting_task.progress(50) if waiting_task
  
  @r.eval "df <- rbind(#{df1},#{df2})"
  @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
  @r.names = [dataset_name1, dataset_name2]
  LOGGER.debug("r-util> - convert data to 2d")
  @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')"
  waiting_task.progress(75) if waiting_task
  
  if fast_plot
    info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'"
  else
    info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'"
  end
  LOGGER.debug("r-util> - plot data")
  plot_to_files(files) do |file|
    @r.eval "plot_split( df.2d, split, names, #{info})"
  end
end

#install_package(package) ⇒ Object



41
42
43
44
45
46
# File 'lib/r-util.rb', line 41

def install_package( package )
  unless package_installed?(package)
    LOGGER.debug "r-util> installing r-package #{package} to #{PACKAGE_DIR}"
    @r.eval "install.packages('#{package}', repos='http://cran.r-project.org', lib='#{PACKAGE_DIR}')"
  end
end

#package_installed?(package) ⇒ Boolean

Returns:

  • (Boolean)


37
38
39
# File 'lib/r-util.rb', line 37

def package_installed?( package )
  @r_packages.include?(package) 
end

#paired_ttest(array1, array2, significance_level = 0.95) ⇒ Object

<0 -> array1 << array2 0 -> no significant difference >0 -> array2 >> array1



51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/r-util.rb', line 51

def paired_ttest(array1, array2, significance_level=0.95)
  @r.assign "v1",array1
  @r.assign "v2",array2
  @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)"
  t = @r.pull "ttest$statistic"
  p = @r.pull "ttest$p.value"
  if (1-significance_level > p)
    t
  else
    0
  end
end

#quit_rObject



25
26
27
28
29
30
31
# File 'lib/r-util.rb', line 25

def quit_r
  begin
    @r.quit
    @r = nil
  rescue
  end
end

#rObject



33
34
35
# File 'lib/r-util.rb', line 33

def r
  @r
end

#stratified_split(dataset, missing_values = "NA", pct = 0.3, subjectid = nil, seed = 42, split_features = nil) ⇒ Object

stratified splits a dataset into two dataset the feature values all features are taken into account unless <split_features> is given



175
176
177
178
179
180
181
182
183
184
185
# File 'lib/r-util.rb', line 175

def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
  raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0
  LOGGER.debug("r-util> apply stratified split to #{dataset.uri}")
  
  df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features )
  @r.eval "set.seed(#{seed})"
  @r.eval "split <- stratified_split(#{df}, ratio=#{pct})"
  split = @r.pull 'split'
  split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set
  split_to_datasets( df, split, subjectid )
end