Class: OpenTox::RUtil
- Inherits:
-
Object
- Object
- OpenTox::RUtil
- Defined in:
- lib/r-util.rb
Constant Summary collapse
- @@feats =
{}
Instance Method Summary collapse
-
#boxplot(files, data, title = "") ⇒ Object
example: files = [“/tmp/box.svg”,“/tmp/box.png”] data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ] boxplot(files, data, “comparison1” ).
-
#dataframe_to_dataset(df, subjectid = nil) ⇒ Object
converts a dataframe into a dataset (a new dataset is created at the dataset webservice) this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!).
-
#dataset_to_dataframe(dataset, missing_value = "NA", subjectid = nil, features = nil) ⇒ Object
dataset should be loaded completely (use Dataset.find) takes duplicates into account replaces missing values with param <missing_value> returns dataframe-variable-name in R.
-
#double_hist_plot(files, data1, data2, is_numerical, log = false, name1 = "first", name2 = "second", title = "title", xaxis = "x-values") ⇒ Object
plots a double histogram data1 and data2 are arrays with values, either numerical or categorial (string values) is_numerical, boolean flag indicating value types log (only for numerical), plot logarithm of values.
-
#feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, features = nil, fast_plot = true, subjectid = nil, waiting_task = nil) ⇒ Object
embedds feature values of two datasets into 2D and plots it fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method).
-
#initialize ⇒ RUtil
constructor
A new instance of RUtil.
- #install_package(package) ⇒ Object
- #package_installed?(package) ⇒ Boolean
-
#paired_ttest(array1, array2, significance_level = 0.95) ⇒ Object
<0 -> array1 << array2 0 -> no significant difference >0 -> array2 >> array1.
- #quit_r ⇒ Object
- #r ⇒ Object
-
#stratified_split(dataset, missing_values = "NA", pct = 0.3, subjectid = nil, seed = 42, split_features = nil) ⇒ Object
stratified splits a dataset into two dataset the feature values all features are taken into account unless <split_features> is given.
Constructor Details
#initialize ⇒ RUtil
Returns a new instance of RUtil.
17 18 19 20 21 22 23 |
# File 'lib/r-util.rb', line 17 def initialize @r = RinRuby.new(true,false) unless defined?(@r) and @r @r.eval ".libPaths('#{PACKAGE_DIR}')" @r_packages = @r.pull "installed.packages()[,1]" ["sampling","gam","vegan"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto" @r.eval "source('#{File.join(Gem.loaded_specs['opentox-ruby'].full_gem_path,'lib/stratification.R')}')" end |
Instance Method Details
#boxplot(files, data, title = "") ⇒ Object
example: files = [“/tmp/box.svg”,“/tmp/box.png”] data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ] boxplot(files, data, “comparison1” )
69 70 71 72 73 74 75 |
# File 'lib/r-util.rb', line 69 def boxplot(files, data, title="") LOGGER.debug("r-util> create boxplot") assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s}) plot_to_files(files) do |file| @r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))" end end |
#dataframe_to_dataset(df, subjectid = nil) ⇒ Object
converts a dataframe into a dataset (a new dataset is created at the dataset webservice) this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
267 268 269 |
# File 'lib/r-util.rb', line 267 def dataframe_to_dataset( df, subjectid=nil ) dataframe_to_dataset_indices( df, subjectid, nil) end |
#dataset_to_dataframe(dataset, missing_value = "NA", subjectid = nil, features = nil) ⇒ Object
dataset should be loaded completely (use Dataset.find) takes duplicates into account replaces missing values with param <missing_value> returns dataframe-variable-name in R
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 |
# File 'lib/r-util.rb', line 191 def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil ) LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}" # count duplicates num_compounds = {} dataset.features.keys.each do |f| dataset.compounds.each do |c| if dataset.data_entries[c] val = dataset.data_entries[c][f] size = val==nil ? 1 : val.size num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max else num_compounds[c] = 1 end end end # use either all, or the provided features, sorting is important as col-index := features if features features.sort! else features = dataset.features.keys.sort end compounds = [] dataset.compounds.each do |c| num_compounds[c].times do |i| compounds << c end end # values into 2D array, then to dataframe d_values = [] dataset.compounds.each do |c| num_compounds[c].times do |i| c_values = [] features.each do |f| if dataset.data_entries[c] val = dataset.data_entries[c][f] v = val==nil ? "" : val[i].to_s else raise "wtf" if i>0 v = "" end v = missing_value if v.size()==0 c_values << v end d_values << c_values end end df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}" assign_dataframe(df_name,d_values,compounds,features) # set dataframe column types accordingly f_count = 1 #R starts at 1 features.each do |f| feat = OpenTox::Feature.find(f,subjectid) nominal = feat.[RDF.type].to_a.flatten.include?(OT.NominalFeature) if nominal @r.eval "#{df_name}[,#{f_count}] <- as.character(#{df_name}[,#{f_count}])" else @r.eval "#{df_name}[,#{f_count}] <- as.numeric(#{df_name}[,#{f_count}])" end f_count += 1 end #@r.eval "head(#{df_name})" # store compounds, and features (including metainformation) @@feats[df_name] = {} features.each do |f| @@feats[df_name][f] = dataset.features[f] end df_name end |
#double_hist_plot(files, data1, data2, is_numerical, log = false, name1 = "first", name2 = "second", title = "title", xaxis = "x-values") ⇒ Object
plots a double histogram data1 and data2 are arrays with values, either numerical or categorial (string values) is_numerical, boolean flag indicating value types log (only for numerical), plot logarithm of values
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# File 'lib/r-util.rb', line 123 def double_hist_plot(files, data1, data2, is_numerical, log=false, name1="first", name2="second", title="title", xaxis="x-values") LOGGER.debug("r-util> create double hist plot") all = data1 + data2 if (is_numerical) @r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values') { if (log) { data1 <- log(data1) data2 <- log(data2) xlab = paste('logarithm of',xlab,sep=' ') } xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2))))) h <- hist(rbind(data1,data2),plot=F) h1 <- hist(data1,plot=F,breaks=h$breaks) h2 <- hist(data2,plot=F,breaks=h$breaks) xlims = c(min(h$breaks),max(h$breaks)) ylims = c(0,max(h1$counts,h2$counts)) xaxps = c(min(h$breaks),max(h$breaks),(length(h$breaks)-1)) plot(h1, col=rgb(1,0,0,2/4), xlim=xlims, xaxp=xaxps, ylim=ylims, main=title, xlab=xlab, ylab='counts' ) plot(h2, col=rgb(0,1,0,2/4), add=T ) legend('topleft',names,lty=c(1,1),col=c('red','green')) }" @r.assign("data1",data1) @r.assign("data2",data2) @r.legend = [name1, name2] else raise "log not valid for categorial" if log vals = all.uniq.sort! counts1 = vals.collect{|e| data1.count(e)} counts2 = vals.collect{|e| data2.count(e)} @r.data1 = counts1 @r.data2 = counts2 @r.value_names = [name1, name2] @r.legend = vals @r.eval("data <- cbind(data1,data2)") end plot_to_files(files) do |file| if (is_numerical) @r.eval "double_plot(data1,data2,log=#{log ? "T":"F"},names=legend,title='#{title}',xlab='#{xaxis}')" else @r.eval("bp <- barplot(data, beside=T, names.arg=value_names, main='#{title}', col=sort(rep(2:3,length(legend))))") #legend.text=c(legend), @r.eval "text(bp, 0, round(data, 1),cex=1,pos=3)" end end end |
#feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, features = nil, fast_plot = true, subjectid = nil, waiting_task = nil) ⇒ Object
embedds feature values of two datasets into 2D and plots it fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method)
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# File 'lib/r-util.rb', line 80 def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, features=nil, fast_plot=true, subjectid=nil, waiting_task=nil) raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof") LOGGER.debug("r-util> create feature value plot") d1 = OpenTox::Dataset.find(dataset_uri1,subjectid) d2 = OpenTox::Dataset.find(dataset_uri2,subjectid) if features [d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}} else raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if (d1.features.keys.sort != d2.features.keys.sort) features = d1.features.keys end raise "at least two features needed" if d1.features.keys.size<2 waiting_task.progress(25) if waiting_task df1 = dataset_to_dataframe(d1,0,subjectid,features) df2 = dataset_to_dataframe(d2,0,subjectid,features) waiting_task.progress(50) if waiting_task @r.eval "df <- rbind(#{df1},#{df2})" @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))" @r.names = [dataset_name1, dataset_name2] LOGGER.debug("r-util> - convert data to 2d") @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')" waiting_task.progress(75) if waiting_task if fast_plot info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'" else info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'" end LOGGER.debug("r-util> - plot data") plot_to_files(files) do |file| @r.eval "plot_split( df.2d, split, names, #{info})" end end |
#install_package(package) ⇒ Object
41 42 43 44 45 46 |
# File 'lib/r-util.rb', line 41 def install_package( package ) unless package_installed?(package) LOGGER.debug "r-util> installing r-package #{package} to #{PACKAGE_DIR}" @r.eval "install.packages('#{package}', repos='http://cran.r-project.org', lib='#{PACKAGE_DIR}')" end end |
#package_installed?(package) ⇒ Boolean
37 38 39 |
# File 'lib/r-util.rb', line 37 def package_installed?( package ) @r_packages.include?(package) end |
#paired_ttest(array1, array2, significance_level = 0.95) ⇒ Object
<0 -> array1 << array2 0 -> no significant difference >0 -> array2 >> array1
51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/r-util.rb', line 51 def paired_ttest(array1, array2, significance_level=0.95) @r.assign "v1",array1 @r.assign "v2",array2 @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)" t = @r.pull "ttest$statistic" p = @r.pull "ttest$p.value" if (1-significance_level > p) t else 0 end end |
#quit_r ⇒ Object
25 26 27 28 29 30 31 |
# File 'lib/r-util.rb', line 25 def quit_r begin @r.quit @r = nil rescue end end |
#r ⇒ Object
33 34 35 |
# File 'lib/r-util.rb', line 33 def r @r end |
#stratified_split(dataset, missing_values = "NA", pct = 0.3, subjectid = nil, seed = 42, split_features = nil) ⇒ Object
stratified splits a dataset into two dataset the feature values all features are taken into account unless <split_features> is given
175 176 177 178 179 180 181 182 183 184 185 |
# File 'lib/r-util.rb', line 175 def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil ) raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0 LOGGER.debug("r-util> apply stratified split to #{dataset.uri}") df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features ) @r.eval "set.seed(#{seed})" @r.eval "split <- stratified_split(#{df}, ratio=#{pct})" split = @r.pull 'split' split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set split_to_datasets( df, split, subjectid ) end |