Module: OpenTox::Algorithm::FeatureSelection
- Includes:
- OpenTox::Algorithm
- Defined in:
- lib/algorithm.rb
Instance Attribute Summary
Attributes included from OpenTox
Class Method Summary collapse
-
.rfe(params) ⇒ String
Recursive Feature Elimination using caret.
Methods included from OpenTox::Algorithm
effect, gauss, get_cdk_descriptors, get_jl_descriptors, get_ob_descriptors, isnull_or_singular?, load_ds_csv, min_frequency, numeric?, pc_descriptors, #run, sum_size, #to_rdfxml, zero_variance?
Methods included from OpenTox
#add_metadata, all, #delete, #initialize, #load_metadata, sign_in, text_to_html, #to_rdfxml
Class Method Details
.rfe(params) ⇒ String
Recursive Feature Elimination using caret
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 |
# File 'lib/algorithm.rb', line 516 def self.rfe(params) @r=RinRuby.new(false,false) @r.ds_csv_file = params[:ds_csv_file].to_s @r.prediction_feature = params[:prediction_feature].to_s @r.fds_csv_file = params[:fds_csv_file].to_s @r.del_missing = params[:del_missing] == true ? 1 : 0 r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_") @r.f_fds_r = r_result_file.to_s # need packs 'randomForest', 'RANN' @r.eval " suppressPackageStartupMessages(library('caret'))\n suppressPackageStartupMessages(library('randomForest'))\n suppressPackageStartupMessages(library('RANN'))\n suppressPackageStartupMessages(library('doMC'))\n registerDoMC()\n set.seed(1)\n\n acts = read.csv(ds_csv_file, check.names=F)\n feats = read.csv(fds_csv_file, check.names=F)\n ds = merge(acts, feats, by=\"SMILES\") # duplicates features for duplicate SMILES :-)\n\n features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]\n y = ds[,which(names(ds) == prediction_feature)] \n\n # assumes a data matrix 'features' and a vector 'y' of target values\n row.names(features)=NULL\n\n # features with all values missing removed\n na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) )\n features = features[,!names(features) %in% na_col]\n\n # features with infinite values removed\n inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) )\n features = features[,!names(features) %in% inf_col]\n\n # features with zero variance removed\n zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) )\n features = features[,!names(features) %in% zero_var]\n\n pp = NULL\n if (del_missing) {\n # needed if rows should be removed\n na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) )\n features = features[!na_ids,]\n y = y[!na_ids]\n pp = preProcess(features, method=c(\"scale\", \"center\"))\n } else {\n # Use imputation if NA's random (only then!)\n pp = preProcess(features, method=c(\"scale\", \"center\", \"knnImpute\"))\n }\n features = predict(pp, features)\n\n # features with nan values removed (sometimes preProcess return NaN values)\n nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) )\n features = features[,!names(features) %in% nan_col]\n\n # determine subsets\n subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7)\n #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)\n #subsets = c(2,3,4,5,7,10,subsets)\n #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30)\n subsets = unique(sort(round(subsets))) \n subsets = subsets[subsets<=dim(features)[2]]\n subsets = subsets[subsets>1] \n\n # Recursive feature elimination\n rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)\n\n # read existing dataset and select most useful features\n csv=feats[,c(\"SMILES\", rfProfile$optVariables)]\n write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')\n EOR\n r_result_file\nend\n" |