Module: OpenTox::Algorithm::FeatureSelection

Includes:
OpenTox::Algorithm
Defined in:
lib/algorithm.rb

Instance Attribute Summary

Attributes included from OpenTox

#metadata, #uri

Class Method Summary collapse

Methods included from OpenTox::Algorithm

effect, gauss, get_pc_descriptors, isnull_or_singular?, load_ds_csv, min_frequency, numeric?, pc_descriptors, #run, sum_size, #to_rdfxml, zero_variance?

Methods included from OpenTox

#add_metadata, all, #delete, #initialize, #load_metadata, sign_in, text_to_html, #to_rdfxml

Class Method Details

.rfe(params) ⇒ String

Recursive Feature Elimination using caret

Parameters:

  • required (Hash)

    keys: ds_csv_file, prediction_feature, fds_csv_file (dataset CSV file, prediction feature column name, and feature dataset CSV file), optional: del_missing (delete rows with missing values).

Returns:

  • (String)

    feature dataset CSV file composed of selected features.



451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
# File 'lib/algorithm.rb', line 451

def self.rfe(params)
  @r=RinRuby.new(false,false)
  @r.ds_csv_file = params[:ds_csv_file].to_s
  @r.prediction_feature = params[:prediction_feature].to_s
  @r.fds_csv_file = params[:fds_csv_file].to_s
  @r.del_missing = params[:del_missing] == true ? 1 : 0
  r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
  @r.f_fds_r = r_result_file.to_s
  
  # need packs 'randomForest', 'RANN'
  @r.eval "    set.seed(1)\n    suppressPackageStartupMessages(library('caret'))\n    suppressPackageStartupMessages(library('randomForest'))\n    suppressPackageStartupMessages(library('RANN'))\n    suppressPackageStartupMessages(library('doMC'))\n    registerDoMC()\n    \n    acts = read.csv(ds_csv_file, check.names=F)\n    feats = read.csv(fds_csv_file, check.names=F)\n    ds = merge(acts, feats, by=\"SMILES\") # duplicates features for duplicate SMILES :-)\n    \n    features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]\n    y = ds[,which(names(ds) == prediction_feature)] \n    \n    # assumes a data matrix 'features' and a vector 'y' of target values\n    row.names(features)=NULL\n    \n    pp = NULL\n    if (del_missing) {\n      # needed if rows should be removed\n      na_ids = apply(features,1,function(x)any(is.na(x)))\n      features = features[!na_ids,]\n      y = y[!na_ids]\n      pp = preProcess(features, method=c(\"scale\", \"center\"))\n    } else {\n      # Use imputation if NA's random (only then!)\n      pp = preProcess(features, method=c(\"scale\", \"center\", \"knnImpute\"))\n    }\n    features = predict(pp, features)\n    \n    # determine subsets\n    subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)\n    subsets = c(2,3,4,5,7,10,subsets)\n    subsets = unique(sort(round(subsets))) \n    subsets = subsets[subsets<=dim(features)[2]]\n    subsets = subsets[subsets>1] \n    \n    # Recursive feature elimination\n    rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets)\n    \n    # read existing dataset and select most useful features\n    csv=feats[,c(\"SMILES\", rfProfile$optVariables)]\n    write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')\n  EOR\n  r_result_file\nend\n"