Module: OpenTox::Algorithm::FeatureSelection

Includes:
OpenTox::Algorithm
Defined in:
lib/algorithm.rb

Instance Attribute Summary

Attributes included from OpenTox

#metadata, #uri

Class Method Summary collapse

Methods included from OpenTox::Algorithm

effect, gauss, get_cdk_descriptors, get_jl_descriptors, get_ob_descriptors, isnull_or_singular?, load_ds_csv, min_frequency, numeric?, pc_descriptors, #run, sum_size, #to_rdfxml, zero_variance?

Methods included from OpenTox

#add_metadata, all, #delete, #initialize, #load_metadata, sign_in, text_to_html, #to_rdfxml

Class Method Details

.rfe(params) ⇒ String

Recursive Feature Elimination using caret

Parameters:

  • required (Hash)

    keys: ds_csv_file, prediction_feature, fds_csv_file (dataset CSV file, prediction feature column name, and feature dataset CSV file), optional: del_missing (delete rows with missing values).

Returns:

  • (String)

    feature dataset CSV file composed of selected features.



516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
# File 'lib/algorithm.rb', line 516

def self.rfe(params)
  @r=RinRuby.new(false,false)
  @r.ds_csv_file = params[:ds_csv_file].to_s
  @r.prediction_feature = params[:prediction_feature].to_s
  @r.fds_csv_file = params[:fds_csv_file].to_s
  @r.del_missing = params[:del_missing] == true ? 1 : 0
  r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_")
  @r.f_fds_r = r_result_file.to_s

  # need packs 'randomForest', 'RANN'
  @r.eval <<-EOR
    suppressPackageStartupMessages(library('caret'))
    suppressPackageStartupMessages(library('randomForest'))
    suppressPackageStartupMessages(library('RANN'))
    suppressPackageStartupMessages(library('doMC'))
    registerDoMC()
    set.seed(1)

    acts = read.csv(ds_csv_file, check.names=F)
    feats = read.csv(fds_csv_file, check.names=F)
    ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-)

    features = ds[,(dim(acts)[2]+1):(dim(ds)[2])]
    y = ds[,which(names(ds) == prediction_feature)] 

    # assumes a data matrix 'features' and a vector 'y' of target values
    row.names(features)=NULL

    # features with all values missing removed
    na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) )
    features = features[,!names(features) %in% na_col]

    # features with infinite values removed
    inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) )
    features = features[,!names(features) %in% inf_col]

    # features with zero variance removed
    zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) )
    features = features[,!names(features) %in% zero_var]

    pp = NULL
    if (del_missing) {
      # needed if rows should be removed
      na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) )
      features = features[!na_ids,]
      y = y[!na_ids]
      pp = preProcess(features, method=c("scale", "center"))
    } else {
      # Use imputation if NA's random (only then!)
      pp = preProcess(features, method=c("scale", "center", "knnImpute"))
    }
    features = predict(pp, features)

    # features with nan values removed (sometimes preProcess return NaN values)
    nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) )
    features = features[,!names(features) %in% nan_col]

    # determine subsets
    subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7)
    #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
    #subsets = c(2,3,4,5,7,10,subsets)
    #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30)
    subsets = unique(sort(round(subsets))) 
    subsets = subsets[subsets<=dim(features)[2]]
    subsets = subsets[subsets>1] 

    # Recursive feature elimination
    rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets)

    # read existing dataset and select most useful features
    csv=feats[,c("SMILES", rfProfile$optVariables)]
    write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='')
  EOR
  r_result_file
end