Class: OpenTox::Transform::PCA

Inherits:
Object
  • Object
show all
Defined in:
lib/transform.rb

Overview

Principal Components Analysis.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(data_matrix, compression = 0.05, maxcols = (1.0/0.0)) ⇒ GSL::Matrix

Creates a transformed dataset as GSL::Matrix.

Parameters:

  • Data (GSL::Matrix)

    matrix.

  • Compression (Float)

    ratio from [0,1], default 0.05.



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/transform.rb', line 112

def initialize data_matrix, compression=0.05, maxcols=(1.0/0.0)
  begin
    @data_matrix = data_matrix.clone
    @compression = compression.to_f
    @mean = Array.new
    @autoscaler = Array.new
    @cols = Array.new
    @maxcols = maxcols

    # Objective Feature Selection
    raise "Error! PCA needs at least two dimensions." if data_matrix.size2 < 2
    @data_matrix_selected = nil
    (0..@data_matrix.size2-1).each { |i|
      if !Algorithm::zero_variance?(@data_matrix.col(i).to_a)
        if @data_matrix_selected.nil?
          @data_matrix_selected = GSL::Matrix.alloc(@data_matrix.size1, 1) 
          @data_matrix_selected.col(0)[0..@data_matrix.size1-1] = @data_matrix.col(i)
        else
          @data_matrix_selected = @data_matrix_selected.horzcat(GSL::Matrix.alloc(@data_matrix.col(i).to_a,@data_matrix.size1, 1))
        end
        @cols << i
      end             
    }
    raise "Error! PCA needs at least two dimensions." if (@data_matrix_selected.nil? || @data_matrix_selected.size2 < 2)

    # PCA uses internal centering on 0
    @data_matrix_scaled = GSL::Matrix.alloc(@data_matrix_selected.size1, @cols.size)
    (0..@cols.size-1).each { |i|
      as = OpenTox::Transform::AutoScale.new(@data_matrix_selected.col(i))
      @data_matrix_scaled.col(i)[0..@data_matrix.size1-1] = as.vs * as.stdev # re-adjust by stdev
      @mean << as.mean
      @autoscaler << as
    }

    # PCA
    data_matrix_hash = Hash.new
    (0..@cols.size-1).each { |i|
      column_view = @data_matrix_scaled.col(i)
      data_matrix_hash[i] = column_view.to_scale
    }
    dataset_hash = data_matrix_hash.to_dataset # see http://goo.gl/7XcW9
    cor_matrix=Statsample::Bivariate.correlation_matrix(dataset_hash)
    pca=Statsample::Factor::PCA.new(cor_matrix)

    # Select best eigenvectors
    pca.eigenvalues.each { |ev| raise "PCA failed!" unless !ev.nan? }
    @eigenvalue_sums = Array.new
    (0..@cols.size-1).each { |i|
      @eigenvalue_sums << pca.eigenvalues[0..i].inject{ |sum, ev| sum + ev }
    }
    eigenvectors_selected = Array.new
    pca.eigenvectors.each_with_index { |ev, i|
      if (@eigenvalue_sums[i] <= ((1.0-@compression)*@cols.size)) || (eigenvectors_selected.size == 0)
        eigenvectors_selected << ev.to_a unless @maxcols <= eigenvectors_selected.size
      end
    }
    @eigenvector_matrix = GSL::Matrix.alloc(eigenvectors_selected.flatten, eigenvectors_selected.size, @cols.size).transpose
    @data_transformed_matrix = (@eigenvector_matrix.transpose * @data_matrix_scaled.transpose).transpose

  rescue Exception => e
      LOGGER.debug "#{e.class}: #{e.message}"
      LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
  end
end

Instance Attribute Details

#autoscalerObject

Returns the value of attribute autoscaler.



105
106
107
# File 'lib/transform.rb', line 105

def autoscaler
  @autoscaler
end

#data_matrixObject

Returns the value of attribute data_matrix.



105
106
107
# File 'lib/transform.rb', line 105

def data_matrix
  @data_matrix
end

#data_transformed_matrixObject

Returns the value of attribute data_transformed_matrix.



105
106
107
# File 'lib/transform.rb', line 105

def data_transformed_matrix
  @data_transformed_matrix
end

#eigenvalue_sumsObject

Returns the value of attribute eigenvalue_sums.



105
106
107
# File 'lib/transform.rb', line 105

def eigenvalue_sums
  @eigenvalue_sums
end

#eigenvector_matrixObject

Returns the value of attribute eigenvector_matrix.



105
106
107
# File 'lib/transform.rb', line 105

def eigenvector_matrix
  @eigenvector_matrix
end

Instance Method Details

#restoreGSL::Matrix

Restores data in the original feature space (possibly with compression loss).

Parameters:

  • Transformed (GSL::Matrix)

    data matrix.

Returns:

  • (GSL::Matrix)

    Data matrix.



200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'lib/transform.rb', line 200

def restore
  begin 
    data_matrix_restored = (@eigenvector_matrix * @data_transformed_matrix.transpose).transpose # reverse pca
    # reverse scaling
    (0..@cols.size-1).each { |i|
      data_matrix_restored.col(i)[0..data_matrix_restored.size1-1] += @mean[i]
    }
    data_matrix_restored
  rescue Exception => e
    LOGGER.debug "#{e.class}: #{e.message}"
    LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
  end
end

#transform(values) ⇒ GSL::Matrix

Transforms data to feature space found by PCA.

Parameters:

  • Data (GSL::Matrix)

    matrix.

Returns:

  • (GSL::Matrix)

    Transformed data matrix.



181
182
183
184
185
186
187
188
189
190
191
192
193
194
# File 'lib/transform.rb', line 181

def transform values
  begin
    vs = values.clone
    raise "Error! Too few columns for transformation." if vs.size2 < @cols.max
    data_matrix_scaled = GSL::Matrix.alloc(vs.size1, @cols.size)
    @cols.each_with_index { |i,j|
      data_matrix_scaled.col(j)[0..data_matrix_scaled.size1-1] = @autoscaler[j].transform(vs.col(i).to_a) * @autoscaler[j].stdev
    }
    (@eigenvector_matrix.transpose * data_matrix_scaled.transpose).transpose
  rescue Exception => e
    LOGGER.debug "#{e.class}: #{e.message}"
    LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
  end
end