Class: ClusterKit::Dimensionality::PCA

Inherits:
Object
  • Object
show all
Defined in:
lib/clusterkit/dimensionality/pca.rb

Overview

Principal Component Analysis using SVD PCA is a linear dimensionality reduction technique that finds the directions of maximum variance in the data

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(n_components: 2) ⇒ PCA

Initialize PCA

Parameters:

  • n_components (Integer) (defaults to: 2)

    Number of principal components to keep



17
18
19
20
# File 'lib/clusterkit/dimensionality/pca.rb', line 17

def initialize(n_components: 2)
  @n_components = n_components
  @fitted = false
end

Instance Attribute Details

#componentsObject (readonly)

Returns the value of attribute components.



13
14
15
# File 'lib/clusterkit/dimensionality/pca.rb', line 13

def components
  @components
end

#explained_varianceArray (readonly)

Get the amount of variance explained by each component

Returns:

  • (Array)

    Explained variance for each component

Raises:

  • (RuntimeError)


135
136
137
# File 'lib/clusterkit/dimensionality/pca.rb', line 135

def explained_variance
  @explained_variance
end

#explained_variance_ratioArray (readonly)

Get the percentage of variance explained by each component

Returns:

  • (Array)

    Explained variance ratio for each component

Raises:

  • (RuntimeError)


142
143
144
# File 'lib/clusterkit/dimensionality/pca.rb', line 142

def explained_variance_ratio
  @explained_variance_ratio
end

#meanObject (readonly)

Returns the value of attribute mean.



13
14
15
# File 'lib/clusterkit/dimensionality/pca.rb', line 13

def mean
  @mean
end

#n_componentsObject (readonly)

Returns the value of attribute n_components.



13
14
15
# File 'lib/clusterkit/dimensionality/pca.rb', line 13

def n_components
  @n_components
end

Instance Method Details

#cumulative_explained_variance_ratioArray

Get cumulative explained variance ratio

Returns:

  • (Array)

    Cumulative sum of explained variance ratios

Raises:

  • (RuntimeError)


149
150
151
152
153
154
155
156
157
158
159
# File 'lib/clusterkit/dimensionality/pca.rb', line 149

def cumulative_explained_variance_ratio
  raise RuntimeError, "Model must be fitted first" unless fitted?
  
  cumsum = []
  sum = 0.0
  @explained_variance_ratio.each do |ratio|
    sum += ratio
    cumsum << sum
  end
  cumsum
end

#fit(data) ⇒ self

Fit the PCA model

Parameters:

  • data (Array)

    2D array of data points (n_samples × n_features)

Returns:

  • (self)

    Returns self for method chaining



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/clusterkit/dimensionality/pca.rb', line 25

def fit(data)
  validate_data(data)
  
  # Center the data (subtract mean from each feature)
  @mean = calculate_mean(data)
  centered_data = center_data(data, @mean)
  
  # Perform SVD on centered data
  # U contains the transformed data, S contains singular values, VT contains components
  u, s, vt = perform_svd(centered_data)
  
  # Store the principal components (eigenvectors)
  @components = vt  # Shape: (n_components, n_features)
  
  # Store singular values for consistency
  @singular_values = s
  
  # Calculate explained variance (eigenvalues)
  n_samples = data.size.to_f
  @explained_variance = s.map { |val| (val ** 2) / (n_samples - 1) }
  
  # Calculate explained variance ratio
  total_variance = calculate_total_variance(centered_data, n_samples)
  @explained_variance_ratio = @explained_variance.map { |var| var / total_variance }
  
  @fitted = true
  self
end

#fit_transform(data) ⇒ Array

Fit the model and transform the data in one step

Parameters:

  • data (Array)

    2D array of data points

Returns:

  • (Array)

    Transformed data



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/clusterkit/dimensionality/pca.rb', line 72

def fit_transform(data)
  validate_data(data)
  
  # Center the data (subtract mean from each feature)
  @mean = calculate_mean(data)
  centered_data = center_data(data, @mean)
  
  # Perform SVD on centered data
  u, s, vt = perform_svd(centered_data)
  
  # Store the principal components (eigenvectors)
  @components = vt
  
  # Store singular values for later use
  @singular_values = s
  
  # Calculate explained variance (eigenvalues)
  n_samples = data.size.to_f
  @explained_variance = s.map { |val| (val ** 2) / (n_samples - 1) }
  
  # Calculate explained variance ratio
  total_variance = calculate_total_variance(centered_data, n_samples)
  @explained_variance_ratio = @explained_variance.map { |var| var / total_variance }
  
  @fitted = true
  
  # For PCA, the transformed data is U * S
  # Scale U by singular values
  transformed = []
  u.each do |row|
    scaled_row = row.each_with_index.map { |val, i| val * s[i] }
    transformed << scaled_row
  end
  transformed
end

#fitted?Boolean

Check if model has been fitted

Returns:

  • (Boolean)

    True if fitted



163
164
165
# File 'lib/clusterkit/dimensionality/pca.rb', line 163

def fitted?
  @fitted
end

#inverse_transform(data) ⇒ Array

Inverse transform - reconstruct data from principal components

Parameters:

  • data (Array)

    Transformed data in PC space

Returns:

  • (Array)

    Reconstructed data in original space

Raises:

  • (RuntimeError)


111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/clusterkit/dimensionality/pca.rb', line 111

def inverse_transform(data)
  raise RuntimeError, "Model must be fitted before inverse_transform" unless fitted?
  
  # Reconstruct: data × components + mean
  reconstructed = []
  data.each do |sample|
    reconstructed_sample = Array.new(@mean.size, 0.0)
    
    sample.each_with_index do |value, i|
      @components[i].each_with_index do |comp_val, j|
        reconstructed_sample[j] += value * comp_val
      end
    end
    
    # Add back the mean
    reconstructed_sample = reconstructed_sample.zip(@mean).map { |r, m| r + m }
    reconstructed << reconstructed_sample
  end
  
  reconstructed
end

#transform(data) ⇒ Array

Transform data using the fitted PCA model

Parameters:

  • data (Array)

    2D array of data points

Returns:

  • (Array)

    Transformed data in principal component space

Raises:

  • (RuntimeError)


57
58
59
60
61
62
63
64
65
66
67
# File 'lib/clusterkit/dimensionality/pca.rb', line 57

def transform(data)
  raise RuntimeError, "Model must be fitted before transform" unless fitted?
  validate_data(data)
  
  # Center the data using the stored mean
  centered_data = center_data(data, @mean)
  
  # Project onto principal components
  # Result = centered_data × components.T
  project_data(centered_data, @components)
end