Class: ClusterKit::Clustering::HDBSCAN
- Inherits:
-
Object
- Object
- ClusterKit::Clustering::HDBSCAN
- Defined in:
- lib/clusterkit/clustering/hdbscan.rb,
lib/clusterkit/hdbscan_api_design.rb
Overview
HDBSCAN clustering algorithm - matching KMeans API pattern
Instance Attribute Summary collapse
-
#cluster_persistence ⇒ Object
readonly
Returns the value of attribute cluster_persistence.
-
#labels ⇒ Object
readonly
Returns the value of attribute labels.
-
#metric ⇒ Object
readonly
Returns the value of attribute metric.
-
#min_cluster_size ⇒ Object
readonly
Returns the value of attribute min_cluster_size.
-
#min_samples ⇒ Object
readonly
Returns the value of attribute min_samples.
-
#outlier_scores ⇒ Object
readonly
Returns the value of attribute outlier_scores.
-
#probabilities ⇒ Object
readonly
Returns the value of attribute probabilities.
Instance Method Summary collapse
-
#cluster_indices ⇒ Hash<Integer, Array<Integer>>
Get indices of points in each cluster.
-
#fit(data) ⇒ self
Fit the HDBSCAN model (matches KMeans.fit).
-
#fit_predict(data) ⇒ Array
Fit the model and return labels (matches KMeans.fit_predict).
-
#fitted? ⇒ Boolean
Check if model has been fitted (matches KMeans.fitted?).
-
#initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean') ⇒ HDBSCAN
constructor
Initialize HDBSCAN clusterer (matches KMeans pattern).
-
#n_clusters ⇒ Integer
Get number of clusters found (similar to KMeans.k but discovered).
-
#n_noise_points ⇒ Integer
Get the number of noise points.
-
#noise_indices ⇒ Array<Integer>
Get indices of noise points.
-
#noise_ratio ⇒ Float
Get noise ratio (HDBSCAN-specific but follows naming pattern).
-
#predict(data) ⇒ Array
HDBSCAN doesn’t support predict for new points (unlike KMeans) But we keep the method for API consistency.
-
#summary ⇒ Hash
Get summary statistics.
Constructor Details
#initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean') ⇒ HDBSCAN
Initialize HDBSCAN clusterer (matches KMeans pattern)
16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 16 def initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean') raise ArgumentError, "min_samples must be positive" unless min_samples > 0 raise ArgumentError, "min_cluster_size must be positive" unless min_cluster_size > 0 valid_metrics = ['euclidean', 'l2', 'manhattan', 'l1', 'cosine'] unless valid_metrics.include?(metric) raise ArgumentError, "metric must be one of: #{valid_metrics.join(', ')}" end @min_samples = min_samples @min_cluster_size = min_cluster_size @metric = metric @fitted = false end |
Instance Attribute Details
#cluster_persistence ⇒ Object (readonly)
Returns the value of attribute cluster_persistence.
9 10 11 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9 def cluster_persistence @cluster_persistence end |
#labels ⇒ Object (readonly)
Returns the value of attribute labels.
9 10 11 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9 def labels @labels end |
#metric ⇒ Object (readonly)
Returns the value of attribute metric.
9 10 11 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9 def metric @metric end |
#min_cluster_size ⇒ Object (readonly)
Returns the value of attribute min_cluster_size.
9 10 11 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9 def min_cluster_size @min_cluster_size end |
#min_samples ⇒ Object (readonly)
Returns the value of attribute min_samples.
9 10 11 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9 def min_samples @min_samples end |
#outlier_scores ⇒ Object (readonly)
Returns the value of attribute outlier_scores.
9 10 11 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9 def outlier_scores @outlier_scores end |
#probabilities ⇒ Object (readonly)
Returns the value of attribute probabilities.
9 10 11 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9 def probabilities @probabilities end |
Instance Method Details
#cluster_indices ⇒ Hash<Integer, Array<Integer>>
Get indices of points in each cluster
104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 104 def cluster_indices return {} unless fitted? result = {} @labels.each_with_index do |label, idx| next if label == -1 # Skip noise points result[label] ||= [] result[label] << idx end result end |
#fit(data) ⇒ self
Fit the HDBSCAN model (matches KMeans.fit)
34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 34 def fit(data) validate_data(data) # Call Rust implementation (hdbscan crate) result = Clustering.hdbscan_rust(data, @min_samples, @min_cluster_size, @metric) @labels = result["labels"] @probabilities = result["probabilities"] @outlier_scores = result["outlier_scores"] @cluster_persistence = result["cluster_persistence"] @fitted = true self end |
#fit_predict(data) ⇒ Array
Fit the model and return labels (matches KMeans.fit_predict)
61 62 63 64 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 61 def fit_predict(data) fit(data) @labels end |
#fitted? ⇒ Boolean
Check if model has been fitted (matches KMeans.fitted?)
68 69 70 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 68 def fitted? @fitted end |
#n_clusters ⇒ Integer
Get number of clusters found (similar to KMeans.k but discovered)
74 75 76 77 78 79 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 74 def n_clusters return 0 unless fitted? # Count unique labels excluding -1 (noise) unique_labels = @labels.uniq.reject { |l| l == -1 } unique_labels.length end |
#n_noise_points ⇒ Integer
Get the number of noise points
90 91 92 93 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 90 def n_noise_points return 0 unless fitted? @labels.count(-1) end |
#noise_indices ⇒ Array<Integer>
Get indices of noise points
97 98 99 100 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 97 def noise_indices return [] unless fitted? @labels.each_with_index.select { |label, _| label == -1 }.map { |_, idx| idx } end |
#noise_ratio ⇒ Float
Get noise ratio (HDBSCAN-specific but follows naming pattern)
83 84 85 86 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 83 def noise_ratio return 0.0 unless fitted? @labels.count(-1).to_f / @labels.length end |
#predict(data) ⇒ Array
HDBSCAN doesn’t support predict for new points (unlike KMeans) But we keep the method for API consistency
53 54 55 56 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 53 def predict(data) raise NotImplementedError, "HDBSCAN does not support prediction on new data. " \ "Use approximate_predict for approximate membership" end |
#summary ⇒ Hash
Get summary statistics
118 119 120 121 122 123 124 125 126 127 128 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 118 def summary return {} unless fitted? { n_clusters: n_clusters, n_noise_points: n_noise_points, noise_ratio: noise_ratio, cluster_sizes: cluster_indices.transform_values(&:length), cluster_persistence: @cluster_persistence } end |