Class: ClusterKit::Clustering::HDBSCAN

Inherits:
Object
  • Object
show all
Defined in:
lib/clusterkit/clustering/hdbscan.rb,
lib/clusterkit/hdbscan_api_design.rb

Overview

HDBSCAN clustering algorithm - matching KMeans API pattern

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean') ⇒ HDBSCAN

Initialize HDBSCAN clusterer (matches KMeans pattern)

Parameters:

  • min_samples (Integer) (defaults to: 5)

    Min neighborhood size for core points (default: 5)

  • min_cluster_size (Integer) (defaults to: 5)

    Minimum size of clusters (default: 5)

  • metric (String) (defaults to: 'euclidean')

    Distance metric (default: ‘euclidean’)

Raises:

  • (ArgumentError)


16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/clusterkit/clustering/hdbscan.rb', line 16

def initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean')
  raise ArgumentError, "min_samples must be positive" unless min_samples > 0
  raise ArgumentError, "min_cluster_size must be positive" unless min_cluster_size > 0
  
  valid_metrics = ['euclidean', 'l2', 'manhattan', 'l1', 'cosine']
  unless valid_metrics.include?(metric)
    raise ArgumentError, "metric must be one of: #{valid_metrics.join(', ')}"
  end
  
  @min_samples = min_samples
  @min_cluster_size = min_cluster_size
  @metric = metric
  @fitted = false
end

Instance Attribute Details

#cluster_persistenceObject (readonly)

Returns the value of attribute cluster_persistence.



9
10
11
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def cluster_persistence
  @cluster_persistence
end

#labelsObject (readonly)

Returns the value of attribute labels.



9
10
11
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def labels
  @labels
end

#metricObject (readonly)

Returns the value of attribute metric.



9
10
11
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def metric
  @metric
end

#min_cluster_sizeObject (readonly)

Returns the value of attribute min_cluster_size.



9
10
11
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def min_cluster_size
  @min_cluster_size
end

#min_samplesObject (readonly)

Returns the value of attribute min_samples.



9
10
11
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def min_samples
  @min_samples
end

#outlier_scoresObject (readonly)

Returns the value of attribute outlier_scores.



9
10
11
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def outlier_scores
  @outlier_scores
end

#probabilitiesObject (readonly)

Returns the value of attribute probabilities.



9
10
11
# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def probabilities
  @probabilities
end

Instance Method Details

#cluster_indicesHash<Integer, Array<Integer>>

Get indices of points in each cluster

Returns:

  • (Hash<Integer, Array<Integer>>)

    Mapping of cluster label to point indices



104
105
106
107
108
109
110
111
112
113
114
# File 'lib/clusterkit/clustering/hdbscan.rb', line 104

def cluster_indices
  return {} unless fitted?
  
  result = {}
  @labels.each_with_index do |label, idx|
    next if label == -1  # Skip noise points
    result[label] ||= []
    result[label] << idx
  end
  result
end

#fit(data) ⇒ self

Fit the HDBSCAN model (matches KMeans.fit)

Parameters:

  • data (Array)

    2D array of data points

Returns:

  • (self)

    Returns self for method chaining



34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/clusterkit/clustering/hdbscan.rb', line 34

def fit(data)
  validate_data(data)
  
  # Call Rust implementation (hdbscan crate)
  result = Clustering.hdbscan_rust(data, @min_samples, @min_cluster_size, @metric)
  
  @labels = result["labels"]
  @probabilities = result["probabilities"]
  @outlier_scores = result["outlier_scores"]
  @cluster_persistence = result["cluster_persistence"]
  @fitted = true
  
  self
end

#fit_predict(data) ⇒ Array

Fit the model and return labels (matches KMeans.fit_predict)

Parameters:

  • data (Array)

    2D array of data points

Returns:

  • (Array)

    Cluster labels (-1 for noise)



61
62
63
64
# File 'lib/clusterkit/clustering/hdbscan.rb', line 61

def fit_predict(data)
  fit(data)
  @labels
end

#fitted?Boolean

Check if model has been fitted (matches KMeans.fitted?)

Returns:

  • (Boolean)

    True if fitted



68
69
70
# File 'lib/clusterkit/clustering/hdbscan.rb', line 68

def fitted?
  @fitted
end

#n_clustersInteger

Get number of clusters found (similar to KMeans.k but discovered)

Returns:

  • (Integer)

    Number of clusters (excluding noise)



74
75
76
77
78
79
# File 'lib/clusterkit/clustering/hdbscan.rb', line 74

def n_clusters
  return 0 unless fitted?
  # Count unique labels excluding -1 (noise)
  unique_labels = @labels.uniq.reject { |l| l == -1 }
  unique_labels.length
end

#n_noise_pointsInteger

Get the number of noise points

Returns:

  • (Integer)

    Number of points labeled as noise



90
91
92
93
# File 'lib/clusterkit/clustering/hdbscan.rb', line 90

def n_noise_points
  return 0 unless fitted?
  @labels.count(-1)
end

#noise_indicesArray<Integer>

Get indices of noise points

Returns:

  • (Array<Integer>)

    Indices of points labeled as noise



97
98
99
100
# File 'lib/clusterkit/clustering/hdbscan.rb', line 97

def noise_indices
  return [] unless fitted?
  @labels.each_with_index.select { |label, _| label == -1 }.map { |_, idx| idx }
end

#noise_ratioFloat

Get noise ratio (HDBSCAN-specific but follows naming pattern)

Returns:

  • (Float)

    Fraction of points labeled as noise



83
84
85
86
# File 'lib/clusterkit/clustering/hdbscan.rb', line 83

def noise_ratio
  return 0.0 unless fitted?
  @labels.count(-1).to_f / @labels.length
end

#predict(data) ⇒ Array

HDBSCAN doesn’t support predict for new points (unlike KMeans) But we keep the method for API consistency

Parameters:

  • data (Array)

    2D array of data points

Returns:

  • (Array)

    Returns nil or raises

Raises:

  • (NotImplementedError)


53
54
55
56
# File 'lib/clusterkit/clustering/hdbscan.rb', line 53

def predict(data)
  raise NotImplementedError, "HDBSCAN does not support prediction on new data. " \
                            "Use approximate_predict for approximate membership"
end

#summaryHash

Get summary statistics

Returns:

  • (Hash)

    Summary of clustering results



118
119
120
121
122
123
124
125
126
127
128
# File 'lib/clusterkit/clustering/hdbscan.rb', line 118

def summary
  return {} unless fitted?
  
  {
    n_clusters: n_clusters,
    n_noise_points: n_noise_points,
    noise_ratio: noise_ratio,
    cluster_sizes: cluster_indices.transform_values(&:length),
    cluster_persistence: @cluster_persistence
  }
end