Class: ClusterKit::Clustering::HDBSCAN

Inherits:

Object

Object
ClusterKit::Clustering::HDBSCAN

show all

Defined in:: lib/clusterkit/clustering/hdbscan.rb,
lib/clusterkit/hdbscan_api_design.rb

Overview

HDBSCAN clustering algorithm - matching KMeans API pattern

Instance Attribute Summary collapse

#cluster_persistence ⇒ Object readonly

Returns the value of attribute cluster_persistence.
#labels ⇒ Object readonly

Returns the value of attribute labels.
#metric ⇒ Object readonly

Returns the value of attribute metric.
#min_cluster_size ⇒ Object readonly

Returns the value of attribute min_cluster_size.
#min_samples ⇒ Object readonly

Returns the value of attribute min_samples.
#outlier_scores ⇒ Object readonly

Returns the value of attribute outlier_scores.
#probabilities ⇒ Object readonly

Returns the value of attribute probabilities.

Instance Method Summary collapse

#cluster_indices ⇒ Hash<Integer, Array<Integer>>

Get indices of points in each cluster.
#fit(data) ⇒ self

Fit the HDBSCAN model (matches KMeans.fit).
#fit_predict(data) ⇒ Array

Fit the model and return labels (matches KMeans.fit_predict).
#fitted? ⇒ Boolean

Check if model has been fitted (matches KMeans.fitted?).
#initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean') ⇒ HDBSCAN constructor

Initialize HDBSCAN clusterer (matches KMeans pattern).
#n_clusters ⇒ Integer

Get number of clusters found (similar to KMeans.k but discovered).
#n_noise_points ⇒ Integer

Get the number of noise points.
#noise_indices ⇒ Array<Integer>

Get indices of noise points.
#noise_ratio ⇒ Float

Get noise ratio (HDBSCAN-specific but follows naming pattern).
#predict(data) ⇒ Array

HDBSCAN doesn’t support predict for new points (unlike KMeans) But we keep the method for API consistency.
#summary ⇒ Hash

Get summary statistics.

Constructor Details

#initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean') ⇒ `HDBSCAN`

Initialize HDBSCAN clusterer (matches KMeans pattern)

Parameters:

min_samples (Integer) (defaults to: 5) —

Min neighborhood size for core points (default: 5)
min_cluster_size (Integer) (defaults to: 5) —

Minimum size of clusters (default: 5)
metric (String) (defaults to: 'euclidean') —

Distance metric (default: ‘euclidean’)

Raises:

(ArgumentError)

# File 'lib/clusterkit/clustering/hdbscan.rb', line 16

def initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean')
  raise ArgumentError, "min_samples must be positive" unless min_samples > 0
  raise ArgumentError, "min_cluster_size must be positive" unless min_cluster_size > 0
  
  valid_metrics = ['euclidean', 'l2', 'manhattan', 'l1', 'cosine']
  unless valid_metrics.include?(metric)
    raise ArgumentError, "metric must be one of: #{valid_metrics.join(', ')}"
  end
  
  @min_samples = min_samples
  @min_cluster_size = min_cluster_size
  @metric = metric
  @fitted = false
end

Instance Attribute Details

#cluster_persistence ⇒ `Object` (readonly)

Returns the value of attribute cluster_persistence.



9
10
11

# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def cluster_persistence
  @cluster_persistence
end

#labels ⇒ `Object` (readonly)

Returns the value of attribute labels.



9
10
11

# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def labels
  @labels
end

#metric ⇒ `Object` (readonly)

Returns the value of attribute metric.



9
10
11

# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def metric
  @metric
end

#min_cluster_size ⇒ `Object` (readonly)

Returns the value of attribute min_cluster_size.



9
10
11

# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def min_cluster_size
  @min_cluster_size
end

#min_samples ⇒ `Object` (readonly)

Returns the value of attribute min_samples.



9
10
11

# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def min_samples
  @min_samples
end

#outlier_scores ⇒ `Object` (readonly)

Returns the value of attribute outlier_scores.



9
10
11

# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def outlier_scores
  @outlier_scores
end

#probabilities ⇒ `Object` (readonly)

Returns the value of attribute probabilities.



9
10
11

# File 'lib/clusterkit/clustering/hdbscan.rb', line 9

def probabilities
  @probabilities
end

Instance Method Details

#cluster_indices ⇒ `Hash<Integer, Array<Integer>>`

Get indices of points in each cluster

Returns:

(Hash<Integer, Array<Integer>>) —

Mapping of cluster label to point indices

# File 'lib/clusterkit/clustering/hdbscan.rb', line 104

def cluster_indices
  return {} unless fitted?
  
  result = {}
  @labels.each_with_index do |label, idx|
    next if label == -1  # Skip noise points
    result[label] ||= []
    result[label] << idx
  end
  result
end

#fit(data) ⇒ `self`

Fit the HDBSCAN model (matches KMeans.fit)

Parameters:

data (Array) —

2D array of data points

Returns:

(self) —

Returns self for method chaining

# File 'lib/clusterkit/clustering/hdbscan.rb', line 34

def fit(data)
  validate_data(data)
  
  # Call Rust implementation (hdbscan crate)
  result = Clustering.hdbscan_rust(data, @min_samples, @min_cluster_size, @metric)
  
  @labels = result["labels"]
  @probabilities = result["probabilities"]
  @outlier_scores = result["outlier_scores"]
  @cluster_persistence = result["cluster_persistence"]
  @fitted = true
  
  self
end

#fit_predict(data) ⇒ `Array`

Fit the model and return labels (matches KMeans.fit_predict)

Parameters:

data (Array) —

2D array of data points

Returns:

(Array) —

Cluster labels (-1 for noise)

# File 'lib/clusterkit/clustering/hdbscan.rb', line 61

def fit_predict(data)
  fit(data)
  @labels
end

#fitted? ⇒ `Boolean`

Check if model has been fitted (matches KMeans.fitted?)

Returns:

(Boolean) —

True if fitted



68
69
70

# File 'lib/clusterkit/clustering/hdbscan.rb', line 68

def fitted?
  @fitted
end

#n_clusters ⇒ `Integer`

Get number of clusters found (similar to KMeans.k but discovered)

Returns:

(Integer) —

Number of clusters (excluding noise)

# File 'lib/clusterkit/clustering/hdbscan.rb', line 74

def n_clusters
  return 0 unless fitted?
  # Count unique labels excluding -1 (noise)
  unique_labels = @labels.uniq.reject { |l| l == -1 }
  unique_labels.length
end

#n_noise_points ⇒ `Integer`

Get the number of noise points

Returns:

(Integer) —

Number of points labeled as noise

# File 'lib/clusterkit/clustering/hdbscan.rb', line 90

def n_noise_points
  return 0 unless fitted?
  @labels.count(-1)
end

#noise_indices ⇒ `Array<Integer>`

Get indices of noise points

Returns:

(Array<Integer>) —

Indices of points labeled as noise

# File 'lib/clusterkit/clustering/hdbscan.rb', line 97

def noise_indices
  return [] unless fitted?
  @labels.each_with_index.select { |label, _| label == -1 }.map { |_, idx| idx }
end

#noise_ratio ⇒ `Float`

Get noise ratio (HDBSCAN-specific but follows naming pattern)

Returns:

(Float) —

Fraction of points labeled as noise

# File 'lib/clusterkit/clustering/hdbscan.rb', line 83

def noise_ratio
  return 0.0 unless fitted?
  @labels.count(-1).to_f / @labels.length
end

#predict(data) ⇒ `Array`

HDBSCAN doesn’t support predict for new points (unlike KMeans) But we keep the method for API consistency

Parameters:

data (Array) —

2D array of data points

Returns:

(Array) —

Returns nil or raises

Raises:

(NotImplementedError)

# File 'lib/clusterkit/clustering/hdbscan.rb', line 53

def predict(data)
  raise NotImplementedError, "HDBSCAN does not support prediction on new data. " \
                            "Use approximate_predict for approximate membership"
end

#summary ⇒ `Hash`

Get summary statistics

Returns:

(Hash) —

Summary of clustering results

# File 'lib/clusterkit/clustering/hdbscan.rb', line 118

def summary
  return {} unless fitted?
  
  {
    n_clusters: n_clusters,
    n_noise_points: n_noise_points,
    noise_ratio: noise_ratio,
    cluster_sizes: cluster_indices.transform_values(&:length),
    cluster_persistence: @cluster_persistence
  }
end

Class: ClusterKit::Clustering::HDBSCAN

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean') ⇒ HDBSCAN

Instance Attribute Details

#cluster_persistence ⇒ Object (readonly)

#labels ⇒ Object (readonly)

#metric ⇒ Object (readonly)

#min_cluster_size ⇒ Object (readonly)

#min_samples ⇒ Object (readonly)

#outlier_scores ⇒ Object (readonly)

#probabilities ⇒ Object (readonly)

Instance Method Details

#cluster_indices ⇒ Hash<Integer, Array<Integer>>

#fit(data) ⇒ self

#fit_predict(data) ⇒ Array

#fitted? ⇒ Boolean

#n_clusters ⇒ Integer

#n_noise_points ⇒ Integer

#noise_indices ⇒ Array<Integer>

#noise_ratio ⇒ Float

#predict(data) ⇒ Array

#summary ⇒ Hash

#initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean') ⇒ `HDBSCAN`

#cluster_persistence ⇒ `Object` (readonly)

#labels ⇒ `Object` (readonly)

#metric ⇒ `Object` (readonly)

#min_cluster_size ⇒ `Object` (readonly)

#min_samples ⇒ `Object` (readonly)

#outlier_scores ⇒ `Object` (readonly)

#probabilities ⇒ `Object` (readonly)

#cluster_indices ⇒ `Hash<Integer, Array<Integer>>`

#fit(data) ⇒ `self`

#fit_predict(data) ⇒ `Array`

#fitted? ⇒ `Boolean`

#n_clusters ⇒ `Integer`

#n_noise_points ⇒ `Integer`

#noise_indices ⇒ `Array<Integer>`

#noise_ratio ⇒ `Float`

#predict(data) ⇒ `Array`

#summary ⇒ `Hash`