Module: ClusterKit::Clustering
- Defined in:
- lib/clusterkit/clustering.rb,
lib/clusterkit/clustering/hdbscan.rb,
lib/clusterkit/hdbscan_api_design.rb
Overview
Module for clustering algorithms
Defined Under Namespace
Class Method Summary collapse
-
.hdbscan(data, min_samples: 5, min_cluster_size: 5) ⇒ Hash
Perform HDBSCAN clustering (matches Clustering.kmeans signature).
-
.silhouette_score(data, labels) ⇒ Float
Calculate silhouette score for any clustering result.
Class Method Details
.hdbscan(data, min_samples: 5, min_cluster_size: 5) ⇒ Hash
Perform HDBSCAN clustering (matches Clustering.kmeans signature)
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# File 'lib/clusterkit/clustering/hdbscan.rb', line 146 def hdbscan(data, min_samples: 5, min_cluster_size: 5, metric: 'euclidean') clusterer = HDBSCAN.new( min_samples: min_samples, min_cluster_size: min_cluster_size, metric: metric ) clusterer.fit(data) { labels: clusterer.labels, probabilities: clusterer.probabilities, outlier_scores: clusterer.outlier_scores, n_clusters: clusterer.n_clusters, noise_ratio: clusterer.noise_ratio, cluster_persistence: clusterer.cluster_persistence || {} } end |
.silhouette_score(data, labels) ⇒ Float
Calculate silhouette score for any clustering result
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
# File 'lib/clusterkit/clustering.rb', line 143 def silhouette_score(data, labels) n_samples = data.size unique_labels = labels.uniq return 0.0 if unique_labels.size == 1 silhouette_values = [] data.each_with_index do |point, i| cluster_label = labels[i] # Calculate mean intra-cluster distance same_cluster_indices = labels.each_index.select { |j| labels[j] == cluster_label && j != i } if same_cluster_indices.empty? silhouette_values << 0.0 next end a = same_cluster_indices.sum { |j| euclidean_distance(point, data[j]) } / same_cluster_indices.size.to_f # Calculate mean nearest-cluster distance b = Float::INFINITY unique_labels.each do |other_label| next if other_label == cluster_label other_cluster_indices = labels.each_index.select { |j| labels[j] == other_label } next if other_cluster_indices.empty? mean_dist = other_cluster_indices.sum { |j| euclidean_distance(point, data[j]) } / other_cluster_indices.size.to_f b = mean_dist if mean_dist < b end # Calculate silhouette value for this point if a == 0.0 && b == 0.0 s = 0.0 # When all points are identical else s = (b - a) / [a, b].max end silhouette_values << s end silhouette_values.sum / silhouette_values.size.to_f end |