scikit-learn-contrib / hdbscan

A high performance implementation of HDBSCAN clustering.
http://hdbscan.readthedocs.io/en/latest/
BSD 3-Clause "New" or "Revised" License
2.81k stars 507 forks source link

Getting Error while using HDBSCAN #609

Open shyamdthakkar opened 1 year ago

shyamdthakkar commented 1 year ago

import hdbscan import numpy as np import pandas as pd from sklearn.metrics import adjusted_rand_score import matplotlib.pyplot as plt

data = pd.read_csv('DBSCAN Data.csv') X = data[['Feature 1', 'Feature 2']].values true_labels = data['Cluster'].values

clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True) hdbscan_labels = clusterer.fit_predict(X)

Error Msg -

TypeError Traceback (most recent call last) Cell In[10], line 2 1 clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True) ----> 2 hdbscan_labels = clusterer.fit_predict(X)

File ~\anaconda3\Lib\site-packages\hdbscan\hdbscan_.py:1243, in HDBSCAN.fit_predict(self, X, y) 1228 def fitpredict(self, X, y=None): 1229 """Performs clustering on X and returns cluster labels. 1230 1231 Parameters (...) 1241 cluster labels 1242 """ -> 1243 self.fit(X) 1244 return self.labels

File ~\anaconda3\Lib\site-packages\hdbscan\hdbscan_.py:1205, in HDBSCAN.fit(self, X, y) 1195 kwargs.pop("prediction_data", None) 1196 kwargs.update(self._metrickwargs) 1198 ( 1199 self.labels, 1200 self.probabilities_, 1201 self.clusterpersistence, 1202 self._condensed_tree, 1203 self._single_linkage_tree, 1204 self._min_spanning_tree, -> 1205 ) = hdbscan(clean_data, **kwargs) 1207 if self.metric != "precomputed" and not self._all_finite: 1208 # remap indices to align with original data in the case of non-finite entries. 1209 self._condensed_tree = remap_condensed_tree( 1210 self._condensed_tree, internal_to_raw, outliers 1211 )

File ~\anaconda3\Lib\site-packages\hdbscan\hdbscan_.py:884, in hdbscan(X, min_cluster_size, min_samples, alpha, cluster_selection_epsilon, max_cluster_size, metric, p, leaf_size, algorithm, memory, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, cluster_selection_method, allow_single_cluster, match_reference_implementation, kwargs) 867 else: 868 (single_linkage_tree, result_min_span_tree) = memory.cache( 869 _hdbscan_boruvka_balltree 870 )( (...) 880 kwargs 881 ) 883 return ( --> 884 _tree_to_labels( 885 X, 886 single_linkage_tree, 887 min_cluster_size, 888 cluster_selection_method, 889 allow_single_cluster, 890 match_reference_implementation, 891 cluster_selection_epsilon, 892 max_cluster_size, 893 ) 894 + (result_min_span_tree,) 895 )

File ~\anaconda3\Lib\site-packages\hdbscan\hdbscan_.py:80, in _tree_to_labels(X, single_linkage_tree, min_cluster_size, cluster_selection_method, allow_single_cluster, match_reference_implementation, cluster_selection_epsilon, max_cluster_size) 78 condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) 79 stability_dict = compute_stability(condensed_tree) ---> 80 labels, probabilities, stabilities = get_clusters( 81 condensed_tree, 82 stability_dict, 83 cluster_selection_method, 84 allow_single_cluster, 85 match_reference_implementation, 86 cluster_selection_epsilon, 87 max_cluster_size, 88 ) 90 return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree)

File hdbscan\_hdbscan_tree.pyx:659, in hdbscan._hdbscan_tree.get_clusters()

File hdbscan\_hdbscan_tree.pyx:733, in hdbscan._hdbscan_tree.get_clusters()

TypeError: 'numpy.float64' object cannot be interpreted as an integer DBSCAN Data.csv

FinnHuelsbusch commented 1 year ago

Seems like a duplicate of #607 . Maybe the suggested fix helps you.