I'm training a top2vec model using universal sentence encoder and I'm getting Type Error:
'numpy.float64' object cannot be interpreted as an integer
All my input text is string type, I've quadruple checked and this has worked before so I'm at a loss as to what is causing this error to be thrown or where it's initiating from. Is it coming from hdbscan?
TypeError Traceback (most recent call last)
Cell In[12], line 2
1 # Train the top2vec model
----> 2 model = Top2Vec(df.Q10.values, embedding_model='universal-sentence-encoder') #, embedding_model='universal-sentence-encoder'
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/top2vec/Top2Vec.py:666, in Top2Vec.__init__(self, documents, min_count, topic_merge_delta, ngram_vocab, ngram_vocab_args, embedding_model, embedding_model_path, embedding_batch_size, split_documents, document_chunker, chunk_length, max_num_chunks, chunk_overlap_ratio, chunk_len_coverage_ratio, sentencizer, speed, use_corpus_file, document_ids, keep_documents, workers, tokenizer, use_embedding_model_tokenizer, umap_args, hdbscan_args, verbose)
663 else:
664 raise ValueError(f"{embedding_model} is an invalid embedding model.")
--> 666 self.compute_topics(umap_args=umap_args, hdbscan_args=hdbscan_args, topic_merge_delta=topic_merge_delta)
668 # initialize document indexing variables
669 self.document_index = None
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/top2vec/Top2Vec.py:1266, in Top2Vec.compute_topics(self, umap_args, hdbscan_args, topic_merge_delta)
1261 if hdbscan_args is None:
1262 hdbscan_args = {'min_cluster_size': 15,
1263 'metric': 'euclidean',
1264 'cluster_selection_method': 'eom'}
-> 1266 cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_)
1268 # calculate topic vectors from dense areas of documents
1269 logger.info('Finding topics')
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/hdbscan/hdbscan_.py:1205, in HDBSCAN.fit(self, X, y)
1195 kwargs.pop("prediction_data", None)
1196 kwargs.update(self._metric_kwargs)
1198 (
1199 self.labels_,
1200 self.probabilities_,
1201 self.cluster_persistence_,
1202 self._condensed_tree,
1203 self._single_linkage_tree,
1204 self._min_spanning_tree,
-> 1205 ) = hdbscan(clean_data, **kwargs)
1207 if self.metric != "precomputed" and not self._all_finite:
1208 # remap indices to align with original data in the case of non-finite entries.
1209 self._condensed_tree = remap_condensed_tree(
1210 self._condensed_tree, internal_to_raw, outliers
1211 )
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/hdbscan/hdbscan_.py:884, in hdbscan(X, min_cluster_size, min_samples, alpha, cluster_selection_epsilon, max_cluster_size, metric, p, leaf_size, algorithm, memory, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, cluster_selection_method, allow_single_cluster, match_reference_implementation, **kwargs)
867 else:
868 (single_linkage_tree, result_min_span_tree) = memory.cache(
869 _hdbscan_boruvka_balltree
870 )(
(...)
880 **kwargs
881 )
883 return (
--> 884 _tree_to_labels(
885 X,
886 single_linkage_tree,
887 min_cluster_size,
888 cluster_selection_method,
889 allow_single_cluster,
890 match_reference_implementation,
891 cluster_selection_epsilon,
892 max_cluster_size,
893 )
894 + (result_min_span_tree,)
895 )
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/hdbscan/hdbscan_.py:80, in _tree_to_labels(X, single_linkage_tree, min_cluster_size, cluster_selection_method, allow_single_cluster, match_reference_implementation, cluster_selection_epsilon, max_cluster_size)
78 condensed_tree = condense_tree(single_linkage_tree, min_cluster_size)
79 stability_dict = compute_stability(condensed_tree)
---> 80 labels, probabilities, stabilities = get_clusters(
81 condensed_tree,
82 stability_dict,
83 cluster_selection_method,
84 allow_single_cluster,
85 match_reference_implementation,
86 cluster_selection_epsilon,
87 max_cluster_size,
88 )
90 return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree)
File hdbscan/_hdbscan_tree.pyx:659, in hdbscan._hdbscan_tree.get_clusters()
File hdbscan/_hdbscan_tree.pyx:733, in hdbscan._hdbscan_tree.get_clusters()
TypeError: 'numpy.float64' object cannot be interpreted as an integer
I'm training a top2vec model using universal sentence encoder and I'm getting Type Error:
'numpy.float64' object cannot be interpreted as an integer
All my input text is string type, I've quadruple checked and this has worked before so I'm at a loss as to what is causing this error to be thrown or where it's initiating from. Is it coming from hdbscan?