I am concatenating the inputs in a list for each batch and fit them to HDBSCAN. This naturally would be slower as I go through more batches.
Hence I thought of a solution where I will collect the resulting medoids and their labels as I go through the batches so some of them can be reused in the next batches , and pop the oldest item in my input list every N iteration.
Below is a simple version of my original implementation:
`import os
import gzip
import pickle
import numpy as np
import torch
from tqdm import tqdm
import itertools
from sklearn.cluster import HDBSCAN
args = {'traindir':'/path/to/your/data/folder', 'minclustersize': 5, 'n_jobs': -1, 'ghost_freq': 5, 'ghost_coeff': 0.5, 'clustersavefreq': 10, 'batch_size': 32, 'workers': 0} # Update this with your arguments
def train(train_loader):
resultpath = '/path/to/your/result/folder' # Update this with your result folder path
if os.path.exists(os.path.join(resultpath, 'train_hdbscan_Fin.gzip')):
with gzip.open(os.path.join(resultpath, 'train_hdbscan_Fin.gzip'), 'rb') as f:
return pickle.load(f)
train_checkpoint_queue = []
model = HDBSCAN(min_cluster_size=args['minclustersize'], n_jobs=args['n_jobs'], store_centers='medoid', copy=True)
input_list, medoids_list = [], []
for i, (inputs, path) in enumerate(tqdm(train_loader)):
input_tensors = torch.stack([input.cuda() for input in inputs]).detach().cpu().clone().numpy()
input_list.append(input_tensors)
if i % args['ghost_freq'] == 0:
items_to_pop = max(1, int(i * args['ghost_coeff']))
for _ in range(items_to_pop):
if input_list:
input_list.pop(0)
if input_list:
input_data = np.concatenate(input_list, axis=0)
model.fit(input_data)
else:
model.fit(input_tensors)
if model.labels_.size > 0:
new_labels = model.labels_[~np.in1d(model.labels_, model.labels_[:-1])]
if new_labels.size > 0:
model.labels_ = np.concatenate((model.labels_.reshape(1, -1)), axis=0)
# Determine the number of rows and columns in model.medoids_
num_rows, num_cols = model.medoids_.shape
# Generate the boolean index array and reshape it to match the shape of model.medoids_
bool_index = ~np.in1d(model.medoids_.ravel(), model.medoids_[:-1].ravel()).reshape((num_rows, num_cols))
if model.medoids_.size > 0:
#new_medoids = model.medoids_[~np.in1d(model.medoids_, model.medoids_[:-1])]
new_medoids = model.medoids_[bool_index]
if new_medoids.size > 0:
model.medoids_ = np.concatenate((model.medoids_.reshape(1, -1)), axis=0)
if i % args['clustersavefreq'] == 0:
output_labels_list = model.labels_.tolist()
unique_labels = list(set(output_labels_list))
medoids_list = model.medoids_.tolist()
unique_medoids = list(set(medoids_list))
with open(resultpath+f'/learn_num_clusters{str(i)}.txt', 'w+', encoding="utf-8") as f:
print('Unique clusters from the current alive batch ', str(unique_labels) + '\n' + "Length of medoids_list:", str(len(unique_medoids))
+"Size of input list ", str(len(input_list))
, file=f)
# save_checkpoint({'batch_idx': i, 'input_list': input_list, 'model': model, 'medoids_list': model.medoids_, 'model_labels': model.labels_, 'mainlist': mainlist}, filename=f'learn_checkpoint{str(i)}.pth')
# manage_learn_checkpoints(i, train_checkpoint_queue, resultpath)
Define the save_checkpoint and manage_learn_checkpoints functions as needed
`
I am not sure if this is being done correctly because when my Nth batch has 419 samples, the labels generated are from -1 to 356, but if the next batch only has 4 samples, the labels are between -1 to 4 though some of those samples could clearly belong to 20th label. Or maybe I am misunderstanding how model.labels_ work. Any insights/suggestions for me? Thank you in advance
I am implementing HDBSCAN to batches of data.
I am concatenating the inputs in a list for each batch and fit them to HDBSCAN. This naturally would be slower as I go through more batches.
Hence I thought of a solution where I will collect the resulting medoids and their labels as I go through the batches so some of them can be reused in the next batches , and pop the oldest item in my input list every N iteration. Below is a simple version of my original implementation:
`import os import gzip import pickle import numpy as np import torch from tqdm import tqdm import itertools from sklearn.cluster import HDBSCAN
args = {'traindir':'/path/to/your/data/folder', 'minclustersize': 5, 'n_jobs': -1, 'ghost_freq': 5, 'ghost_coeff': 0.5, 'clustersavefreq': 10, 'batch_size': 32, 'workers': 0} # Update this with your arguments
def train(train_loader): resultpath = '/path/to/your/result/folder' # Update this with your result folder path
Define the save_checkpoint and manage_learn_checkpoints functions as needed
`
I am not sure if this is being done correctly because when my Nth batch has 419 samples, the labels generated are from -1 to 356, but if the next batch only has 4 samples, the labels are between -1 to 4 though some of those samples could clearly belong to 20th label. Or maybe I am misunderstanding how model.labels_ work. Any insights/suggestions for me? Thank you in advance