GGiecold-zz / DBSCAN_multiplex

A fast and efficient implementation of DBSCAN clustering.
MIT License
52 stars 17 forks source link

MemoryError #9

Closed StatguyUser closed 6 years ago

StatguyUser commented 6 years ago

eps, labels_matrix = DB.DBSCAN(textVect, minPts = 1000, verbose = True,metric='euclidean')

I tried running DBSCAN on a vector which has 300000*300 dimension input data which came from doc2vec output. However, i am getting memory error as follows.

INFO: DBSCAN_multiplex @ load:
starting the determination of an appropriate value of 'eps' for this data-set and for the other parameter of the DBSCAN algorithm set to 1000.
This might take a while.

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-15-20e71b464c7a> in <module>()
----> 1 eps, labels_matrix = DB.DBSCAN(textVect, minPts = 1000, verbose = True,metric='euclidean')

c:\python27\lib\site-packages\DBSCAN_multiplex.pyc in DBSCAN(data, minPts, eps, quantile, subsamples_matrix, samples_weights, metric, p, verbose)
    600 
    601     with open(path.join(getcwd(), 'tmp.h5'), 'w') as f:
--> 602         eps = load(f.name, data, minPts, eps, quantile, subsamples_matrix, samples_weights, metric, p, verbose)
    603 
    604         for run in xrange(N_runs):

c:\python27\lib\site-packages\DBSCAN_multiplex.pyc in load(hdf5_file_name, data, minPts, eps, quantile, subsamples_matrix, samples_weights, metric, p, verbose)
    271         quantile = np.clip(quantile, 0, 100)
    272 
--> 273         k_distances = kneighbors_graph(data, minPts, mode = 'distance', metric = metric, p = p).data
    274 
    275         radii = np.zeros(N_samples, dtype = float)

c:\python27\lib\site-packages\sklearn\neighbors\graph.pyc in kneighbors_graph(X, n_neighbors, mode, metric, p, metric_params, include_self, n_jobs)
    101 
    102     query = _query_include_self(X, include_self)
--> 103     return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)
    104 
    105 

c:\python27\lib\site-packages\sklearn\neighbors\base.pyc in kneighbors_graph(self, X, n_neighbors, mode)
    487         elif mode == 'distance':
    488             A_data, A_ind = self.kneighbors(
--> 489                 X, n_neighbors, return_distance=True)
    490             A_data = np.ravel(A_data)
    491 

c:\python27\lib\site-packages\sklearn\neighbors\base.pyc in kneighbors(self, X, n_neighbors, return_distance)
    383                 delayed(self._tree.query, check_pickle=False)(
    384                     X[s], n_neighbors, return_distance)
--> 385                 for s in gen_even_slices(X.shape[0], n_jobs)
    386             )
    387             if return_distance:

c:\python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

c:\python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

c:\python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

c:\python27\lib\site-packages\sklearn\externals\joblib\_parallel_backends.pyc in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

c:\python27\lib\site-packages\sklearn\externals\joblib\_parallel_backends.pyc in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

c:\python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

sklearn\neighbors\binary_tree.pxi in sklearn.neighbors.kd_tree.BinaryTree.query()

c:\python27\lib\site-packages\sklearn\utils\validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    431                                       force_all_finite)
    432     else:
--> 433         array = np.array(array, dtype=dtype, order=order, copy=copy)
    434 
    435         if ensure_2d:

MemoryError: 
GGiecold-zz commented 6 years ago

First of all, your call to eps = load(f.name, ...) uses an Euclidean metric. The similarity between word embeddings is usually assessed via a cosine distance.

Second, the memory error is triggered by a call to scikit-learn's kneighbors_graph. By setting minPts to 1000, you are launching the computation of the 1000-nearest neighbors for each sample. Get more RAM, reduce minPts or come up with a memory-efficient implementation of kneighbors_graph, one that would proceed to evaluate a sparse CSR matrix of neighbors in chunks written, say, to HDF5.

Regards,

Gregory

On Tue, Jan 30, 2018 at 10:35 AM, newbiestatguy notifications@github.com wrote:

eps, labels_matrix = DB.DBSCAN(textVect, minPts = 1000, verbose = True,metric='euclidean')

I tried running DBSCAN on a vector which has 300000*300 dimension input data which came from doc2vec output. However, i am getting memory error as follows.

INFO: DBSCAN_multiplex @ load: starting the determination of an appropriate value of 'eps' for this data-set and for the other parameter of the DBSCAN algorithm set to 1000. This might take a while.


MemoryError Traceback (most recent call last)

in () ----> 1 eps, labels_matrix = DB.DBSCAN(textVect, minPts = 1000, verbose = True,metric='euclidean') c:\python27\lib\site-packages\DBSCAN_multiplex.pyc in DBSCAN(data, minPts, eps, quantile, subsamples_matrix, samples_weights, metric, p, verbose) 600 601 with open(path.join(getcwd(), 'tmp.h5'), 'w') as f: --> 602 eps = load(f.name, data, minPts, eps, quantile, subsamples_matrix, samples_weights, metric, p, verbose) 603 604 for run in xrange(N_runs): c:\python27\lib\site-packages\DBSCAN_multiplex.pyc in load(hdf5_file_name, data, minPts, eps, quantile, subsamples_matrix, samples_weights, metric, p, verbose) 271 quantile = np.clip(quantile, 0, 100) 272 --> 273 k_distances = kneighbors_graph(data, minPts, mode = 'distance', metric = metric, p = p).data 274 275 radii = np.zeros(N_samples, dtype = float) c:\python27\lib\site-packages\sklearn\neighbors\graph.pyc in kneighbors_graph(X, n_neighbors, mode, metric, p, metric_params, include_self, n_jobs) 101 102 query = _query_include_self(X, include_self) --> 103 return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode) 104 105 c:\python27\lib\site-packages\sklearn\neighbors\base.pyc in kneighbors_graph(self, X, n_neighbors, mode) 487 elif mode == 'distance': 488 A_data, A_ind = self.kneighbors( --> 489 X, n_neighbors, return_distance=True) 490 A_data = np.ravel(A_data) 491 c:\python27\lib\site-packages\sklearn\neighbors\base.pyc in kneighbors(self, X, n_neighbors, return_distance) 383 delayed(self._tree.query, check_pickle=False)( 384 X[s], n_neighbors, return_distance) --> 385 for s in gen_even_slices(X.shape[0], n_jobs) 386 ) 387 if return_distance: c:\python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self, iterable) 777 # was dispatched. In particular this covers the edge 778 # case of Parallel used with an exhausted iterator. --> 779 while self.dispatch_one_batch(iterator): 780 self._iterating = True 781 else: c:\python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in dispatch_one_batch(self, iterator) 623 return False 624 else: --> 625 self._dispatch(tasks) 626 return True 627 c:\python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in _dispatch(self, batch) 586 dispatch_timestamp = time.time() 587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) --> 588 job = self._backend.apply_async(batch, callback=cb) 589 self._jobs.append(job) 590 c:\python27\lib\site-packages\sklearn\externals\joblib\_parallel_backends.pyc in apply_async(self, func, callback) 109 def apply_async(self, func, callback=None): 110 """Schedule a func to be run""" --> 111 result = ImmediateResult(func) 112 if callback: 113 callback(result) c:\python27\lib\site-packages\sklearn\externals\joblib\_parallel_backends.pyc in __init__(self, batch) 330 # Don't delay the application, to avoid keeping the input 331 # arguments in memory --> 332 self.results = batch() 333 334 def get(self): c:\python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): sklearn\neighbors\binary_tree.pxi in sklearn.neighbors.kd_tree.BinaryTree.query() c:\python27\lib\site-packages\sklearn\utils\validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 431 force_all_finite) 432 else: --> 433 array = np.array(array, dtype=dtype, order=order, copy=copy) 434 435 if ensure_2d: MemoryError: — You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub , or mute the thread .