lmcinnes / enstop

Ensemble topic modelling with pLSA
BSD 2-Clause "Simplified" License
112 stars 12 forks source link

FloatingPointError. NMF #6

Open lyriccoder opened 4 years ago

lyriccoder commented 4 years ago

I can't run NMF algorithm. When I run:

%%time
nmf_model = NMF(n_components=20, beta_loss='kullback-leibler', solver='mu').fit(data)

... I see the following error stack :

---------------------------------------------------------------------------
FloatingPointError                        Traceback (most recent call last)
<timed exec> in <module>

d:\pycharmprojects\biclustering\venv\lib\site-packages\sklearn\decomposition\_nmf.py in fit(self, X, y, **params)
   1310         self
   1311         """
-> 1312         self.fit_transform(X, **params)
   1313         return self
   1314 

d:\pycharmprojects\biclustering\venv\lib\site-packages\sklearn\decomposition\_nmf.py in fit_transform(self, X, y, W, H)
   1285             l1_ratio=self.l1_ratio, regularization='both',
   1286             random_state=self.random_state, verbose=self.verbose,
-> 1287             shuffle=self.shuffle)
   1288 
   1289         self.reconstruction_err_ = _beta_divergence(X, W, H, self.beta_loss,

d:\pycharmprojects\biclustering\venv\lib\site-packages\sklearn\decomposition\_nmf.py in non_negative_factorization(X, W, H, n_components, init, update_H, solver, beta_loss, tol, max_iter, alpha, l1_ratio, regularization, random_state, verbose, shuffle)
   1067                                                   tol, l1_reg_W, l1_reg_H,
   1068                                                   l2_reg_W, l2_reg_H, update_H,
-> 1069                                                   verbose)
   1070 
   1071     else:

d:\pycharmprojects\biclustering\venv\lib\site-packages\sklearn\decomposition\_nmf.py in _fit_multiplicative_update(X, W, H, beta_loss, max_iter, tol, l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H, update_H, verbose)
    810         if update_H:
    811             delta_H = _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H,
--> 812                                                l2_reg_H, gamma)
    813             H *= delta_H
    814 

d:\pycharmprojects\biclustering\venv\lib\site-packages\sklearn\decomposition\_nmf.py in _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma)
    634     else:
    635         # Numerator
--> 636         WH_safe_X = _special_sparse_dot(W, H, X)
    637         if sp.issparse(X):
    638             WH_safe_X_data = WH_safe_X.data

d:\pycharmprojects\biclustering\venv\lib\site-packages\sklearn\decomposition\_nmf.py in _special_sparse_dot(W, H, X)
    178             batch = slice(start, start + batch_size)
    179             dot_vals[batch] = np.multiply(W[ii[batch], :],
--> 180                                           H.T[jj[batch], :]).sum(axis=1)
    181 
    182         WH = sp.coo_matrix((dot_vals, (ii, jj)), shape=X.shape)

FloatingPointError: underflow encountered in multiply

I also have the same error for LatentDirichletAllocation if I choose 448 clusters for 25000 rows:

%%time
lda_model = LatentDirichletAllocation(n_components=448).fit(data_vec)
---------------------------------------------------------------------------
FloatingPointError                        Traceback (most recent call last)
<timed exec> in <module>

d:\pycharmprojects\biclustering\venv\lib\site-packages\sklearn\decomposition\_online_lda.py in fit(self, X, y)
    566                     # batch update
    567                     self._em_step(X, total_samples=n_samples,
--> 568                                   batch_update=True, parallel=parallel)
    569 
    570                 # check perplexity

d:\pycharmprojects\biclustering\venv\lib\site-packages\sklearn\decomposition\_online_lda.py in _em_step(self, X, total_samples, batch_update, parallel)
    446         # E-step
    447         _, suff_stats = self._e_step(X, cal_sstats=True, random_init=True,
--> 448                                      parallel=parallel)
    449 
    450         # M-step

d:\pycharmprojects\biclustering\venv\lib\site-packages\sklearn\decomposition\_online_lda.py in _e_step(self, X, cal_sstats, random_init, parallel)
    399                                               self.mean_change_tol, cal_sstats,
    400                                               random_state)
--> 401             for idx_slice in gen_even_slices(X.shape[0], n_jobs))
    402 
    403         # merge result

d:\pycharmprojects\biclustering\venv\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
   1001             # remaining jobs.
   1002             self._iterating = False
-> 1003             if self.dispatch_one_batch(iterator):
   1004                 self._iterating = self._original_iterator is not None
   1005 

d:\pycharmprojects\biclustering\venv\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
    832                 return False
    833             else:
--> 834                 self._dispatch(tasks)
    835                 return True
    836 

d:\pycharmprojects\biclustering\venv\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
    751         with self._lock:
    752             job_idx = len(self._jobs)
--> 753             job = self._backend.apply_async(batch, callback=cb)
    754             # A job can complete so quickly than its callback is
    755             # called before we get here, causing self._jobs to

d:\pycharmprojects\biclustering\venv\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
    199     def apply_async(self, func, callback=None):
    200         """Schedule a func to be run"""
--> 201         result = ImmediateResult(func)
    202         if callback:
    203             callback(result)

d:\pycharmprojects\biclustering\venv\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
    580         # Don't delay the application, to avoid keeping the input
    581         # arguments in memory
--> 582         self.results = batch()
    583 
    584     def get(self):

d:\pycharmprojects\biclustering\venv\lib\site-packages\joblib\parallel.py in __call__(self)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

d:\pycharmprojects\biclustering\venv\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

d:\pycharmprojects\biclustering\venv\lib\site-packages\sklearn\decomposition\_online_lda.py in _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, max_iters, mean_change_tol, cal_sstats, random_state)
    115 
    116             doc_topic_d = (exp_doc_topic_d *
--> 117                            np.dot(cnts / norm_phi, exp_topic_word_d.T))
    118             # Note: adds doc_topic_prior to doc_topic_d, in-place.
    119             _dirichlet_expectation_1d(doc_topic_d, doc_topic_prior,

FloatingPointError: underflow encountered in multiply

Could you please help? I am using Python 3.7.5 x64. Windows 10.