Open fras2560 opened 7 years ago
I have the same issue on Ubuntu with a medium sized gensim lda model. The library starts several sub-processes, each takes 4 gigabytes of ram. Is this behavior considered normal? Gensim model corpus is a serialized MM corpus. @fras2560 have you solved the issue?
@mohgh No i was never able to resolve this and didnt seem to find any helpful information about it.
There is a n_jobs
parameter for prepare
function that controls the number of parallel processes. Setting it to a low number could somehow resolve this issue.
Same problem:
$ free
total used free shared buffers cached
Mem: 31399940 30868104 531836 15452 58120 187620
-/+ buffers/cache: 30622364 777576
Swap: 0 0 0
import pyLDAvis
topics = pyLDAvis.prepare(topic_term_dist, doc_topic_dist, doc_len, vocab, term_frec)
pyLDAvis.save_html(topics, fileobj="results2.html")
print(topic_term_dist.shape)
print(doc_topic_dist.shape)
print(doc_len.shape)
print(vocab.shape)
print(term_frec.shape)
(10, 11463)
(5811, 10)
(5811,)
(11463,)
(11463,)
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-30-7a37feb7b93c> in <module>()
1 import pyLDAvis
----> 2 topics = pyLDAvis.prepare(topic_term_dist, doc_topic_dist, doc_len, vocab, term_frec)
3 pyLDAvis.save_html(topics, fileobj="results2.html")
~/anaconda/envs/ai/lib/python3.6/site-packages/pyLDAvis/_prepare.py in prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, R, lambda_step, mds, n_jobs, plot_opts, sort_topics)
396 term_frequency = np.sum(term_topic_freq, axis=0)
397
--> 398 topic_info = _topic_info(topic_term_dists, topic_proportion, term_frequency, term_topic_freq, vocab, lambda_step, R, n_jobs)
399 token_table = _token_table(topic_info, term_topic_freq, vocab, term_frequency)
400 topic_coordinates = _topic_coordinates(mds, topic_term_dists, topic_proportion)
~/anaconda/envs/ai/lib/python3.6/site-packages/pyLDAvis/_prepare.py in _topic_info(topic_term_dists, topic_proportion, term_frequency, term_topic_freq, vocab, lambda_step, R, n_jobs)
253
254 top_terms = pd.concat(Parallel(n_jobs=n_jobs)(delayed(_find_relevance_chunks)(log_ttd, log_lift, R, ls) \
--> 255 for ls in _job_chunks(lambda_seq, n_jobs)))
256 topic_dfs = map(topic_top_term_df, enumerate(top_terms.T.iterrows(), 1))
257 return pd.concat([default_term_info] + list(topic_dfs))
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
904
905 if not self._managed_backend:
--> 906 n_jobs = self._initialize_backend()
907 else:
908 n_jobs = self._effective_n_jobs()
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/parallel.py in _initialize_backend(self)
703 try:
704 n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self,
--> 705 **self._backend_args)
706 if self.timeout is not None and not self._backend.supports_timeout:
707 warnings.warn(
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/_parallel_backends.py in configure(self, n_jobs, parallel, prefer, require, idle_worker_timeout, **memmappingexecutor_args)
468 n_jobs, timeout=idle_worker_timeout,
469 initializer=self.limit_clib_threads,
--> 470 **memmappingexecutor_args)
471 self.parallel = parallel
472 return n_jobs
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/executor.py in get_memmapping_executor(n_jobs, timeout, initializer, initargs, **backend_args)
34 reuse=reuse, timeout=timeout,
35 initializer=initializer,
---> 36 initargs=initargs)
37 # If executor doesn't have a _temp_folder, it means it is a new executor
38 # and the reducers have been used. Else, the previous reducers are used
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/externals/loky/reusable_executor.py in get_reusable_executor(max_workers, context, timeout, kill_workers, reuse, job_reducers, result_reducers, initializer, initargs)
106 _executor = executor = _ReusablePoolExecutor(
107 _executor_lock, max_workers=max_workers,
--> 108 executor_id=executor_id, **kwargs)
109 else:
110 if reuse == 'auto':
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/externals/loky/reusable_executor.py in __init__(self, submit_resize_lock, max_workers, context, timeout, executor_id, job_reducers, result_reducers, initializer, initargs)
142 max_workers=max_workers, context=context, timeout=timeout,
143 job_reducers=job_reducers, result_reducers=result_reducers,
--> 144 initializer=initializer, initargs=initargs)
145 self.executor_id = executor_id
146 self._submit_resize_lock = submit_resize_lock
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py in __init__(self, max_workers, job_reducers, result_reducers, timeout, context, initializer, initargs)
823 self._running_work_items = []
824 self._work_ids = queue.Queue()
--> 825 self._processes_management_lock = self._context.Lock()
826 self._queue_management_thread = None
827
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/externals/loky/backend/context.py in Lock(self)
185 """Returns a lock object"""
186 from .synchronize import Lock
--> 187 return Lock()
188
189 def RLock(self):
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/externals/loky/backend/synchronize.py in __init__(self)
172
173 def __init__(self):
--> 174 super(Lock, self).__init__(SEMAPHORE, 1, 1)
175
176 def __repr__(self):
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/externals/loky/backend/synchronize.py in __init__(self, kind, value, maxvalue)
88 # When the object is garbage collected or the
89 # process shuts down we unlink the semaphore name
---> 90 semaphore_tracker.register(self._semlock.name)
91 util.Finalize(self, SemLock._cleanup, (self._semlock.name,),
92 exitpriority=0)
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/externals/loky/backend/semaphore_tracker.py in register(self, name)
115 def register(self, name):
116 '''Register name of semaphore with semaphore tracker.'''
--> 117 self.ensure_running()
118 self._send('REGISTER', name)
119
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/externals/loky/backend/semaphore_tracker.py in ensure_running(self)
94 args += ['-c', cmd % r]
95 util.debug("launching Semaphore tracker: {}".format(args))
---> 96 pid = spawnv_passfds(exe, args, fds_to_pass)
97 except BaseException:
98 os.close(w)
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/externals/loky/backend/semaphore_tracker.py in spawnv_passfds(path, args, passfds)
230 _pass += [_mk_inheritable(fd)]
231 from .fork_exec import fork_exec
--> 232 return fork_exec(args, _pass)
233 finally:
234 os.close(errpipe_read)
~/anaconda/envs/ai/lib/python3.6/site-packages/joblib/externals/loky/backend/fork_exec.py in fork_exec(cmd, keep_fds)
36 def fork_exec(cmd, keep_fds):
37
---> 38 pid = os.fork()
39 if pid == 0: # pragma: no cover
40 close_fds(keep_fds)
OSError: [Errno 12] Cannot allocate memory
I also faced same issue that "A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker."
Then I read we can solve this by updating n_jobs parameter
Below is the link of updated library code for n_jobs parameter as n_jobs = 1 pyLDAvis_local.zip
Extract pyLDAvis_local.zip file and Put this pyLDAvis_local folder in the same directory where python code is present (Use this below image as a reference)
Import Library:
from . import pyLDAvis_local
from .pyLDAvis_local import gensim_local
vis = pyLDAvis_local.gensim_local.prepare(lda_model, corpus, id2word)
Use this code with pyLDAvis_local and gensim_local
I got an out of memory error when trying to visualise an lda model. Is there any way to avoid this? Or is there a max size the corpus can be?