File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:467, in LDAMallet.init(self, num_topics, corpus, alpha, eta, id2word, n_cpu, tmp_dir, optimize_interval, iterations, topic_threshold, random_seed, reuse_corpus, mallet_path)
465 self.mallet_path = mallet_path
466 if corpus is not None:
--> 467 self.train(corpus, reuse_corpus)
File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:552, in LDAMallet.train(self, corpus, reuse_corpus)
550 logger = logging.getLogger("LDAMalletWrapper")
551 if os.path.isfile(self.fcorpusmallet()) is False or reuse_corpus is False:
--> 552 self.convert_input(corpus)
553 else:
554 logger.info("MALLET corpus already exists, training model")
File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:534, in LDAMallet.convert_input(self, corpus)
532 subprocess.check_output(args=cmd, shell=False, stderr=subprocess.STDOUT)
533 except subprocess.CalledProcessError as e:
--> 534 raise RuntimeError(
535 f"command '{e.cmd}' return with error (code {e.returncode}): {e.output}"
536 )
Describe the bug CalledProcessError: Command '['/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet', 'import-file', '--preserve-case', '--keep-sequence', '--token-regex', '\S+', '--input', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.txt', '--output', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.mallet']' returned non-zero exit status 127.
To Reproduce os.environ['MALLET_MEMORY'] = '200G' from pycisTopic.lda_models import run_cgs_models_mallet
Configure path Mallet
mallet_path="/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet"
Run models
models=run_cgs_models_mallet( cistopic_obj, n_topics=[2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50], n_cpu=12, n_iter=500, random_state=555, alpha=50, alpha_by_topic=True, eta=0.1, eta_by_topic=False, tmp_path="./scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial", save_path="./scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial", mallet_path=mallet_path, )
Error output CalledProcessError Traceback (most recent call last) File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:532, in LDAMallet.convert_input(self, corpus) 531 try: --> 532 subprocess.check_output(args=cmd, shell=False, stderr=subprocess.STDOUT) 533 except subprocess.CalledProcessError as e:
File ~/anaconda3/envs/pycisTopic/lib/python3.11/subprocess.py:466, in check_output(timeout, *popenargs, *kwargs) 464 kwargs['input'] = empty --> 466 return run(popenargs, stdout=PIPE, timeout=timeout, check=True, 467 **kwargs).stdout
File ~/anaconda3/envs/pycisTopic/lib/python3.11/subprocess.py:571, in run(input, capture_output, timeout, check, *popenargs, **kwargs) 570 if check and retcode: --> 571 raise CalledProcessError(retcode, process.args, 572 output=stdout, stderr=stderr) 573 return CompletedProcess(process.args, retcode, stdout, stderr)
CalledProcessError: Command '['/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet', 'import-file', '--preserve-case', '--keep-sequence', '--token-regex', '\S+', '--input', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.txt', '--output', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.mallet']' returned non-zero exit status 127.
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last) Cell In[47], line 6 4 mallet_path="/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet" 5 # Run models ----> 6 models=run_cgs_models_mallet( 7 cistopic_obj, 8 n_topics=[2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50], 9 n_cpu=12, 10 n_iter=500, 11 random_state=555, 12 alpha=50, 13 alpha_by_topic=True, 14 eta=0.1, 15 eta_by_topic=False, 16 tmp_path="./scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial", 17 save_path="./scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial", 18 mallet_path=mallet_path, 19 )
File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:806, in run_cgs_models_mallet(cistopic_obj, n_topics, n_cpu, n_iter, random_state, alpha, alpha_by_topic, eta, eta_by_topic, top_topics_coh, tmp_path, save_path, reuse_corpus, mallet_path) 803 corpus = matutils.Sparse2Corpus(binary_matrix) 804 id2word = utils.FakeDict(len(region_names)) --> 806 model_list = [ 807 run_cgs_model_mallet( 808 binary_matrix=binary_matrix, 809 corpus=corpus, 810 id2word=id2word, 811 n_topics=n_topic, 812 cell_names=cell_names, 813 region_names=region_names, 814 n_cpu=n_cpu, 815 n_iter=n_iter, 816 random_state=random_state, 817 alpha=alpha, 818 alpha_by_topic=alpha_by_topic, 819 eta=eta, 820 eta_by_topic=eta_by_topic, 821 top_topics_coh=top_topics_coh, 822 tmp_path=tmp_path, 823 save_path=save_path, 824 reuse_corpus=reuse_corpus, 825 mallet_path=mallet_path, 826 ) 827 for n_topic in n_topics 828 ] 829 return model_list
File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:807, in(.0)
803 corpus = matutils.Sparse2Corpus(binary_matrix)
804 id2word = utils.FakeDict(len(region_names))
806 model_list = [
--> 807 run_cgs_model_mallet(
808 binary_matrix=binary_matrix,
809 corpus=corpus,
810 id2word=id2word,
811 n_topics=n_topic,
812 cell_names=cell_names,
813 region_names=region_names,
814 n_cpu=n_cpu,
815 n_iter=n_iter,
816 random_state=random_state,
817 alpha=alpha,
818 alpha_by_topic=alpha_by_topic,
819 eta=eta,
820 eta_by_topic=eta_by_topic,
821 top_topics_coh=top_topics_coh,
822 tmp_path=tmp_path,
823 save_path=save_path,
824 reuse_corpus=reuse_corpus,
825 mallet_path=mallet_path,
826 )
827 for n_topic in n_topics
828 ]
829 return model_list
File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:916, in run_cgs_model_mallet(binary_matrix, corpus, id2word, n_topics, cell_names, region_names, n_cpu, n_iter, random_state, alpha, alpha_by_topic, eta, eta_by_topic, top_topics_coh, tmp_path, save_path, reuse_corpus, mallet_path) 914 start = time.time() 915 log.info(f"Running model with {n_topics} topics") --> 916 model = LDAMallet( 917 corpus=corpus, 918 id2word=id2word, 919 num_topics=n_topics, 920 iterations=n_iter, 921 alpha=alpha, 922 eta=eta, 923 n_cpu=n_cpu, 924 tmp_dir=tmp_path, 925 random_seed=random_state, 926 reuse_corpus=reuse_corpus, 927 mallet_path=mallet_path, 928 ) 929 end_time = time.time() - start 931 # Get distributions
File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:467, in LDAMallet.init(self, num_topics, corpus, alpha, eta, id2word, n_cpu, tmp_dir, optimize_interval, iterations, topic_threshold, random_seed, reuse_corpus, mallet_path) 465 self.mallet_path = mallet_path 466 if corpus is not None: --> 467 self.train(corpus, reuse_corpus)
File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:552, in LDAMallet.train(self, corpus, reuse_corpus) 550 logger = logging.getLogger("LDAMalletWrapper") 551 if os.path.isfile(self.fcorpusmallet()) is False or reuse_corpus is False: --> 552 self.convert_input(corpus) 553 else: 554 logger.info("MALLET corpus already exists, training model")
File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:534, in LDAMallet.convert_input(self, corpus) 532 subprocess.check_output(args=cmd, shell=False, stderr=subprocess.STDOUT) 533 except subprocess.CalledProcessError as e: --> 534 raise RuntimeError( 535 f"command '{e.cmd}' return with error (code {e.returncode}): {e.output}" 536 )
RuntimeError: command '['/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet', 'import-file', '--preserve-case', '--keep-sequence', '--token-regex', '\S+', '--input', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.txt', '--output', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.mallet']' return with error (code 127): b'/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet: \xe8\xa1\x8c 60: java: \xe6\x9c\xaa\xe6\x89\xbe\xe5\x88\xb0\xe5\x91\xbd\xe4\xbb\xa4\n'