Open ShaharGlatman opened 4 years ago
This is a fix for that code:
from gensim.corpora import WikiCorpus from gensim import utils
def create_wiki_corpus(input_path, output_file): i = 0
print("Starting to create wiki corpus")
output = open(output_file, 'w', encoding='utf-8') # Specify the encoding when opening the file
wiki = WikiCorpus(input_path, dictionary={})
for text in get_texts(wiki):
print(text)
article = " ".join([t for t in text]) # Use " ".join instead of space.join
output.write(article + "\n")
i += 1
if i % 1000 == 0:
print("Saved " + str(i) + " articles")
output.close()
print("Finished - Saved " + str(i) + " articles")
def get_texts(wiki): for doc in wiki.getstream(): yield [word for word in utils.to_unicode(doc).lower().split()]
input_file = "hewiki-latest-pages-articles.xml.bz2" output_file = "wiki.he.text"
create_wiki_corpus(input_file, output_file)
hi, i'm getting this error. what should i do? thanks. _fixup_main_from_path(data['init_main_from_path']) File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 277, in _fixup_main_from_path run_name="mp_main__") File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\runpy.py", line 263, in run_path pkg_name=pkg_name, script_name=fname) File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\runpy.py", line 96, in _run_module_code mod_name, mod_spec, pkg_name, script_name) File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\runpy.py", line 85, in _run_code exec(code, run_globals) File "C:\Users\User\PycharmProjects\BINAP\wordembedding-hebrew-master\create_corpus.py", line 12, in
for text in wiki.get_texts():
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\gensim\corpora\wikicorpus.py", line 679, in get_texts
pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py", line 119, in Pool
context=self.get_context())
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\multiprocessing\pool.py", line 175, in init__
self._repopulate_pool()
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\multiprocessing\pool.py", line 236, in _repopulate_pool
self._wrap_exception)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\multiprocessing\pool.py", line 255, in _repopulate_pool_static
w.start()
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py", line 322, in _Popen
return Popen(process_obj)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\multiprocessing\popen_spawn_win32.py", line 33, in init
prep_data = spawn.get_preparation_data(process_obj._name)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 143, in get_preparation_data
_check_not_importing_main()
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\multiprocessing\spawn.py", line 136, in _check_not_importing_main
is not going to be frozen to produce an executable.''')
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.