I have gotten W2V with the package to work with the Gutenberg corpus, but I am having trouble getting other corpora to work. I have downloaded the corpora as per your instructions.
The Wikipedia-corpus simply keeps loading when I do w2v = Word2Vec().
The LCC-corpus is not able to find the file word2vec.pkl.gz. It seems to be related to this issue: https://github.com/fnielsen/dasem/issues/6. In my dasem_data/lcc folder I only have a dan-dk_web_2014_10K.tar.gz-file, so I am wondering if it is simply a naming issue?
Here is my full error:
/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/smart_open/smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function
'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Traceback (most recent call last):
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/dasem/models.py", line 421, in __init__
self.load()
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/dasem/models.py", line 460, in load
self.model = gensim.models.Word2Vec.load(full_filename)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/gensim/models/word2vec.py", line 1330, in load
model = super(Word2Vec, cls).load(*args, **kwargs)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/gensim/models/base_any2vec.py", line 1244, in load
model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/gensim/models/base_any2vec.py", line 603, in load
return super(BaseAny2VecModel, cls).load(fname_or_handle, **kwargs)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/gensim/utils.py", line 426, in load
obj = unpickle(fname)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/gensim/utils.py", line 1381, in unpickle
with smart_open(fname, 'rb') as f:
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/smart_open/smart_open_lib.py", line 453, in smart_open
return open(uri, mode, ignore_ext=ignore_extension, transport_params=transport_params, **scrubbed_kwargs)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/smart_open/smart_open_lib.py", line 348, in open
binary, filename = _open_binary_stream(uri, binary_mode, transport_params)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/smart_open/smart_open_lib.py", line 544, in _open_binary_stream
fobj = io.open(parsed_uri.uri_path, mode)
FileNotFoundError: [Errno 2] No such file or directory: '/home/bertil/dasem_data/lcc/word2vec.pkl.gz'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/tarfile.py", line 1645, in gzopen
t = cls.taropen(name, mode, fileobj, **kwargs)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/tarfile.py", line 1621, in taropen
return cls(name, mode, fileobj, **kwargs)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/tarfile.py", line 1484, in __init__
self.firstmember = self.next()
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/tarfile.py", line 2289, in next
tarinfo = self.tarinfo.fromtarfile(self)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/tarfile.py", line 1094, in fromtarfile
buf = tarfile.fileobj.read(BLOCKSIZE)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/gzip.py", line 276, in read
return self._buffer.read(size)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/_compression.py", line 68, in readinto
data = self.read(len(byte_view))
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/gzip.py", line 463, in read
if not self._read_gzip_header():
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/gzip.py", line 411, in _read_gzip_header
raise OSError('Not a gzipped file (%r)' % magic)
OSError: Not a gzipped file (b'<!')
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "test.py", line 3, in <module>
Word2Vec()
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/dasem/models.py", line 425, in __init__
self.train()
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/dasem/models.py", line 527, in train
workers=workers)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/gensim/models/word2vec.py", line 783, in __init__
fast_version=FAST_VERSION)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/gensim/models/base_any2vec.py", line 759, in __init__
self.build_vocab(sentences=sentences, corpus_file=corpus_file, trim_rule=trim_rule)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/gensim/models/base_any2vec.py", line 936, in build_vocab
sentences=sentences, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/gensim/models/word2vec.py", line 1591, in scan_vocab
total_words, corpus_count = self._scan_vocab(sentences, progress_per, trim_rule)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/gensim/models/word2vec.py", line 1560, in _scan_vocab
for sentence_no, sentence in enumerate(sentences):
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/dasem/lcc.py", line 255, in iter_sentence_words
for word_list in lcc_file.iter_sentence_words():
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/dasem/lcc.py", line 141, in iter_sentence_words
for n, sentence in enumerate(self.iter_sentences()):
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/site-packages/dasem/lcc.py", line 112, in iter_sentences
with tarfile.open(self.filename, "r:gz") as tar:
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/tarfile.py", line 1591, in open
return func(name, filemode, fileobj, **kwargs)
File "/home/bertil/anaconda3/envs/word2vec_tool/lib/python3.7/tarfile.py", line 1649, in gzopen
raise ReadError("not a gzip file")
tarfile.ReadError: not a gzip file
I have gotten W2V with the package to work with the Gutenberg corpus, but I am having trouble getting other corpora to work. I have downloaded the corpora as per your instructions.
The Wikipedia-corpus simply keeps loading when I do
w2v = Word2Vec()
.The LCC-corpus is not able to find the file
word2vec.pkl.gz
. It seems to be related to this issue: https://github.com/fnielsen/dasem/issues/6. In mydasem_data/lcc
folder I only have adan-dk_web_2014_10K.tar.gz
-file, so I am wondering if it is simply a naming issue?Here is my full error: