makcedward / nlpaug

Data augmentation for NLP
https://makcedward.github.io/
MIT License
4.41k stars 460 forks source link

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte #308

Open NitishOritro opened 2 years ago

NitishOritro commented 2 years ago

1 aug = naw.WordEmbsAug(model_type='word2vec', model_path=model_path, action="insert") 2 augmented_text = aug.augment(text) 3 print("Original:") 4 print(text) 5 print("Augmented Text:")

/usr/local/lib/python3.7/dist-packages/nlpaug/augmenter/word/word_embs.py in init(self, model_type, model_path, model, action, name, aug_min, aug_max, aug_p, top_k, n_gram_separator, stopwords, tokenizer, reverse_tokenizer, force_reload, stopwords_regex, verbose, skip_check) 87 if model is None: 88 self.model = self.get_model(model_path=model_path, model_type=model_type, force_reload=force_reload, ---> 89 top_k=self.top_k, skip_check=skip_check) 90 else: 91 self.model = model

/usr/local/lib/python3.7/dist-packages/nlpaug/augmenter/word/word_embs.py in get_model(cls, model_path, model_type, force_reload, top_k, skip_check) 97 @classmethod 98 def get_model(cls, model_path, model_type, force_reload=False, top_k=100, skip_check=False): ---> 99 return init_word_embs_model(model_path, model_type, force_reload, top_k=top_k, skip_check=skip_check) 100 101 def skip_aug(self, token_idxes, tokens):

/usr/local/lib/python3.7/dist-packages/nlpaug/augmenter/word/word_embs.py in init_word_embs_model(model_path, model_type, force_reload, top_k, skip_check) 21 if model_type == 'word2vec': 22 model = nmw.Word2vec(top_k=top_k, skip_check=skip_check) ---> 23 model.read(model_path) 24 elif model_type == 'glove': 25 model = nmw.GloVe(top_k=top_k, skip_check=skip_check)

/usr/local/lib/python3.7/dist-packages/nlpaug/model/word_embs/word2vec.py in read(self, file_path, max_num_vector) 22 23 def read(self, file_path, max_num_vector=None): ---> 24 self.model = KeyedVectors.load_word2vec_format(file_path, binary=True, limit=max_num_vector) 25 super()._read()

/usr/local/lib/python3.7/dist-packages/gensim/models/keyedvectors.py in load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype, no_header) 1630 return _load_word2vec_format( 1631 cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors, -> 1632 limit=limit, datatype=datatype, no_header=no_header, 1633 ) 1634

/usr/local/lib/python3.7/dist-packages/gensim/models/keyedvectors.py in _load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype, no_header, binary_chunk_size) 1900 fin = utils.open(fname, 'rb') 1901 else: -> 1902 header = utils.to_unicode(fin.readline(), encoding=encoding) 1903 vocab_size, vector_size = [int(x) for x in header.split()] # throws for invalid file format 1904 if limit:

/usr/local/lib/python3.7/dist-packages/gensim/utils.py in any2unicode(text, encoding, errors) 363 if isinstance(text, str): 364 return text --> 365 return str(text, encoding, errors=errors) 366 367

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte