Hello, I Have an issue that when I try to import import dadmatools.pipeline.language as language in my local machine I face this error:
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 220: character maps to undefined
How can I fix this?
This is the full trace of the error:
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
Cell In[34], line 1
----> 1 import dadmatools.pipeline.language as language
3 # here lemmatizer and pos tagger will be loaded
4 # as tokenizer is the default tool, it will be loaded as well even without calling
5 pips = 'lem'
File c:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\dadmatools\pipeline\__init__.py:1
----> 1 from .language import Pipeline
2 from .tpipeline import TPipeline
3 from .language import supported_langs, langwithner, remove_with_path
File c:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\dadmatools\pipeline\language.py:4
1 from typing import List
3 from .config import config as master_config
----> 4 from .informal2formal.main import Informal2Formal
5 from .models.base_models import Multilingual_Embedding
6 from .models.classifiers import TokenizerClassifier, PosDepClassifier, NERClassifier, SentenceClassifier, \
7 KasrehClassifier
File c:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\dadmatools\pipeline\informal2formal\main.py:6
4 import yaml
5 from .download_utils import download_dataset
----> 6 import dadmatools.pipeline.informal2formal.utils as utils
7 from .formality_transformer import FormalityTransformer
8 from dadmatools.pipeline.persian_tokenization.tokenizer import SentenceTokenizer
File c:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\dadmatools\pipeline\informal2formal\utils.py:10
7 from dadmatools.pipeline.persian_tokenization.tokenizer import WordTokenizer
8 from dadmatools.normalizer import Normalizer
---> 10 normalizer = Normalizer()
11 tokenizer = WordTokenizer('cache/dadmatools')
12 # tokenizer = WordTokenizer(separate_emoji=True)
File c:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\dadmatools\normalizer.py:32, in Normalizer.__init__(self, full_cleaning, unify_chars, refine_punc_spacing, remove_extra_space, remove_puncs, remove_html, remove_stop_word, replace_email_with, replace_number_with, replace_url_with, replace_mobile_number_with, replace_emoji_with, replace_home_number_with)
30 self.remove_puncs = remove_puncs
31 self.remove_stop_word = remove_stop_word
---> 32 self.STOPWORDS = open(prefix+save_dir+'stopwords-fa.py').read().splitlines()
33 self.PUNCS = string.punctuation.replace('<', '').replace('>', '') + '،؟'
34 if full_cleaning:
File c:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\encodings\cp1252.py:23, in IncrementalDecoder.decode(self, input, final)
22 def decode(self, input, final=False):
---> 23 return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 220: character maps to <undefined>
Hello, I Have an issue that when I try to import import dadmatools.pipeline.language as language in my local machine I face this error: UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 220: character maps to undefined How can I fix this?
This is the full trace of the error: