utterworks / fast-bert

Super easy library for BERT based NLP models
Apache License 2.0
1.85k stars 342 forks source link

BertDataBunch leads to "sep_token not found in the vocabulary" only when using local BERT #247

Closed benedictau1993 closed 4 years ago

benedictau1993 commented 4 years ago

When using local tokenizer instead of calling from huggingface.com/models, it returns the following error: TypeError: sep_token not found in the vocabulary When I look at the vocab file, the token [SEP] is in there.

This is the command that I ran:

from fast_bert.data_cls import BertDataBunch

databunch = BertDataBunch("./", #DATA_PATH
                          "./", #LABEL_PATH
                          tokenizer='bert-large-uncased/bert_config.json',
                          train_file='train_file.csv',
                          val_file='val_file.csv',
                          label_file='labels.csv',
                          text_col='CLEAN_TEXT',
                          label_col=ICD9_CODE_LIST,
                          batch_size_per_gpu=16,
                          max_seq_length=512,
                          multi_gpu=True,
                          multi_label=True,
                          model_type='bert')

which returns:

Calling BertTokenizerFast.from_pretrained() with the path to a single file or url is deprecated
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-22-9505ad5925a5> in <module>
     13                           multi_gpu=True,
     14                           multi_label=True,
---> 15                           model_type='bert')

/project2/msca/ivy2/software2/install/Anaconda3-2019.10/envs/TF_latest_GPU/lib/python3.7/site-packages/fast_bert/data_cls.py in __init__(self, data_dir, label_dir, tokenizer, train_file, val_file, test_data, label_file, text_col, label_col, batch_size_per_gpu, max_seq_length, multi_gpu, multi_label, backend, model_type, logger, clear_cache, no_cache, custom_sampler)
    388         if isinstance(tokenizer, str):
    389             # instantiate the new tokeniser object using the tokeniser name
--> 390             tokenizer = AutoTokenizer.from_pretrained(tokenizer, use_fast=True)
    391 
    392         self.tokenizer = tokenizer

/project2/msca/ivy2/software2/install/Anaconda3-2019.10/envs/TF_latest_GPU/lib/python3.7/site-packages/transformers/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    213             if isinstance(config, config_class):
    214                 if tokenizer_class_fast and use_fast:
--> 215                     return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    216                 else:
    217                     return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

/project2/msca/ivy2/software2/install/Anaconda3-2019.10/envs/TF_latest_GPU/lib/python3.7/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, *inputs, **kwargs)
   1138 
   1139         """
-> 1140         return cls._from_pretrained(*inputs, **kwargs)
   1141 
   1142     @classmethod

/project2/msca/ivy2/software2/install/Anaconda3-2019.10/envs/TF_latest_GPU/lib/python3.7/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
   1285         # Instantiate tokenizer.
   1286         try:
-> 1287             tokenizer = cls(*init_inputs, **init_kwargs)
   1288         except OSError:
   1289             raise OSError(

/project2/msca/ivy2/software2/install/Anaconda3-2019.10/envs/TF_latest_GPU/lib/python3.7/site-packages/transformers/tokenization_bert.py in __init__(self, vocab_file, do_lower_case, unk_token, sep_token, pad_token, cls_token, mask_token, clean_text, tokenize_chinese_chars, strip_accents, wordpieces_prefix, **kwargs)
    621                 strip_accents=strip_accents,
    622                 lowercase=do_lower_case,
--> 623                 wordpieces_prefix=wordpieces_prefix,
    624             ),
    625             unk_token=unk_token,

/project2/msca/ivy2/software2/install/Anaconda3-2019.10/envs/TF_latest_GPU/lib/python3.7/site-packages/tokenizers/implementations/bert_wordpiece.py in __init__(self, vocab_file, unk_token, sep_token, cls_token, pad_token, mask_token, clean_text, handle_chinese_chars, strip_accents, lowercase, wordpieces_prefix)
     55             sep_token_id = tokenizer.token_to_id(str(sep_token))
     56             if sep_token_id is None:
---> 57                 raise TypeError("sep_token not found in the vocabulary")
     58             cls_token_id = tokenizer.token_to_id(str(cls_token))
     59             if cls_token_id is None:

TypeError: sep_token not found in the vocabulary