Fail to load a tokenizer (CroissantLLM)

ccdv-ai commented 3 months ago

Trying to run the colab using a small model:

from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Gemma sadly only supports max 8192 for now
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "croissantllm/CroissantLLMBase", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

The model is loaded but it fails to load the tokenizer:

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

[<ipython-input-6-ce8dc43987f2>](https://localhost:8080/#) in <cell line: 21>()
     19 ] # More models at https://huggingface.co/unsloth
     20 
---> 21 model, tokenizer = FastLanguageModel.from_pretrained(
     22     model_name = "croissantllm/CroissantLLMBase", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
     23     max_seq_length = max_seq_length,

9 frames

[/usr/local/lib/python3.10/dist-packages/unsloth/models/loader.py](https://localhost:8080/#) in from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, token, device_map, rope_scaling, fix_tokenizer, trust_remote_code, use_gradient_checkpointing, *args, **kwargs)
    130         pass
    131 
--> 132         model, tokenizer = dispatch_model.from_pretrained(
    133             model_name     = model_name,
    134             max_seq_length = max_seq_length,

[/usr/local/lib/python3.10/dist-packages/unsloth/models/llama.py](https://localhost:8080/#) in from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, token, device_map, rope_scaling, fix_tokenizer, model_patcher, tokenizer_name, trust_remote_code, **kwargs)
   1105         # Counteract saved tokenizers
   1106         tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
-> 1107         tokenizer = load_correct_tokenizer(
   1108             tokenizer_name    = tokenizer_name,
   1109             model_max_length  = max_position_embeddings,

[/usr/local/lib/python3.10/dist-packages/unsloth/tokenizer_utils.py](https://localhost:8080/#) in load_correct_tokenizer(tokenizer_name, model_max_length, padding_side, token, trust_remote_code, cache_dir)
    270     pass
    271 
--> 272     slow_tokenizer = AutoTokenizer.from_pretrained(
    273         tokenizer_name,
    274         model_max_length  = model_max_length,

[/usr/local/lib/python3.10/dist-packages/transformers/models/auto/tokenization_auto.py](https://localhost:8080/#) in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    823                     f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
    824                 )
--> 825             return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    826 
    827         # Otherwise we have to be creative.

[/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py](https://localhost:8080/#) in from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
   2046                 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
   2047 
-> 2048         return cls._from_pretrained(
   2049             resolved_vocab_files,
   2050             pretrained_model_name_or_path,

[/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py](https://localhost:8080/#) in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
   2285         # Instantiate the tokenizer.
   2286         try:
-> 2287             tokenizer = cls(*init_inputs, **init_kwargs)
   2288         except OSError:
   2289             raise OSError(

[/usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py](https://localhost:8080/#) in __init__(self, vocab_file, unk_token, bos_token, eos_token, pad_token, sp_model_kwargs, add_bos_token, add_eos_token, clean_up_tokenization_spaces, use_default_system_prompt, spaces_between_special_tokens, legacy, add_prefix_space, **kwargs)
    180         self.add_eos_token = add_eos_token
    181         self.use_default_system_prompt = use_default_system_prompt
--> 182         self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
    183         self.add_prefix_space = add_prefix_space
    184 

[/usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py](https://localhost:8080/#) in get_spm_processor(self, from_slow)
    207         tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
    208         if self.legacy or from_slow:  # no dependency on protobuf
--> 209             tokenizer.Load(self.vocab_file)
    210             return tokenizer
    211 

[/usr/local/lib/python3.10/dist-packages/sentencepiece/__init__.py](https://localhost:8080/#) in Load(self, model_file, model_proto)
    903       if model_proto:
    904         return self.LoadFromSerializedProto(model_proto)
--> 905       return self.LoadFromFile(model_file)
    906 
    907 

[/usr/local/lib/python3.10/dist-packages/sentencepiece/__init__.py](https://localhost:8080/#) in LoadFromFile(self, arg)
    308 
    309     def LoadFromFile(self, arg):
--> 310         return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
    311 
    312     def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):

TypeError: not a string

ccdv-ai commented 3 months ago

@danielhanchen ok this is something with sentencepiece Some models are missing the tokenizer ".model" file so the fast tokenizer can be loaded but not the slow one. And there is no easy way to recover the file.

Considering that, load_correct_tokenizer function fails.

Possible fix:

load fast tokenizer
try to load slow tokenizer
if 2. fails, returns fast tokenizer else continue