guidance-ai / guidance

A guidance language for controlling large language models.
MIT License
19.12k stars 1.04k forks source link

TypeError: not a string when loading argilla/notus-7b-v1 #490

Closed hollstein closed 11 months ago

hollstein commented 11 months ago

The bug

When using the model from the Huggingface hub argilla/notus-7b-v1, I get the following error: TypeError: not a string

To Reproduce

from guidance import __version__
from guidance import models
print(__version__)
lm = models.Transformers(
    **{
        "model":"argilla/notus-7b-v1","device_map":"auto"
    }
)
print(type(lm))
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[1], line 4
      2 from guidance import models
      3 print(__version__)
----> 4 lm = models.Transformers(
      5     **{
      6         "model":"argilla/notus-7b-v1","device_map":"auto"
      7     }
      8 )
      9 print(type(lm))

File ~/.conda/envs/python=3.11/lib/python3.11/site-packages/guidance/models/transformers/_transformers.py:28, in Transformers.__init__(self, model, tokenizer, echo, caching, temperature, device, **kwargs)
     25     except:
     26         pass
---> 28 self.model_obj, self._orig_tokenizer = self._model_and_tokenizer(model, tokenizer, **kwargs)
     30 if not isinstance(model, str):
     31     self.model = model.__class__.__name__

File ~/.conda/envs/python=3.11/lib/python3.11/site-packages/guidance/models/transformers/_transformers.py:74, in Transformers._model_and_tokenizer(self, model, tokenizer, **kwargs)
     71         raise Exception("Please install transformers with `pip install transformers` in order to use guidance.models.Transformers!")
     73     if tokenizer is None:
---> 74         tokenizer = transformers.AutoTokenizer.from_pretrained(model, use_fast=False, **kwargs)
     75     model = transformers.AutoModelForCausalLM.from_pretrained(model, **kwargs)
     77 assert tokenizer is not None, "You must give a tokenizer object when you provide a model object (as opposed to just a model name)!"

File ~/.conda/envs/python=3.11/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py:768, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    764     if tokenizer_class is None:
    765         raise ValueError(
    766             f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
    767         )
--> 768     return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    770 # Otherwise we have to be creative.
    771 # if model is an encoder decoder, the encoder tokenizer class is used by default
    772 if isinstance(config, EncoderDecoderConfig):

File ~/.conda/envs/python=3.11/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2024, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, *init_inputs, **kwargs)
   2021     else:
   2022         logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2024 return cls._from_pretrained(
   2025     resolved_vocab_files,
   2026     pretrained_model_name_or_path,
   2027     init_configuration,
   2028     *init_inputs,
   2029     token=token,
   2030     cache_dir=cache_dir,
   2031     local_files_only=local_files_only,
   2032     _commit_hash=commit_hash,
   2033     _is_local=is_local,
   2034     **kwargs,
   2035 )

File ~/.conda/envs/python=3.11/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2256, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
   2254 # Instantiate the tokenizer.
   2255 try:
-> 2256     tokenizer = cls(*init_inputs, **init_kwargs)
   2257 except OSError:
   2258     raise OSError(
   2259         "Unable to load vocabulary from file. "
   2260         "Please check that the provided vocabulary is accessible and not corrupted."
   2261     )

File ~/.conda/envs/python=3.11/lib/python3.11/site-packages/transformers/models/llama/tokenization_llama.py:178, in LlamaTokenizer.__init__(self, vocab_file, unk_token, bos_token, eos_token, pad_token, sp_model_kwargs, add_bos_token, add_eos_token, clean_up_tokenization_spaces, use_default_system_prompt, spaces_between_special_tokens, legacy, **kwargs)
    176 self.add_eos_token = add_eos_token
    177 self.use_default_system_prompt = use_default_system_prompt
--> 178 self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
    180 super().__init__(
    181     bos_token=bos_token,
    182     eos_token=eos_token,
   (...)
    192     **kwargs,
    193 )

File ~/.conda/envs/python=3.11/lib/python3.11/site-packages/transformers/models/llama/tokenization_llama.py:203, in LlamaTokenizer.get_spm_processor(self, from_slow)
    201 tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
    202 if self.legacy or from_slow:  # no dependency on protobuf
--> 203     tokenizer.Load(self.vocab_file)
    204     return tokenizer
    206 with open(self.vocab_file, "rb") as f:

File ~/.conda/envs/python=3.11/lib/python3.11/site-packages/sentencepiece/__init__.py:905, in SentencePieceProcessor.Load(self, model_file, model_proto)
    903 if model_proto:
    904   return self.LoadFromSerializedProto(model_proto)
--> 905 return self.LoadFromFile(model_file)

File ~/.conda/envs/python=3.11/lib/python3.11/site-packages/sentencepiece/__init__.py:310, in SentencePieceProcessor.LoadFromFile(self, arg)
    309 def LoadFromFile(self, arg):
--> 310     return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)

TypeError: not a string

System info (please complete the following information):

slundberg commented 11 months ago

Thanks @hollstein ! It looks like this is a failure of the transformers tokenizer, in particular the non-fast version of that tokenizer. If we remove use_fast=False, then it loads. So I added a workaround that falls back to the fast tokenizer when needed.

hollstein commented 11 months ago

Brilliant @slundberg! Thanks for fixing this so fast.