from transformers import AutoModel, AutoTokenizer
MAX_LENGTH = 128
model = AutoModel.from_pretrained("unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit")
input_text = ['What is the capital of China? Introduce the history of this city.']
input_tokens = model.tokenizer(input_text, return_tensors="pt", return_attention_mask=False, truncation=True, padding=False, max_length=MAX_LENGTH)
generation_output = model.generate(input_tokens['input_ids'].auto(), max_new_tokens=10, use_cache=True, return_dict_in_generate=True)
output = model.tokenizer.decode(generation_output.sequences[0])
print(output)
Error is
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>. Traceback (most recent call last): File "/Users/taozhiyu/Downloads/airllm.py", line 4, in <module> model = AutoModel.from_pretrained("unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit") ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained return model_class.from_pretrained( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/transformers/modeling_utils.py", line 3354, in from_pretrained hf_quantizer.validate_environment( File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/transformers/quantizers/quantizer_bnb_4bit.py", line 62, in validate_environment raise RuntimeError("No GPU found. A GPU is needed for quantization.") RuntimeError: No GPU found. A GPU is needed for quantization.
run on Mac M3 Max 128GB
run this code
Error is
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>. Traceback (most recent call last): File "/Users/taozhiyu/Downloads/airllm.py", line 4, in <module> model = AutoModel.from_pretrained("unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit") ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained return model_class.from_pretrained( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/transformers/modeling_utils.py", line 3354, in from_pretrained hf_quantizer.validate_environment( File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/transformers/quantizers/quantizer_bnb_4bit.py", line 62, in validate_environment raise RuntimeError("No GPU found. A GPU is needed for quantization.") RuntimeError: No GPU found. A GPU is needed for quantization.