🚀 A simple way to launch, train, and use PyTorch models on almost any device and distributed configuration, automatic mixed precision (including fp8), and easy-to-configure FSDP and DeepSpeed support
[ ] One of the scripts in the examples/ folder of Accelerate or an officially supported no_trainer script in the examples folder of the transformers repo (such as run_no_trainer_glue.py)
[X] My own task or dataset (give details below)
Reproduction
Run the follwoing code
from accelerate import init_empty_weights
from transformers import AutoConfig, AutoModelForCausalLM
from accelerate import load_checkpoint_and_dispatch
from transformers import AutoTokenizer
from accelerate import infer_auto_device_map
import pprint
import torch
checkpoint = "tiiuae/falcon-7b-instruct"
weights_location = "/home/ubuntu/llm-ft/falcon/no_lora_test"
config = AutoConfig.from_pretrained(checkpoint)
dtype = torch.float16
with init_empty_weights():
model = AutoModelForCausalLM.from_config(config)
model.tie_weights()
device_map = infer_auto_device_map(model)
model = load_checkpoint_and_dispatch(
model, weights_location, device_map="auto", dtype=dtype, no_split_module_classes=['DecoderLayer']
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer("Hello, my name is", return_tensors="pt")
#inputs = inputs.to(0)
inputs = inputs.to("cuda")
output = model.generate(inputs["input_ids"])
res = tokenizer.decode(output[0].tolist())
print(f'{res=}')
Where no_lora_test is the output directory of saving the state dict
(ft-llm) ubuntu@ip-172-31-89-151:~/llm-ft/falcon$ ll /home/ubuntu/llm-ft/falcon/no_lora_test
total 28192268
drwxrwxr-x 2 ubuntu ubuntu 4096 Oct 19 12:03 ./
drwxr-xr-x 11 ubuntu ubuntu 4096 Oct 19 16:52 ../
-rw-rw-r-- 1 ubuntu ubuntu 1368 Oct 19 12:03 config.json
-rw-rw-r-- 1 ubuntu ubuntu 118 Oct 19 12:03 generation_config.json
-rw-rw-r-- 1 ubuntu ubuntu 9962578563 Oct 19 12:03 pytorch_model-00001-of-00003.bin
-rw-rw-r-- 1 ubuntu ubuntu 9939388767 Oct 19 12:03 pytorch_model-00002-of-00003.bin
-rw-rw-r-- 1 ubuntu ubuntu 8966859383 Oct 19 12:03 pytorch_model-00003-of-00003.bin
-rw-rw-r-- 1 ubuntu ubuntu 16924 Oct 19 12:03 pytorch_model.bin.index.json
(ft-llm) ubuntu@ip-172-31-89-151:~/llm-ft/falcon$ python inferrence.py
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
/home/ubuntu/anaconda3/envs/ft-llm/lib/python3.9/site-packages/transformers/generation/utils.py:1273: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.
warnings.warn(
The current implementation of Falcon calls `torch.scaled_dot_product_attention` directly, this will be deprecated in the future in favor of the `BetterTransformer` API. Please install the latest optimum library with `pip install -U optimum` and call `model.to_bettertransformer()` to benefit from `torch.scaled_dot_product_attention` and future performance optimizations.
Traceback (most recent call last):
File "/home/ubuntu/llm-ft/falcon/inferrence.py", line 28, in <module>
output = model.generate(inputs["input_ids"])
File "/home/ubuntu/anaconda3/envs/ft-llm/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/ft-llm/lib/python3.9/site-packages/transformers/generation/utils.py", line 1658, in generate
return self.greedy_search(
File "/home/ubuntu/anaconda3/envs/ft-llm/lib/python3.9/site-packages/transformers/generation/utils.py", line 2506, in greedy_search
outputs = self(
File "/home/ubuntu/anaconda3/envs/ft-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/ft-llm/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/ft-llm/lib/python3.9/site-packages/transformers/models/falcon/modeling_falcon.py", line 1287, in forward
transformer_outputs = self.transformer(
File "/home/ubuntu/anaconda3/envs/ft-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/ft-llm/lib/python3.9/site-packages/transformers/models/falcon/modeling_falcon.py", line 1162, in forward
outputs = block(
File "/home/ubuntu/anaconda3/envs/ft-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/ft-llm/lib/python3.9/site-packages/transformers/models/falcon/modeling_falcon.py", line 823, in forward
mlp_output += attention_output
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!
(ft-llm) ubuntu@ip-172-31-89-151:~/llm-ft/falcon$ python inferrence.py
Note: this is also happen when launch "accelerate launch inferrence.py"
Thanks !
System Info
Information
Tasks
no_trainer
script in theexamples
folder of thetransformers
repo (such asrun_no_trainer_glue.py
)Reproduction
Run the follwoing code
Where no_lora_test is the output directory of saving the state dict
Simlpe code to create it
Expected behavior
The code crash and not making preiciton
Note: this is also happen when launch "accelerate launch inferrence.py" Thanks !