RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat2 in method wrapper_CUDA_mm)

System Info

I am using bitsandbytes quantization to load mistral-7b on NVIDIA T4 gpu. I loaded the model with the quantized configuration, however, I keep getting an runtime error related to device. I am ensured that the model and inputs are on cuda.

Reproduction


import bitsandbytes
from peft import LoraConfig, get_peft_model

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

bnb_config = BitsAndBytesConfig(
                                    load_in_4bit=True, =
                                    bnb_4bit_quant_type="nf4", 
                                    bnb_4bit_compute_dtype=torch.bfloat16, 
                                    bnb_4bit_use_double_quant=True, 
                                    bnb_4bit_quant_storage=torch.bfloat16 
                               )

model = AutoModelForCausalLM.from_pretrained(
                                                "mistralai/Mistral-7B-v0.1"
                                                quantization_config=bnb_config,
                                            )

# Lora

config = LoraConfig(
                        r=32, 
                        lora_alpha=64, 
                        lora_dropout=0.01, 
                        target_modules=["q_proj", "k_proj", "v_proj"], 
                        bias="none", 
                        task_type="CAUSAL_LM"
                   )

 peft_model = get_peft_model(model, config)

inputs = tokenizer("Do you have time", return_tensors="pt")
print("Inputs:", inputs)

{'input_ids': tensor([[   1, 2378,  368,  506,  727]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1]], device='cuda:0')} 

 with torch.no_grad():
    outputs = peft_model(**inputs)

print("Outputs:\n", outputs.logits)

Error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[44], line 2
      1 with torch.no_grad():
----> 2     outputs = peft_model(**inputs)
      4 print("Outputs:\n", outputs.logits)
      5 print("Outputs dimensions:", outputs.logits.shape) # shape: (batch_size, num_tokens, num_classes)

File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ~/anaconda3/envs/python3/lib/python3.10/site-packages/peft/peft_model.py:1430, in PeftModelForCausalLM.forward(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, **kwargs)
   1428     with self._enable_peft_forward_hooks(**kwargs):
   1429         kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
-> 1430         return self.base_model(
   1431             input_ids=input_ids,
   1432             attention_mask=attention_mask,
   1433             inputs_embeds=inputs_embeds,
   1434             labels=labels,
   1435             output_attentions=output_attentions,
   1436             output_hidden_states=output_hidden_states,
   1437             return_dict=return_dict,
   1438             **kwargs,
   1439         )
   1441 batch_size = _get_batch_size(input_ids, inputs_embeds)
   1442 if attention_mask is not None:
   1443     # concat prompt attention mask

File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ~/anaconda3/envs/python3/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:179, in BaseTuner.forward(self, *args, **kwargs)
    178 def forward(self, *args: Any, **kwargs: Any):
--> 179     return self.model.forward(*args, **kwargs)

File ~/anaconda3/envs/python3/lib/python3.10/site-packages/accelerate/hooks.py:166, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    164         output = module._old_forward(*args, **kwargs)
    165 else:
--> 166     output = module._old_forward(*args, **kwargs)
    167 return module._hf_hook.post_forward(module, output)

File ~/anaconda3/envs/python3/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:1152, in MistralForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
   1139 outputs = self.model(
   1140     input_ids=input_ids,
   1141     attention_mask=attention_mask,
   (...)
   1148     return_dict=return_dict,
   1149 )
   1151 hidden_states = outputs[0]
-> 1152 logits = self.lm_head(hidden_states)
   1153 logits = logits.float()
   1155 loss = None

File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/linear.py:116, in Linear.forward(self, input)
    115 def forward(self, input: Tensor) -> Tensor:
--> 116     return F.linear(input, self.weight, self.bias)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat2 in method wrapper_CUDA_mm)

Expected behavior

Output logits.

bitsandbytes-foundation / bitsandbytes

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat2 in method wrapper_CUDA_mm) #1237

System Info

Reproduction

Expected behavior