[Qwen2-VL] RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!

System Info

When i load the Qwen2-VL 7B model using example script from the model card, i encounter with this error:

This error doesnt seems to happen yet at this git commit: 21fac7abba2a37fae86106f87fcf9974fd1e3830 The issue is this hash commit doesnt contains the update for Llama 3.2 yet and i hope to be able to run both

RuntimeError                              Traceback (most recent call last)
Cell In[2], line 64
     60 inputs = inputs.to(model.device)
     61 # inputs = inputs.to("cuda")
     62 
     63 # Inference: Generation of the output
---> 64 generated_ids = model.generate(**inputs, max_new_tokens=128)
     65 generated_ids_trimmed = [
     66     out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     67 ]
     68 output_text = processor.batch_decode(
     69     generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     70 )

File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    113 @functools.wraps(func)
    114 def decorate_context(*args, **kwargs):
    115     with ctx_factory():
--> 116         return func(*args, **kwargs)

File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/transformers/generation/utils.py:2048, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
   2040     input_ids, model_kwargs = self._expand_inputs_for_generation(
   2041         input_ids=input_ids,
   2042         expand_size=generation_config.num_return_sequences,
   2043         is_encoder_decoder=self.config.is_encoder_decoder,
   2044         **model_kwargs,
   2045     )
   2047     # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
-> 2048     result = self._sample(
   2049         input_ids,
   2050         logits_processor=prepared_logits_processor,
   2051         stopping_criteria=prepared_stopping_criteria,
   2052         generation_config=generation_config,
   2053         synced_gpus=synced_gpus,
   2054         streamer=streamer,
   2055         **model_kwargs,
   2056     )
   2058 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
   2059     # 11. prepare beam search scorer
   2060     beam_scorer = BeamSearchScorer(
   2061         batch_size=batch_size,
   2062         num_beams=generation_config.num_beams,
   (...)
   2067         max_length=generation_config.max_length,
   2068     )

File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/transformers/generation/utils.py:3008, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
   3005 model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
   3007 # forward pass to get next token
-> 3008 outputs = self(**model_inputs, return_dict=True)
   3010 if synced_gpus and this_peer_finished:
   3011     continue  # don't waste resources running the code we don't need

File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
   1551     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1552 else:
-> 1553     return self._call_impl(*args, **kwargs)

File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
   1557 # If we don't have any hooks, we want to skip the rest of the logic in
   1558 # this function, and just call forward.
   1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1560         or _global_backward_pre_hooks or _global_backward_hooks
   1561         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562     return forward_call(*args, **kwargs)
   1564 try:
   1565     result = None

File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    168         output = module._old_forward(*args, **kwargs)
    169 else:
--> 170     output = module._old_forward(*args, **kwargs)
    171 return module._hf_hook.post_forward(module, output)

File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/transformers/models/qwen2_vl/modeling_qwen2_vl.py:1694, in Qwen2VLForConditionalGeneration.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, rope_deltas)
   1692     image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
   1693     image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
-> 1694     inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
   1696 if pixel_values_videos is not None:
   1697     pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument mask in method wrapper_CUDA__masked_scatter_)

Who can help?

No response

Information

[X] The official example scripts
[ ] My own modified scripts

Tasks

[ ] An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
[ ] My own task or dataset (give details below)

Reproduction

from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-7B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

Expected behavior

Bug fix or some workaround

huggingface / transformers

[Qwen2-VL] RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! #33816