When i load the Qwen2-VL 7B model using example script from the model card, i encounter with this error:
This error doesnt seems to happen yet at this git commit: 21fac7abba2a37fae86106f87fcf9974fd1e3830
The issue is this hash commit doesnt contains the update for Llama 3.2 yet and i hope to be able to run both
RuntimeError Traceback (most recent call last)
Cell In[2], line 64
60 inputs = inputs.to(model.device)
61 # inputs = inputs.to("cuda")
62
63 # Inference: Generation of the output
---> 64 generated_ids = model.generate(**inputs, max_new_tokens=128)
65 generated_ids_trimmed = [
66 out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
67 ]
68 output_text = processor.batch_decode(
69 generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
70 )
File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/transformers/generation/utils.py:2048, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
2040 input_ids, model_kwargs = self._expand_inputs_for_generation(
2041 input_ids=input_ids,
2042 expand_size=generation_config.num_return_sequences,
2043 is_encoder_decoder=self.config.is_encoder_decoder,
2044 **model_kwargs,
2045 )
2047 # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
-> 2048 result = self._sample(
2049 input_ids,
2050 logits_processor=prepared_logits_processor,
2051 stopping_criteria=prepared_stopping_criteria,
2052 generation_config=generation_config,
2053 synced_gpus=synced_gpus,
2054 streamer=streamer,
2055 **model_kwargs,
2056 )
2058 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
2059 # 11. prepare beam search scorer
2060 beam_scorer = BeamSearchScorer(
2061 batch_size=batch_size,
2062 num_beams=generation_config.num_beams,
(...)
2067 max_length=generation_config.max_length,
2068 )
File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/transformers/generation/utils.py:3008, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3005 model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
3007 # forward pass to get next token
-> 3008 outputs = self(**model_inputs, return_dict=True)
3010 if synced_gpus and this_peer_finished:
3011 continue # don't waste resources running the code we don't need
File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
168 output = module._old_forward(*args, **kwargs)
169 else:
--> 170 output = module._old_forward(*args, **kwargs)
171 return module._hf_hook.post_forward(module, output)
File ~/miniconda3/envs/mlenv_tp/lib/python3.12/site-packages/transformers/models/qwen2_vl/modeling_qwen2_vl.py:1694, in Qwen2VLForConditionalGeneration.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, rope_deltas)
1692 image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
1693 image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
-> 1694 inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
1696 if pixel_values_videos is not None:
1697 pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument mask in method wrapper_CUDA__masked_scatter_)
Who can help?
No response
Information
[X] The official example scripts
[ ] My own modified scripts
Tasks
[ ] An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
[ ] My own task or dataset (give details below)
Reproduction
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
# "Qwen/Qwen2-VL-7B-Instruct",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
# device_map="auto",
# )
# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
},
{"type": "text", "text": "Describe this image."},
],
}
]
# Preparation for inference
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
System Info
When i load the Qwen2-VL 7B model using example script from the model card, i encounter with this error:
This error doesnt seems to happen yet at this git commit: 21fac7abba2a37fae86106f87fcf9974fd1e3830 The issue is this hash commit doesnt contains the update for Llama 3.2 yet and i hope to be able to run both
Who can help?
No response
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
Expected behavior
Bug fix or some workaround