Closed avi-jain closed 2 months ago
Hey @avi-jain! Thanks for your interest in our work!
Can you provide full traceback of the error in question?
Sure! I moved on from using falcon to another model, and the other model also gives me similar errors about multiple devices
Trace
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/lm_polygraph/stat_calculators/greedy_probs.py:90, in GreedyProbsCalculator.__call__(self, dependencies, texts, model, max_new_tokens)
88 batch = {k: v.to(model.device()) for k, v in batch.items()}
89 with torch.no_grad():
---> 90 out = model.generate(
91 **batch,
92 output_scores=True,
93 return_dict_in_generate=True,
94 max_new_tokens=max_new_tokens,
95 min_length=2,
96 output_attentions=False,
97 output_hidden_states=True,
98 temperature=model.parameters.temperature,
99 top_k=model.parameters.topk,
100 top_p=model.parameters.topp,
101 do_sample=model.parameters.do_sample,
102 num_beams=model.parameters.num_beams,
103 presence_penalty=model.parameters.presence_penalty,
104 repetition_penalty=model.parameters.repetition_penalty,
105 suppress_tokens=([] if model.parameters.allow_newlines else
106 [t for t in range(len(model.tokenizer)) if '\n' in model.tokenizer.decode([t])]),
107 num_return_sequences=1,
108 )
109 logits = torch.stack(out.scores, dim=1)
110 logits = logits.log_softmax(-1)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/lm_polygraph/utils/model.py:276, in WhiteboxModel.generate(self, **args)
274 logits_processor = LogitsProcessorList([processor])
275 args['logits_processor'] = logits_processor
--> 276 generation = self.model.generate(**args)
278 orig_scores = [s.log_softmax(-1) for s in processor.scores]
280 # override generation.scores with original scores from model
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/generation/utils.py:1673, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
1656 return self.assisted_decoding(
1657 input_ids,
1658 assistant_model=assistant_model,
(...)
1669 **model_kwargs,
1670 )
1671 if generation_mode == GenerationMode.GREEDY_SEARCH:
1672 # 11. run greedy search
-> 1673 return self.greedy_search(
1674 input_ids,
1675 logits_processor=logits_processor,
1676 stopping_criteria=stopping_criteria,
1677 pad_token_id=generation_config.pad_token_id,
1678 eos_token_id=generation_config.eos_token_id,
1679 output_scores=generation_config.output_scores,
1680 return_dict_in_generate=generation_config.return_dict_in_generate,
1681 synced_gpus=synced_gpus,
1682 streamer=streamer,
1683 **model_kwargs,
1684 )
1686 elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
1687 if not model_kwargs["use_cache"]:
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/generation/utils.py:2521, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
2518 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
2520 # forward pass to get next token
-> 2521 outputs = self(
2522 **model_inputs,
2523 return_dict=True,
2524 output_attentions=output_attentions,
2525 output_hidden_states=output_hidden_states,
2526 )
2528 if synced_gpus and this_peer_finished:
2529 continue # don't waste resources running the code we don't need
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/accelerate/hooks.py:164, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
162 output = module._old_forward(*args, **kwargs)
163 else:
--> 164 output = module._old_forward(*args, **kwargs)
165 return module._hf_hook.post_forward(module, output)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:1009, in MistralForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1006 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1008 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1009 outputs = self.model(
1010 input_ids=input_ids,
1011 attention_mask=attention_mask,
1012 position_ids=position_ids,
1013 past_key_values=past_key_values,
1014 inputs_embeds=inputs_embeds,
1015 use_cache=use_cache,
1016 output_attentions=output_attentions,
1017 output_hidden_states=output_hidden_states,
1018 return_dict=return_dict,
1019 )
1021 hidden_states = outputs[0]
1022 logits = self.lm_head(hidden_states)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:837, in MistralModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
834 position_ids = position_ids.view(-1, seq_length).long()
836 if inputs_embeds is None:
--> 837 inputs_embeds = self.embed_tokens(input_ids)
839 if (
840 attention_mask is not None
841 and hasattr(self.config, "_flash_attn_2_enabled")
842 and self.config._flash_attn_2_enabled
843 and past_key_values is not None
844 ):
845 is_padding_right = attention_mask[:, -1].sum().item() != batch_size
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/accelerate/hooks.py:164, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
162 output = module._old_forward(*args, **kwargs)
163 else:
--> 164 output = module._old_forward(*args, **kwargs)
165 return module._hf_hook.post_forward(module, output)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/sparse.py:162, in Embedding.forward(self, input)
161 def forward(self, input: Tensor) -> Tensor:
--> 162 return F.embedding(
163 input, self.weight, self.padding_idx, self.max_norm,
164 self.norm_type, self.scale_grad_by_freq, self.sparse)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/functional.py:2210, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2204 # Note [embedding_renorm set_grad_enabled]
2205 # XXX: equivalent to
2206 # with torch.no_grad():
2207 # torch.embedding_renorm_
2208 # remove once script supports set_grad_enabled
2209 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2210 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
The problem seems to lie in the use of the arg device_map=auto
. Idk if there is an example in the repo that uses that, or if it's not supported yet
Can you share the your code that causes the error, for which the traceback above is provided? It doesn't seem to be a call to WhiteboxModel.from_pretrained
at a glance.
@ArtemVazh The issue is kind of stale, but we still should not try moving the model to specific devices if device_map='auto'
is set.
Resolved
The code
Throws the error
You shouldn't move a model when it is dispatched on multiple devices.
While
seems to work fine :/