Error loading larger models - You shouldn't move a model when it is dispatched on multiple devices

avi-jain commented 10 months ago

The code

model = WhiteboxModel.from_pretrained(
    "tiiuae/falcon-40b-instruct",
    cache_dir="~/cache/",
    device_map='auto', 
    offload_folder="offload_folder"

Throws the error You shouldn't move a model when it is dispatched on multiple devices.

While

model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b-instruct", 
                                             trust_remote_code=True, 
                                             cache_dir="~/cache/",
                                             device_map="auto",
                                             offload_folder="offload_folder")

seems to work fine :/

rvashurin commented 10 months ago

Hey @avi-jain! Thanks for your interest in our work!

Can you provide full traceback of the error in question?

avi-jain commented 9 months ago

Sure! I moved on from using falcon to another model, and the other model also gives me similar errors about multiple devices

Trace

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/lm_polygraph/stat_calculators/greedy_probs.py:90, in GreedyProbsCalculator.__call__(self, dependencies, texts, model, max_new_tokens)
     88 batch = {k: v.to(model.device()) for k, v in batch.items()}
     89 with torch.no_grad():
---> 90     out = model.generate(
     91         **batch,
     92         output_scores=True,
     93         return_dict_in_generate=True,
     94         max_new_tokens=max_new_tokens,
     95         min_length=2,
     96         output_attentions=False,
     97         output_hidden_states=True,
     98         temperature=model.parameters.temperature,
     99         top_k=model.parameters.topk,
    100         top_p=model.parameters.topp,
    101         do_sample=model.parameters.do_sample,
    102         num_beams=model.parameters.num_beams,
    103         presence_penalty=model.parameters.presence_penalty,
    104         repetition_penalty=model.parameters.repetition_penalty,
    105         suppress_tokens=([] if model.parameters.allow_newlines else
    106                          [t for t in range(len(model.tokenizer)) if '\n' in model.tokenizer.decode([t])]),
    107         num_return_sequences=1,
    108     )
    109     logits = torch.stack(out.scores, dim=1)
    110     logits = logits.log_softmax(-1)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/lm_polygraph/utils/model.py:276, in WhiteboxModel.generate(self, **args)
    274     logits_processor = LogitsProcessorList([processor])
    275 args['logits_processor'] = logits_processor
--> 276 generation = self.model.generate(**args)
    278 orig_scores = [s.log_softmax(-1) for s in processor.scores]
    280 # override generation.scores with original scores from model

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    112 @functools.wraps(func)
    113 def decorate_context(*args, **kwargs):
    114     with ctx_factory():
--> 115         return func(*args, **kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/generation/utils.py:1673, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
   1656     return self.assisted_decoding(
   1657         input_ids,
   1658         assistant_model=assistant_model,
   (...)
   1669         **model_kwargs,
   1670     )
   1671 if generation_mode == GenerationMode.GREEDY_SEARCH:
   1672     # 11. run greedy search
-> 1673     return self.greedy_search(
   1674         input_ids,
   1675         logits_processor=logits_processor,
   1676         stopping_criteria=stopping_criteria,
   1677         pad_token_id=generation_config.pad_token_id,
   1678         eos_token_id=generation_config.eos_token_id,
   1679         output_scores=generation_config.output_scores,
   1680         return_dict_in_generate=generation_config.return_dict_in_generate,
   1681         synced_gpus=synced_gpus,
   1682         streamer=streamer,
   1683         **model_kwargs,
   1684     )
   1686 elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
   1687     if not model_kwargs["use_cache"]:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/generation/utils.py:2521, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
   2518 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
   2520 # forward pass to get next token
-> 2521 outputs = self(
   2522     **model_inputs,
   2523     return_dict=True,
   2524     output_attentions=output_attentions,
   2525     output_hidden_states=output_hidden_states,
   2526 )
   2528 if synced_gpus and this_peer_finished:
   2529     continue  # don't waste resources running the code we don't need

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/accelerate/hooks.py:164, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    162         output = module._old_forward(*args, **kwargs)
    163 else:
--> 164     output = module._old_forward(*args, **kwargs)
    165 return module._hf_hook.post_forward(module, output)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:1009, in MistralForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
   1006 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
   1008 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1009 outputs = self.model(
   1010     input_ids=input_ids,
   1011     attention_mask=attention_mask,
   1012     position_ids=position_ids,
   1013     past_key_values=past_key_values,
   1014     inputs_embeds=inputs_embeds,
   1015     use_cache=use_cache,
   1016     output_attentions=output_attentions,
   1017     output_hidden_states=output_hidden_states,
   1018     return_dict=return_dict,
   1019 )
   1021 hidden_states = outputs[0]
   1022 logits = self.lm_head(hidden_states)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:837, in MistralModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
    834     position_ids = position_ids.view(-1, seq_length).long()
    836 if inputs_embeds is None:
--> 837     inputs_embeds = self.embed_tokens(input_ids)
    839 if (
    840     attention_mask is not None
    841     and hasattr(self.config, "_flash_attn_2_enabled")
    842     and self.config._flash_attn_2_enabled
    843     and past_key_values is not None
    844 ):
    845     is_padding_right = attention_mask[:, -1].sum().item() != batch_size

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/accelerate/hooks.py:164, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    162         output = module._old_forward(*args, **kwargs)
    163 else:
--> 164     output = module._old_forward(*args, **kwargs)
    165 return module._hf_hook.post_forward(module, output)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/sparse.py:162, in Embedding.forward(self, input)
    161 def forward(self, input: Tensor) -> Tensor:
--> 162     return F.embedding(
    163         input, self.weight, self.padding_idx, self.max_norm,
    164         self.norm_type, self.scale_grad_by_freq, self.sparse)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/functional.py:2210, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2204     # Note [embedding_renorm set_grad_enabled]
   2205     # XXX: equivalent to
   2206     # with torch.no_grad():
   2207     #   torch.embedding_renorm_
   2208     # remove once script supports set_grad_enabled
   2209     _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2210 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

The problem seems to lie in the use of the arg device_map=auto. Idk if there is an example in the repo that uses that, or if it's not supported yet

rvashurin commented 9 months ago

Can you share the your code that causes the error, for which the traceback above is provided? It doesn't seem to be a call to WhiteboxModel.from_pretrained at a glance.

rvashurin commented 6 months ago

@ArtemVazh The issue is kind of stale, but we still should not try moving the model to specific devices if device_map='auto' is set.

IINemo commented 2 months ago

Resolved

IINemo / lm-polygraph

Error loading larger models - You shouldn't move a model when it is dispatched on multiple devices #147