PKU-YuanGroup / MoE-LLaVA

Mixture-of-Experts for Large Vision-Language Models
https://arxiv.org/abs/2401.15947
Apache License 2.0
1.9k stars 121 forks source link

inference error in llavamistral #38

Open saeedkhaki92 opened 6 months ago

saeedkhaki92 commented 6 months ago

Question

Hello,

I have trained a LlavaMistralForCausalLM model based on openchat (not moe version), but when I use predict.py I get the following error:

File ~/scripts/MoE-LLaVA/moellava/model/language_model/llava_mistral.py:94, in LlavaMistralForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, images, return_dict)
     76     (
     77         input_ids,
     78         position_ids,
   (...)
     89         images
     90     )
     92 # dist.barrier()
     93 # print(f'rank {dist.get_rank()}', 'after prepare_inputs_labels_for_multimodal')
---> 94 out = super().forward(
     95     input_ids=input_ids,
     96     attention_mask=attention_mask,
     97     position_ids=position_ids,
     98     past_key_values=past_key_values,
     99     inputs_embeds=inputs_embeds,
    100     labels=labels,
    101     use_cache=use_cache,
    102     output_attentions=output_attentions,
    103     output_hidden_states=output_hidden_states,
    104     return_dict=return_dict
    105 )
    106 # dist.barrier()
    107 # print(f'rank {dist.get_rank()}', 'after LLM')
    108 return out

File /opt/conda/envs/moellava/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:1053, in MistralForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
   1050 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
   1052 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1053 outputs = self.model(
   1054     input_ids=input_ids,
   1055     attention_mask=attention_mask,
   1056     position_ids=position_ids,
   1057     past_key_values=past_key_values,
   1058     inputs_embeds=inputs_embeds,
   1059     use_cache=use_cache,
   1060     output_attentions=output_attentions,
   1061     output_hidden_states=output_hidden_states,
   1062     return_dict=return_dict,
   1063 )
   1065 hidden_states = outputs[0]
   1066 logits = self.lm_head(hidden_states)

File /opt/conda/envs/moellava/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File /opt/conda/envs/moellava/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:908, in MistralModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
    905     attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
    906 else:
    907     # 4d mask is passed through the layers
--> 908     attention_mask = _prepare_4d_causal_attention_mask(
    909         attention_mask,
    910         (batch_size, seq_length),
    911         inputs_embeds,
    912         past_key_values_length,
    913         sliding_window=self.config.sliding_window,
    914     )
    916 hidden_states = inputs_embeds
    918 # decoder layers

File /opt/conda/envs/moellava/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.py:306, in _prepare_4d_causal_attention_mask(attention_mask, input_shape, inputs_embeds, past_key_values_length, sliding_window)
    304 # 4d mask is passed through the layers
    305 if attention_mask is not None:
--> 306     attention_mask = attn_mask_converter.to_4d(
    307         attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype
    308     )
    309 else:
    310     attention_mask = attn_mask_converter.to_causal_4d(
    311         input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device
    312     )

File /opt/conda/envs/moellava/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.py:136, in AttentionMaskConverter.to_4d(self, attention_mask_2d, query_length, dtype, key_value_length)
    132 expanded_attn_mask = self._expand_mask(attention_mask_2d, dtype, tgt_len=input_shape[-1]).to(
    133     attention_mask_2d.device
    134 )
    135 if causal_4d_mask is not None:
--> 136     expanded_attn_mask = causal_4d_mask.masked_fill(expanded_attn_mask.bool(), torch.finfo(dtype).min)
    138 # expanded_attn_mask + causal_4d_mask can cause some overflow
    139 expanded_4d_mask = expanded_attn_mask

RuntimeError: The size of tensor a (622) must match the size of tensor b (1243) at non-singleton dimension 3

Should I change the inference code for non moe models? Thanks for the help

saeedkhaki92 commented 6 months ago

nevermind, if you set use_cache=False, it will work. Thanks

LinB203 commented 6 months ago

We fixed that, could you try it again?