salesforce / BLIP

PyTorch code for BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
BSD 3-Clause "New" or "Revised" License
4.83k stars 642 forks source link

demo.ipynb : RuntimeError: The size of tensor a (3) must match the size of tensor b (9) at non-singleton dimension 0 #173

Closed Taiga10969 closed 1 year ago

Taiga10969 commented 1 year ago

demo.ipynb


---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[3], line 14
     10 model = model.to(device)
     12 with torch.no_grad():
     13     # beam search
---> 14     caption = model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5) 
     15     # nucleus sampling
     16     # caption = model.generate(image, sample=True, top_p=0.9, max_length=20, min_length=5) 
     17     print('caption: '+caption[0])

File [/taiga/experiment/BLIP_Figure_Classification/models/blip.py:156](https://vscode-remote+attached-002dcontainer-002b7b22636f6e7461696e65724e616d65223a222f626c6970222c2273657474696e6773223a7b22686f7374223a227373683a2f2f3139322e3136382e3137302e3730227d7d.vscode-resource.vscode-cdn.net/taiga/experiment/BLIP_Figure_Classification/models/blip.py:156), in BLIP_Decoder.generate(self, image, sample, num_beams, max_length, min_length, top_p, repetition_penalty)
    144     outputs = self.text_decoder.generate(input_ids=input_ids,
    145                                           max_length=max_length,
    146                                           min_length=min_length,
   (...)
    152                                           repetition_penalty=1.1,                                            
    153                                           **model_kwargs)
    154 else:
    155     #beam search
--> 156     outputs = self.text_decoder.generate(input_ids=input_ids,
    157                                           max_length=max_length,
    158                                           min_length=min_length,
    159                                           num_beams=num_beams,
    160                                           eos_token_id=self.tokenizer.sep_token_id,
    161                                           pad_token_id=self.tokenizer.pad_token_id,     
    162                                           repetition_penalty=repetition_penalty,
    163                                           **model_kwargs)            
    165 captions = []    
    166 for output in outputs:

File [/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:115](https://vscode-remote+attached-002dcontainer-002b7b22636f6e7461696e65724e616d65223a222f626c6970222c2273657474696e6773223a7b22686f7374223a227373683a2f2f3139322e3136382e3137302e3730227d7d.vscode-resource.vscode-cdn.net/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:115), in context_decorator..decorate_context(*args, **kwargs)
    112 @functools.wraps(func)
    113 def decorate_context(*args, **kwargs):
    114     with ctx_factory():
--> 115         return func(*args, **kwargs)

File [/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1627](https://vscode-remote+attached-002dcontainer-002b7b22636f6e7461696e65724e616d65223a222f626c6970222c2273657474696e6773223a7b22686f7374223a227373683a2f2f3139322e3136382e3137302e3730227d7d.vscode-resource.vscode-cdn.net/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1627), in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, **kwargs)
   1620     input_ids, model_kwargs = self._expand_inputs_for_generation(
   1621         input_ids=input_ids,
   1622         expand_size=generation_config.num_beams,
   1623         is_encoder_decoder=self.config.is_encoder_decoder,
   1624         **model_kwargs,
   1625     )
   1626     # 13. run beam search
-> 1627     return self.beam_search(
   1628         input_ids,
   1629         beam_scorer,
   1630         logits_processor=logits_processor,
   1631         stopping_criteria=stopping_criteria,
   1632         pad_token_id=generation_config.pad_token_id,
   1633         eos_token_id=generation_config.eos_token_id,
   1634         output_scores=generation_config.output_scores,
   1635         return_dict_in_generate=generation_config.return_dict_in_generate,
   1636         synced_gpus=synced_gpus,
   1637         **model_kwargs,
   1638     )
   1640 elif is_beam_sample_gen_mode:
   1641     # 11. prepare logits warper
   1642     logits_warper = self._get_logits_warper(generation_config)

File [/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:2932](https://vscode-remote+attached-002dcontainer-002b7b22636f6e7461696e65724e616d65223a222f626c6970222c2273657474696e6773223a7b22686f7374223a227373683a2f2f3139322e3136382e3137302e3730227d7d.vscode-resource.vscode-cdn.net/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:2932), in GenerationMixin.beam_search(self, input_ids, beam_scorer, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, **model_kwargs)
   2928         break
   2930 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-> 2932 outputs = self(
   2933     **model_inputs,
   2934     return_dict=True,
   2935     output_attentions=output_attentions,
   2936     output_hidden_states=output_hidden_states,
   2937 )
   2939 if synced_gpus and this_peer_finished:
   2940     cur_len = cur_len + 1

File [/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501](https://vscode-remote+attached-002dcontainer-002b7b22636f6e7461696e65724e616d65223a222f626c6970222c2273657474696e6773223a7b22686f7374223a227373683a2f2f3139322e3136382e3137302e3730227d7d.vscode-resource.vscode-cdn.net/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501), in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File [/taiga/experiment/BLIP_Figure_Classification/models/med.py:886](https://vscode-remote+attached-002dcontainer-002b7b22636f6e7461696e65724e616d65223a222f626c6970222c2273657474696e6773223a7b22686f7374223a227373683a2f2f3139322e3136382e3137302e3730227d7d.vscode-resource.vscode-cdn.net/taiga/experiment/BLIP_Figure_Classification/models/med.py:886), in BertLMHeadModel.forward(self, input_ids, attention_mask, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict, return_logits, is_decoder, reduction, mode)
    883 if labels is not None:
    884     use_cache = False
--> 886 outputs = self.bert(
    887     input_ids,
    888     attention_mask=attention_mask,
    889     position_ids=position_ids,
    890     head_mask=head_mask,
    891     inputs_embeds=inputs_embeds,
    892     encoder_hidden_states=encoder_hidden_states,
    893     encoder_attention_mask=encoder_attention_mask,
    894     past_key_values=past_key_values,
...
--> 178 attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
    180 if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
    181     seq_length = hidden_states.size()[1]

RuntimeError: The size of tensor a (3) must match the size of tensor b (9) at non-singleton dimension 0
Output is truncated. View as a [scrollable element](command:cellOutput.enableScrolling?461d633d-0f17-4246-937f-43e61e16b4ca) or open in a [text editor](command:workbench.action.openLargeOutput?461d633d-0f17-4246-937f-43e61e16b4ca). Adjust cell output [settings](command:workbench.action.openSettings?%5B%22%40tag%3AnotebookOutputLayout%22%5D)...```
hoangpnhat commented 1 year ago

I got same issue, did you fix it already ?

Taiga10969 commented 1 year ago

@hoangpnhat not yet fixed

hoangpnhat commented 1 year ago

@Taiga10969 https://github.com/salesforce/BLIP/issues/176#issue-1850469769 I think we stuck in the install step, I try to fix the error in install step and it work

Taiga10969 commented 1 year ago

@hoangpnhat Do I need to fix code? I want you to tell me what to fix

hoangpnhat commented 1 year ago

https://github.com/salesforce/BLIP/issues/176#issuecomment-1695212521 I tried to install Rust in my Linux system then reinstall requirement.txt and it works. You can see step by step in my comment I attached

Taiga10969 commented 1 year ago

@hoangpnhat Thank you!!