Traceback (most recent call last):
File "DVIS_Plus/demo_video/open_vocabulary/demo.py", line 138, in
predictions, visualized_output = demo.run_on_video(vid_frames)
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/predictor.py", line 210, in run_on_video
predictions = self.predictor(frames)
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/predictor.py", line 336, in call
predictions = self.model([inputs])
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, kwargs)
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/../../ov_dvis/meta_architecture_ov.py", line 2032, in forward
text_classifier, num_templates = self._set_class_information(batched_inputs[0]['name'], self.training)
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/../../ov_dvis/meta_architecture_ov.py", line 309, in _set_class_information
self.test_text_classifier, self.test_num_templates = self.get_text_classifier(train=train)
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/../../ov_dvis/meta_architecture_ov.py", line 351, in get_text_classifier
text_classifier.append(self.backbone.get_text_classifier(self.test_class_names[idx:idx+bs], self.device).detach())
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/../../ov_dvis/backbones/clip.py", line 211, in get_text_classifier
text_features = self.encode_text(text_tokens, normalize=False)
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/../../ov_dvis/backbones/clip.py", line 95, in encode_text
x = self.clip_model.transformer(x, attn_mask=self.clip_model.attn_mask)
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, *kwargs)
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/open_clip/transformer.py", line 363, in forward
x = r(x, attn_mask=attn_mask)
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(input, kwargs)
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/open_clip/transformer.py", line 263, in forward
x = q_x + self.ls_1(self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask))
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/open_clip/transformer.py", line 250, in attention
return self.attn(
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/torch/nn/modules/activation.py", line 1031, in forward
attn_output, attn_output_weights = F.multi_head_attention_forward(
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/torch/nn/functional.py", line 4992, in multi_head_attention_forward
raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.")
RuntimeError: The shape of the 2D attn_mask is torch.Size([77, 77]), but should be (28, 28).
Traceback (most recent call last): File "DVIS_Plus/demo_video/open_vocabulary/demo.py", line 138, in
predictions, visualized_output = demo.run_on_video(vid_frames)
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/predictor.py", line 210, in run_on_video
predictions = self.predictor(frames)
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/predictor.py", line 336, in call
predictions = self.model([inputs])
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, kwargs)
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/../../ov_dvis/meta_architecture_ov.py", line 2032, in forward
text_classifier, num_templates = self._set_class_information(batched_inputs[0]['name'], self.training)
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/../../ov_dvis/meta_architecture_ov.py", line 309, in _set_class_information
self.test_text_classifier, self.test_num_templates = self.get_text_classifier(train=train)
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/../../ov_dvis/meta_architecture_ov.py", line 351, in get_text_classifier
text_classifier.append(self.backbone.get_text_classifier(self.test_class_names[idx:idx+bs], self.device).detach())
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/../../ov_dvis/backbones/clip.py", line 211, in get_text_classifier
text_features = self.encode_text(text_tokens, normalize=False)
File "/home/ver/yyk/DVIS_Plus/DVIS_Plus/demo_video/open_vocabulary/../../ov_dvis/backbones/clip.py", line 95, in encode_text
x = self.clip_model.transformer(x, attn_mask=self.clip_model.attn_mask)
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, *kwargs)
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/open_clip/transformer.py", line 363, in forward
x = r(x, attn_mask=attn_mask)
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(input, kwargs)
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/open_clip/transformer.py", line 263, in forward
x = q_x + self.ls_1(self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask))
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/open_clip/transformer.py", line 250, in attention
return self.attn(
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/torch/nn/modules/activation.py", line 1031, in forward
attn_output, attn_output_weights = F.multi_head_attention_forward(
File "/home/ver/anaconda3/envs/dvis/lib/python3.8/site-packages/torch/nn/functional.py", line 4992, in multi_head_attention_forward
raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.")
RuntimeError: The shape of the 2D attn_mask is torch.Size([77, 77]), but should be (28, 28).
Is my data incorrect? Please help.