Closed ersanliqiao closed 10 months ago
File "/root/data/vjuicefs_hz_cv_enhance_v1/public_data/11105507/video_edit/animate-anything-main/models/pipeline.py", line 347, in call image_embeddings = self._encode_image(image, device, num_videos_per_prompt, do_classifier_free_guidance) File "/root/miniconda3/envs/animation/lib/python3.10/site-packages/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py", line 139, in _encode_image image_embeddings = self.image_encoder(image).image_embeds File "/root/miniconda3/envs/animation/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/root/miniconda3/envs/animation/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 1304, in forward vision_outputs = self.vision_model( File "/root/miniconda3/envs/animation/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, *kwargs) File "/root/miniconda3/envs/animation/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 859, in forward hidden_states = self.embeddings(pixel_values) File "/root/miniconda3/envs/animation/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/root/miniconda3/envs/animation/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 195, in forward patch_embeds = self.patch_embedding(pixel_values) # shape = [, width, grid, grid] File "/root/miniconda3/envs/animation/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, **kwargs) File "/root/miniconda3/envs/animation/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 463, in forward return self._conv_forward(input, self.weight, self.bias) File "/root/miniconda3/envs/animation/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 459, in _conv_forward return F.conv2d(input, weight, bias, self.stride, RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 3, 3, 224, 224]
升级最新的transformer版本就可以了
python train_svd.py --config example/train_svd_mask.yaml --eval