yoyo-nb / Thin-Plate-Spline-Motion-Model

[CVPR 2022] Thin-Plate Spline Motion Model for Image Animation.
MIT License
3.38k stars 552 forks source link

When setting “find_best_frame = True”, an error occurs in the bottom four modules in demo.ipynb. #104

Closed Chichibabin closed 4 months ago

Chichibabin commented 4 months ago

The error does not occur when running on CPU, but appears when using CUDA with the following error:

{
    "name": "RuntimeError",
    "message": "nvrtc: error: invalid value for --gpu-architecture (-arch)

nvrtc compilation failed: 

#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)

template<typename T>
__device__ T maximum(T a, T b) {
  return isnan(a) ? a : (a > b ? a : b);
}

template<typename T>
__device__ T minimum(T a, T b) {
  return isnan(a) ? a : (a < b ? a : b);
}

extern \"C\" __global__
void fused_cat_cat(float* tinput0_42, float* tinput0_46, float* tout3_67, float* tinput0_60, float* tinput0_52, float* tout3_71, float* aten_cat_1, float* aten_cat) {
{
if (blockIdx.x<512ll ? 1 : 0) {
    aten_cat[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 1024ll<192ll ? 1 : 0) ? ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 1024ll<128ll ? 1 : 0) ? __ldg(tinput0_60 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) : __ldg(tinput0_52 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 131072ll)) : __ldg(tout3_71 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 196608ll));
  }  aten_cat_1[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 4096ll<192ll ? 1 : 0) ? ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 4096ll<128ll ? 1 : 0) ? __ldg(tinput0_42 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) : __ldg(tinput0_46 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 524288ll)) : __ldg(tout3_67 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 786432ll));
}
}
",
    "stack": "---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[36], line 6
      4 if predict_mode=='relative' and find_best_frame:
      5     from demo import find_best_frame as _find
----> 6     i = _find(source_image, driving_video, device.type=='cpu')
      7     print (\"Best frame: \" + str(i))
      8     driving_forward = driving_video[i:]

File ~/autodl-tmp/shijieqi/Thin-Plate-Spline-Motion-Model/demo.py:109, in find_best_frame(source, driving, cpu)
    105     return kp
    107 fa = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, flip_input=True,
    108                                   device= 'cpu' if cpu else 'cuda')
--> 109 kp_source = fa.get_landmarks(255 * source)[0]
    110 kp_source = normalize_kp(kp_source)
    111 norm  = float('inf')

File ~/autodl-tmp/shijieqi/TPSM/lib/python3.9/site-packages/face_alignment/api.py:110, in FaceAlignment.get_landmarks(self, image_or_path, detected_faces, return_bboxes, return_landmark_score)
     98 def get_landmarks(self, image_or_path, detected_faces=None, return_bboxes=False, return_landmark_score=False):
     99     \"\"\"Deprecated, please use get_landmarks_from_image
    100 
    101     Arguments:
   (...)
    108         return_landmark_score {boolean} -- If True, return the keypoint scores along with the keypoints.
    109     \"\"\"
--> 110     return self.get_landmarks_from_image(image_or_path, detected_faces, return_bboxes, return_landmark_score)

File ~/autodl-tmp/shijieqi/TPSM/lib/python3.9/site-packages/torch/autograd/grad_mode.py:28, in _DecoratorContextManager.__call__.<locals>.decorate_context(*args, **kwargs)
     25 @functools.wraps(func)
     26 def decorate_context(*args, **kwargs):
     27     with self.__class__():
---> 28         return func(*args, **kwargs)

File ~/autodl-tmp/shijieqi/TPSM/lib/python3.9/site-packages/face_alignment/api.py:167, in FaceAlignment.get_landmarks_from_image(self, image_or_path, detected_faces, return_bboxes, return_landmark_score)
    165 out = self.face_alignment_net(inp).detach()
    166 if self.flip_input:
--> 167     out += flip(self.face_alignment_net(flip(inp)).detach(), is_label=True)
    168 out = out.cpu().numpy()
    170 pts, pts_img, scores = get_preds_fromhm(out, center.numpy(), scale)

File ~/autodl-tmp/shijieqi/TPSM/lib/python3.9/site-packages/torch/nn/modules/module.py:1102, in Module._call_impl(self, *input, **kwargs)
   1098 # If we don't have any hooks, we want to skip the rest of the logic in
   1099 # this function, and just call forward.
   1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1101         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102     return forward_call(*input, **kwargs)
   1103 # Do not call functions when jit is used
   1104 full_backward_hooks, non_full_backward_hooks = [], []

RuntimeError: nvrtc: error: invalid value for --gpu-architecture (-arch)

nvrtc compilation failed: 

#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)

template<typename T>
__device__ T maximum(T a, T b) {
  return isnan(a) ? a : (a > b ? a : b);
}

template<typename T>
__device__ T minimum(T a, T b) {
  return isnan(a) ? a : (a < b ? a : b);
}

extern \"C\" __global__
void fused_cat_cat(float* tinput0_42, float* tinput0_46, float* tout3_67, float* tinput0_60, float* tinput0_52, float* tout3_71, float* aten_cat_1, float* aten_cat) {
{
if (blockIdx.x<512ll ? 1 : 0) {
    aten_cat[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 1024ll<192ll ? 1 : 0) ? ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 1024ll<128ll ? 1 : 0) ? __ldg(tinput0_60 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) : __ldg(tinput0_52 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 131072ll)) : __ldg(tout3_71 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 196608ll));
  }  aten_cat_1[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 4096ll<192ll ? 1 : 0) ? ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 4096ll<128ll ? 1 : 0) ? __ldg(tinput0_42 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) : __ldg(tinput0_46 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 524288ll)) : __ldg(tout3_67 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 786432ll));
}
}
"
}
Chichibabin commented 4 months ago

The issue was resolved by upgrading the CUDA and PyTorch versions.

conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=11.8 -c pytorch -c nvidia