Closed Chichibabin closed 4 months ago
The error does not occur when running on CPU, but appears when using CUDA with the following error:
{ "name": "RuntimeError", "message": "nvrtc: error: invalid value for --gpu-architecture (-arch) nvrtc compilation failed: #define NAN __int_as_float(0x7fffffff) #define POS_INFINITY __int_as_float(0x7f800000) #define NEG_INFINITY __int_as_float(0xff800000) template<typename T> __device__ T maximum(T a, T b) { return isnan(a) ? a : (a > b ? a : b); } template<typename T> __device__ T minimum(T a, T b) { return isnan(a) ? a : (a < b ? a : b); } extern \"C\" __global__ void fused_cat_cat(float* tinput0_42, float* tinput0_46, float* tout3_67, float* tinput0_60, float* tinput0_52, float* tout3_71, float* aten_cat_1, float* aten_cat) { { if (blockIdx.x<512ll ? 1 : 0) { aten_cat[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 1024ll<192ll ? 1 : 0) ? ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 1024ll<128ll ? 1 : 0) ? __ldg(tinput0_60 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) : __ldg(tinput0_52 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 131072ll)) : __ldg(tout3_71 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 196608ll)); } aten_cat_1[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 4096ll<192ll ? 1 : 0) ? ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 4096ll<128ll ? 1 : 0) ? __ldg(tinput0_42 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) : __ldg(tinput0_46 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 524288ll)) : __ldg(tout3_67 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 786432ll)); } } ", "stack": "--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[36], line 6 4 if predict_mode=='relative' and find_best_frame: 5 from demo import find_best_frame as _find ----> 6 i = _find(source_image, driving_video, device.type=='cpu') 7 print (\"Best frame: \" + str(i)) 8 driving_forward = driving_video[i:] File ~/autodl-tmp/shijieqi/Thin-Plate-Spline-Motion-Model/demo.py:109, in find_best_frame(source, driving, cpu) 105 return kp 107 fa = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, flip_input=True, 108 device= 'cpu' if cpu else 'cuda') --> 109 kp_source = fa.get_landmarks(255 * source)[0] 110 kp_source = normalize_kp(kp_source) 111 norm = float('inf') File ~/autodl-tmp/shijieqi/TPSM/lib/python3.9/site-packages/face_alignment/api.py:110, in FaceAlignment.get_landmarks(self, image_or_path, detected_faces, return_bboxes, return_landmark_score) 98 def get_landmarks(self, image_or_path, detected_faces=None, return_bboxes=False, return_landmark_score=False): 99 \"\"\"Deprecated, please use get_landmarks_from_image 100 101 Arguments: (...) 108 return_landmark_score {boolean} -- If True, return the keypoint scores along with the keypoints. 109 \"\"\" --> 110 return self.get_landmarks_from_image(image_or_path, detected_faces, return_bboxes, return_landmark_score) File ~/autodl-tmp/shijieqi/TPSM/lib/python3.9/site-packages/torch/autograd/grad_mode.py:28, in _DecoratorContextManager.__call__.<locals>.decorate_context(*args, **kwargs) 25 @functools.wraps(func) 26 def decorate_context(*args, **kwargs): 27 with self.__class__(): ---> 28 return func(*args, **kwargs) File ~/autodl-tmp/shijieqi/TPSM/lib/python3.9/site-packages/face_alignment/api.py:167, in FaceAlignment.get_landmarks_from_image(self, image_or_path, detected_faces, return_bboxes, return_landmark_score) 165 out = self.face_alignment_net(inp).detach() 166 if self.flip_input: --> 167 out += flip(self.face_alignment_net(flip(inp)).detach(), is_label=True) 168 out = out.cpu().numpy() 170 pts, pts_img, scores = get_preds_fromhm(out, center.numpy(), scale) File ~/autodl-tmp/shijieqi/TPSM/lib/python3.9/site-packages/torch/nn/modules/module.py:1102, in Module._call_impl(self, *input, **kwargs) 1098 # If we don't have any hooks, we want to skip the rest of the logic in 1099 # this function, and just call forward. 1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1101 or _global_forward_hooks or _global_forward_pre_hooks): -> 1102 return forward_call(*input, **kwargs) 1103 # Do not call functions when jit is used 1104 full_backward_hooks, non_full_backward_hooks = [], [] RuntimeError: nvrtc: error: invalid value for --gpu-architecture (-arch) nvrtc compilation failed: #define NAN __int_as_float(0x7fffffff) #define POS_INFINITY __int_as_float(0x7f800000) #define NEG_INFINITY __int_as_float(0xff800000) template<typename T> __device__ T maximum(T a, T b) { return isnan(a) ? a : (a > b ? a : b); } template<typename T> __device__ T minimum(T a, T b) { return isnan(a) ? a : (a < b ? a : b); } extern \"C\" __global__ void fused_cat_cat(float* tinput0_42, float* tinput0_46, float* tout3_67, float* tinput0_60, float* tinput0_52, float* tout3_71, float* aten_cat_1, float* aten_cat) { { if (blockIdx.x<512ll ? 1 : 0) { aten_cat[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 1024ll<192ll ? 1 : 0) ? ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 1024ll<128ll ? 1 : 0) ? __ldg(tinput0_60 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) : __ldg(tinput0_52 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 131072ll)) : __ldg(tout3_71 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 196608ll)); } aten_cat_1[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 4096ll<192ll ? 1 : 0) ? ((((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 4096ll<128ll ? 1 : 0) ? __ldg(tinput0_42 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) : __ldg(tinput0_46 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 524288ll)) : __ldg(tout3_67 + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 786432ll)); } } " }
The issue was resolved by upgrading the CUDA and PyTorch versions.
conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=11.8 -c pytorch -c nvidia
The error does not occur when running on CPU, but appears when using CUDA with the following error: