On windows unable to use candle with cuda enabled. I'm using Cuda SDK 12.4 since that is what is supported by my pytorch install which runs fine. 🤔 Is Windows supported? Is there a specific Cuda SDK version that should be used? Not sure why it doesn't compile. There are hundreds of errors. Log below, but could be longer probably.
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\include\sm_32_intrinsics.hpp(117): error: asm operand type size(8) does not match type/size implied by constraint 'r'
static __declspec(__device__) __inline short4 __ldg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.nc.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : "r" (ptr)); return ret; }
^
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\include\sm_32_intrinsics.hpp(118): error: asm operand type size(8) does not match type/size implied by constraint 'r'
static __declspec(__device__) __inline int2 __ldg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.nc.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : "r" (ptr)); return ret; }
^
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\include\sm_32_intrinsics.hpp(119): error: asm operand type size(8) does not match type/size implied by constraint 'r'
static __declspec(__device__) __inline int4 __ldg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.nc.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : "r" (ptr)); return ret; }
^
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\include\sm_32_intrinsics.hpp(120): error: asm operand type size(8) does not match type/size implied by constraint 'r'
static __declspec(__device__) __inline longlong2 __ldg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.nc.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : "r" (ptr)); return ret; }
^
Error limit reached.
100 errors detected in the compilation of "src/cast.cu".
Compilation terminated.
cast.cu
--- stderr
thread 'main' panicked at C:\Users\Chase\.cargo\registry\src\index.crates.io-6f17d22bba15001f\bindgen_cuda-0.1.5\src\lib.rs:391:13:
nvcc error while compiling "src\\affine.cu":
# CLI "nvcc" "--gpu-architecture=sm_89" "--ptx" "--default-stream" "per-thread" "--output-directory" "E:\\Dev\\WebNN\\target\\debug\\build\\candle-kernels-fc639e1b2bb65e3d\\out" "-Isrc" "-IC:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.4\\include" "src\\affine.cu"
# stdout
# stderr
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
warning: build failed, waiting for other jobs to finish...
On windows unable to use candle with cuda enabled. I'm using Cuda SDK 12.4 since that is what is supported by my pytorch install which runs fine. 🤔 Is Windows supported? Is there a specific Cuda SDK version that should be used? Not sure why it doesn't compile. There are hundreds of errors. Log below, but could be longer probably.