Windows Cuda Compatibility?

On windows unable to use candle with cuda enabled. I'm using Cuda SDK 12.4 since that is what is supported by my pytorch install which runs fine. 🤔 Is Windows supported? Is there a specific Cuda SDK version that should be used? Not sure why it doesn't compile. There are hundreds of errors. Log below, but could be longer probably.

  C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\include\sm_32_intrinsics.hpp(117): error: asm operand type size(8) does not match type/size implied by constraint 'r'
    static __declspec(__device__) __inline short4 __ldg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.nc.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : "r" (ptr)); return ret; }
                                                                                                                                                                                                          ^ 

  C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\include\sm_32_intrinsics.hpp(118): error: asm operand type size(8) does not match type/size implied by constraint 'r'
    static __declspec(__device__) __inline int2 __ldg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.nc.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : "r" (ptr)); return ret; }
                                                                                                                                                                    ^

  C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\include\sm_32_intrinsics.hpp(119): error: asm operand type size(8) does not match type/size implied by constraint 'r'
    static __declspec(__device__) __inline int4 __ldg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.nc.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : "r" (ptr)); return ret; }
                                                                                                                                                                                                    ^       

  C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\include\sm_32_intrinsics.hpp(120): error: asm operand type size(8) does not match type/size implied by constraint 'r'
    static __declspec(__device__) __inline longlong2 __ldg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.nc.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : "r" (ptr)); return ret; }
                                                                                                                                                                                   ^

  Error limit reached.
  100 errors detected in the compilation of "src/cast.cu".
  Compilation terminated.
  cast.cu

  --- stderr
  thread 'main' panicked at C:\Users\Chase\.cargo\registry\src\index.crates.io-6f17d22bba15001f\bindgen_cuda-0.1.5\src\lib.rs:391:13:
  nvcc error while compiling "src\\affine.cu":

  # CLI "nvcc" "--gpu-architecture=sm_89" "--ptx" "--default-stream" "per-thread" "--output-directory" "E:\\Dev\\WebNN\\target\\debug\\build\\candle-kernels-fc639e1b2bb65e3d\\out" "-Isrc" "-IC:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.4\\include" "src\\affine.cu"

  # stdout

  # stderr

  note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
warning: build failed, waiting for other jobs to finish...

huggingface / candle

Windows Cuda Compatibility? #2633