CUDA 12.6 Support? - Githubissues

Hi, I'd like to know if madrona supports CUDA 12.6 yet? I have been testing GPUDrive with various cuda versions. I have found that I am unable to compile on following versions -

cuda12.1.1-cudnn8.9.0-devel-ubuntu22.04.2
cuda12.6.1-devel-ubuntu24.04
cuda 12.5 (I have not tried on this version, but we received some user feedback they were not able to compile on 12.5 and had to to downgrade to 12.4).

Running with the latest madrona commit on 12.6 (445062f), I get these errors on compilation -

(madrona) (base) aarav@emerge2-desktop:~/gpudrive/build$ ./headless CUDA 1
Compiling GPU engine code:
inlinable function call in a function with debug info must have a !dbg location
  call void @_ZN4cuda3std3__423__atomic_store_dispatchINS1_16__atomic_storageIiEEiNS1_25__thread_scope_device_tagELi0EEEvPT_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIiEE* %15, i32 %16, i32 %17, %struct._ZN4cuda3std3__425__thread_scope_device_tagE %19)
inlinable function call in a function with debug info must have a !dbg location
  %15 = call i32 @_ZN4cuda3std3__422__atomic_load_dispatchINS1_16__atomic_storageIiEENS1_25__thread_scope_device_tagELi0EEENT_14__underlying_tEPKS6_NS1_12memory_orderET0_(%struct._ZN4cuda3std3__416__atomic_storageIiEE* %11, i32 %12, %struct._ZN4cuda3std3__425__thread_scope_device_tagE %14)
inlinable function call in a function with debug info must have a !dbg location
  %21 = call i32 @_ZN4cuda3std3__426__atomic_exchange_dispatchINS1_16__atomic_storageIiEEiNS1_25__thread_scope_device_tagELi0EEENT_14__underlying_tEPS6_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIiEE* %16, i32 %17, i32 %18, %struct._ZN4cuda3std3__425__thread_scope_device_tagE %20)
inlinable function call in a function with debug info must have a !dbg location
  call void @_ZN4cuda3std3__423__atomic_store_dispatchINS1_16__atomic_storageIjEEjNS1_25__thread_scope_system_tagELi0EEEvPT_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIjEE* %15, i32 %16, i32 %17, %struct._ZN4cuda3std3__425__thread_scope_system_tagE %19)
inlinable function call in a function with debug info must have a !dbg location
  %15 = call i32 @_ZN4cuda3std3__422__atomic_load_dispatchINS1_16__atomic_storageIjEENS1_25__thread_scope_system_tagELi0EEENT_14__underlying_tEPKS6_NS1_12memory_orderET0_(%struct._ZN4cuda3std3__416__atomic_storageIjEE* %11, i32 %12, %struct._ZN4cuda3std3__425__thread_scope_system_tagE %14)
inlinable function call in a function with debug info must have a !dbg location
  %21 = call i64 @_ZN4cuda3std3__427__atomic_fetch_add_dispatchINS1_16__atomic_storageIyEEyNS1_25__thread_scope_device_tagELi0EEENT_14__underlying_tEPS6_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIyEE* %16, i64 %17, i32 %18, %struct._ZN4cuda3std3__425__thread_scope_device_tagE %20)
inlinable function call in a function with debug info must have a !dbg location
  call void @_ZN4cuda3std3__423__atomic_store_dispatchINS1_16__atomic_storageIiEEiNS1_25__thread_scope_device_tagELi0EEEvPT_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIiEE* %6, i32 %4, i32 %5, %struct._ZN4cuda3std3__425__thread_scope_device_tagE zeroinitializer)
inlinable function call in a function with debug info must have a !dbg location
  %5 = call i32 @_ZN4cuda3std3__422__atomic_load_dispatchINS1_16__atomic_storageIiEENS1_25__thread_scope_device_tagELi0EEENT_14__underlying_tEPKS6_NS1_12memory_orderET0_(%struct._ZN4cuda3std3__416__atomic_storageIiEE* %4, i32 %3, %struct._ZN4cuda3std3__425__thread_scope_device_tagE zeroinitializer)
inlinable function call in a function with debug info must have a !dbg location
  %7 = call i32 @_ZN4cuda3std3__426__atomic_exchange_dispatchINS1_16__atomic_storageIiEEiNS1_25__thread_scope_device_tagELi0EEENT_14__underlying_tEPS6_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIiEE* %6, i32 %4, i32 %5, %struct._ZN4cuda3std3__425__thread_scope_device_tagE zeroinitializer)
inlinable function call in a function with debug info must have a !dbg location
  call void @_ZN4cuda3std3__423__atomic_store_dispatchINS1_16__atomic_storageIjEEjNS1_25__thread_scope_system_tagELi0EEEvPT_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIjEE* %6, i32 %4, i32 %5, %struct._ZN4cuda3std3__425__thread_scope_system_tagE zeroinitializer)
inlinable function call in a function with debug info must have a !dbg location
  %5 = call i32 @_ZN4cuda3std3__422__atomic_load_dispatchINS1_16__atomic_storageIjEENS1_25__thread_scope_system_tagELi0EEENT_14__underlying_tEPKS6_NS1_12memory_orderET0_(%struct._ZN4cuda3std3__416__atomic_storageIjEE* %4, i32 %3, %struct._ZN4cuda3std3__425__thread_scope_system_tagE zeroinitializer)
inlinable function call in a function with debug info must have a !dbg location
  %7 = call i64 @_ZN4cuda3std3__427__atomic_fetch_add_dispatchINS1_16__atomic_storageIyEEyNS1_25__thread_scope_device_tagELi0EEENT_14__underlying_tEPS6_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIyEE* %6, i64 %4, i32 %5, %struct._ZN4cuda3std3__425__thread_scope_device_tagE zeroinitializer)
error: Broken module found, compilation aborted!

Error at /home/aarav/gpudrive/external/madrona/src/mw/cpp_compile.cpp:81 in auto madrona::cu::jitCompileCPPSrc(const char *, const char *, const char **, uint32_t, const char **, uint32_t, bool)::(anonymous class)::operator()() const
NVRTC_ERROR_COMPILATION
Aborted (core dumped)

I was able to solve this error by disabling debug mode here. But then I get this follow up error (with verbose compilation for more details) -

(madrona) (base) aarav@emerge2-desktop:~/gpudrive/build$ ./headless CUDA 1
Compiler Flags:
-I/home/aarav/gpudrive/external/madrona/src/mw/device/include
-I/home/aarav/gpudrive/external/madrona/src/common/../../include
-I/usr/local/cuda/targets/x86_64-linux/include
-std=c++20
-default-device
-rdc=true
-use_fast_math
-DMADRONA_GPU_MODE=1
-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP
-DCCCL_DISABLE_BF16_SUPPORT=1
-DCUB_DISABLE_BF16_SUPPORT=1
-arch
sm_89
-DMADRONA_MWGPU_NUM_SMS=(76_i32)
-DMADRONA_MWGPU_MAX_BLOCKS_PER_SM=(1_i32)
-dopt=on
--extra-device-vectorization
-lineinfo
-dlto
-DMADRONA_MWGPU_LTO_MODE=1
-DMADRONA_MWGPU_TASKGRAPH=1

Linker Flags:
-arch=sm_89
-ftz=1
-prec-div=0
-prec-sqrt=0
-fma=1
-optimize-unused-variables
-lineinfo
-lto
-verbose

Compiling GPU engine code:
/home/aarav/gpudrive/external/madrona/src/mw/device/memory.cpp
/home/aarav/gpudrive/external/madrona/src/mw/device/state.cpp
/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(177): error: qualified name is not allowed
      cuda::atomic<T, cuda::thread_scope_device> impl_;
      ^

/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(177): error: this declaration has no storage class or type specifier
      cuda::atomic<T, cuda::thread_scope_device> impl_;
      ^

/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(177): error: expected a ";"
      cuda::atomic<T, cuda::thread_scope_device> impl_;
                  ^

/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(328): error: qualified name is not allowed
      cuda::atomic_ref<T, cuda::thread_scope_device> ref_;
      ^

/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(328): error: this declaration has no storage class or type specifier
      cuda::atomic_ref<T, cuda::thread_scope_device> ref_;
      ^

/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(328): error: expected a ";"
      cuda::atomic_ref<T, cuda::thread_scope_device> ref_;
                      ^

/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(332): error: identifier "ref_" is undefined
      static_assert(decltype(ref_)::is_always_lock_free);
                             ^

/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(56): error: "impl_" is not a nonstatic data member or base class of class "madrona::Atomic<T>"
          : impl_(v)
            ^

/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(58): error: identifier "impl_" is undefined
          static_assert(decltype(impl_)::is_always_lock_free);
                                 ^

/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(96): error: identifier "impl_" is undefined
          return impl_.exchange(v, order);
                 ^

/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(69): error: identifier "impl_" is undefined
          return impl_.load(sync::relaxed);
                 ^

/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(90): error: identifier "impl_" is undefined
          impl_.store(v, sync::release);
          ^

/home/aarav/gpudrive/external/madrona/src/common/../../include/madrona/sync.hpp(85): error: identifier "impl_" is undefined
          impl_.store(v, sync::relaxed);
          ^

/home/aarav/gpudrive/external/madrona/src/mw/device/include/algorithm(3): catastrophic error: cannot open source file "cuda/std/__algorithm"
  #include <cuda/std/__algorithm>
                                 ^

 and 1 catastrophic error detected in the compilation of "/home/aarav/gpudrive/external/madrona/src/mw/device/state.cpp".
Compilation terminated.

Error at /home/aarav/gpudrive/external/madrona/src/mw/cpp_compile.cpp:100 in CompileOutput madrona::cu::jitCompileCPPSrc(const char *, const char *, const char **, uint32_t, const char **, uint32_t, bool)
NVRTC_ERROR_COMPILATION
Aborted (core dumped)

I also tried the same with the base docker image from nvidia with the same errors.

Thank you for your prompt reply. But I don't see any such problem.

This is my python virtual environment。

Package                  Version    Editable project location
------------------------ ---------- --------------------------------------------
filelock                 3.16.1
fsspec                   2024.10.0
Jinja2                   3.1.4
madrona_escape_room      0.0.1      /home/zhangfan/workspace/madrona_escape_room
MarkupSafe               3.0.2
mpmath                   1.3.0
networkx                 3.4.2
numpy                    2.1.3
nvidia-cublas-cu12       12.4.5.8
nvidia-cuda-cupti-cu12   12.4.127
nvidia-cuda-nvrtc-cu12   12.4.127
nvidia-cuda-runtime-cu12 12.4.127
nvidia-cudnn-cu12        9.1.0.70
nvidia-cufft-cu12        11.2.1.3
nvidia-curand-cu12       10.3.5.147
nvidia-cusolver-cu12     11.6.1.9
nvidia-cusparse-cu12     12.3.1.170
nvidia-nccl-cu12         2.21.5
nvidia-nvjitlink-cu12    12.4.127
nvidia-nvtx-cu12         12.4.127
pip                      24.3.1
setuptools               75.6.0
sympy                    1.13.1
torch                    2.5.1
triton                   3.1.0
typing_extensions        4.12.2

This is my cuda infomation from nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Mar_28_02:18:24_PDT_2024
Cuda compilation tools, release 12.4, V12.4.131
Build cuda_12.4.r12.4/compiler.34097967_0

This is my nvidia infomation from nvidia-smi

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 4090 D      Off |   00000000:31:00.0 Off |                  Off |
| 30%   31C    P8             22W /  425W |       0MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090 D      Off |   00000000:4B:00.0 Off |                  Off |
| 30%   31C    P8             22W /  425W |       0MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

This is my error after python scripts/train.py --num-worlds 8192 --num-updates 5000 --profile-report --fp16 --gpu-sim --ckpt-dir build/checkpoints/

Compiling GPU engine code:
inlinable function call in a function with debug info must have a !dbg location
  call void @_ZN4cuda3std3__423__atomic_store_dispatchINS1_16__atomic_storageIiEEiNS1_25__thread_scope_device_tagELi0EEEvPT_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIiEE* %15, i32 %16, i32 %17, %struct._ZN4cuda3std3__425__thread_scope_device_tagE %19)
inlinable function call in a function with debug info must have a !dbg location
  %15 = call i32 @_ZN4cuda3std3__422__atomic_load_dispatchINS1_16__atomic_storageIiEENS1_25__thread_scope_device_tagELi0EEENT_14__underlying_tEPKS6_NS1_12memory_orderET0_(%struct._ZN4cuda3std3__416__atomic_storageIiEE* %11, i32 %12, %struct._ZN4cuda3std3__425__thread_scope_device_tagE %14)
inlinable function call in a function with debug info must have a !dbg location
  %21 = call i32 @_ZN4cuda3std3__426__atomic_exchange_dispatchINS1_16__atomic_storageIiEEiNS1_25__thread_scope_device_tagELi0EEENT_14__underlying_tEPS6_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIiEE* %16, i32 %17, i32 %18, %struct._ZN4cuda3std3__425__thread_scope_device_tagE %20)
inlinable function call in a function with debug info must have a !dbg location
  call void @_ZN4cuda3std3__423__atomic_store_dispatchINS1_16__atomic_storageIjEEjNS1_25__thread_scope_system_tagELi0EEEvPT_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIjEE* %15, i32 %16, i32 %17, %struct._ZN4cuda3std3__425__thread_scope_system_tagE %19)
inlinable function call in a function with debug info must have a !dbg location
  %15 = call i32 @_ZN4cuda3std3__422__atomic_load_dispatchINS1_16__atomic_storageIjEENS1_25__thread_scope_system_tagELi0EEENT_14__underlying_tEPKS6_NS1_12memory_orderET0_(%struct._ZN4cuda3std3__416__atomic_storageIjEE* %11, i32 %12, %struct._ZN4cuda3std3__425__thread_scope_system_tagE %14)
inlinable function call in a function with debug info must have a !dbg location
  %21 = call i64 @_ZN4cuda3std3__427__atomic_fetch_add_dispatchINS1_16__atomic_storageIyEEyNS1_25__thread_scope_device_tagELi0EEENT_14__underlying_tEPS6_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIyEE* %16, i64 %17, i32 %18, %struct._ZN4cuda3std3__425__thread_scope_device_tagE %20)
inlinable function call in a function with debug info must have a !dbg location
  call void @_ZN4cuda3std3__423__atomic_store_dispatchINS1_16__atomic_storageIiEEiNS1_25__thread_scope_device_tagELi0EEEvPT_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIiEE* %6, i32 %4, i32 %5, %struct._ZN4cuda3std3__425__thread_scope_device_tagE zeroinitializer)
inlinable function call in a function with debug info must have a !dbg location
  %5 = call i32 @_ZN4cuda3std3__422__atomic_load_dispatchINS1_16__atomic_storageIiEENS1_25__thread_scope_device_tagELi0EEENT_14__underlying_tEPKS6_NS1_12memory_orderET0_(%struct._ZN4cuda3std3__416__atomic_storageIiEE* %4, i32 %3, %struct._ZN4cuda3std3__425__thread_scope_device_tagE zeroinitializer)
inlinable function call in a function with debug info must have a !dbg location
  %7 = call i32 @_ZN4cuda3std3__426__atomic_exchange_dispatchINS1_16__atomic_storageIiEEiNS1_25__thread_scope_device_tagELi0EEENT_14__underlying_tEPS6_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIiEE* %6, i32 %4, i32 %5, %struct._ZN4cuda3std3__425__thread_scope_device_tagE zeroinitializer)
inlinable function call in a function with debug info must have a !dbg location
  call void @_ZN4cuda3std3__423__atomic_store_dispatchINS1_16__atomic_storageIjEEjNS1_25__thread_scope_system_tagELi0EEEvPT_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIjEE* %6, i32 %4, i32 %5, %struct._ZN4cuda3std3__425__thread_scope_system_tagE zeroinitializer)
inlinable function call in a function with debug info must have a !dbg location
  %5 = call i32 @_ZN4cuda3std3__422__atomic_load_dispatchINS1_16__atomic_storageIjEENS1_25__thread_scope_system_tagELi0EEENT_14__underlying_tEPKS6_NS1_12memory_orderET0_(%struct._ZN4cuda3std3__416__atomic_storageIjEE* %4, i32 %3, %struct._ZN4cuda3std3__425__thread_scope_system_tagE zeroinitializer)
inlinable function call in a function with debug info must have a !dbg location
  %7 = call i64 @_ZN4cuda3std3__427__atomic_fetch_add_dispatchINS1_16__atomic_storageIyEEyNS1_25__thread_scope_device_tagELi0EEENT_14__underlying_tEPS6_T0_NS1_12memory_orderET1_(%struct._ZN4cuda3std3__416__atomic_storageIyEE* %6, i64 %4, i32 %5, %struct._ZN4cuda3std3__425__thread_scope_device_tagE zeroinitializer)
Error: Broken module found, compilation aborted!

Error at /home/zhangfan/workspace/madrona_escape_room/external/madrona/src/mw/cpp_compile.cpp:81 in auto madrona::cu::jitCompileCPPSrc(const char *, const char *, const char **, uint32_t, const char **, uint32_t, bool)::(anonymous class)::operator()() const
NVRTC_ERROR_COMPILATION
Aborted

When I comment cuda_exec.cpp line 1323 fast_compile_flags.push_back("-G");, get this error

Compiling GPU engine code:
/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(177): error: qualified name is not allowed
      cuda::atomic<T, cuda::thread_scope_device> impl_;
      ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(177): error: this declaration has no storage class or type specifier
      cuda::atomic<T, cuda::thread_scope_device> impl_;
      ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(177): error: expected a ";"
      cuda::atomic<T, cuda::thread_scope_device> impl_;
                  ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(328): error: qualified name is not allowed
      cuda::atomic_ref<T, cuda::thread_scope_device> ref_;
      ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(328): error: this declaration has no storage class or type specifier
      cuda::atomic_ref<T, cuda::thread_scope_device> ref_;
      ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(328): error: expected a ";"
      cuda::atomic_ref<T, cuda::thread_scope_device> ref_;
                      ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(332): error: identifier "ref_" is undefined
      static_assert(decltype(ref_)::is_always_lock_free);
                             ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(56): error: "impl_" is not a nonstatic data member or base class of class "madrona::Atomic<T>"
          : impl_(v)
            ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(58): error: identifier "impl_" is undefined
          static_assert(decltype(impl_)::is_always_lock_free);
                                 ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(96): error: identifier "impl_" is undefined
          return impl_.exchange(v, order);
                 ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(69): error: identifier "impl_" is undefined
          return impl_.load(sync::relaxed);
                 ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(90): error: identifier "impl_" is undefined
          impl_.store(v, sync::release);
          ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/common/../../include/madrona/sync.hpp(85): error: identifier "impl_" is undefined
          impl_.store(v, sync::relaxed);
          ^

/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/mw/device/include/algorithm(3): catastrophic error: cannot open source file "cuda/std/__algorithm"
  #include <cuda/std/__algorithm>
                                 ^

 and 1 catastrophic error detected in the compilation of "/home/zhangfan/workspace/madrona_escape_room/external/madrona/src/mw/device/state.cpp".
Compilation terminated.

Error at /home/zhangfan/workspace/madrona_escape_room/external/madrona/src/mw/cpp_compile.cpp:100 in CompileOutput madrona::cu::jitCompileCPPSrc(const char *, const char *, const char **, uint32_t, const char **, uint32_t, bool)
NVRTC_ERROR_COMPILATION
Aborted

shacklettbp / madrona

CUDA 12.6 Support? #39