Illegal memory access in the CUDA implementation with multiple concurrent processes

fwyzard commented 3 years ago

When running multiple concurrent processes without MPS, one often runs into this error:

terminate called after throwing an instance of 'std::runtime_error'
  what():  
.../pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h, line 617:
cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
cudaErrorIllegalAddress: an illegal memory access was encountered

inside notcub::DeviceFree(int device, void *d_ptr).

Anecdotal evidence suggests that this happens a lot more on Ampere vs Volta/Turing, and on larger (e.g. V100/A100) vs smaller (e.g. T4/A10) GPUs.

fwyzard commented 3 years ago

This has been around for a while now - I'm currently investigating it starting from 4c34e1788c80b12e083e0f81a6889600aca239df .

fwyzard commented 3 years ago

Adding a couple of checks earlier in the function:

diff --git a/src/cuda/CUDACore/CachingDeviceAllocator.h b/src/cuda/CUDACore/CachingDeviceAllocator.h
index 50c1ebd..44b1fae 100644
--- a/src/cuda/CUDACore/CachingDeviceAllocator.h
+++ b/src/cuda/CUDACore/CachingDeviceAllocator.h
@@ -556,6 +556,7 @@ namespace notcub {
     cudaError_t DeviceFree(int device, void *d_ptr) {
       int entrypoint_device = INVALID_DEVICE_ORDINAL;
       cudaError_t error = cudaSuccess;
+      cudaCheck(error = cudaGetLastError());  // line 559
       // CMS: use RAII instead of (un)locking explicitly
       std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);

@@ -612,6 +613,7 @@ namespace notcub {
       }

       if (recached) {
+        cudaCheck(error = cudaGetLastError());  // line 616
         // Insert the ready event in the associated stream (must have current device set properly)
         // CMS: throw exception on error
         cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));

shows that - at least in some cases - the error is a pre-existing condition; the call to DeviceFree is simply the first place that checks for errors:

terminate called after throwing an instance of 'std::runtime_error'
  what():  
.../pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h, line 559:
cudaCheck(error = cudaGetLastError());
cudaErrorIllegalAddress: an illegal memory access was encountered

fwyzard commented 3 years ago

After adding the capability to dump a stack trace when a CUDA error is detected (see #189), it looks like these illegal memory access errors happen mostly when CUDA memory is freed in the destructor of TrackingRecHit2DHeterogeneous:

/home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h, line 559:
cudaCheck(error = cudaGetLastError());
cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace:
 0# cms::cuda::abortOnCudaError(char const*, int, char const*, char const*, char const*, char const*) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35
 1# 0x00007F47541E637B at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:61
 2# cms::cuda::free_device(int, void*, CUstream_st*) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc:40
 3# TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>::~TrackingRecHit2DHeterogeneous() at /home/cern/pixeltrack-standalone/src/cuda/CUDADataFormats/TrackingRecHit2DHeterogeneous.h:22
 4# pixelgpudetails::PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const&, SiPixelClustersCUDA const&, BeamSpotCUDA const&, pixelCPEforGPU::ParamsOnGPU const*, CUstream_st*) const [clone .cold.119] at /usr/include/c++/8/bits/basic_string.tcc:138
 5# SiPixelRecHitCUDA::produce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc:58
 6# edm::FunctorWaitingTask<edm::WorkerT<SiPixelRecHitCUDA>::doWorkAsync(edm::Event&, edm::EventSetup const&, edm::WaitingTask*)::{lambda(std::__exception_ptr::exception_ptr const*)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78
 7# tbb::internal::custom_scheduler<tbb::internal::IntelSchedulerTraits>::process_bypass_loop(tbb::internal::context_guard_helper<false>&, tbb::task*, long) at ../../src/tbb/custom_scheduler.h:469
 8# tbb::internal::custom_scheduler<tbb::internal::IntelSchedulerTraits>::local_wait_for_all(tbb::task&, tbb::task*) at ../../src/tbb/custom_scheduler.h:631
 9# edm::EventProcessor::runToCompletion() at /home/cern/pixeltrack-standalone/src/cuda/bin/EventProcessor.cc:36
10# main at /home/cern/pixeltrack-standalone/src/cuda/bin/main.cc:147
11# __libc_start_main in /lib64/libc.so.6
12# _start in ./cuda

Out of 500 failures, I got

      1  3# edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<ZVertexSoA> > >::~Wrapper() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:20
    494  3# TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>::~TrackingRecHit2DHeterogeneous() at /home/cern/pixeltrack-standalone/src/cuda/CUDADataFormats/TrackingRecHit2DHeterogeneous.h:22
      5  3# TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>::TrackingRecHit2DHeterogeneous(unsigned int, pixelCPEforGPU::ParamsOnGPU const*, unsigned int const*, CUstream_st*) at /home/cern/pixeltrack-standalone/src/cuda/CUDADataFormats/TrackingRecHit2DHeterogeneous.h:79

I'd say TrackingRecHit2DHeterogeneous seems like the first candidate to investigate.

makortel commented 3 years ago

I'm a bit puzzled why PixelRecHitGPUKernel::makeHitsAsync() ends up calling a destructor of TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>, especially in a way that leads to calling cms::cuda::free_device() (in absence of exception being thrown). The TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> is returned here https://github.com/cms-patatrack/pixeltrack-standalone/blob/4c34e1788c80b12e083e0f81a6889600aca239df/src/cuda/plugin-SiPixelRecHits/PixelRecHits.cu#L75 and at minimum hits_d should get moved to the caller (ideally even move would be elided).

fwyzard commented 3 years ago

Is it possible that the elision does not happen because we have deleted the copy constructor and copy assignment ?

https://github.com/cms-patatrack/pixeltrack-standalone/blob/4c34e1788c80b12e083e0f81a6889600aca239df/src/cuda/CUDADataFormats/TrackingRecHit2DHeterogeneous.h#L24-L25

makortel commented 3 years ago

I'd find it hard (but not impossible) that deleting copy constructor and assignment would lead to compiler to not elide the "copy", given that one can return an object of non-copyable non-movable type (although IIRC in that case the object needs to be constructed in the return statement).

But even if the "copy" is not elided, we should see a move, and the destructor called in makeHitsAsync() should be on the object with null unique_ptrs that should not lead to calling cms::cuda::free_device().

fwyzard commented 3 years ago

Some more statistics... out of 30 tests, with a total of 120 jobs:

95 succeded

20 failed with


terminate called after throwing an instance of 'std::runtime_error'
what():  
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h, line 559:
cudaCheck(error = cudaGetLastError());
cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace: 0# cms::cuda::abortOnCudaError(char const, int, char const, char const, char const, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35 1# 0x00007FF5CC3D637B at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:61 2# cms::cuda::free_device(int, void, CUstream_st) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc:40 3# CAHitNtupletGeneratorKernels::~CAHitNtupletGeneratorKernels() at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h:163 4# CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DHeterogeneous const&, float, CUstream_st) const [clone .cold.133] at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc:116 5# CAHitNtupletCUDA::produce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletCUDA.cc:41 6# edm::FunctorWaitingTask<edm::WorkerT::doWorkAsync(edm::Event&, edm::EventSetup const&, edm::WaitingTask)::{lambda(std::__exception_ptr::exception_ptr const)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78 7# tbb::internal::custom_scheduler::process_bypass_loop(tbb::internal::context_guard_helper&, tbb::task, long) at ../../src/tbb/custom_scheduler.h:469 8# tbb::internal::custom_scheduler::local_wait_for_all(tbb::task&, tbb::task) at ../../src/tbb/custom_scheduler.h:631 9# tbb::internal::arena::process(tbb::internal::generic_scheduler&) at ../../src/tbb/arena.cpp:152 10# tbb::internal::market::process(rml::job&) at ../../src/tbb/market.cpp:668 11# tbb::internal::rml::private_worker::run() at ../../src/tbb/private_server.cpp:264 12# tbb::internal::rml::private_worker::thread_routine(void*) at ../../src/tbb/private_server.cpp:221 13# start_thread in /lib64/libpthread.so.0 14# clone in /lib64/libc.so.6


The error can happen in the main thread, as in

... 6# edm::FunctorWaitingTask<edm::WorkerT::doWorkAsync(edm::Event&, edm::EventSetup const&, edm::WaitingTask)::{lambda(std::__exception_ptr::exception_ptr const)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78 7# tbb::internal::custom_scheduler::process_bypass_loop(tbb::internal::context_guard_helper&, tbb::task, long) at ../../src/tbb/custom_scheduler.h:469 8# tbb::internal::custom_scheduler::local_wait_for_all(tbb::task&, tbb::task) at ../../src/tbb/custom_scheduler.h:631 9# edm::EventProcessor::runToCompletion() at /home/cern/pixeltrack-standalone/src/cuda/bin/EventProcessor.cc:36 10# main at /home/cern/pixeltrack-standalone/src/cuda/bin/main.cc:147 11# __libc_start_main in /lib64/libc.so.6 12# _start in ./cuda

or in a thread spawned by TBB:

6# edm::FunctorWaitingTask<edm::WorkerT::doWorkAsync(edm::Event&, edm::EventSetup const&, edm::WaitingTask)::{lambda(std::__exception_ptr::exception_ptr const)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78 7# tbb::internal::custom_scheduler::process_bypass_loop(tbb::internal::context_guard_helper&, tbb::task, long) at ../../src/tbb/custom_scheduler.h:469 8# tbb::internal::custom_scheduler::local_wait_for_all(tbb::task&, tbb::task) at ../../src/tbb/custom_scheduler.h:631 9# tbb::internal::arena::process(tbb::internal::generic_scheduler&) at ../../src/tbb/arena.cpp:152 10# tbb::internal::market::process(rml::job&) at ../../src/tbb/market.cpp:668 11# tbb::internal::rml::private_worker::run() at ../../src/tbb/private_server.cpp:264 12# tbb::internal::rml::private_worker::thread_routine(void*) at ../../src/tbb/private_server.cpp:221 13# start_thread in /lib64/libpthread.so.0 14# clone in /lib64/libc.so.6


The error may be detected before

/home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h, line 559: cudaCheck(error = cudaGetLastError()); cudaErrorIllegalAddress: an illegal memory access was encountered

or after

/home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h, line 619: cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)); cudaErrorIllegalAddress: an illegal memory access was encountered

freeing the device memory.

The important details do not change:

3# CAHitNtupletGeneratorKernels::~CAHitNtupletGeneratorKernels() at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h:163 4# CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DHeterogeneous const&, float, CUstream_st*) const [clone .cold.133] at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc:116 5# CAHitNtupletCUDA::produce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletCUDA.cc:41

---
  - 1 failed with

Found 1 devices Processing 1000 events, of which 3 concurrently, with 3 threads. terminate called after throwing an instance of 'std::runtime_error' what():
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h, line 559: cudaCheck(error = cudaGetLastError()); cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace: 0# cms::cuda::abortOnCudaError(char const, int, char const, char const, char const, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35 1# 0x00007F4E4563C37B at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:61 2# cms::cuda::free_device(int, void, CUstream_st) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc:40 3# HeterogeneousSoA<TrackSoAT<32768> >::~HeterogeneousSoA() at /home/cern/pixeltrack-standalone/src/cuda/CUDADataFormats/HeterogeneousSoA.h:18 4# CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DHeterogeneous const&, float, CUstream_st) const [clone .cold.133] at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc:116 5# CAHitNtupletCUDA::produce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletCUDA.cc:41 6# edm::FunctorWaitingTask<edm::WorkerT::doWorkAsync(edm::Event&, edm::EventSetup const&, edm::WaitingTask)::{lambda(std::__exception_ptr::exception_ptr const)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78 7# tbb::internal::custom_scheduler::process_bypass_loop(tbb::internal::context_guard_helper&, tbb::task, long) at ../../src/tbb/custom_scheduler.h:469 8# tbb::internal::custom_scheduler::local_wait_for_all(tbb::task&, tbb::task) at ../../src/tbb/custom_scheduler.h:631 9# tbb::internal::arena::process(tbb::internal::generic_scheduler&) at ../../src/tbb/arena.cpp:152 10# tbb::internal::market::process(rml::job&) at ../../src/tbb/market.cpp:668 11# tbb::internal::rml::private_worker::run() at ../../src/tbb/private_server.cpp:264 12# tbb::internal::rml::private_worker::thread_routine(void*) at ../../src/tbb/private_server.cpp:221 13# start_thread in /lib64/libpthread.so.0 14# clone in /lib64/libc.so.6

---
  - 1 failed with

terminate called after throwing an instance of 'std::runtime_error' what():
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h, line 559: cudaCheck(error = cudaGetLastError()); cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace: 0# cms::cuda::abortOnCudaError(char const, int, char const, char const, char const, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35 1# 0x00007FE07F9BA37B at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:61 2# cms::cuda::free_device(int, void, CUstream_st) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc:40 3# edm::Wrapper<cms::cuda::Product<HeterogeneousSoA > >::~Wrapper() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:20 4# edm::FunctorWaitingTask<edm::StreamSchedule::processOneEventAsync(edm::WaitingTaskHolder)::{lambda(std::__exception_ptr::exception_ptr const)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78 5# tbb::internal::custom_scheduler::process_bypass_loop(tbb::internal::context_guard_helper&, tbb::task, long) at ../../src/tbb/custom_scheduler.h:469 6# tbb::internal::custom_scheduler::local_wait_for_all(tbb::task&, tbb::task) at ../../src/tbb/custom_scheduler.h:631 7# tbb::internal::arena::process(tbb::internal::generic_scheduler&) at ../../src/tbb/arena.cpp:152 8# tbb::internal::market::process(rml::job&) at ../../src/tbb/market.cpp:668 9# tbb::internal::rml::private_worker::run() at ../../src/tbb/private_server.cpp:264 10# tbb::internal::rml::private_worker::thread_routine(void*) at ../../src/tbb/private_server.cpp:221 11# start_thread in /lib64/libpthread.so.0 12# clone in /lib64/libc.so.6

---
  - 3 failed with

terminate called after throwing an instance of 'std::runtime_error' what():
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h, line 559: cudaCheck(error = cudaGetLastError()); cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace: 0# cms::cuda::abortOnCudaError(char const, int, char const, char const, char const, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35 1# 0x00007F78F939F37B at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:61 2# cms::cuda::free_device(int, void, CUstream_st) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc:40 3# edm::Wrapper<cms::cuda::Product >::~Wrapper() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:20 4# edm::FunctorWaitingTask<edm::StreamSchedule::processOneEventAsync(edm::WaitingTaskHolder)::{lambda(std::__exception_ptr::exception_ptr const)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78 5# tbb::internal::custom_scheduler::process_bypass_loop(tbb::internal::context_guard_helper&, tbb::task, long) at ../../src/tbb/custom_scheduler.h:469 6# tbb::internal::custom_scheduler::local_wait_for_all(tbb::task&, tbb::task) at ../../src/tbb/custom_scheduler.h:631 7# tbb::internal::arena::process(tbb::internal::generic_scheduler&) at ../../src/tbb/arena.cpp:152 8# tbb::internal::market::process(rml::job&) at ../../src/tbb/market.cpp:668 9# tbb::internal::rml::private_worker::run() at ../../src/tbb/private_server.cpp:264 10# tbb::internal::rml::private_worker::thread_routine(void*) at ../../src/tbb/private_server.cpp:221 11# start_thread in /lib64/libpthread.so.0 12# clone in /lib64/libc.so.6

fwyzard commented 3 years ago

Recompiling the cuda binary and libraries with -O0 (on the host):

106 jobs succeeded

9 jobs failed with


terminate called after throwing an instance of 'std::runtime_error'
what():  
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h, line 559:
cudaCheck(error = cudaGetLastError());
cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace: 0# cms::cuda::abortOnCudaError(char const, int, char const, char const, char const, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35 1# cms::cuda::cudaCheck_(char const, int, char const, cudaError, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:63 2# notcub::CachingDeviceAllocator::DeviceFree(int, void) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h:561 3# cms::cuda::free_device(int, void, CUstream_st) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc:40 4# cms::cuda::device::impl::DeviceDeleter::operator()(void) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/device_unique_ptr.h:26 5# std::unique_ptr<unsigned long long [], cms::cuda::device::impl::DeviceDeleter>::~unique_ptr() in /home/cern/pixeltrack-standalone/lib/cuda/pluginPixelTriplets.so 6# CAHitNtupletGeneratorKernels::~CAHitNtupletGeneratorKernels() in /home/cern/pixeltrack-standalone/lib/cuda/pluginPixelTriplets.so 7# CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DHeterogeneous const&, float, CUstream_st) const at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc:116 8# CAHitNtupletCUDA::produce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletCUDA.cc:41 9# edm::EDProducer::doProduce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/Framework/EDProducer.h:19 10# edm::WorkerT::doWorkAsync(edm::Event&, edm::EventSetup const&, edm::WaitingTask)::{lambda(std::exception_ptr::exception_ptr const*)#1}::operator()(std::exception_ptr::exception_ptr const) at /home/cern/pixeltrack-standalone/src/cuda/Framework/Worker.h:60 11# edm::FunctorWaitingTask<edm::WorkerT::doWorkAsync(edm::Event&, edm::EventSetup const&, edm::WaitingTask)::{lambda(std::__exception_ptr::exception_ptr const)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:79 12# tbb::internal::custom_scheduler::process_bypass_loop(tbb::internal::context_guard_helper&, tbb::task, long) at ../../src/tbb/custom_scheduler.h:469 13# tbb::internal::custom_scheduler::local_wait_for_all(tbb::task&, tbb::task) at ../../src/tbb/custom_scheduler.h:631 14# tbb::internal::arena::process(tbb::internal::generic_scheduler&) at ../../src/tbb/arena.cpp:152 15# tbb::internal::market::process(rml::job&) at ../../src/tbb/market.cpp:668 16# tbb::internal::rml::private_worker::run() at ../../src/tbb/private_server.cpp:264 17# tbb::internal::rml::private_worker::thread_routine(void) at ../../src/tbb/private_server.cpp:221 18# start_thread in /lib64/libpthread.so.0 19# clone in /lib64/libc.so.6

---
  - 2 jobs failed with

terminate called after throwing an instance of 'std::runtime_error' what():
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h, line 559: cudaCheck(error = cudaGetLastError()); cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace: 0# cms::cuda::abortOnCudaError(char const, int, char const, char const, char const, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35 1# cms::cuda::cudaCheck_(char const, int, char const, cudaError, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:63 2# notcub::CachingDeviceAllocator::DeviceFree(int, void) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h:561 3# cms::cuda::free_device(int, void, CUstream_st) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc:40 4# cms::cuda::device::impl::DeviceDeleter::operator()(void) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/device_unique_ptr.h:26 5# std::unique_ptr<BeamSpotPOD, cms::cuda::device::impl::DeviceDeleter>::~unique_ptr() at /usr/include/c++/8/bits/unique_ptr.h:270 6# BeamSpotCUDA::~BeamSpotCUDA() at /home/cern/pixeltrack-standalone/src/cuda/CUDADataFormats/BeamSpotCUDA.h:9 7# cms::cuda::Product::~Product() at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/Product.h:34 8# edm::Wrapper<cms::cuda::Product >::~Wrapper() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:20 9# edm::Wrapper<cms::cuda::Product >::~Wrapper() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:20 10# std::default_delete::operator()(edm::WrapperBase) const at /usr/include/c++/8/bits/unique_ptr.h:82 11# std::unique_ptr<edm::WrapperBase, std::default_delete >::~unique_ptr() at /usr/include/c++/8/bits/unique_ptr.h:270 12# void std::_Destroy<std::unique_ptr<edm::WrapperBase, std::default_delete > >(std::unique_ptr<edm::WrapperBase, std::default_delete >) at /usr/include/c++/8/bits/stl_construct.h:98 13# void std::_Destroy_aux::destroy<std::unique_ptr<edm::WrapperBase, std::default_delete >>(std::unique_ptr<edm::WrapperBase, std::default_delete >, std::unique_ptr<edm::WrapperBase, std::default_delete >) at /usr/include/c++/8/bits/stl_construct.h:107 14# void std::_Destroy<std::unique_ptr<edm::WrapperBase, std::default_delete >>(std::unique_ptr<edm::WrapperBase, std::default_delete >, std::unique_ptr<edm::WrapperBase, std::default_delete >) at /usr/include/c++/8/bits/stl_construct.h:138 15# void std::_Destroy<std::unique_ptr<edm::WrapperBase, std::default_delete >, std::unique_ptr<edm::WrapperBase, std::default_delete > >(std::unique_ptr<edm::WrapperBase, std::default_delete >, std::unique_ptr<edm::WrapperBase, std::default_delete >, std::allocator<std::unique_ptr<edm::WrapperBase, std::default_delete > >&) at /usr/include/c++/8/bits/stl_construct.h:207 16# std::vector<std::unique_ptr<edm::WrapperBase, std::default_delete >, std::allocator<std::unique_ptr<edm::WrapperBase, std::default_delete > > >::~vector() at /usr/include/c++/8/bits/stl_vector.h:570 17# edm::Event::~Event() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:31 18# std::default_delete::operator()(edm::Event) const at /usr/include/c++/8/bits/unique_ptr.h:81 19# std::unique_ptr<edm::Event, std::default_delete >::reset(edm::Event*) in ./cuda 20# edm::StreamSchedule::processOneEventAsync(edm::WaitingTaskHolder)::{lambda(std::exception_ptr::exception_ptr const)#1}::operator()(std::__exception_ptr::exception_ptr const) at /home/cern/pixeltrack-standalone/src/cuda/bin/StreamSchedule.cc:66 21# edm::FunctorWaitingTask<edm::StreamSchedule::processOneEventAsync(edm::WaitingTaskHolder)::{lambda(std::__exception_ptr::exception_ptr const)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:79 22# tbb::internal::custom_scheduler::process_bypass_loop(tbb::internal::context_guard_helper&, tbb::task, long) at ../../src/tbb/custom_scheduler.h:469 23# tbb::internal::custom_scheduler::local_wait_for_all(tbb::task&, tbb::task*) at ../../src/tbb/custom_scheduler.h:631 24# tbb::task::wait_for_all() at /home/cern/pixeltrack-standalone/external/tbb/include/tbb/task.h:810 25# edm::EventProcessor::runToCompletion() at /home/cern/pixeltrack-standalone/src/cuda/bin/EventProcessor.cc:36 26# main at /home/cern/pixeltrack-standalone/src/cuda/bin/main.cc:147 27# __libc_start_main in /lib64/libc.so.6 28# _start in ./cuda

---
  - 3 jobs failed with

terminate called after throwing an instance of 'std::runtime_error' what():
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h, line 559: cudaCheck(error = cudaGetLastError()); cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace: 0# cms::cuda::abortOnCudaError(char const, int, char const, char const, char const, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35 1# cms::cuda::cudaCheck_(char const, int, char const, cudaError, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:63 2# notcub::CachingDeviceAllocator::DeviceFree(int, void) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/CachingDeviceAllocator.h:561 3# cms::cuda::free_device(int, void, CUstream_st) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc:40 4# cms::cuda::device::impl::DeviceDeleter::operator()(void) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/device_unique_ptr.h:26 5# std::unique_ptr<cms::cuda::HistoContainer<unsigned int, 8u, 24576u, 32u, unsigned short, 1u>, cms::cuda::device::impl::DeviceDeleter>::~unique_ptr() in /home/cern/pixeltrack-standalone/lib/cuda/pluginPixelTriplets.so 6# CAHitNtupletGeneratorKernels::~CAHitNtupletGeneratorKernels() in /home/cern/pixeltrack-standalone/lib/cuda/pluginPixelTriplets.so 7# CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DHeterogeneous const&, float, CUstream_st) const at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc:116 8# CAHitNtupletCUDA::produce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletCUDA.cc:41 9# edm::EDProducer::doProduce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/Framework/EDProducer.h:19 10# edm::WorkerT::doWorkAsync(edm::Event&, edm::EventSetup const&, edm::WaitingTask)::{lambda(std::exception_ptr::exception_ptr const*)#1}::operator()(std::exception_ptr::exception_ptr const) at /home/cern/pixeltrack-standalone/src/cuda/Framework/Worker.h:60 11# edm::FunctorWaitingTask<edm::WorkerT::doWorkAsync(edm::Event&, edm::EventSetup const&, edm::WaitingTask)::{lambda(std::__exception_ptr::exception_ptr const)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:79 12# tbb::internal::custom_scheduler::process_bypass_loop(tbb::internal::context_guard_helper&, tbb::task, long) at ../../src/tbb/custom_scheduler.h:469 13# tbb::internal::custom_scheduler::local_wait_for_all(tbb::task&, tbb::task) at ../../src/tbb/custom_scheduler.h:631 14# tbb::internal::arena::process(tbb::internal::generic_scheduler&) at ../../src/tbb/arena.cpp:152 15# tbb::internal::market::process(rml::job&) at ../../src/tbb/market.cpp:668 16# tbb::internal::rml::private_worker::run() at ../../src/tbb/private_server.cpp:264 17# tbb::internal::rml::private_worker::thread_routine(void) at ../../src/tbb/private_server.cpp:221 18# start_thread in /lib64/libpthread.so.0 19# clone in /lib64/libc.so.6

fwyzard commented 3 years ago

To summarise, the errors always happen (or are noticed) when returning memory to the caching allocator, via cms::cuda::free_device - which in turn always happen in the destructor of a device unique_ptr:

failures from `CAHitNtupletCUDA`

with `-O2`

 3# CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>::~CAHitNtupletGeneratorKernels() at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h:163
 4# CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const&, float, CUstream_st*) const [clone .cold.133] at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc:116
 5# CAHitNtupletCUDA::produce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletCUDA.cc:41

with `-O2`

 3# HeterogeneousSoA<TrackSoAT<32768> >::~HeterogeneousSoA() at /home/cern/pixeltrack-standalone/src/cuda/CUDADataFormats/HeterogeneousSoA.h:18
 4# CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const&, float, CUstream_st*) const [clone .cold.133] at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc:116
 5# CAHitNtupletCUDA::produce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletCUDA.cc:41

with `-O0`

 5# std::unique_ptr<unsigned long long [], cms::cuda::device::impl::DeviceDeleter>::~unique_ptr() in /home/cern/pixeltrack-standalone/lib/cuda/pluginPixelTriplets.so
 6# CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>::~CAHitNtupletGeneratorKernels() in /home/cern/pixeltrack-standalone/lib/cuda/pluginPixelTriplets.so
 7# CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const&, float, CUstream_st*) const at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc:116
 8# CAHitNtupletCUDA::produce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletCUDA.cc:41

with `-O0`

 5# std::unique_ptr<cms::cuda::HistoContainer<unsigned int, 8u, 24576u, 32u, unsigned short, 1u>, cms::cuda::device::impl::DeviceDeleter>::~unique_ptr() in /home/cern/pixeltrack-standalone/lib/cuda/pluginPixelTriplets.so
 6# CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>::~CAHitNtupletGeneratorKernels() in /home/cern/pixeltrack-standalone/lib/cuda/pluginPixelTriplets.so
 7# CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const&, float, CUstream_st*) const at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc:116
 8# CAHitNtupletCUDA::produce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletCUDA.cc:41

failures from `WaitingTask`

with `-O2`

 3# edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<ZVertexSoA> > >::~Wrapper() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:20
 4# edm::FunctorWaitingTask<edm::StreamSchedule::processOneEventAsync(edm::WaitingTaskHolder)::{lambda(std::__exception_ptr::exception_ptr const*)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78

with `-O2`

 3# edm::Wrapper<cms::cuda::Product<BeamSpotCUDA> >::~Wrapper() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:20
 4# edm::FunctorWaitingTask<edm::StreamSchedule::processOneEventAsync(edm::WaitingTaskHolder)::{lambda(std::__exception_ptr::exception_ptr const*)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78

with `-O0`

 5# std::unique_ptr<BeamSpotPOD, cms::cuda::device::impl::DeviceDeleter>::~unique_ptr() at /usr/include/c++/8/bits/unique_ptr.h:270
 6# BeamSpotCUDA::~BeamSpotCUDA() at /home/cern/pixeltrack-standalone/src/cuda/CUDADataFormats/BeamSpotCUDA.h:9
 7# cms::cuda::Product<BeamSpotCUDA>::~Product() at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/Product.h:34
 8# edm::Wrapper<cms::cuda::Product<BeamSpotCUDA> >::~Wrapper() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:20
...
17# edm::Event::~Event() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:31
18# std::default_delete<edm::Event>::operator()(edm::Event*) const at /usr/include/c++/8/bits/unique_ptr.h:81
19# std::unique_ptr<edm::Event, std::default_delete<edm::Event> >::reset(edm::Event*) in ./cuda
20# edm::StreamSchedule::processOneEventAsync(edm::WaitingTaskHolder)::{lambda(std::__exception_ptr::exception_ptr const*)#1}::operator()(std::__exception_ptr::exception_ptr const*) at /home/cern/pixeltrack-standalone/src/cuda/bin/StreamSchedule.cc:66
21# edm::FunctorWaitingTask<edm::StreamSchedule::processOneEventAsync(edm::WaitingTaskHolder)::{lambda(std::__exception_ptr::exception_ptr const*)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:79

fwyzard commented 3 years ago

If I disable the caching allocator, i.e. via make cuda USER_CXXFLAGS="-DCUDA_DISABLE_CACHING_ALLOCATOR", while still compiling with -O2, the number of crashes (and the throughput) is significantly reduced.

Out of 120 jobs, I got only 2 crashes:

terminate called after throwing an instance of 'std::runtime_error'
  what():  
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_host.cc, line 32:
cudaCheck(cudaFreeHost(ptr));
cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace:
 0# cms::cuda::abortOnCudaError(char const*, int, char const*, char const*, char const*, char const*) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35
 1# 0x00007FF7E19FE5CA in /home/cern/pixeltrack-standalone/lib/cuda/libCUDACore.so
 2# edm::Wrapper<cms::cuda::Product<SiPixelDigiErrorsCUDA> >::~Wrapper() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:20
 3# edm::FunctorWaitingTask<edm::StreamSchedule::processOneEventAsync(edm::WaitingTaskHolder)::{lambda(std::__exception_ptr::exception_ptr const*)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78
 4# tbb::internal::custom_scheduler<tbb::internal::IntelSchedulerTraits>::process_bypass_loop(tbb::internal::context_guard_helper<false>&, tbb::task*, long) at ../../src/tbb/custom_scheduler.h:469
 5# tbb::internal::custom_scheduler<tbb::internal::IntelSchedulerTraits>::local_wait_for_all(tbb::task&, tbb::task*) at ../../src/tbb/custom_scheduler.h:631
 6# tbb::internal::arena::process(tbb::internal::generic_scheduler&) at ../../src/tbb/arena.cpp:152
 7# tbb::internal::market::process(rml::job&) at ../../src/tbb/market.cpp:668
 8# tbb::internal::rml::private_worker::run() at ../../src/tbb/private_server.cpp:264
 9# tbb::internal::rml::private_worker::thread_routine(void*) at ../../src/tbb/private_server.cpp:221
10# start_thread in /lib64/libpthread.so.0
11# clone in /lib64/libc.so.6

and

terminate called recursively
  what():  
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_host.cc, line 32:
cudaCheck(cudaFreeHost(ptr));
cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace:
 0# cms::cuda::abortOnCudaError(char const*, int, char const*, char const*, char const*, char const*) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35
 1# 0x00007FC39EC615CA in /home/cern/pixeltrack-standalone/lib/cuda/libCUDACore.so
 2# SiPixelDigisCUDA::SiPixelDigisCUDA(unsigned long, CUstream_st*) at /home/cern/pixeltrack-standalone/src/cuda/CUDADataFormats/SiPixelDigisCUDA.cc:26
 3# pixelgpudetails::SiPixelRawToClusterGPUKernel::makeClustersAsync(bool, SiPixelFedCablingMapGPU const*, unsigned char const*, SiPixelGainForHLTonGPU const*, pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender const&, std::map<unsigned int, std::vector<SiPixelRawDataError, std::allocator<SiPixelRawDataError> >, std::less<unsigned int>, std::allocator<std::pair<unsigned int const, std::vector<SiPixelRawDataError, std::allocator<SiPixelRawDataError> > > > >&&, unsigned int, unsigned int, bool, bool, bool, CUstream_st*) at /home/cern/pixeltrack-standalone/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu:540
 4# SiPixelRawToClusterCUDA::acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) at /home/cern/pixeltrack-standalone/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc:69
 5# edm::FunctorWaitingTask<edm::WorkerT<SiPixelRawToClusterCUDA>::doWorkAsync(edm::Event&, edm::EventSetup const&, edm::WaitingTask*)::{lambda(std::__exception_ptr::exception_ptr const*)#2}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78
 6# tbb::internal::custom_scheduler<tbb::internal::IntelSchedulerTraits>::process_bypass_loop(tbb::internal::context_guard_helper<false>&, tbb::task*, long) at ../../src/tbb/custom_scheduler.h:469
 7# tbb::internal::custom_scheduler<tbb::internal::IntelSchedulerTraits>::local_wait_for_all(tbb::task&, tbb::task*) at ../../src/tbb/custom_scheduler.h:631
 8# edm::EventProcessor::runToCompletion() at /home/cern/pixeltrack-standalone/src/cuda/bin/EventProcessor.cc:36
 9# main at /home/cern/pixeltrack-standalone/src/cuda/bin/main.cc:147
10# __libc_start_main in /lib64/libc.so.6
11# _start in ./cuda

Both happen (or are detected) in the synchronisation caused by cudaFreeHost(ptr).

fwyzard commented 3 years ago

Now if I build with

-O2
-DCUDA_DISABLE_CACHING_ALLOCATOR
commenting out cudaFreeHost() in src/cuda/CUDACore/allocate_host.cc, and just leaking the host memory

I get again a large number of errors, all in the call to cudaFreeAsync().

Out of 30 tests, with a total of 120 jobs:

76 succeeded

45 failed in ~CAHitNtupletGeneratorKernels():


terminate called after throwing an instance of 'std::runtime_error'
what():  
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc, line 46:
cudaCheck(cudaFreeAsync(ptr, stream));
cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace: 0# cms::cuda::abortOnCudaError(char const, int, char const, char const, char const, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35 1# 0x00007F7E1EB8722B at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:61 2# cms::cuda::free_device(int, void, CUstream_st) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc:45 3# CAHitNtupletGeneratorKernels::~CAHitNtupletGeneratorKernels() at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h:163 4# CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DHeterogeneous const&, float, CUstream_st) const [clone .cold.133] at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc:116 5# CAHitNtupletCUDA::produce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletCUDA.cc:41 6# edm::FunctorWaitingTask<edm::WorkerT::doWorkAsync(edm::Event&, edm::EventSetup const&, edm::WaitingTask)::{lambda(std::__exception_ptr::exception_ptr const)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78 7# tbb::internal::custom_scheduler::process_bypass_loop(tbb::internal::context_guard_helper&, tbb::task, long) at ../../src/tbb/custom_scheduler.h:469 8# tbb::internal::custom_scheduler::local_wait_for_all(tbb::task&, tbb::task) at ../../src/tbb/custom_scheduler.h:631 9# tbb::internal::arena::process(tbb::internal::generic_scheduler&) at ../../src/tbb/arena.cpp:152 10# tbb::internal::market::process(rml::job&) at ../../src/tbb/market.cpp:668 11# tbb::internal::rml::private_worker::run() at ../../src/tbb/private_server.cpp:264 12# tbb::internal::rml::private_worker::thread_routine(void*) at ../../src/tbb/private_server.cpp:221 13# start_thread in /lib64/libpthread.so.0 14# clone in /lib64/libc.so.6

---
  - 8 failed in `edm::Wrapper<cms::cuda::Product<BeamSpotCUDA>>::~Wrapper()`:

terminate called after throwing an instance of 'std::runtime_error' what():
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc, line 46: cudaCheck(cudaFreeAsync(ptr, stream)); cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace: 0# cms::cuda::abortOnCudaError(char const, int, char const, char const, char const, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35 1# 0x00007F71C453322B at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:61 2# cms::cuda::free_device(int, void, CUstream_st) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc:45 3# edm::Wrapper<cms::cuda::Product >::~Wrapper() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:20 4# edm::FunctorWaitingTask<edm::StreamSchedule::processOneEventAsync(edm::WaitingTaskHolder)::{lambda(std::__exception_ptr::exception_ptr const)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78 5# tbb::internal::custom_scheduler::process_bypass_loop(tbb::internal::context_guard_helper&, tbb::task, long) at ../../src/tbb/custom_scheduler.h:469 6# tbb::internal::custom_scheduler::local_wait_for_all(tbb::task&, tbb::task) at ../../src/tbb/custom_scheduler.h:631 7# edm::EventProcessor::runToCompletion() at /home/cern/pixeltrack-standalone/src/cuda/bin/EventProcessor.cc:36 8# main at /home/cern/pixeltrack-standalone/src/cuda/bin/main.cc:147 9# __libc_start_main in /lib64/libc.so.6 10# _start in ./cuda

---
  - 1 failed in `edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<ZVertexSoA>>>::~Wrapper()`:

Processing 1000 events, of which 3 concurrently, with 3 threads. terminate called after throwing an instance of 'std::runtime_error' what():
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc, line 46: cudaCheck(cudaFreeAsync(ptr, stream)); cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace: 0# cms::cuda::abortOnCudaError(char const, int, char const, char const, char const, char const) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:35 1# 0x00007F05ECD5F22B at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:61 2# cms::cuda::free_device(int, void, CUstream_st) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc:45 3# edm::Wrapper<cms::cuda::Product<HeterogeneousSoA > >::~Wrapper() at /home/cern/pixeltrack-standalone/src/cuda/Framework/Event.h:20 4# edm::FunctorWaitingTask<edm::StreamSchedule::processOneEventAsync(edm::WaitingTaskHolder)::{lambda(std::__exception_ptr::exception_ptr const)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78 5# tbb::internal::custom_scheduler::process_bypass_loop(tbb::internal::context_guard_helper&, tbb::task, long) at ../../src/tbb/custom_scheduler.h:469 6# tbb::internal::custom_scheduler::local_wait_for_all(tbb::task&, tbb::task) at ../../src/tbb/custom_scheduler.h:631 7# tbb::internal::arena::process(tbb::internal::generic_scheduler&) at ../../src/tbb/arena.cpp:152 8# tbb::internal::market::process(rml::job&) at ../../src/tbb/market.cpp:668 9# tbb::internal::rml::private_worker::run() at ../../src/tbb/private_server.cpp:264 10# tbb::internal::rml::private_worker::thread_routine(void*) at ../../src/tbb/private_server.cpp:221 11# start_thread in /lib64/libpthread.so.0 12# clone in /lib64/libc.so.6

fwyzard commented 3 years ago

Adding checks before the call to cudaFreeAsync():

diff --git a/src/cuda/CUDACore/allocate_device.cc b/src/cuda/CUDACore/allocate_device.cc
index 2e4a6fa..3fea6e9 100644
--- a/src/cuda/CUDACore/allocate_device.cc
+++ b/src/cuda/CUDACore/allocate_device.cc
@@ -37,15 +37,19 @@ namespace cms::cuda {
   }

   void free_device(int device, void *ptr, cudaStream_t stream) {
+    cudaCheck(cudaGetLastError());  // <-- this is line 40
     if constexpr (allocator::policy == allocator::Policy::Caching) {
+      cudaCheck(cudaGetLastError());
       cudaCheck(allocator::getCachingDeviceAllocator().DeviceFree(device, ptr));
 #if CUDA_VERSION >= 11020
     } else if constexpr (allocator::policy == allocator::Policy::Asynchronous) {
       ScopedSetDevice setDeviceForThisScope(device);
+      cudaCheck(cudaGetLastError());  // <-- this is line 47
       cudaCheck(cudaFreeAsync(ptr, stream));
 #endif
     } else {
       ScopedSetDevice setDeviceForThisScope(device);
+      cudaCheck(cudaGetLastError());
       cudaCheck(cudaFree(ptr));
     }
   }

shows mixed results. Out of 120 jobs

78 succeeded
13 failed on line 40, in the call to cudaGetLastError()
29 failed on line 48, in the call to cudaFreeAsync(ptr, stream)

fwyzard commented 3 years ago

So, it looks like the caching allocator is indeed not the cause of the issue, as we since the same errors happen with or without it.

fwyzard commented 3 years ago

As a rather extreme check, I've commented out the call cudaFreeAsync in src/cuda/CUDACore/allocate_device.cc:

diff --git a/src/cuda/CUDACore/allocate_device.cc b/src/cuda/CUDACore/allocate_device.cc
index 2e4a6fa..73b1ca2 100644
--- a/src/cuda/CUDACore/allocate_device.cc
+++ b/src/cuda/CUDACore/allocate_device.cc
   ScopedSetDevice setDeviceForThisScope(dev);
@@ -36,15 +36,19 @@ namespace cms::cuda {
   }

   void free_device(int device, void *ptr, cudaStream_t stream) {
+    cudaCheck(cudaGetLastError());
     if constexpr (allocator::policy == allocator::Policy::Caching) {
       cudaCheck(allocator::getCachingDeviceAllocator().DeviceFree(device, ptr));
 #if CUDA_VERSION >= 11020
     } else if constexpr (allocator::policy == allocator::Policy::Asynchronous) {
       ScopedSetDevice setDeviceForThisScope(device);
-      cudaCheck(cudaFreeAsync(ptr, stream));
+      // leak all device memory
+      //cudaCheck(cudaFreeAsync(ptr, stream));
+      cudaCheck(cudaGetLastError());
 #endif
     } else {
       ScopedSetDevice setDeviceForThisScope(device);
+      cudaCheck(cudaGetLastError());
       cudaCheck(cudaFree(ptr));
     }
   }

Running must be limited to 4 jobs, 3 threads, ~80 events to avoid exhausing the A10 memory... but I still get the same cudaErrorIllegalAddress: an illegal memory access was encountered error:

terminate called after throwing an instance of 'std::runtime_error'
  what():  
/home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc, line 39:
cudaCheck(cudaGetLastError());
cudaErrorIllegalAddress: an illegal memory access was encountered

Current stack trace:
 0# cms::cuda::abortOnCudaError(char const*, int, char const*, char const*, char const*, char const*) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:40
 1# 0x00007F805A7DBA0B at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/cudaCheck.h:66
 2# cms::cuda::free_device(int, void*, CUstream_st*) at /home/cern/pixeltrack-standalone/src/cuda/CUDACore/allocate_device.cc:65
 3# CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>::~CAHitNtupletGeneratorKernels() at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h:163
 4# CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const&, float, CUstream_st*) const [clone .cold.133] at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc:116
 5# CAHitNtupletCUDA::produce(edm::Event&, edm::EventSetup const&) at /home/cern/pixeltrack-standalone/src/cuda/plugin-PixelTriplets/CAHitNtupletCUDA.cc:41
 6# edm::FunctorWaitingTask<edm::WorkerT<CAHitNtupletCUDA>::doWorkAsync(edm::Event&, edm::EventSetup const&, edm::WaitingTask*)::{lambda(std::__exception_ptr::exception_ptr const*)#1}>::execute() at /home/cern/pixeltrack-standalone/src/cuda/Framework/WaitingTask.h:78
 7# tbb::internal::custom_scheduler<tbb::internal::IntelSchedulerTraits>::process_bypass_loop(tbb::internal::context_guard_helper<false>&, tbb::task*, long) at ../../src/tbb/custom_scheduler.h:469
 8# tbb::internal::custom_scheduler<tbb::internal::IntelSchedulerTraits>::local_wait_for_all(tbb::task&, tbb::task*) at ../../src/tbb/custom_scheduler.h:631
 9# edm::EventProcessor::runToCompletion() at /home/cern/pixeltrack-standalone/src/cuda/bin/EventProcessor.cc:36
10# main at /home/cern/pixeltrack-standalone/src/cuda/bin/main.cc:147
11# __libc_start_main in /lib64/libc.so.6
12# _start in ./cuda

fwyzard commented 3 years ago

So... with this I would say that the memory handling is not the source of the problem.

fwyzard commented 3 years ago

With this out of the way, let's try something new: reading the actual description of the error.

cudaErrorIllegalAddress = 700

The device encountered a load or store instruction on an invalid memory address. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.

So, it looks like some memory access on the device is misbehaving ?

fwyzard commented 3 years ago

An update on the ongoing investigation:

disabling the module PixelVertexProducerCUDA still shows the problem
disabling also the module CAHitNtupletCUDA still shows the problem
disabling most of the kernel calls in the module SiPixelRecHitCUDA still shows the problem; the access to the beam spot and average geometry (from the pixelCPEforGPU) is enough to cause the error

adding a call to cudaDeviceSynchronize() in SiPixelRecHitCUDA, between cpe.getGPUProductAsync(...) and makeHitsAsync(...), seems to solve the problem:

diff --git a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
index 413982f..6964776 100644
--- a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
+++ b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc
@@ -46,9 +46,11 @@ void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) {
   auto const& digis = ctx.get(iEvent, tokenDigi_);
   auto const& bs = ctx.get(iEvent, tBeamSpot);

+  pixelCPEforGPU::ParamsOnGPU const* params = fcpe.getGPUProductAsync(ctx.stream());
+  cudaCheck(cudaDeviceSynchronize());
   ctx.emplace(iEvent,
               tokenHit_,
-              gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.getGPUProductAsync(ctx.stream()), ctx.stream()));
+              gpuAlgo_.makeHitsAsync(digis, clusters, bs, params, ctx.stream()));
 }

 DEFINE_FWK_MODULE(SiPixelRecHitCUDA);

However, it's still not clear why this is necessary.

fwyzard commented 3 years ago

A less intrusive, minimal workaround seems to be

diff --git a/src/cudadev/CondFormats/PixelCPEFast.cc b/src/cudadev/CondFormats/PixelCPEFast.cc
index 08941425d636..4f594f8dab20 100644
--- a/src/cudadev/CondFormats/PixelCPEFast.cc
+++ b/src/cudadev/CondFormats/PixelCPEFast.cc
@@ -71,6 +71,7 @@ const pixelCPEforGPU::ParamsOnGPU *PixelCPEFast::getGPUProductAsync(cudaStream_t
                               this->detParamsGPU_.size() * sizeof(pixelCPEforGPU::DetParams),
                               cudaMemcpyDefault,
                               stream));
+    cudaCheck(cudaStreamSynchronize(stream));
   });
   return data.paramsOnGPU_d;
 }

fwyzard commented 3 years ago

Ah... I think I may have found the real issue: when multiple CUDA streams are trying to initialise the same EventSetup object, the first one to do so starts the asynchronous operations, and the others are supposed to wait for it to finish. However, code for recording the CUDA event was missing, so the other streams would find the default-constructed event, which is always "valid".

Adding the missing call to record the event fixes the problem:

diff --git a/src/cudadev/CUDACore/ESProduct.h b/src/cudadev/CUDACore/ESProduct.h
index 0e8965a718ad..6146a2b9f4fd 100644
--- a/src/cudadev/CUDACore/ESProduct.h
+++ b/src/cudadev/CUDACore/ESProduct.h
@@ -74,6 +74,9 @@ namespace cms {
             transferAsync(data.m_data, cudaStream);
             assert(data.m_fillingStream == nullptr);
             data.m_fillingStream = cudaStream;
+            // Record in the cudaStream an event to mark the readiness of the
+            // EventSetup data on the GPU, so other streams can check for it
+            cudaCheck(cudaEventRecord(data.m_event.get(), cudaStream));
             // Now the filling has been enqueued to the cudaStream, so we
             // can return the GPU data immediately, since all subsequent
             // work must be either enqueued to the cudaStream, or the cudaStream

Tested with various jobs/threads configuration on V100, T4 and A10.

@makortel FYI

makortel commented 3 years ago

Good catch! By quick look I agree on the fix. Thanks for making the PRs to here and CMSSW.

cms-patatrack / pixeltrack-standalone