Closed makortel closed 1 year ago
A new Issue was created by @makortel Matti Kortelainen.
@Dr15Jones, @perrotta, @dpiparo, @rappoccio, @makortel, @smuzaffar can you please review it and eventually sign/assign? Thanks.
cms-bot commands are listed here
Here is another one pointing more clearly to the crash to occur in sorting
#3 0x0000148464e82b1b in sig_dostack_then_abort () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/pluginFWCoreServicesPlugins.so
#4 <signal handler called>
#5 0x00001483fbbaed27 in void std::__introsort_loop<__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi> >(__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, __gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi>) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/libRecoTrackerTkHitPairs.so
#6 0x00001483fbbaed7b in void std::__introsort_loop<__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi> >(__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, __gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi>) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/libRecoTrackerTkHitPairs.so
#7 0x00001483fbbaed7b in void std::__introsort_loop<__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi> >(__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, __gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi>) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/libRecoTrackerTkHitPairs.so
#8 0x00001483fbbaed7b in void std::__introsort_loop<__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi> >(__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, __gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi>) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/libRecoTrackerTkHitPairs.so
#9 0x00001483fbbaed7b in void std::__introsort_loop<__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi> >(__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, __gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi>) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/libRecoTrackerTkHitPairs.so
#10 0x00001483fbbaed7b in void std::__introsort_loop<__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi> >(__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, __gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi>) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/libRecoTrackerTkHitPairs.so
#11 0x00001483fbbaed7b in void std::__introsort_loop<__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi> >(__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, __gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi>) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/libRecoTrackerTkHitPairs.so
#12 0x00001483fbbaed7b in void std::__introsort_loop<__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi> >(__gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, __gnu_cxx::__normal_iterator<RecHitsSortedInPhi::HitWithPhi*, std::vector<RecHitsSortedInPhi::HitWithPhi, std::allocator<RecHitsSortedInPhi::HitWithPhi> > >, long, __gnu_cxx::__ops::_Iter_comp_iter<RecHitsSortedInPhi::HitLessPhi>) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/libRecoTrackerTkHitPairs.so
#13 0x00001483fbbae2ec in RecHitsSortedInPhi::RecHitsSortedInPhi(std::vector<BaseTrackerRecHit const*, std::allocator<BaseTrackerRecHit const*> > const&, Point3DBase<float, GlobalTag> const&, DetLayer const*) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/libRecoTrackerTkHitPairs.so
#14 0x00001483fbbaa62c in LayerHitMapCache::operator()(SeedingLayerSetsHits::SeedingLayer const&, TrackingRegion const&) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/libRecoTrackerTkHitPairs.so
#15 0x00001483fbba85ca in HitPairGeneratorFromLayerPair::doublets(TrackingRegion const&, edm::Event const&, edm::EventSetup const&, SeedingLayerSetsHits::SeedingLayer const&, SeedingLayerSetsHits::SeedingLayer const&, LayerHitMapCache&) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/libRecoTrackerTkHitPairs.so
#16 0x000014839ad3b559 in (anonymous namespace)::Impl<(anonymous namespace)::DoNothing, (anonymous namespace)::ImplIntermediateHitDoublets, (anonymous namespace)::RegionsLayersSeparate>::produce(bool, edm::Event&, edm::EventSetup const&) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/pluginRecoTrackerTkHitPairsPlugins.so
#17 0x000014846dd6259d in edm::stream::EDProducerAdaptorBase::doEvent(edm::EventTransitionInfo const&, edm::ActivityRegistry*, edm::ModuleCallingContext const*) () from /cvmfs/cms-ib.cern.ch/sw/x86_64/nweek-02769/el8_amd64_gcc11/cms/cmssw/CMSSW_13_0_GPU_X_2023-01-22-2300/lib/el8_amd64_gcc11/libFWCoreFramework.so
Current Modules:
Module: HitPairEDProducer:initialStepHitDoubletsPreSplitting (crashed)
Module: SiStripRecHitsValid:stripRecHitsValid
Module: SiStripRecHitConverter:siStripMatchedRecHits
Module: none
Assign reconstruction,heterogeneous
New categories assigned: heterogeneous,reconstruction
@mandrenguyen,@fwyzard,@clacaputo,@makortel you have been requested to review this Pull request/Issue and eventually sign? Thanks
https://github.com/cms-sw/cmssw/pull/40465 looks like a plausible culprit. Let me tag also @AdrianoDee.
Let me have a look.
@AdrianoDee Have you had a chance to take a look? In principle it would be good to have the crashes fixed for 13_0_0.
@makortel you are right. I had a look but I didn't converge. On it in the next days.
So, I still didn't understand what's happening but something strange is that I can't reproduce this in single thread and the crash occurs when any of the threads goes to the next event (so at 5th event for 4 threads, 9th for 8 and so on). If this ring a bell for somebody please let me know. Debugging is getting nasty not being able to run single threaded (also, any suggestion on how to better debug this it's very welcome).
suggestion on how to better debug this it's very welcome
Have you tried valgrind? It will also work with multiple threads.
Another thing to try would be to see if using 2 streams and 1 thread also leads to a crash.
After taking a look at the code (which ultimately is just sorting on floats which are stored as member data) it seems the most likely culprit is a NaN value as at least one of the phi values. A NaN breaks sorting since
//to a sort 1 must be equal to nan since
1 < nan == false;
nan < 1 == false;
// to a sort 2 must be equal to nan since
2 < nan == false;
nan < 2 == false;
so from the transitive property of arithmetics, the sort would assume 1 == 2
as well so it expects the following
``
1 < 2 == false;
so breaks the sorting algorithm.
Thanks @Dr15Jones I was noticing the same nan
s too in hits' phi. Trying to track why they appear.
The problem is that localCoordToHostAsync
is not taking into account the SoA layout padding to 128 alignment. And then this cudaMemcpyAsync
is copying some wrong portion of memory. Still don't understand how this got unspotted. My quick fix would be:
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
@@ -48,7 +48,11 @@ public:
cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
size_t rowSize = sizeof(float) * nHits();
- cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal(), rowSize * 4, cudaMemcpyDefault, stream));
+
+ cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal(), rowSize, cudaMemcpyDefault, stream));
+ cudaCheck(cudaMemcpyAsync(ret.get() + nHits(), view().yLocal(), rowSize, cudaMemcpyDefault, stream));
+ cudaCheck(cudaMemcpyAsync(ret.get() + nHits() * 2, view().xerrLocal(), rowSize, cudaMemcpyDefault, stream));
+ cudaCheck(cudaMemcpyAsync(ret.get() + nHits() * 3, view().yerrLocal(), rowSize, cudaMemcpyDefault, stream));
return ret;
} //move to utilities
Proposed the fix in https://github.com/cms-sw/cmssw/pull/40869
+heterogeneous
The step 3 in subset of 10824.59x and 11634.59x workflows have been segfaulting in GPU IBs since CMSSW_13_0_X_2023-01-18-2300. Example stack trace
https://cmssdt.cern.ch/SDT/cgi-bin/logreader/el8_amd64_gcc11/CMSSW_13_0_GPU_X_2023-01-23-2300/pyRelValMatrixLogs/run/10824.592_TTbar_13+2018_Patatrack_FullRecoGPU/step3_TTbar_13+2018_Patatrack_FullRecoGPU.log#/