CHIP-SPV / chipStar

chipStar is a tool for compiling and running HIP/CUDA on SPIR-V via OpenCL or Level Zero APIs.
Other
185 stars 30 forks source link

CHIPQueueOpenCL::MemMap bug Unit_hipMultiThreadStreams1_AsyncSame `invalid free()` #657

Closed pvelesko closed 10 months ago

pvelesko commented 11 months ago
Thread 5 (Thread 0x7f964810e640 (LWP 175973)):
#0  0x00007f96498ebb3f in ioctl () from /lib/x86_64-linux-gnu/libc.so.6
#1  0x00007f96488c6be0 in NEO::Drm::ioctl(NEO::DrmIoctl, void*) () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#2  0x00007f96488c0d5b in NEO::Drm::waitHandle(unsigned int, long) () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#3  0x00007f96488ab6a3 in NEO::BufferObject::wait(long) () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#4  0x00007f96488aee48 in NEO::DrmGemCloseWorker::worker(void*) () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#5  0x00007f9649865ac3 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#6  0x00007f96498f7a40 in ?? () from /lib/x86_64-linux-gnu/libc.so.6

Thread 4 (Thread 0x7f9634eb5640 (LWP 176086)):
#0  0x00007f9649862117 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#1  0x00007f9649864a41 in pthread_cond_wait () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007f9648572250 in NEO::AsyncEventsHandler::asyncProcess(void*) () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#3  0x00007f9649865ac3 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#4  0x00007f96498f7a40 in ?? () from /lib/x86_64-linux-gnu/libc.so.6

Thread 3 (Thread 0x7f9649775600 (LWP 175972)):
#0  0x00007f9649862117 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#1  0x00007f9649867624 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007f9649bde2c7 in std::thread::join() () from /lib/x86_64-linux-gnu/libstdc++.so.6
#3  0x0000563f6d17a23d in test_multiThread_1<float, HipTest::MemcpyAsync> (stream0=0x563f6fbf7ee0, stream1=0x563f6fbf7ee0, serialize=false) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/unit/multiThread/hipMultiThreadStreams1.cc:106
#4  0x0000563f6d172fbf in ____C_A_T_C_H____T_E_S_T____39 () at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/unit/multiThread/hipMultiThreadStreams1.cc:143
#5  0x0000563f6d193033 in Catch::TestInvokerAsFunction::invoke (this=0x563f6ee61900) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/external/Catch2/catch.hpp:14261
#6  0x0000563f6d18d654 in Catch::TestCase::invoke (this=0x563f6fbe58d0) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/external/Catch2/catch.hpp:14100
#7  0x0000563f6d18d58a in Catch::RunContext::invokeActiveTestCase (this=0x7ffc3e530ba0) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/external/Catch2/catch.hpp:12959
#8  0x0000563f6d18c22e in Catch::RunContext::runCurrentTest (this=0x7ffc3e530ba0, redirectedCout=..., redirectedCerr=...) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/external/Catch2/catch.hpp:12932
#9  0x0000563f6d18b6b5 in Catch::RunContext::runTest (this=0x7ffc3e530ba0, testCase=...) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/external/Catch2/catch.hpp:12693
#10 0x0000563f6d18fd53 in Catch::(anonymous namespace)::TestGroup::execute (this=0x7ffc3e530b90) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/external/Catch2/catch.hpp:13287
#11 0x0000563f6d18f37b in Catch::Session::runInternal (this=0x7ffc3e530eb0) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/external/Catch2/catch.hpp:13493
#12 0x0000563f6d18f167 in Catch::Session::run (this=0x7ffc3e530eb0) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/external/Catch2/catch.hpp:13449
#13 0x0000563f6d1bfdca in Catch::Session::run<char> (this=0x7ffc3e530eb0, argc=2, argv=0x7ffc3e531168) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/external/Ca--Type <RET> for more, q to quit, c to continue without paging--
tch2/catch.hpp:13171
#14 0x0000563f6d1a4eaf in main (argc=2, argv=0x7ffc3e531168) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/external/Catch2/catch.hpp:17448

Thread 2 (Thread 0x7f9647737640 (LWP 175974)):
#0  0x00007f96498622c0 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#1  0x00007f964986905d in pthread_mutex_lock () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007f9648735a2b in NEO::CommandStreamReceiver::obtainUniqueOwnership() () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#3  0x00007f96484d1cad in int NEO::CommandQueueHw<NEO::XeHpgCoreFamily>::enqueueHandler<4618u>(NEO::Surface**, unsigned long, bool, NEO::MultiDispatchInfo const&, unsigned int, _cl_event* const*, _cl_event**) () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#4  0x00007f96484ed2d6 in NEO::CommandQueueHw<NEO::XeHpgCoreFamily>::enqueueSVMMemcpy(unsigned int, void*, void const*, unsigned long, unsigned int, _cl_event* const*, _cl_event**) () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#5  0x00007f96482690a4 in clEnqueueSVMMemcpy () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#6  0x00007f964a1031b3 in CHIPQueueOpenCL::memCopyAsyncImpl (this=0x563f6fbf7ee0, Dst=0x7f9612c00000, Src=0x7f9621a00000, Size=32000000) at /home/pvelesko/space/chipStar/fix-657/src/backend/OpenCL/CHIPBackendOpenCL.cc:1243
#7  0x00007f964a03364b in chipstar::Queue::memCopyAsync (this=0x563f6fbf7ee0, Dst=0x7f9612c00000, Src=0x7f9621a00000, Size=32000000) at /home/pvelesko/space/chipStar/fix-657/src/CHIPBackend.cc:1630
#8  0x00007f964a09b70a in hipMemcpyAsyncInternal (Dst=0x7f9612c00000, Src=0x7f9621a00000, SizeBytes=32000000, Kind=hipMemcpyDeviceToHost, Stream=0x563f6fbf7ee0) at /home/pvelesko/space/chipStar/fix-657/src/CHIPBindings.cc:1354
#9  0x00007f964a09b465 in hipMemcpyAsync (Dst=0x7f9612c00000, Src=0x7f9621a00000, SizeBytes=32000000, Kind=hipMemcpyDeviceToHost, Stream=0x563f6fbf7ee0) at /home/pvelesko/space/chipStar/fix-657/src/CHIPBindings.cc:1363
#10 0x0000563f6d175833 in HipTest::MemTraits<HipTest::MemcpyAsync>::Copy (dest=0x7f9612c00000, src=0x7f9621a00000, sizeBytes=32000000, kind=hipMemcpyDeviceToHost, stream=0x563f6fbf7ee0) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/./include/hip_test_common.hh:326
#11 0x0000563f6d174288 in simpleVectorAdd<float, HipTest::Pinned, HipTest::MemcpyAsync> (numElements=8000000, iters=10, stream=0x563f6fbf7ee0) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/unit/multiThread/hipMultiThreadStreams1.cc:79
#12 0x0000563f6d17a965 in std::__invoke_impl<void, void (*)(unsigned long, int, ihipStream_t*), unsigned long, int, ihipStream_t*> (__f=@0x563f6fbead10: 0x563f6d173bd0 <simpleVectorAdd<float, HipTest::Pinned, HipTest::MemcpyAsync>(unsigned long, int, ihipStream_t*)>, __args=@0x563f6fbeacf8: 0x563f6fbf7ee0, __args=@0x563f6fbeacf8: 0x563f6fbf7ee0, __args=@0x563f6fbeacf8: 0x563f6fbf7ee0) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/invoke.h:61
#13 0x0000563f6d17a89d in std::__invoke<void (*)(unsigned long, int, ihipStream_t*), unsigned long, int, ihipStream_t*> (__fn=@0x563f6fbead10: 0x563f6d173bd0 <simpleVectorAdd<float, HipTest::Pinned, HipTest::MemcpyAsync>(unsigned long, int, ihipStream_t*)>, __args=@0x563f6fbeacf8: 0x563f6fbf7ee0, __args=@0x563f6fbeacf8: 0x563f6fbf7ee0, __args=@0x563f6fbeacf8: 0x563f6fbf7ee0) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/invoke.h:96
#14 0x0000563f6d17a864 in std::thread::_Invoker<std::tuple<void (*)(unsigned long, int, ihipStream_t*), unsigned long, int, ihipStream_t*> >::_M_invoke<0ul, 1ul, 2ul, 3ul> (this=0x563f6fbeacf8) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/std_thread.h:279
#15 0x0000563f6d17a805 in std::thread::_Invoker<std::tuple<void (*)(unsigned long, int, ihipStream_t*), unsigned long, int, ihipStream_t*> >::operator() (this=0x563f6fbeacf8) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/std_thread.h:286
#16 0x0000563f6d17a5f9 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)(unsigned long, int, ihipStream_t*), unsigned long, int, ihipStream_t*> > >::_M_run (this=0x563f6fbeacf0) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/std_thread.h:231
#17 0x00007f9649bde253 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#18 0x00007f9649865ac3 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
--Type <RET> for more, q to quit, c to continue without paging--
#19 0x00007f96498f7a40 in ?? () from /lib/x86_64-linux-gnu/libc.so.6

Thread 1 (Thread 0x7f9646f36640 (LWP 175975)):
#0  0x00007f96498679fc in pthread_kill () from /lib/x86_64-linux-gnu/libc.so.6
#1  0x00007f9649813476 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007f96497f9885 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#3  0x00007f964985a676 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#4  0x00007f9649871cfc in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#5  0x00007f9649873a44 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#6  0x00007f9649876453 in free () from /lib/x86_64-linux-gnu/libc.so.6
#7  0x00007f964881ac33 in NEO::SVMAllocsManager::MapOperationsTracker::remove(void const*) () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#8  0x00007f964881bc3e in NEO::SVMAllocsManager::removeSvmMapOperation(void const*) () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#9  0x00007f96484ebfd8 in NEO::CommandQueueHw<NEO::XeHpgCoreFamily>::enqueueSVMUnmap(void*, unsigned int, _cl_event* const*, _cl_event**, bool) () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#10 0x00007f964826a6a8 in clEnqueueSVMUnmap () from /home/pvelesko/install/intel/neo/2023.10.02/lib/intel-opencl/libigdrcl.so
#11 0x00007f964a0ff724 in CHIPQueueOpenCL::MemUnmap (this=0x563f6fbe0c80, AllocInfo=0x7f963000d810) at /home/pvelesko/space/chipStar/fix-657/src/backend/OpenCL/CHIPBackendOpenCL.cc:1012
#12 0x00007f964a0417a9 in chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem*, chipstar::Queue::MANAGED_MEM_STATE)::$_0::operator()(chipstar::AllocationInfo const&) const (this=0x7f9646f34820, AllocInfo=...) at /home/pvelesko/space/chipStar/fix-657/src/CHIPBackend.cc:1840
#13 0x00007f964a03ed59 in chipstar::AllocationTracker::visitAllocations<chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem*, chipstar::Queue::MANAGED_MEM_STATE)::$_0>(chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem*, chipstar::Queue::MANAGED_MEM_STATE)::$_0) const (this=0x563f6fb160a0, Visitor=...) at /home/pvelesko/space/chipStar/fix-657/src/CHIPBackend.hh:600
#14 0x00007f964a03eb68 in chipstar::Queue::RegisteredVarCopy (this=0x563f6fbe0c80, ExecItem=0x7f9628012220, ExecState=chipstar::Queue::MANAGED_MEM_STATE::PRE_KERNEL) at /home/pvelesko/space/chipStar/fix-657/src/CHIPBackend.cc:1852
#15 0x00007f964a03fd35 in chipstar::Queue::launch (this=0x563f6fbe0c80, ExItem=0x7f9628012220) at /home/pvelesko/space/chipStar/fix-657/src/CHIPBackend.cc:1922
#16 0x00007f964a04057c in chipstar::Queue::launchKernel (this=0x563f6fbe0c80, ChipKernel=0x7f9630b78eb0, NumBlocks=..., DimBlocks=..., Args=0x7f9646f358d0, SharedMemBytes=0) at /home/pvelesko/space/chipStar/fix-657/src/CHIPBackend.cc:1967
#17 0x00007f964a0bc5c9 in hipLaunchKernelInternal (HostFunction=0x563f6d2beb80 <void HipTest::vectorADDReverse<float>(float const*, float const*, float*, unsigned long)>, GridDim=..., BlockDim=..., Args=0x7f9646f358d0, SharedMem=0, Stream=0x563f6fbe0c80) at /home/pvelesko/space/chipStar/fix-657/src/CHIPBindings.cc:4020
#18 0x00007f964a0bbe94 in hipLaunchKernel (HostFunction=0x563f6d2beb80 <void HipTest::vectorADDReverse<float>(float const*, float const*, float*, unsigned long)>, GridDim=..., BlockDim=..., Args=0x7f9646f358d0, SharedMem=0, Stream=0x563f6fbe0c80) at /home/pvelesko/space/chipStar/fix-657/src/CHIPBindings.cc:4032
#19 0x0000563f6d175965 in void HipTest::__device_stub__vectorADDReverse<float>(float const*, float const*, float*, unsigned long) ()
#20 0x0000563f6d173e8e in simpleVectorAdd<float, HipTest::Pinned, HipTest::MemcpyAsync> (numElements=8000000, iters=10, stream=0x563f6fbf7ee0) at /home/pvelesko/space/chipStar/fix-657/HIP/tests/catch/unit/multiThread/hipMultiThreadStreams1.cc:75
#21 0x0000563f6d17a965 in std::__invoke_impl<void, void (*)(unsigned long, int, ihipStream_t*), unsigned long, int, ihipStream_t*> (__f=@0x563f6fbff270: 0x563f6d173bd0 <simpleVectorAdd<float, HipTest::Pinned, HipTest::MemcpyAsync>(unsigned long, int, ihipStream_t*)>, __args=@0x563f6fbff258: 0x563f6fbf7ee0, __args=@0x563f6fbff258: 0x563f6fbf7ee0, __args=@0x563f6fbff258: 0x563f6fbf7ee0) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/invoke.h:61
--Type <RET> for more, q to quit, c to continue without paging--
#22 0x0000563f6d17a89d in std::__invoke<void (*)(unsigned long, int, ihipStream_t*), unsigned long, int, ihipStream_t*> (__fn=@0x563f6fbff270: 0x563f6d173bd0 <simpleVectorAdd<float, HipTest::Pinned, HipTest::MemcpyAsync>(unsigned long, int, ihipStream_t*)>, __args=@0x563f6fbff258: 0x563f6fbf7ee0, __args=@0x563f6fbff258: 0x563f6fbf7ee0, __args=@0x563f6fbff258: 0x563f6fbf7ee0) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/invoke.h:96
#23 0x0000563f6d17a864 in std::thread::_Invoker<std::tuple<void (*)(unsigned long, int, ihipStream_t*), unsigned long, int, ihipStream_t*> >::_M_invoke<0ul, 1ul, 2ul, 3ul> (this=0x563f6fbff258) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/std_thread.h:279
#24 0x0000563f6d17a805 in std::thread::_Invoker<std::tuple<void (*)(unsigned long, int, ihipStream_t*), unsigned long, int, ihipStream_t*> >::operator() (this=0x563f6fbff258) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/std_thread.h:286
#25 0x0000563f6d17a5f9 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)(unsigned long, int, ihipStream_t*), unsigned long, int, ihipStream_t*> > >::_M_run (this=0x563f6fbff250) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/std_thread.h:231
#26 0x00007f9649bde253 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#27 0x00007f9649865ac3 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#28 0x00007f96498f7a40 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
4/994 Test #979: cuda-bandwidthTest ........................................................Subprocess aborted***Exception:   4.58 sec
cuda-bandwidthTest: /opt/actions-runner/_work/chipStar/chipStar/src/backend/OpenCL/CHIPBackendOpenCL.cc:997: virtual void CHIPQueueOpenCL::MemMap(const chipstar::AllocationInfo *, chipstar::Queue::MEM_MAP_TYPE): Assertion `Status == CL_SUCCESS' failed.