CHIP-SPV / chipStar

chipStar is a tool for compiling and running HIP/CUDA on SPIR-V via OpenCL or Level Zero APIs.
Other
185 stars 30 forks source link

Hangs in chipstar::Queue::getSyncQueuesLastEvents #651

Closed pjaaskel closed 11 months ago

pjaaskel commented 11 months ago

Hangs with all my local test envs (including PoCL-CPU). Tested with an LLVM 17 build.

Backtrace of a PoCL-CPU test case run:

Thread 11 (Thread 0x7f94e1ffb640 (LWP 171980) "__cosf"):
#0  __futex_abstimed_wait_common64 (private=0, cancel=true, abstime=0x0, op=393, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:57
#1  __futex_abstimed_wait_common (cancel=true, private=0, abstime=0x0, clockid=0, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:87
#2  __GI___futex_abstimed_wait_cancelable64 (futex_word=futex_word@entry=0x7f94f61157e8 <scheduler+168>, expected=expected@entry=0, clockid=clockid@entry=0, abstime=abstime@entry=0x0, private=private@entry=0) at ./nptl/futex-internal.c:139
#3  0x00007f94f6293a41 in __pthread_cond_wait_common (abstime=0x0, clockid=0, mutex=0x7f94f6115800 <scheduler+192>, cond=0x7f94f61157c0 <scheduler+128>) at ./nptl/pthread_cond_wait.c:503
#4  ___pthread_cond_wait (cond=0x7f94f61157c0 <scheduler+128>, mutex=0x7f94f6115800 <scheduler+192>) at ./nptl/pthread_cond_wait.c:627
#5  0x00007f94f5edd3e8 in pthread_scheduler_get_work (td=0x5645ec0f8780) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:540
#6  0x00007f94f5edd6c1 in pocl_pthread_driver_thread (p=0x5645ec0f8780) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:589
#7  0x00007f94f6294ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#8  0x00007f94f6326a40 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Thread 10 (Thread 0x7f94e27fc640 (LWP 171979) "__cosf"):
#0  __futex_abstimed_wait_common64 (private=0, cancel=true, abstime=0x0, op=393, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:57
#1  __futex_abstimed_wait_common (cancel=true, private=0, abstime=0x0, clockid=0, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:87
#2  __GI___futex_abstimed_wait_cancelable64 (futex_word=futex_word@entry=0x7f94f61157e8 <scheduler+168>, expected=expected@entry=0, clockid=clockid@entry=0, abstime=abstime@entry=0x0, private=private@entry=0) at ./nptl/futex-internal.c:139
#3  0x00007f94f6293a41 in __pthread_cond_wait_common (abstime=0x0, clockid=0, mutex=0x7f94f6115800 <scheduler+192>, cond=0x7f94f61157c0 <scheduler+128>) at ./nptl/pthread_cond_wait.c:503
#4  ___pthread_cond_wait (cond=0x7f94f61157c0 <scheduler+128>, mutex=0x7f94f6115800 <scheduler+192>) at ./nptl/pthread_cond_wait.c:627
#5  0x00007f94f5edd3e8 in pthread_scheduler_get_work (td=0x5645ec0f8740) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:540
#6  0x00007f94f5edd6c1 in pocl_pthread_driver_thread (p=0x5645ec0f8740) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:589
#7  0x00007f94f6294ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#8  0x00007f94f6326a40 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Thread 9 (Thread 0x7f94e2ffd640 (LWP 171978) "__cosf"):
#0  __futex_abstimed_wait_common64 (private=0, cancel=true, abstime=0x0, op=393, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:57
#1  __futex_abstimed_wait_common (cancel=true, private=0, abstime=0x0, clockid=0, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:87
--Type <RET> for more, q to quit, c to continue without paging--
#2  __GI___futex_abstimed_wait_cancelable64 (futex_word=futex_word@entry=0x7f94f61157e8 <scheduler+168>, expected=expected@entry=0, clockid=clockid@entry=0, abstime=abstime@entry=0x0, private=private@entry=0) at ./nptl/futex-internal.c:139
#3  0x00007f94f6293a41 in __pthread_cond_wait_common (abstime=0x0, clockid=0, mutex=0x7f94f6115800 <scheduler+192>, cond=0x7f94f61157c0 <scheduler+128>) at ./nptl/pthread_cond_wait.c:503
#4  ___pthread_cond_wait (cond=0x7f94f61157c0 <scheduler+128>, mutex=0x7f94f6115800 <scheduler+192>) at ./nptl/pthread_cond_wait.c:627
#5  0x00007f94f5edd3e8 in pthread_scheduler_get_work (td=0x5645ec0f8700) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:540
#6  0x00007f94f5edd6c1 in pocl_pthread_driver_thread (p=0x5645ec0f8700) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:589
#7  0x00007f94f6294ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#8  0x00007f94f6326a40 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Thread 8 (Thread 0x7f94e37fe640 (LWP 171977) "__cosf"):
#0  __futex_abstimed_wait_common64 (private=0, cancel=true, abstime=0x0, op=393, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:57
#1  __futex_abstimed_wait_common (cancel=true, private=0, abstime=0x0, clockid=0, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:87
#2  __GI___futex_abstimed_wait_cancelable64 (futex_word=futex_word@entry=0x7f94f61157e8 <scheduler+168>, expected=expected@entry=0, clockid=clockid@entry=0, abstime=abstime@entry=0x0, private=private@entry=0) at ./nptl/futex-internal.c:139
#3  0x00007f94f6293a41 in __pthread_cond_wait_common (abstime=0x0, clockid=0, mutex=0x7f94f6115800 <scheduler+192>, cond=0x7f94f61157c0 <scheduler+128>) at ./nptl/pthread_cond_wait.c:503
#4  ___pthread_cond_wait (cond=0x7f94f61157c0 <scheduler+128>, mutex=0x7f94f6115800 <scheduler+192>) at ./nptl/pthread_cond_wait.c:627
#5  0x00007f94f5edd3e8 in pthread_scheduler_get_work (td=0x5645ec0f86c0) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:540
#6  0x00007f94f5edd6c1 in pocl_pthread_driver_thread (p=0x5645ec0f86c0) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:589
#7  0x00007f94f6294ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#8  0x00007f94f6326a40 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Thread 7 (Thread 0x7f94e3fff640 (LWP 171976) "__cosf"):
#0  __futex_abstimed_wait_common64 (private=0, cancel=true, abstime=0x0, op=393, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:57
#1  __futex_abstimed_wait_common (cancel=true, private=0, abstime=0x0, clockid=0, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:87
#2  __GI___futex_abstimed_wait_cancelable64 (futex_word=futex_word@entry=0x7f94f61157e8 <scheduler+168>, expected=expected@entry=0, clockid=clockid@entry=0, abstime=abstime@entry=0x0, private=private@entry=0) at ./nptl/futex-internal.c:139
#3  0x00007f94f6293a41 in __pthread_cond_wait_common (abstime=0x0, clockid=0, mutex=0x7f94f6115800 <scheduler+192>, cond=0x7f94f61157c0 <scheduler+128>) at ./nptl/pthread_cond_wait.c:503
#4  ___pthread_cond_wait (cond=0x7f94f61157c0 <scheduler+128>, mutex=0x7f94f6115800 <scheduler+192>) at ./nptl/pthread_cond_wait.c:627
#5  0x00007f94f5edd3e8 in pthread_scheduler_get_work (td=0x5645ec0f8680) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.--Type <RET> for more, q to quit, c to continue without paging--
c:540
#6  0x00007f94f5edd6c1 in pocl_pthread_driver_thread (p=0x5645ec0f8680) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:589
#7  0x00007f94f6294ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#8  0x00007f94f6326a40 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Thread 6 (Thread 0x7f94e8dfb640 (LWP 171975) "__cosf"):
#0  __futex_abstimed_wait_common64 (private=0, cancel=true, abstime=0x0, op=393, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:57
#1  __futex_abstimed_wait_common (cancel=true, private=0, abstime=0x0, clockid=0, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:87
#2  __GI___futex_abstimed_wait_cancelable64 (futex_word=futex_word@entry=0x7f94f61157e8 <scheduler+168>, expected=expected@entry=0, clockid=clockid@entry=0, abstime=abstime@entry=0x0, private=private@entry=0) at ./nptl/futex-internal.c:139
#3  0x00007f94f6293a41 in __pthread_cond_wait_common (abstime=0x0, clockid=0, mutex=0x7f94f6115800 <scheduler+192>, cond=0x7f94f61157c0 <scheduler+128>) at ./nptl/pthread_cond_wait.c:503
#4  ___pthread_cond_wait (cond=0x7f94f61157c0 <scheduler+128>, mutex=0x7f94f6115800 <scheduler+192>) at ./nptl/pthread_cond_wait.c:627
#5  0x00007f94f5edd3e8 in pthread_scheduler_get_work (td=0x5645ec0f8640) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:540
#6  0x00007f94f5edd6c1 in pocl_pthread_driver_thread (p=0x5645ec0f8640) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:589
#7  0x00007f94f6294ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#8  0x00007f94f6326a40 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Thread 5 (Thread 0x7f94e95fc640 (LWP 171974) "__cosf"):
#0  __futex_abstimed_wait_common64 (private=0, cancel=true, abstime=0x0, op=393, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:57
#1  __futex_abstimed_wait_common (cancel=true, private=0, abstime=0x0, clockid=0, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:87
#2  __GI___futex_abstimed_wait_cancelable64 (futex_word=futex_word@entry=0x7f94f61157e8 <scheduler+168>, expected=expected@entry=0, clockid=clockid@entry=0, abstime=abstime@entry=0x0, private=private@entry=0) at ./nptl/futex-internal.c:139
#3  0x00007f94f6293a41 in __pthread_cond_wait_common (abstime=0x0, clockid=0, mutex=0x7f94f6115800 <scheduler+192>, cond=0x7f94f61157c0 <scheduler+128>) at ./nptl/pthread_cond_wait.c:503
#4  ___pthread_cond_wait (cond=0x7f94f61157c0 <scheduler+128>, mutex=0x7f94f6115800 <scheduler+192>) at ./nptl/pthread_cond_wait.c:627
#5  0x00007f94f5edd3e8 in pthread_scheduler_get_work (td=0x5645ec0f8600) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:540
#6  0x00007f94f5edd6c1 in pocl_pthread_driver_thread (p=0x5645ec0f8600) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:589
#7  0x00007f94f6294ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#8  0x00007f94f6326a40 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

--Type <RET> for more, q to quit, c to continue without paging--
Thread 4 (Thread 0x7f94e9dfd640 (LWP 171973) "__cosf"):
#0  __futex_abstimed_wait_common64 (private=0, cancel=true, abstime=0x0, op=393, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:57
#1  __futex_abstimed_wait_common (cancel=true, private=0, abstime=0x0, clockid=0, expected=0, futex_word=0x7f94f61157e8 <scheduler+168>) at ./nptl/futex-internal.c:87
#2  __GI___futex_abstimed_wait_cancelable64 (futex_word=futex_word@entry=0x7f94f61157e8 <scheduler+168>, expected=expected@entry=0, clockid=clockid@entry=0, abstime=abstime@entry=0x0, private=private@entry=0) at ./nptl/futex-internal.c:139
#3  0x00007f94f6293a41 in __pthread_cond_wait_common (abstime=0x0, clockid=0, mutex=0x7f94f6115800 <scheduler+192>, cond=0x7f94f61157c0 <scheduler+128>) at ./nptl/pthread_cond_wait.c:503
#4  ___pthread_cond_wait (cond=0x7f94f61157c0 <scheduler+128>, mutex=0x7f94f6115800 <scheduler+192>) at ./nptl/pthread_cond_wait.c:627
#5  0x00007f94f5edd3e8 in pthread_scheduler_get_work (td=0x5645ec0f85c0) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:540
#6  0x00007f94f5edd6c1 in pocl_pthread_driver_thread (p=0x5645ec0f85c0) at /home/pjaaskel/src/pocl/lib/CL/devices/pthread/pthread_scheduler.c:589
#7  0x00007f94f6294ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#8  0x00007f94f6326a40 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Thread 3 (Thread 0x7f94ea5fe640 (LWP 171969) "__cosf-ust"):
#0  syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38
#1  0x00007f94f653b136 in ?? () from /lib/x86_64-linux-gnu/liblttng-ust.so.1
#2  0x00007f94f6294ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#3  0x00007f94f6326a40 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Thread 2 (Thread 0x7f94eadff640 (LWP 171968) "__cosf-ust"):
#0  __recvmsg_syscall (flags=0, msg=0x7f94eadfe530, fd=6) at ../sysdeps/unix/sysv/linux/recvmsg.c:27
#1  __libc_recvmsg (fd=6, msg=0x7f94eadfe530, flags=0) at ../sysdeps/unix/sysv/linux/recvmsg.c:41
#2  0x00007f94f6565eec in ?? () from /lib/x86_64-linux-gnu/liblttng-ust.so.1
#3  0x00007f94f653af42 in ?? () from /lib/x86_64-linux-gnu/liblttng-ust.so.1
#4  0x00007f94f6294ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#5  0x00007f94f6326a40 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Thread 1 (Thread 0x7f94f6856800 (LWP 171966) "__cosf"):
#0  futex_wait (private=0, expected=2, futex_word=0x5645ec184f78) at ../sysdeps/nptl/futex-internal.h:146
#1  __GI___lll_lock_wait (futex=futex@entry=0x5645ec184f78, private=0) at ./nptl/lowlevellock.c:49
#2  0x00007f94f6298002 in lll_mutex_lock_optimized (mutex=0x5645ec184f78) at ./nptl/pthread_mutex_lock.c:48
#3  ___pthread_mutex_lock (mutex=0x5645ec184f78) at ./nptl/pthread_mutex_lock.c:93
#4  0x00007f94f6a732d3 in __gthread_mutex_lock (__mutex=0x5645ec184f78) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/x86_64-linux-gnu/c++/12/bits/gthr-default.h:749
#5  std::mutex::lock (this=0x5645ec184f78) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/std_mutex.h:100
#6  std::lock_guard<std::mutex>::lock_guard (__m=..., this=<optimized out>) at /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bi--Type <RET> for more, q to quit, c to continue without paging--
ts/std_mutex.h:229
#7  chipstar::Queue::getSyncQueuesLastEvents (this=0x5645ec1852f0) at /home/pjaaskel/src/chipStar/src/CHIPBackend.cc:1538
#8  0x00007f94f6ae7c8b in CHIPQueueOpenCL::getSyncQueuesEventHandles (this=0x80, this@entry=0x5645ec1852f0) at /home/pjaaskel/src/chipStar/src/backend/OpenCL/CHIPBackendOpenCL.cc:952
#9  0x00007f94f6aeddd5 in CHIPQueueOpenCL::enqueueBarrierImpl (this=0x5645ec1852f0, EventsToWaitFor=std::vector of length 0, capacity 0) at /home/pjaaskel/src/chipStar/src/backend/OpenCL/CHIPBackendOpenCL.cc:1364
#10 0x00007f94f6a6f69c in chipstar::Context::syncQueues (this=<optimized out>, TargetQueue=0x5645ec1852f0) at /home/pjaaskel/src/chipStar/src/CHIPBackend.cc:1089
#11 0x00007f94f6a73987 in chipstar::Queue::memCopy (this=0x5645ec1852f0, Dst=0x5645ec19f280, Src=0x2, Size=140277761774272) at /home/pjaaskel/src/chipStar/src/CHIPBackend.cc:1586
#12 0x00007f94f6aaf276 in hipMallocInternal (Ptr=0x7ffea8c8b218, Size=<optimized out>) at /home/pjaaskel/src/chipStar/src/CHIPBindings.cc:2418
#13 hipMalloc (Ptr=0x7ffea8c8b218, Size=<optimized out>) at /home/pjaaskel/src/chipStar/src/CHIPBindings.cc:2429
#14 0x00005645ea8173b3 in ____C_A_T_C_H____T_E_S_T____17 () at /home/pjaaskel/src/chipStar/HIP/tests/catch/unit/deviceLib/SinglePrecisionIntrinsics/__cosf.cc:2
#15 0x00005645ea82d1e9 in Catch::TestCase::invoke (this=<optimized out>) at /home/pjaaskel/src/chipStar/HIP/tests/catch/external/Catch2/catch.hpp:14100
#16 Catch::RunContext::invokeActiveTestCase (this=0x7ffea8c8b760) at /home/pjaaskel/src/chipStar/HIP/tests/catch/external/Catch2/catch.hpp:12959
#17 Catch::RunContext::runCurrentTest (this=this@entry=0x7ffea8c8b760, redirectedCout="", redirectedCerr="") at /home/pjaaskel/src/chipStar/HIP/tests/catch/external/Catch2/catch.hpp:12932
#18 0x00005645ea82c8f0 in Catch::RunContext::runTest (this=this@entry=0x7ffea8c8b760, testCase=...) at /home/pjaaskel/src/chipStar/HIP/tests/catch/external/Catch2/catch.hpp:12693
#19 0x00005645ea83187a in Catch::(anonymous namespace)::TestGroup::execute (this=0x7ffea8c8b750) at /home/pjaaskel/src/chipStar/HIP/tests/catch/external/Catch2/catch.hpp:13287
#20 Catch::Session::runInternal (this=this@entry=0x7ffea8c8b9f0) at /home/pjaaskel/src/chipStar/HIP/tests/catch/external/Catch2/catch.hpp:13493
#21 0x00005645ea830b04 in Catch::Session::run (this=this@entry=0x7ffea8c8b9f0) at /home/pjaaskel/src/chipStar/HIP/tests/catch/external/Catch2/catch.hpp:13449
#22 0x00005645ea845f52 in Catch::Session::run<char> (this=0x7ffea8c8b9f0, argc=<optimized out>, argv=<optimized out>) at /home/pjaaskel/src/chipStar/HIP/tests/catch/external/Catch2/catch.hpp:13171
#23 main (argc=2, argv=0x7ffea8c8bca8) at /home/pjaaskel/src/chipStar/HIP/tests/catch/external/Catch2/catch.hpp:17448
pjaaskel commented 11 months ago

This disappeared with a wiped build (recmake with new defaults).