JuliaGPU / AMDGPU.jl

AMD GPU (ROCm) programming in Julia
Other
267 stars 39 forks source link

Multithreading code hangs #606

Open pxl-th opened 4 months ago

pxl-th commented 4 months ago

MWE

using AMDGPU

function main()
    data = rand(Float64, 1024, 1024)
    Threads.@threads for i in 1:1000
        sum(ROCArray(data))
    end
end
main()

gdb

(gdb) bt
#0  __futex_abstimed_wait_common64 (private=<optimized out>, cancel=true, abstime=0x0, op=393, expected=0, futex_word=0xde5448)
    at ./nptl/futex-internal.c:57
#1  __futex_abstimed_wait_common (cancel=true, private=<optimized out>, abstime=0x0, clockid=0, expected=0, futex_word=0xde5448)
    at ./nptl/futex-internal.c:87
#2  __GI___futex_abstimed_wait_cancelable64 (futex_word=futex_word@entry=0xde5448, expected=expected@entry=0, 
    clockid=clockid@entry=0, abstime=abstime@entry=0x0, private=<optimized out>) at ./nptl/futex-internal.c:139
#3  0x00007ff13e29cbdf in do_futex_wait (sem=sem@entry=0xde5448, abstime=0x0, clockid=0) at ./nptl/sem_waitcommon.c:111
#4  0x00007ff13e29cc78 in __new_sem_wait_slow64 (sem=0xde5448, abstime=0x0, clockid=0) at ./nptl/sem_waitcommon.c:183
#5  0x00007ff06957fffe in amd::Semaphore::wait (this=0xde5440) at /home/pxl-th/code/clr/rocclr/thread/semaphore.cpp:96
#6  0x00007ff06957f43d in amd::Monitor::finishLock (this=0x7ff06ab140c0 <streamSetLock>)
    at /home/pxl-th/code/clr/rocclr/thread/monitor.cpp:121
#7  0x00007ff069243506 in amd::Monitor::lock (this=0x7ff06ab140c0 <streamSetLock>)
    at /home/pxl-th/code/clr/rocclr/thread/monitor.hpp:207
#8  0x00007ff069243318 in amd::ScopedLock::ScopedLock (this=0x7fef5d5fd810, lock=...)
    at /home/pxl-th/code/clr/rocclr/thread/monitor.hpp:163
#9  0x00007ff06945d5cc in iHipWaitActiveStreams (blocking_stream=0x1885a00, wait_null_stream=true)
    at /home/pxl-th/code/clr/hipamd/src/hip_stream.cpp:204
#10 0x00007ff069251f31 in hip::getStream (stream=0x1885a00, wait=true) at /home/pxl-th/code/clr/hipamd/src/hip_context.cpp:99
#11 0x00007ff069293448 in hip::Event::addMarker (this=0x19ae190, stream=0x1885a00, command=0x0, record=true)
    at /home/pxl-th/code/clr/hipamd/src/hip_event.cpp:251
#12 0x00007ff0693fc532 in hip::MemoryPool::FreeMemory (this=0x15acb70, memory=0x1c6da90, stream=0x1885a00)
    at /home/pxl-th/code/clr/hipamd/src/hip_mempool_impl.cpp:249
#13 0x00007ff06927f94f in hip::Device::FreeMemory (this=0xef7400, memory=0x1c6da90, stream=0x1885a00)
    at /home/pxl-th/code/clr/hipamd/src/hip_device.cpp:93
#14 0x00007ff0693f8792 in FreeAsyncCommand::submit (this=0x1c6e860, device=...)
    at /home/pxl-th/code/clr/hipamd/src/hip_mempool.cpp:112
#15 0x00007ff069546692 in amd::Command::enqueue (this=0x1c6e860) at /home/pxl-th/code/clr/rocclr/platform/command.cpp:391
#16 0x00007ff0693e9dd0 in hipFreeAsync (dev_ptr=0x7fef3c220000, stream=0x1885a00)
    at /home/pxl-th/code/clr/hipamd/src/hip_mempool.cpp:137
#17 0x00007ff13cfe8cd3 in ?? ()
#18 0x000000000000000c in ?? ()

kill -USR1 PID

======================================================================================
Information request received. A stacktrace will print followed by a 1.0 second profile
======================================================================================

cmd: /home/pxl-th/bin/julia-1.10.1/bin/julia 55042 running 2 of 2

unknown function (ip: 0x7f8322c91115)
unknown function (ip: 0x7f8322c9cc77)
wait at /home/pxl-th/code/clr/rocclr/thread/semaphore.cpp:96
finishLock at /home/pxl-th/code/clr/rocclr/thread/monitor.cpp:121
lock at /home/pxl-th/code/clr/rocclr/thread/monitor.hpp:207
ScopedLock at /home/pxl-th/code/clr/rocclr/thread/monitor.hpp:163
isValid at /home/pxl-th/code/clr/hipamd/src/hip_stream.cpp:98
hipModuleLaunchKernel at /home/pxl-th/code/clr/hipamd/src/hip_module.cpp:440
macro expansion at /home/pxl-th/.julia/dev/AMDGPU/src/hip/call.jl:38 [inlined]
hipModuleLaunchKernel at /home/pxl-th/.julia/dev/AMDGPU/src/hip/libhip.jl:282
#24 at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:123 [inlined]
macro expansion at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:110 [inlined]
macro expansion at ./none:0 [inlined]
pack_arguments at ./none:0
#launch#23 at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:122 [inlined]
launch at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:116 [inlined]
#18 at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:85 [inlined]
macro expansion at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:78 [inlined]
macro expansion at ./none:0 [inlined]
convert_arguments at ./none:0 [inlined]
#roccall#17 at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:86 [inlined]
roccall at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:84 [inlined]
macro expansion at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:50 [inlined]
macro expansion at ./none:0 [inlined]
#call#1 at ./none:0
unknown function (ip: 0x7f83219799ed)
_jl_invoke at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/gf.c:2894 [inlined]
ijl_apply_generic at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/gf.c:3076
call at ./none:0 [inlined]
#_#15 at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:59
HIPKernel at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:54
unknown function (ip: 0x7f8321979415)
_jl_invoke at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/gf.c:2894 [inlined]
ijl_apply_generic at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/gf.c:3076
macro expansion at /home/pxl-th/.julia/dev/AMDGPU/src/highlevel.jl:175 [inlined]
#mapreducedim!#59 at /home/pxl-th/.julia/dev/AMDGPU/src/kernels/mapreduce.jl:155
mapreducedim! at /home/pxl-th/.julia/dev/AMDGPU/src/kernels/mapreduce.jl:86
unknown function (ip: 0x7f8321977230)
#mapreducedim!#59 at /home/pxl-th/.julia/dev/AMDGPU/src/kernels/mapreduce.jl:167
mapreducedim! at /home/pxl-th/.julia/dev/AMDGPU/src/kernels/mapreduce.jl:86 [inlined]
#mapreducedim!#59 at /home/pxl-th/.julia/dev/AMDGPU/src/kernels/mapreduce.jl:167
mapreducedim! at /home/pxl-th/.julia/dev/AMDGPU/src/kernels/mapreduce.jl:86 [inlined]
#_mapreduce#43 at /home/pxl-th/.julia/packages/GPUArrays/Hd5Sk/src/host/mapreduce.jl:67
_mapreduce at /home/pxl-th/.julia/packages/GPUArrays/Hd5Sk/src/host/mapreduce.jl:33 [inlined]
#mapreduce#41 at /home/pxl-th/.julia/packages/GPUArrays/Hd5Sk/src/host/mapreduce.jl:28 [inlined]
mapreduce at /home/pxl-th/.julia/packages/GPUArrays/Hd5Sk/src/host/mapreduce.jl:28 [inlined]
#_sum#831 at ./reducedim.jl:1015 [inlined]
_sum at ./reducedim.jl:1015 [inlined]
#_sum#830 at ./reducedim.jl:1014 [inlined]
_sum at ./reducedim.jl:1014 [inlined]
#sum#828 at ./reducedim.jl:1010 [inlined]
sum at ./reducedim.jl:1010 [inlined]
macro expansion at /home/pxl-th/.julia/dev/t.jl:26 [inlined]
#39#threadsfor_fun#7 at ./threadingconstructs.jl:215
#39#threadsfor_fun at ./threadingconstructs.jl:182 [inlined]
#1 at ./threadingconstructs.jl:154
unknown function (ip: 0x7f83218af892)
_jl_invoke at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/gf.c:2894 [inlined]
ijl_apply_generic at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/gf.c:3076
jl_apply at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined]
start_task at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/task.c:1238
unknown function (ip: (nil))

unknown function (ip: 0x7f8322c91115)
unknown function (ip: 0x7f8322c9cc77)
wait at /home/pxl-th/code/clr/rocclr/thread/semaphore.cpp:96
finishLock at /home/pxl-th/code/clr/rocclr/thread/monitor.cpp:121
lock at /home/pxl-th/code/clr/rocclr/thread/monitor.hpp:207
ScopedLock at /home/pxl-th/code/clr/rocclr/thread/monitor.hpp:163
FreeMemory at /home/pxl-th/code/clr/hipamd/src/hip_device.cpp:90
submit at /home/pxl-th/code/clr/hipamd/src/hip_mempool.cpp:112
enqueue at /home/pxl-th/code/clr/rocclr/platform/command.cpp:391
hipFreeAsync at /home/pxl-th/code/clr/hipamd/src/hip_mempool.cpp:137
macro expansion at /home/pxl-th/.julia/dev/AMDGPU/src/hip/call.jl:38 [inlined]
hipFreeAsync at /home/pxl-th/.julia/dev/AMDGPU/src/hip/libhip.jl:174 [inlined]
#free#9 at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/memory/hip.jl:134
free at /home/pxl-th/.julia/dev/AMDGPU/src/runtime/memory/hip.jl:129 [inlined]
#43 at /home/pxl-th/.julia/dev/AMDGPU/src/array.jl:30 [inlined]
context! at /home/pxl-th/.julia/dev/AMDGPU/src/tls.jl:131
_free_buf at /home/pxl-th/.julia/dev/AMDGPU/src/array.jl:28
unknown function (ip: 0x7f83219788cc)
_jl_invoke at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/gf.c:2894 [inlined]
ijl_apply_generic at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/gf.c:3076
release at /home/pxl-th/.julia/packages/GPUArrays/Hd5Sk/src/host/abstractarray.jl:42
unsafe_free! at /home/pxl-th/.julia/packages/GPUArrays/Hd5Sk/src/host/abstractarray.jl:91 [inlined]
unsafe_free! at /home/pxl-th/.julia/dev/AMDGPU/src/array.jl:34 [inlined]
#mapreducedim!#59 at /home/pxl-th/.julia/dev/AMDGPU/src/kernels/mapreduce.jl:168
mapreducedim! at /home/pxl-th/.julia/dev/AMDGPU/src/kernels/mapreduce.jl:86 [inlined]
#mapreducedim!#59 at /home/pxl-th/.julia/dev/AMDGPU/src/kernels/mapreduce.jl:167
mapreducedim! at /home/pxl-th/.julia/dev/AMDGPU/src/kernels/mapreduce.jl:86 [inlined]
#_mapreduce#43 at /home/pxl-th/.julia/packages/GPUArrays/Hd5Sk/src/host/mapreduce.jl:67
_mapreduce at /home/pxl-th/.julia/packages/GPUArrays/Hd5Sk/src/host/mapreduce.jl:33 [inlined]
#mapreduce#41 at /home/pxl-th/.julia/packages/GPUArrays/Hd5Sk/src/host/mapreduce.jl:28 [inlined]
mapreduce at /home/pxl-th/.julia/packages/GPUArrays/Hd5Sk/src/host/mapreduce.jl:28 [inlined]
#_sum#831 at ./reducedim.jl:1015 [inlined]
_sum at ./reducedim.jl:1015 [inlined]
#_sum#830 at ./reducedim.jl:1014 [inlined]
_sum at ./reducedim.jl:1014 [inlined]
#sum#828 at ./reducedim.jl:1010 [inlined]
sum at ./reducedim.jl:1010 [inlined]
macro expansion at /home/pxl-th/.julia/dev/t.jl:26 [inlined]
#39#threadsfor_fun#7 at ./threadingconstructs.jl:215
#39#threadsfor_fun at ./threadingconstructs.jl:182 [inlined]
#1 at ./threadingconstructs.jl:154
unknown function (ip: 0x7f83218af892)
_jl_invoke at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/gf.c:2894 [inlined]
ijl_apply_generic at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/gf.c:3076
jl_apply at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined]
start_task at /cache/build/default-maughin-0/julialang/julia-release-1-dot-10/src/task.c:1238
unknown function (ip: (nil))

==============================================================
Profile collected. A report will print at the next yield point
==============================================================
pxl-th commented 4 months ago

Mixing default and non-default streams in hip*Async functions seems to cause hangs. Here's C++ reproducer:

#include <hip/hip_runtime.h>
#include <iostream>
#include <thread>

__global__
void vectorAdd(int *a, int *b, int numElements) {
    int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
    if (i < numElements)
        b[i] += a[i];
}

void fn() {
    hipStream_t stream;
    hipStreamCreate(&stream);

    int n_elements = 1024 * 1024;
    int size = n_elements * sizeof(int);

    int *a = new int[n_elements];
    int *b = new int[n_elements];
    for (int i = 0; i < n_elements; ++i) {
        a[i] = 1;
        b[i] = 1;
    }

    int *da, *db;
    hipMallocAsync(&da, size, stream);
    hipMallocAsync(&db, size, stream);

    hipMemcpyHtoDAsync(da, a, size, stream);
    hipMemcpyHtoDAsync(db, a, size, stream);

    hipLaunchKernelGGL(
        vectorAdd, dim3((n_elements + 255) / 256), dim3(256),
        0, stream, da, db, n_elements);

    /* hipFreeAsync(da, stream); */
    hipFreeAsync(da, nullptr); // <--- Mixing default stream with non-default causes hangs!
    hipFreeAsync(db, stream);

    hipStreamSynchronize(stream);
    hipStreamDestroy(stream);

    delete[] a;
    delete[] b;
}

void thread_fn() {
    for (int i = 0; i < 1000; i++) {
        fn();
    }
}

int main() {
    std::thread t1(thread_fn);
    std::thread t2(thread_fn);
    std::thread t3(thread_fn);
    std::thread t4(thread_fn);

    t1.join();
    t2.join();
    t3.join();
    t4.join();
    return 0;
}
pxl-th commented 4 months ago

Respective issue in HIP: https://github.com/ROCm/HIP/issues/3370#issuecomment-1970744166

luraess commented 3 months ago

MWE

using AMDGPU

function main()
    data = rand(Float64, 1024, 1024)
    Threads.@threads for i in 1:1000
        sum(ROCArray(data))
    end
end
main()

[ ..]

This ☝️ does not fail on MI250x and ROCm 5.3 @pxl-th