Open pxl-th opened 4 months ago
Mixing default and non-default streams in hip*Async
functions seems to cause hangs.
Here's C++ reproducer:
#include <hip/hip_runtime.h>
#include <iostream>
#include <thread>
__global__
void vectorAdd(int *a, int *b, int numElements) {
int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if (i < numElements)
b[i] += a[i];
}
void fn() {
hipStream_t stream;
hipStreamCreate(&stream);
int n_elements = 1024 * 1024;
int size = n_elements * sizeof(int);
int *a = new int[n_elements];
int *b = new int[n_elements];
for (int i = 0; i < n_elements; ++i) {
a[i] = 1;
b[i] = 1;
}
int *da, *db;
hipMallocAsync(&da, size, stream);
hipMallocAsync(&db, size, stream);
hipMemcpyHtoDAsync(da, a, size, stream);
hipMemcpyHtoDAsync(db, a, size, stream);
hipLaunchKernelGGL(
vectorAdd, dim3((n_elements + 255) / 256), dim3(256),
0, stream, da, db, n_elements);
/* hipFreeAsync(da, stream); */
hipFreeAsync(da, nullptr); // <--- Mixing default stream with non-default causes hangs!
hipFreeAsync(db, stream);
hipStreamSynchronize(stream);
hipStreamDestroy(stream);
delete[] a;
delete[] b;
}
void thread_fn() {
for (int i = 0; i < 1000; i++) {
fn();
}
}
int main() {
std::thread t1(thread_fn);
std::thread t2(thread_fn);
std::thread t3(thread_fn);
std::thread t4(thread_fn);
t1.join();
t2.join();
t3.join();
t4.join();
return 0;
}
Respective issue in HIP: https://github.com/ROCm/HIP/issues/3370#issuecomment-1970744166
MWE
using AMDGPU function main() data = rand(Float64, 1024, 1024) Threads.@threads for i in 1:1000 sum(ROCArray(data)) end end main()
[ ..]
This ☝️ does not fail on MI250x and ROCm 5.3 @pxl-th
MWE
gdb
kill -USR1 PID