The OpenMP interop construct should ensure that foreign functions enqueued on the foreign synchronisation object (i.e., the CUDA queue) that is returned via the omp_ipr_targetsync property should finish before tasks enqueued after the interop construct which depend on that interop construct.
For example, OpenMP tasks depending on the interop construct with a depend() clause should not start until the foreign functions in the synchronisation object have finished.
The below example captures the incorrect behaviour, which can be fixed by adding a call to cudaStreamSynchronize. This is a blunt workaround because it stops the host. I'm suggesting a real fix could be to have the interop construct create a CUDA event and call cuStreamWaitEvent or cudaStreamWaitEvent for all other streams to make sure that those streams do not continue until the work on the "interop" stream is done.
# Where FIXME_MISSINGBARRIER is defined as empty
Incorrect 42894
# Where FIXME_MISSINGBARRIER is defined as cudaStreamSynchronize(s)
Success
example.c
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include <cuda_runtime.h>
#define FIXME_ADDBARRIER
//#define FIXME_ADDBARRIER cudaStreamSynchronize(s);
extern void call_cuda_kernel(int * A, int N, cudaStream_t s);
int main(void) {
int N = 100000;
int *A = (int *)malloc(sizeof(int) * N);
#pragma omp target enter data map(alloc: A[:N])
#pragma omp target nowait depend(out: A)
for (int i = 0; i < N; ++i)
A[i] = i;
omp_interop_t iobj = omp_interop_none;
#pragma omp interop init(targetsync: iobj) nowait depend(inout: A)
// Check we have a CUDA runtime
int err;
if (omp_get_interop_int(iobj, omp_ipr_fr_id, &err) != omp_ifr_cuda) {
printf("Wrong interop runtime\n");
exit(EXIT_FAILURE);
}
// Get CUDA stream
cudaStream_t s = (cudaStream_t) omp_get_interop_ptr(iobj, omp_ipr_targetsync, NULL);
// Asynchronously enqueue CUDA kernel on the stream
#pragma omp target data use_device_ptr(A)
call_cuda_kernel(A, N, s);
FIXME_ADDBARRIER
#pragma omp interop use(iobj) nowait depend(inout: A)
#pragma omp target nowait depend(inout: A)
for (int i = 0; i < N; ++i)
A[i] += 1;
#pragma omp interop use(iobj) nowait depend(inout: A)
#pragma omp target data use_device_ptr(A)
call_cuda_kernel(A, N, s);
FIXME_ADDBARRIER
#pragma omp interop destroy(iobj) nowait depend(inout: A)
#pragma omp taskwait
#pragma omp target exit data map(from: A[:N])
// Check solution
for (int i = 0; i < N; ++i)
if (A[i] != i + 3) {
printf("Incorrect %d\n", A[i]);
exit(EXIT_FAILURE);
}
printf("Success\n");
free(A);
}
kernel.cu
#include <cuda_runtime.h>
#include <cstdio>
__global__ void cuda_kernel(int *A, int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N)
A[tid] += 1;
}
extern "C" {
void call_cuda_kernel(int *A, int N, cudaStream_t s) {
cuda_kernel<<<N, 1, 0, s>>>(A, N);
}
}
The OpenMP
interop
construct should ensure that foreign functions enqueued on the foreign synchronisation object (i.e., the CUDA queue) that is returned via theomp_ipr_targetsync
property should finish before tasks enqueued after theinterop
construct which depend on thatinterop
construct.For example, OpenMP tasks depending on the
interop
construct with adepend()
clause should not start until the foreign functions in the synchronisation object have finished.The below example captures the incorrect behaviour, which can be fixed by adding a call to
cudaStreamSynchronize
. This is a blunt workaround because it stops the host. I'm suggesting a real fix could be to have theinterop
construct create a CUDA event and callcuStreamWaitEvent
orcudaStreamWaitEvent
for all other streams to make sure that those streams do not continue until the work on the "interop" stream is done.Build
Output
example.c
kernel.cu