libomptarget interop use and destroy clauses with nowait and depend are missing synchronisations

tomdeakin commented 1 year ago

The OpenMP interop construct should ensure that foreign functions enqueued on the foreign synchronisation object (i.e., the CUDA queue) that is returned via the omp_ipr_targetsync property should finish before tasks enqueued after the interop construct which depend on that interop construct.

For example, OpenMP tasks depending on the interop construct with a depend() clause should not start until the foreign functions in the synchronisation object have finished.

The below example captures the incorrect behaviour, which can be fixed by adding a call to cudaStreamSynchronize. This is a blunt workaround because it stops the host. I'm suggesting a real fix could be to have the interop construct create a CUDA event and call cuStreamWaitEvent or cudaStreamWaitEvent for all other streams to make sure that those streams do not continue until the work on the "interop" stream is done.

Build

clang++ -O3 kernel.cu --cuda-gpu-arch=sm_75 -c
clang -fopenmp --offload-arch=native example.c kernel.o -lcudart -L/path/to/cuda/lib64

Output

# Where FIXME_MISSINGBARRIER is defined as empty
Incorrect 42894
# Where FIXME_MISSINGBARRIER is defined as cudaStreamSynchronize(s)
Success

example.c

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include <cuda_runtime.h>

#define FIXME_ADDBARRIER
//#define FIXME_ADDBARRIER cudaStreamSynchronize(s);

extern void call_cuda_kernel(int * A, int N, cudaStream_t s);

int main(void) {

  int N = 100000;
  int *A = (int *)malloc(sizeof(int) * N);
  #pragma omp target enter data map(alloc: A[:N])

  #pragma omp target nowait depend(out: A)
  for (int i = 0; i < N; ++i)
    A[i] = i;

  omp_interop_t iobj = omp_interop_none;
  #pragma omp interop init(targetsync: iobj) nowait depend(inout: A)

  // Check we have a CUDA runtime
  int err;
  if (omp_get_interop_int(iobj, omp_ipr_fr_id, &err) != omp_ifr_cuda) {
    printf("Wrong interop runtime\n");
    exit(EXIT_FAILURE);
  }

  // Get CUDA stream
  cudaStream_t s = (cudaStream_t) omp_get_interop_ptr(iobj, omp_ipr_targetsync, NULL);

  // Asynchronously enqueue CUDA kernel on the stream
  #pragma omp target data use_device_ptr(A)
  call_cuda_kernel(A, N, s);

  FIXME_ADDBARRIER

  #pragma omp interop use(iobj) nowait depend(inout: A)

  #pragma omp target nowait depend(inout: A)
  for (int i = 0; i < N; ++i)
    A[i] += 1;

  #pragma omp interop use(iobj) nowait depend(inout: A)

  #pragma omp target data use_device_ptr(A)
  call_cuda_kernel(A, N, s);

  FIXME_ADDBARRIER
  #pragma omp interop destroy(iobj) nowait depend(inout: A)

  #pragma omp taskwait

  #pragma omp target exit data map(from: A[:N])

  // Check solution
  for (int i = 0; i < N; ++i)
    if (A[i] != i + 3) {
      printf("Incorrect %d\n", A[i]);
      exit(EXIT_FAILURE);
    }

  printf("Success\n");

  free(A);
}

kernel.cu

#include <cuda_runtime.h>
#include <cstdio>

__global__ void cuda_kernel(int *A, int N) {
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid < N)
    A[tid] += 1;
}

extern "C" {
void call_cuda_kernel(int *A, int N, cudaStream_t s) {
  cuda_kernel<<<N, 1, 0, s>>>(A, N);
}
}

llvmbot commented 1 year ago

@llvm/issue-subscribers-openmp

shiltian commented 1 year ago

I don't think interop nowait is implemented yet.

llvm / llvm-project