Support for cooperative groups

I wish to trace a kernel written with cooperative groups. To be specific, my program is something like the following:

#include <cooperative_groups.h>
#include <cuda.h>
#include <cuda_runtime_api.h>

__global__ void random_access_shared(...) {
  ... do something ...
  // __syncthreads();
  grid.sync();
  ... do something ...
}

int main(int argc, char **argv) {
  ... do something ...

  size_t shared_size = 0xffff;

  void *kernelArgs[] = {...};

  dim3 dimBlock(nthread, 1, 1);
  dim3 dimGrid(nblock, 1, 1);

  cudaLaunchCooperativeKernel((void *)random_access_shared, dimGrid, dimBlock, kernelArgs, shared_size, 0);

  ... do something ...
}

However, when I trace this kernel with something like the following command, I got no trace file output.

export CUDA_VERSION="11.0"; 
export CUDA_VISIBLE_DEVICES="0" ;
export TRACES_FOLDER=/trace/output/folder;
CUDA_INJECTION64_PATH=/path/to/tracer_tool.so; 
LD_PRELOAD=/path/to/tracer_tool.so /path/to/executable.out

Does NVBit support tracing kernels written with cooperative groups, or how should I use NVBit differently for tracing cooperative groups?

NVlabs / NVBit

Support for cooperative groups #121