Level0 backend is uninitialized before destructor for custom class is called

When running the following CUDA code (as reproducer) with chipStar, the program segfaulted as it exited:

Reproducer:

#include <cstdio>
#include <cuda_runtime.h>

class Test {
  public:
    float *output;
    Test();
    ~Test();
};

// Constructor
Test::Test()
{
  output = NULL;
  cudaMalloc(&output, sizeof(float) * 100);
}

// Destructor
Test::~Test()
{
  printf("destructor called\n");
  cudaFree(output);
}

Test test = Test();

int main() {
  printf("testing\n");
  return 0;
}

gdb Backtrace:

After investigating, it seems like the destructor for the global variable test that contains the cudaFree call, is called after the backend is uninitialized (backend was 0x0 right before segfaulting). As the below trace from thapi shows, the __hipUnregisterFatBinary was executed before the hipFree:

02:54:26.733910220 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_ze:zeMemAllocDevice_entry: { hContext: 0x000055e43882cba0, device_desc: 0x00007ffcd309f3f0, size: 400, alignment: 0, hDevice: 0x000055e43883fb10, pptr: 0x00007ffcd309f250, device_desc_val: { stype: ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, pNext: 0x0000000000000000, flags: [ ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED ], ordinal: 0 } }
02:54:26.733932554 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_ze:zeMemAllocDevice_exit: { zeResult: ZE_RESULT_SUCCESS, pptr_val: 0xff00fffffffe0000 }
02:54:26.733943373 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_hip:hipMalloc_exit: { hipResult: hipSuccess, ptr_val: 0xff00fffffffe0000 }
02:54:26.733954246 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_hip:__hipRegisterFatBinary_entry: { data: 0x000055e436d3b060 }
02:54:26.733970344 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_hip:__hipRegisterFatBinary_exit: { hipResult: 0x000055e4387fcba0 }
02:54:26.733972413 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_hip:__hipRegisterVar_entry: { modules: 0x000055e4387fcba0, var: 0x000055e436d3b098, hostVar: 0x000055e436d37d2c, deviceVar: 0x000055e436d37d2c, ext: 0, size: 4, constant: 0, global: 0 }
02:54:26.733980781 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_hip:__hipRegisterVar_exit: {  }
02:54:26.733992207 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_hip:__hipUnregisterFatBinary_entry: { modules: 0x000055e4387fcba0 }
02:54:27.234431398 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_ze:zeCommandQueueSynchronize_entry: { hCommandQueue: 0x000055e43884a320, timeout: 18446744073709551615 }
02:54:27.234436882 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_ze:zeCommandQueueSynchronize_exit: { zeResult: ZE_RESULT_SUCCESS }
02:54:27.234438746 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_ze:zeCommandQueueDestroy_entry: { hCommandQueue: 0x000055e43884a320 }
02:54:27.234452878 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_ze:zeCommandQueueDestroy_exit: { zeResult: ZE_RESULT_SUCCESS }
02:54:27.234457091 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_ze:zeContextDestroy_entry: { hContext: 0x000055e43882cba0 }
02:54:27.234459294 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_ze:zeContextDestroy_exit: { zeResult: ZE_RESULT_SUCCESS }
02:54:27.234465599 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_hip:__hipUnregisterFatBinary_exit: {  }
02:54:27.234475633 - chiatta00 - vpid: 2765, vtid: 2765 - lttng_ust_hip:hipFree_entry: { ptr: 0xff00fffffffe0000 }

CUDA was run and traced to see if it behaves the same way (even though it was not segfaulting), and it seems like the cuMemFree succeeded (while chipStar's failed). Then it was after the cuMemFree was executed successfully, the error CUDA_ERROR_DEINITIALIZED was generated in cuDevicePrimaryCtxRelease. Please see the below trace:

BACKEND_CUDA | 1 Hostnames | 1 Processes | 1 Threads |

                     Name |     Time | Time(%) | Calls |  Average |     Min |      Max | Error |
 cuDevicePrimaryCtxRetain |  58.80ms |  60.58% |     1 |  58.80ms | 58.80ms |  58.80ms |     0 |
                   cuInit |  36.61ms |  37.73% |     1 |  36.61ms | 36.61ms |  36.61ms |     0 |
      cuGetProcAddress_v2 |   1.33ms |   1.37% |   405 |   3.28us |   280ns | 451.80us |     0 |
     cuDeviceGetAttribute | 121.75us |   0.13% |   111 |   1.10us |   160ns |  48.61us |     0 |
             cuMemFree_v2 |  86.22us |   0.09% |     1 |  86.22us | 86.22us |  86.22us |     0 |
            cuMemAlloc_v2 |  83.25us |   0.09% |     1 |  83.25us | 83.25us |  83.25us |     0 |
   cuModuleGetLoadingMode |   4.12us |   0.00% |     1 |   4.12us |  4.12us |   4.12us |     0 |
       cuDriverGetVersion |   3.54us |   0.00% |     1 |   3.54us |  3.54us |   3.54us |     0 |
          cuCtxSetCurrent |   3.49us |   0.00% |     1 |   3.49us |  3.49us |   3.49us |     0 |
          cuDeviceGetName |   2.54us |   0.00% |     1 |   2.54us |  2.54us |   2.54us |     0 |
         cuGetExportTable |   2.32us |   0.00% |     6 | 386.67ns |   220ns |    790ns |     0 |
          cuCtxGetCurrent |   1.48us |   0.00% |     2 | 740.00ns |   650ns |    830ns |     0 |
         cuDeviceGetCount |    940ns |   0.00% |     1 | 940.00ns |   940ns |    940ns |     0 |
              cuDeviceGet |    780ns |   0.00% |     1 | 780.00ns |   780ns |    780ns |     0 |
           cuCtxGetDevice |    590ns |   0.00% |     1 | 590.00ns |   590ns |    590ns |     0 |
      cuDeviceTotalMem_v2 |    520ns |   0.00% |     1 | 520.00ns |   520ns |    520ns |     0 |
cuDevicePrimaryCtxRelease |          |         |     1 |          |         |          |     1 |
                    Total |  97.05ms | 100.00% |   537 |                                     1 |

02:51:12.762665801 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuCtxGetCurrent_entry: { pctx: 0x00007fffc1f0a968 }
02:51:12.762666631 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuCtxGetCurrent_exit: { cuResult: CUDA_SUCCESS, pctx_val: 0x0000000000000000 }
02:51:12.762668511 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuCtxSetCurrent_entry: { ctx: 0x00000000020ab380 }
02:51:12.762672001 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuCtxSetCurrent_exit: { cuResult: CUDA_SUCCESS }
02:51:12.762672901 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuDevicePrimaryCtxRetain_entry: { pctx: 0x00007fffc1f0a928, dev: 0 }
02:51:12.821472940 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuDevicePrimaryCtxRetain_exit: { cuResult: CUDA_SUCCESS, pctx_val: 0x00000000020ab380 }
02:51:12.821476790 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuCtxGetCurrent_entry: { pctx: 0x00007fffc1f0a968 }
02:51:12.821477440 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuCtxGetCurrent_exit: { cuResult: CUDA_SUCCESS, pctx_val: 0x00000000020ab380 }
02:51:12.821477970 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuCtxGetDevice_entry: { device: 0x00007fffc1f0a958 }
02:51:12.821478560 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuCtxGetDevice_exit: { cuResult: CUDA_SUCCESS, device_val: 0 }
02:51:12.821492860 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuMemAlloc_v2_entry: { dptr: 0x00000000004a6558, bytesize: 400 }
02:51:12.821576111 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuMemAlloc_v2_exit: { cuResult: CUDA_SUCCESS, dptr_val: 0x00007f5f63200000 }
02:51:12.821594731 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuMemFree_v2_entry: { dptr: 0x00007f5f63200000 }
02:51:12.821680952 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuMemFree_v2_exit: { cuResult: CUDA_SUCCESS }
02:51:12.821722193 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuDevicePrimaryCtxRelease_entry: { dev: 0 }
02:51:12.821728053 - gpu07 - vpid: 64562, vtid: 64562 - lttng_ust_cuda:cuDevicePrimaryCtxRelease_exit: { cuResult: CUDA_ERROR_DEINITIALIZED }

CHIP-SPV / chipStar

Level0 backend is uninitialized before destructor for custom class is called #887