Celtoys / Remotery

Single C file, Realtime CPU/GPU Profiler with Remote Web Viewer
Apache License 2.0
3.1k stars 262 forks source link

C++: ObjectAllocator_Destructor: Assertion `allocator->nb_inuse == 0' failed #263

Open fortminors opened 4 weeks ago

fortminors commented 4 weeks ago

Hello! I am trying to profile my cuda program, however it results in assertion errors. I have created a minimal reproducing example below:

#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>

#include "utils/Remotery.h"

int main()
{
    CUcontext* context = nullptr;
    // cuCtxCreate(context, 0, 0);
    cuCtxGetCurrent(context);

    Remotery* rmt;
    rmt_CreateGlobalInstance(&rmt);
    rmtCUDABind bind;

    bind.context = (void*)context;
    bind.CtxSetCurrent = (void*)&cuCtxSetCurrent;
    bind.CtxGetCurrent = (void*)&cuCtxGetCurrent;
    bind.EventCreate = (void*)&cuEventCreate;
    bind.EventDestroy = (void*)&cuEventDestroy;
    bind.EventRecord = (void*)&cuEventRecord;
    bind.EventQuery = (void*)&cuEventQuery;
    bind.EventElapsedTime = (void*)&cuEventElapsedTime;
    rmt_BindCUDA(&bind);

    CUstream stream;

    std::cout << "before cpu scoped sample" << std::endl;
    {
        rmt_ScopedCPUSample(ScopedCPUSample, 0);
    }
    std::cout << "after cpu scoped sample" << std::endl;

    std::cout << "before cpu standard sample" << std::endl;
    rmt_BeginCPUSample(StandardCPUSample, 0);
    rmt_EndCPUSample();
    std::cout << "after cpu standard sample" << std::endl;

    std::cout << "before cuda scoped sample" << std::endl;
    {
        rmt_ScopedCUDASample(ScopedCUDASample, stream);
    }
    std::cout << "after cuda scoped sample" << std::endl;

    std::cout << "before cuda standard sample" << std::endl;
    rmt_BeginCUDASample(StandardCUDASample, stream);
    rmt_EndCUDASample(stream);
    std::cout << "after cuda standard sample" << std::endl;

    std::cout << "success" << std::endl;

    rmt_DestroyGlobalInstance(rmt);
}

Building, linking and running the above script results in the following output:

before cpu scoped sample
after cpu scoped sample
before cpu standard sample
after cpu standard sample
before cuda scoped sample
test_program: utils/Remotery.c:2462: ObjectAllocator_Destructor: Assertion `allocator->nb_inuse == 0' failed.

The CPU sampling works perfectly. I would like to make CUDA sampling work as well, any help is appreciated.

I was able to successfully build Remotery after the changes suggested in https://github.com/Celtoys/Remotery/pull/262

dwilliamson commented 3 weeks ago

Oddly, there is no rmt_UnbindCUDA in the API, but there should be.

Take a look at the implementation of `rmt_UnbindOpenGL for an example: https://github.com/Celtoys/Remotery/blob/e862ba46de1a7287743b38f8e64e3a8d599e7a4d/lib/Remotery.c#L9554

GPU profilers have a bunch of query data that will be in transit between the various queues and the assert message is telling you the app is shutting down without freeing them.

Adding an equivalent rmt_UnbindCUDA should fix that, and its implementation will be very similar.

fortminors commented 3 weeks ago

Interesting. Why is the app shutting down though? This is happening after I call rmt_ScopedCUDASample

dwilliamson commented 3 weeks ago

Have you tried calling cudaStreamCreate? I'm not sure what to expect when you profile a non-existant stream.

fortminors commented 3 weeks ago

I haven't done it in this sample, however in my application I have multiple cuda streams that are created with cudaStreamCreate, but the same error occurs

dwilliamson commented 3 weeks ago

Right, but this app isn't a valid repro until the streams are created. As I said: I have no idea what CUDA will do internally if you try to use its API (like Remotery does) without creating the stream first.

Already I can see code inside _rmt_EndCUDASample that causes a sample tree imbalance against _rmt_BeginCUDASample if CUDAEventRecord fails.

fortminors commented 3 weeks ago

I have just tried calling cudaStreamCreate in the beginning, but it did not help - I get the same error

Here is the code repro that I am using:

#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>

#include "utils/Remotery.h"

int main()
{
    CUcontext* context = nullptr;
    // cuCtxCreate(context, 0, 0);
    cuCtxGetCurrent(context);

    CUstream stream;
    cudaError_t ret = cudaStreamCreate(&stream);

    if (ret == cudaSuccess)
    {
        std::cout << "cuda stream created" << std::endl;
    }
    else
    {
        throw std::runtime_error("Could not create the cuda stream");
    }

    Remotery* rmt;
    rmt_CreateGlobalInstance(&rmt);
    rmtCUDABind bind;

    bind.context = (void*)context;
    bind.CtxSetCurrent = (void*)&cuCtxSetCurrent;
    bind.CtxGetCurrent = (void*)&cuCtxGetCurrent;
    bind.EventCreate = (void*)&cuEventCreate;
    bind.EventDestroy = (void*)&cuEventDestroy;
    bind.EventRecord = (void*)&cuEventRecord;
    bind.EventQuery = (void*)&cuEventQuery;
    bind.EventElapsedTime = (void*)&cuEventElapsedTime;
    rmt_BindCUDA(&bind);

    std::cout << "before cpu scoped sample" << std::endl;
    {
        rmt_ScopedCPUSample(ScopedCPUSample, 0);
    }
    std::cout << "after cpu scoped sample" << std::endl;

    std::cout << "before cpu standard sample" << std::endl;
    rmt_BeginCPUSample(StandardCPUSample, 0);
    rmt_EndCPUSample();
    std::cout << "after cpu standard sample" << std::endl;

    std::cout << "before cuda scoped sample" << std::endl;
    {
        rmt_ScopedCUDASample(ScopedCUDASample, stream);
    }
    std::cout << "after cuda scoped sample" << std::endl;

    std::cout << "before cuda standard sample" << std::endl;
    rmt_BeginCUDASample(StandardCUDASample, stream);
    rmt_EndCUDASample(stream);
    std::cout << "after cuda standard sample" << std::endl;

    std::cout << "success" << std::endl;

    rmt_DestroyGlobalInstance(rmt);
}

That's the output I get:

cuda stream created
before cpu scoped sample
after cpu scoped sample
before cpu standard sample
after cpu standard sample
before cuda scoped sample
REMOTERY_TEST: /srv/vas/src/utils/Remotery.c:2462: ObjectAllocator_Destructor: Assertion `allocator->nb_inuse == 0' failed.

And here is the call stack: image

Is there anything else that I should be aware of to use the CUDA API?

dwilliamson commented 3 weeks ago

OK! That makes a lot more sense.

The code here is failing:

https://github.com/Celtoys/Remotery/blob/e862ba46de1a7287743b38f8e64e3a8d599e7a4d/lib/Remotery.c#L7914

Points:

So your first port of call is to find out why this code is failing:

https://github.com/Celtoys/Remotery/blob/e862ba46de1a7287743b38f8e64e3a8d599e7a4d/lib/Remotery.c#L7795C1-L7813C2

fortminors commented 3 weeks ago

The constructor function calls CUDAEventCreate, which in turn calls CUDAEnsureContext that does not match (here) the current context with the one I set during rmt_BindCUDA - apparently it was null and cuCtxGetCurrent in my main function was actually giving me CUDA_ERROR_NOT_INITIALIZED. That's why CUDAEventCreate ended up giving me RMT_ERROR_CUDA_INVALID_CONTEXT.

So I adapted the code to initialize cuda with cudaSetDevice(0) as below, however I now get a different error image

with the following output:

cuda context obtained
cuda stream created
before cpu scoped sample
after cpu scoped sample
before cpu standard sample
after cpu standard sample
before cuda scoped sample
REMOTERY_TEST: /srv/vas/src/utils/Remotery.c:5049: SampleTree_Pop: Assertion `sample != tree->root' failed.

And here is the updated repro code that I use

#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>

#include "utils/Remotery.h"

void get_cuda_context(void** context)
{
    CUresult ctx_res = cuCtxGetCurrent((CUcontext*)context);
    if (ctx_res == CUDA_SUCCESS)
    {
        std::cout << "cuda context obtained" << std::endl;
    }
    else
    {
        throw std::runtime_error("Could not get the cuda context");
    }
}

int main()
{
    cudaSetDevice(0);

    void* context;
    get_cuda_context(&context);

    CUstream stream;
    cudaError_t ret = cudaStreamCreate(&stream);

    if (ret == cudaSuccess)
    {
        std::cout << "cuda stream created" << std::endl;
    }
    else
    {
        throw std::runtime_error("Could not create the cuda stream");
    }

    Remotery* rmt;
    rmt_CreateGlobalInstance(&rmt);
    rmtCUDABind bind;

    bind.context = (void*)context;
    bind.CtxSetCurrent = (void*)&cuCtxSetCurrent;
    bind.CtxGetCurrent = (void*)&cuCtxGetCurrent;
    bind.EventCreate = (void*)&cuEventCreate;
    bind.EventDestroy = (void*)&cuEventDestroy;
    bind.EventRecord = (void*)&cuEventRecord;
    bind.EventQuery = (void*)&cuEventQuery;
    bind.EventElapsedTime = (void*)&cuEventElapsedTime;
    rmt_BindCUDA(&bind);

    std::cout << "before cpu scoped sample" << std::endl;
    {
        rmt_ScopedCPUSample(ScopedCPUSample, 0);
    }
    std::cout << "after cpu scoped sample" << std::endl;

    std::cout << "before cpu standard sample" << std::endl;
    rmt_BeginCPUSample(StandardCPUSample, 0);
    rmt_EndCPUSample();
    std::cout << "after cpu standard sample" << std::endl;

    std::cout << "before cuda scoped sample" << std::endl;
    {
        rmt_ScopedCUDASample(ScopedCUDASample, stream);
    }
    std::cout << "after cuda scoped sample" << std::endl;

    std::cout << "before cuda standard sample" << std::endl;
    rmt_BeginCUDASample(StandardCUDASample, stream);
    rmt_EndCUDASample(stream);
    std::cout << "after cuda standard sample" << std::endl;

    std::cout << "success" << std::endl;

    rmt_DestroyGlobalInstance(rmt);
}