NVIDIA / cudnn-frontend

cudnn_frontend provides a c++ wrapper for the cudnn backend API and samples on how to use it
MIT License
423 stars 85 forks source link

cudnn-frontend crashes in case of MAX_OPGRAPH_OPS violation #108

Open gritukan opened 2 weeks ago

gritukan commented 2 weeks ago

Describe the bug Consider a graph with more than MAX_OPGRAPH_OPS nodes, for example in this code

#include "cudnn-frontend/include/cudnn_frontend.h"

namespace fe = cudnn_frontend;

int main()
{
    cudnnHandle_t handle;
    assert(cudnnCreate(&handle) == CUDNN_STATUS_SUCCESS);

    auto graph = std::make_shared<fe::graph::Graph>();
    auto x = graph->tensor(
        fe::graph::Tensor_attributes()
            .set_name("x")
            .set_dim({1, 1, 1})
            .set_stride({1, 1, 1})
            .set_data_type(fe::DataType_t::FLOAT));
    auto y = graph->tensor(
        fe::graph::Tensor_attributes()
            .set_name("y")
            .set_dim({1, 1, 1})
            .set_stride({1, 1, 1})
            .set_data_type(fe::DataType_t::FLOAT));

    auto inX = x;
    auto inY = y;

    for (int i = 0; i < 60; ++i) {
        auto sum = graph->pointwise(
            x,
            y,
            fe::graph::Pointwise_attributes()
                .set_mode(fe::PointwiseMode_t::ADD)
                .set_compute_data_type(fe::DataType_t::FLOAT));
        sum->set_data_type(fe::DataType_t::FLOAT);

        x = y;
        y = sum;
    }

    y->set_output(true);

    assert(graph->validate().is_good());
    auto r = graph->build_operation_graph(handle);
    if (!r.is_good()) {
        std::cerr << r.get_message() << std::endl;
        return 0;
    }

    assert(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
    assert(graph->build_plans(handle, fe::BuildPlanPolicy_t::ALL).is_good());

    float One = 1;

    void* inXPtr;
    assert(cudaMalloc(&inXPtr, sizeof(float)) == cudaSuccess);
    assert(cudaMemcpy(inXPtr, &One, sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);

    void* inYPtr;
    assert(cudaMalloc(&inYPtr, sizeof(float)) == cudaSuccess);
    assert(cudaMemcpy(inYPtr, &One, sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);

    void* outPtr;
    assert(cudaMalloc(&outPtr, sizeof(float)) == cudaSuccess);

    void* workspacePtr;
    assert(cudaMalloc(&workspacePtr, graph->get_workspace_size()) == cudaSuccess);

    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> tensorMap;
    tensorMap[inX] = inXPtr;
    tensorMap[inY] = inYPtr;
    tensorMap[y] = outPtr;

    r = graph->execute(handle, tensorMap, workspacePtr);
    if (!r.is_good()) {
        std::cerr << r.get_message() << std::endl;
        return 0;
    }

    assert(cudaDeviceSynchronize() == cudaSuccess);

    float outData;
    assert(cudaMemcpy(&outData, outPtr, sizeof(float), cudaMemcpyDeviceToHost) == cudaSuccess);
    std::cout << outData << std::endl;

    assert(cudaFree(inXPtr) == cudaSuccess);
    assert(cudaFree(inYPtr) == cudaSuccess);
    assert(cudaFree(outPtr) == cudaSuccess);
    assert(cudaFree(workspacePtr) == cudaSuccess);

    assert(cudnnDestroy(handle) == CUDNN_STATUS_SUCCESS);
}

Instead of giving an error during compilation it crashes with the following trace because of the out-of-bounds access to m_operationGraph.ops.

(gdb) bt
#0  0x00005555555a56d5 in __gnu_cxx::__exchange_and_add (__val=-1, __mem=0xf8) at /usr/include/c++/11/ext/atomicity.h:66
#1  __gnu_cxx::__exchange_and_add_dispatch (__val=-1, __mem=0xf8) at /usr/include/c++/11/ext/atomicity.h:101
#2  std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0xf0) at /usr/include/c++/11/bits/shared_ptr_base.h:165
#3  0x0000555555596de3 in std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count (this=0x7fffffffcfe8, 
    __in_chrg=<optimized out>) at /usr/include/c++/11/bits/shared_ptr_base.h:705
#4  0x000055555556decc in std::__shared_ptr<cudnn_frontend::OpaqueBackendPointer, (__gnu_cxx::_Lock_policy)2>::~__shared_ptr (
    this=0x7fffffffcfe0, __in_chrg=<optimized out>) at /usr/include/c++/11/bits/shared_ptr_base.h:1154
#5  0x00005555555a75e6 in std::__shared_ptr<cudnn_frontend::OpaqueBackendPointer, (__gnu_cxx::_Lock_policy)2>::operator= (
    this=0x7fffffffd4e8, __r=...) at /usr/include/c++/11/bits/shared_ptr_base.h:1250
#6  0x000055555559883e in std::shared_ptr<cudnn_frontend::OpaqueBackendPointer>::operator= (this=0x7fffffffd4e8, __r=...)
    at /usr/include/c++/11/bits/shared_ptr.h:385
#7  0x0000555555578ef1 in cudnn_frontend::OperationGraphBuilder_v8::setOperationGraph (this=0x7fffffffd170, numOps_=60, 
    ops_=0x555555ab2f80) at cudnn-frontend/include/cudnn_frontend_OperationGraph.h:157
#8  0x0000555555586552 in cudnn_frontend::ICudnn::create_cudnn_operation_graph (this=0x5555562310f8, handle=0x5555557379c0)
    at cudnn-frontend/include/cudnn_frontend/node/../cudnn_interface.h:123
#9  0x0000555555592b85 in cudnn_frontend::graph::Graph::build_operation_graph (this=0x5555562310f0, handle=0x5555557379c0)
    at cudnn-frontend/include/cudnn_frontend/graph_interface.h:265
#10 0x00005555555681d4 in main () at example.cpp:43

Expected behavior

I expect to get an error in this case, so my code can fallback to another computational engine.

Also, I expect this restriction to be mentioned in the documentation.

Anerudhan commented 3 days ago

Will be addressed in next release.