microsoft / onnxruntime

ONNX Runtime: cross-platform, high performance ML inferencing and training accelerator
https://onnxruntime.ai
MIT License
14.7k stars 2.93k forks source link

[Performance] Can not release memory in gpu. #14957

Open t107598066 opened 1 year ago

t107598066 commented 1 year ago

Describe the issue

How to release entire gpu memory in onnxruntime session create. I try to release the memory by the bellow two codes are same performance. Setting three breakpoints at const wchar_t* model_path, g_ort->ReleaseSession(session); and return 0;.

It's seem to be no clear fully.

at const wchar_t* model_path image

at g_ort->ReleaseSession(session); image

at return 0; image

To reproduce

inference by bellow code.

const OrtApi* g_ort = NULL;
int main() {

    const wchar_t* model_path = L"C:/Users/user/Deskto/en/Project1/model.onnx";

    const OrtApiBase* ptr_api_base = OrtGetApiBase();
    g_ort = ptr_api_base->GetApi(ORT_API_VERSION);

    printf("Using Onnxruntime C++ API\n");

    OrtEnv* env;
    g_ort->CreateEnv(ORT_LOGGING_LEVEL_ERROR, "test", &env);
    OrtSessionOptions* session_options;
    g_ort->CreateSessionOptions(&session_options);

    g_ort->DisableCpuMemArena(session_options);
    g_ort->DisableMemPattern(session_options);

    OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0);

    g_ort->DisableCpuMemArena(session_options);
    g_ort->DisableMemPattern(session_options);

    OrtSession* session;
    g_ort->CreateSession(env, model_path, session_options, &session);    

    g_ort->ReleaseSession(session);    
    g_ort->ReleaseSessionOptions(session_options);
    g_ort->ReleaseEnv(env);

    return 0;
}

or using g_ort->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options);

const OrtApi* g_ort = NULL;
int main() {

    const wchar_t* model_path = L"C:/Users/user/Desktop/en/Project1/model.onnx";

    const OrtApiBase* ptr_api_base = OrtGetApiBase();
    g_ort = ptr_api_base->GetApi(ORT_API_VERSION);

    printf("Using Onnxruntime C++ API\n");

    OrtEnv* env;
    g_ort->CreateEnv(ORT_LOGGING_LEVEL_ERROR, "test", &env);
    OrtSessionOptions* session_options;
    g_ort->CreateSessionOptions(&session_options);

    g_ort->DisableCpuMemArena(session_options);
    g_ort->DisableMemPattern(session_options);

    OrtCUDAProviderOptionsV2* cuda_options = nullptr;
    g_ort->CreateCUDAProviderOptions(&cuda_options);

    std::vector<const char*> keys{ "device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d" };
    std::vector<const char*> values{ "0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1" };

    g_ort->UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
    g_ort->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options);

    g_ort->DisableCpuMemArena(session_options);
    g_ort->DisableMemPattern(session_options);

    OrtSession* session;
    g_ort->CreateSession(env, model_path, session_options, &session);    

    g_ort->ReleaseSession(session);    
    g_ort->ReleaseCUDAProviderOptions(cuda_options);
    g_ort->ReleaseSessionOptions(session_options);
    g_ort->ReleaseEnv(env);

    return 0;

Urgency

No response

Platform

Windows

OS Version

Windows10

ONNX Runtime Installation

Built from Source

ONNX Runtime Version or Commit ID

1.12.0 and 1.14.1

ONNX Runtime API

C++

Architecture

X64

Execution Provider

CUDA

Execution Provider Library Version

No response

Model File

No response

Is this a quantized model?

No

t107598066 commented 1 year ago

I predict the image with onnxruntime sample code is under below. Check the cpu memory and gpu memory changed. Is cpu and gpu memory is not entire release too?

Could anyone give me some answer or suggestion? thanks.

Debug at begin ,before release and return. image image image

#include <iostream>
#include <string>
#include <onnxruntime_cxx_api.h>
#include <opencv2\opencv.hpp>
#include <ctime>
#include <vector>

const OrtApi* g_ort = NULL;

int test_onnx(
    int test_onnx(
    cv::Mat blob,
    OrtSession* session,
    OrtAllocator* allocator,
    OrtMemoryInfo* memory_info,
    OrtCUDAProviderOptionsV2* cuda_options,
    std::vector<const char*> input_node_names,
    std::vector<const char*> output_node_names,
    int model_height,
    int model_width) {

    std::vector<int64_t> input_node_dims = { 1, 3, model_height, model_width };

    OrtValue* input_tensor = NULL;
    OrtValue* output_tensor = NULL;

    g_ort->CreateTensorWithDataAsOrtValue(memory_info, blob.ptr<float>(), blob.total() * sizeof(float), input_node_dims.data(), input_node_dims.size(), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &input_tensor);

    int startTime = clock();

    auto output_tensors = g_ort->Run(
        session,
        Ort::RunOptions{ nullptr },
        input_node_names.data(),
        (const OrtValue* const*)& input_tensor,
        input_node_names.size(),
        output_node_names.data(),
        output_node_names.size(),
        &output_tensor);

    int detect_time = clock() - startTime;
    std::cout << clock() - startTime << std::endl;

    float* floatarr;
    g_ort->GetTensorMutableData(output_tensor, (void**)&floatarr);    

    /*
    cv::Mat mask = cv::Mat::zeros(input_node_dims[2], input_node_dims[3], CV_32F);    

    for (int i = 0; i < input_node_dims[2]; i++) {
        for (int j = 0; j < input_node_dims[3]; j++) {
            mask.at<float>(i, j) = floatarr[i * input_node_dims[2] + j];
        }
    }
    */

    g_ort->ReleaseValue(output_tensor);
    g_ort->ReleaseValue(input_tensor);
    output_tensor = nullptr;
    input_tensor = nullptr;

    return detect_time;
}

int main() {

    const wchar_t* model_path = L"C:/Users/uese/Desktop/en/Project1/model.onnx";    

    const OrtApiBase* ptr_api_base = OrtGetApiBase();
    g_ort = ptr_api_base->GetApi(ORT_API_VERSION);    

    printf("Using Onnxruntime C++ API\n");

    OrtEnv* env;
    g_ort->CreateEnv(ORT_LOGGING_LEVEL_ERROR, "test", &env);
    OrtSessionOptions* session_options;
    g_ort->CreateSessionOptions(&session_options);

    OrtCUDAProviderOptionsV2* cuda_options = nullptr;
    g_ort->CreateCUDAProviderOptions(&cuda_options);

    std::vector<const char*> keys{ "device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d" };
    std::vector<const char*> values{ "0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1" };

    g_ort->UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
    g_ort->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options);

    OrtSession* session;
    g_ort->CreateSession(env, model_path, session_options, &session);

    g_ort->DisableCpuMemArena(session_options);
    g_ort->DisableMemPattern(session_options);        

    OrtAllocator* allocator;
    g_ort->GetAllocatorWithDefaultOptions(&allocator);

    OrtMemoryInfo* memory_info;
    g_ort->CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, &memory_info);

    OrtTypeInfo* inputTypeInfo = nullptr;
    OrtStatus* status = g_ort->SessionGetInputTypeInfo(session, 0, &inputTypeInfo);
    const OrtTensorTypeAndShapeInfo* inputTensorInfo;
    g_ort->CastTypeInfoToTensorInfo(inputTypeInfo, &inputTensorInfo);

    size_t numDims = -1;
    g_ort->GetDimensionsCount(inputTensorInfo, &numDims);
    std::vector<int64_t> inputNodeDims = std::vector<int64_t>(numDims);
    g_ort->GetDimensions(inputTensorInfo, inputNodeDims.data(), numDims);

    int model_height = static_cast<int>(inputNodeDims[2]);
    int model_width = static_cast<int>(inputNodeDims[3]);

    g_ort->ReleaseTypeInfo(inputTypeInfo);

    long long t = 0;

    int start = clock();

    char* input_name = nullptr;
    char* output_name = nullptr;

    g_ort->SessionGetInputName(session, 0, allocator, &input_name);
    g_ort->SessionGetOutputName(session, 0, allocator, &output_name);

    std::vector<const char*> input_node_names = { input_name };
    std::vector<const char*> output_node_names = { output_name };

    for (int i = 1801; i < 1810; i++) {       
        std::string path = "C:/Users/user/Desktop/en/Project1/rcm.bmp";

        cv::Mat image = cv::imread(path);
        cv::resize(image, image, cv::Size(1024, 1024));
        cv::Mat normalized_image = normalize(image);
        cv::Mat blob = blobFromImage(normalized_image);

        int per_detect_time = test_onnx(
            blob, 
            session, 
            allocator, 
            memory_info, 
            cuda_options, 
            input_node_names,
            output_node_names,
            model_height,
            model_width);
    }

    g_ort->ReleaseMemoryInfo(memory_info);
    allocator->Free(allocator, input_name);
    allocator->Free(allocator, output_name);

    g_ort->ReleaseCUDAProviderOptions(cuda_options);
    g_ort->ReleaseSession(session);
    g_ort->ReleaseSessionOptions(session_options);
    g_ort->ReleaseEnv(env);

    printf("Done");

    return 0;
}
rytisaugustauskas commented 1 year ago

I can see similar behavior [only related to CUDA]. After releasing the model it leaves some memory allocated in the RAM. Although, reloading the model multiple times does not allocate memory again (seems that it is reusing the same allocated memory). Note that the same model's memory loaded for CPU inference is managed nicely.

Still, it would be nice to know if it is possible to do the full cleanup in CUDA inference.

I am using C++ API

rytisaugustauskas commented 1 year ago

Seems to be fixed here: #15040

UNeedCryDear commented 1 year ago

Seems to be fixed here: #15040

There are still some GPU memory(VRAM) not released after "g_ort->ReleaseSession(session)". It will not be released until the program exits. Do you know how to fully release GPU memory in the main()?

ReverseSystem001 commented 1 year ago

Seems to be fixed here: #15040

There are still some GPU memory(VRAM) not released after "g_ort->ReleaseSession(session)". It will not be released until the program exits. Do you know how to fully release GPU memory in the main()?

How do you solve this problem? hope your reply