Open t107598066 opened 1 year ago
I predict the image with onnxruntime sample code is under below. Check the cpu memory and gpu memory changed. Is cpu and gpu memory is not entire release too?
Could anyone give me some answer or suggestion? thanks.
Debug at begin ,before release and return.
#include <iostream>
#include <string>
#include <onnxruntime_cxx_api.h>
#include <opencv2\opencv.hpp>
#include <ctime>
#include <vector>
const OrtApi* g_ort = NULL;
int test_onnx(
int test_onnx(
cv::Mat blob,
OrtSession* session,
OrtAllocator* allocator,
OrtMemoryInfo* memory_info,
OrtCUDAProviderOptionsV2* cuda_options,
std::vector<const char*> input_node_names,
std::vector<const char*> output_node_names,
int model_height,
int model_width) {
std::vector<int64_t> input_node_dims = { 1, 3, model_height, model_width };
OrtValue* input_tensor = NULL;
OrtValue* output_tensor = NULL;
g_ort->CreateTensorWithDataAsOrtValue(memory_info, blob.ptr<float>(), blob.total() * sizeof(float), input_node_dims.data(), input_node_dims.size(), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &input_tensor);
int startTime = clock();
auto output_tensors = g_ort->Run(
session,
Ort::RunOptions{ nullptr },
input_node_names.data(),
(const OrtValue* const*)& input_tensor,
input_node_names.size(),
output_node_names.data(),
output_node_names.size(),
&output_tensor);
int detect_time = clock() - startTime;
std::cout << clock() - startTime << std::endl;
float* floatarr;
g_ort->GetTensorMutableData(output_tensor, (void**)&floatarr);
/*
cv::Mat mask = cv::Mat::zeros(input_node_dims[2], input_node_dims[3], CV_32F);
for (int i = 0; i < input_node_dims[2]; i++) {
for (int j = 0; j < input_node_dims[3]; j++) {
mask.at<float>(i, j) = floatarr[i * input_node_dims[2] + j];
}
}
*/
g_ort->ReleaseValue(output_tensor);
g_ort->ReleaseValue(input_tensor);
output_tensor = nullptr;
input_tensor = nullptr;
return detect_time;
}
int main() {
const wchar_t* model_path = L"C:/Users/uese/Desktop/en/Project1/model.onnx";
const OrtApiBase* ptr_api_base = OrtGetApiBase();
g_ort = ptr_api_base->GetApi(ORT_API_VERSION);
printf("Using Onnxruntime C++ API\n");
OrtEnv* env;
g_ort->CreateEnv(ORT_LOGGING_LEVEL_ERROR, "test", &env);
OrtSessionOptions* session_options;
g_ort->CreateSessionOptions(&session_options);
OrtCUDAProviderOptionsV2* cuda_options = nullptr;
g_ort->CreateCUDAProviderOptions(&cuda_options);
std::vector<const char*> keys{ "device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d" };
std::vector<const char*> values{ "0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1" };
g_ort->UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
g_ort->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options);
OrtSession* session;
g_ort->CreateSession(env, model_path, session_options, &session);
g_ort->DisableCpuMemArena(session_options);
g_ort->DisableMemPattern(session_options);
OrtAllocator* allocator;
g_ort->GetAllocatorWithDefaultOptions(&allocator);
OrtMemoryInfo* memory_info;
g_ort->CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, &memory_info);
OrtTypeInfo* inputTypeInfo = nullptr;
OrtStatus* status = g_ort->SessionGetInputTypeInfo(session, 0, &inputTypeInfo);
const OrtTensorTypeAndShapeInfo* inputTensorInfo;
g_ort->CastTypeInfoToTensorInfo(inputTypeInfo, &inputTensorInfo);
size_t numDims = -1;
g_ort->GetDimensionsCount(inputTensorInfo, &numDims);
std::vector<int64_t> inputNodeDims = std::vector<int64_t>(numDims);
g_ort->GetDimensions(inputTensorInfo, inputNodeDims.data(), numDims);
int model_height = static_cast<int>(inputNodeDims[2]);
int model_width = static_cast<int>(inputNodeDims[3]);
g_ort->ReleaseTypeInfo(inputTypeInfo);
long long t = 0;
int start = clock();
char* input_name = nullptr;
char* output_name = nullptr;
g_ort->SessionGetInputName(session, 0, allocator, &input_name);
g_ort->SessionGetOutputName(session, 0, allocator, &output_name);
std::vector<const char*> input_node_names = { input_name };
std::vector<const char*> output_node_names = { output_name };
for (int i = 1801; i < 1810; i++) {
std::string path = "C:/Users/user/Desktop/en/Project1/rcm.bmp";
cv::Mat image = cv::imread(path);
cv::resize(image, image, cv::Size(1024, 1024));
cv::Mat normalized_image = normalize(image);
cv::Mat blob = blobFromImage(normalized_image);
int per_detect_time = test_onnx(
blob,
session,
allocator,
memory_info,
cuda_options,
input_node_names,
output_node_names,
model_height,
model_width);
}
g_ort->ReleaseMemoryInfo(memory_info);
allocator->Free(allocator, input_name);
allocator->Free(allocator, output_name);
g_ort->ReleaseCUDAProviderOptions(cuda_options);
g_ort->ReleaseSession(session);
g_ort->ReleaseSessionOptions(session_options);
g_ort->ReleaseEnv(env);
printf("Done");
return 0;
}
I can see similar behavior [only related to CUDA]. After releasing the model it leaves some memory allocated in the RAM. Although, reloading the model multiple times does not allocate memory again (seems that it is reusing the same allocated memory). Note that the same model's memory loaded for CPU inference is managed nicely.
Still, it would be nice to know if it is possible to do the full cleanup in CUDA inference.
I am using C++ API
Seems to be fixed here: #15040
Seems to be fixed here: #15040
There are still some GPU memory(VRAM) not released after "g_ort->ReleaseSession(session)". It will not be released until the program exits. Do you know how to fully release GPU memory in the main()
?
Seems to be fixed here: #15040
There are still some GPU memory(VRAM) not released after "g_ort->ReleaseSession(session)". It will not be released until the program exits. Do you know how to fully release GPU memory in the
main()
?
How do you solve this problem? hope your reply
Describe the issue
How to release entire gpu memory in onnxruntime session create. I try to release the memory by the bellow two codes are same performance. Setting three breakpoints at const wchar_t* model_path, g_ort->ReleaseSession(session); and return 0;.
It's seem to be no clear fully.
at const wchar_t* model_path
at g_ort->ReleaseSession(session);
at return 0;
To reproduce
inference by bellow code.
or using g_ort->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options);
Urgency
No response
Platform
Windows
OS Version
Windows10
ONNX Runtime Installation
Built from Source
ONNX Runtime Version or Commit ID
1.12.0 and 1.14.1
ONNX Runtime API
C++
Architecture
X64
Execution Provider
CUDA
Execution Provider Library Version
No response
Model File
No response
Is this a quantized model?
No