使用paddle inference demo,发现python 预测api会占满显存，但是c++ api只占用500m显存。

QingYuan-L commented 4 years ago

1）PaddlePaddle版本：在docker-dev中自己编译的1.8.4 2）GPU：gtx1060 3）系统环境：ubuntu18.04,python3.7 4）预测库来源：in docker latest-dev-cuda10.1-cudnn7-gcc82,正常编译

复现信息：使用https://github.com/PaddlePaddle/Paddle-Inference-Demo 中的代码，加了循环持续运行，c++和python都正常运行，但是python占用显存数倍于c++，python代码如下

import numpy as np
import argparse
import cv2
import time
from PIL import Image

from paddle.fluid.core import AnalysisConfig
from paddle.fluid.core import create_paddle_predictor

from utils import preprocess, draw_bbox

def create_predictor(args):
   config = AnalysisConfig(args.model_file, args.params_file)
   config.switch_use_feed_fetch_ops(False)
   config.enable_memory_optim()
   config.enable_tensorrt_engine(
       max_batch_size=1, min_subgraph_size=5,
       precision_mode=AnalysisConfig.Precision.Half,
       use_static=True, use_calib_mode=False)
   if args.use_gpu:
     config.enable_use_gpu(200, 0)
   else:
     # If not specific mkldnn, you can set the blas thread.
     # The thread num should not be greater than the number of cores in the CPU.
     config.set_cpu_math_library_num_threads(4)
     #config.enable_mkldnn()

   predictor = create_paddle_predictor(config)
   return predictor

def run(predictor, img):
  # copy img data to input tensor
  input_names = predictor.get_input_names()
  for i,  name in enumerate(input_names):
    input_tensor = predictor.get_input_tensor(name)
    input_tensor.reshape(img[i].shape)   
    input_tensor.copy_from_cpu(img[i].copy())

  # do the inference
  predictor.zero_copy_run()

  results = []
  # get out data from output tensor
  output_names = predictor.get_output_names()
  for i, name in enumerate(output_names):
    output_tensor = predictor.get_output_tensor(name)
    output_data = output_tensor.copy_to_cpu()
    results.append(output_data)
  return results

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_file", type=str, default="model/__model__", help="Model filename, Specify this when your model is a combined model.")
    parser.add_argument("--params_file", type=str, default="model/__params__", help="Parameter filename, Specify this when your model is a combined model.")
    parser.add_argument("--use_gpu", type=int, default=1, help="Whether use gpu.")
    return parser.parse_args()

if __name__ == '__main__':
  args = parse_args()
  img_name = '1.jpg'
  save_img_name = 'res.jpg'

  im_size = 608
  pred = create_predictor(args)
  img = cv2.imread(img_name)
  data = preprocess(img, im_size)
  im_shape = np.array([im_size, im_size]).reshape((1,2)).astype(np.int32)
  i = 1
  while(1):
      a=time.time()
      result = run(pred, [data, im_shape])
      b = time.time()
      print(b-a)
      print(i)
      i = i+1

2020-09-28 13-43-22屏幕截图

使用c++：

C++代码：

#include "paddle/include/paddle_inference_api.h"

#include <numeric>
#include <iostream>
#include <memory>
#include <chrono>
#include <ctime>
#include <gflags/gflags.h>
#include <glog/logging.h>

using paddle::AnalysisConfig;
using namespace std;
clock_t startc,endc;

DEFINE_string(model_file, "model/__model__", "Directory of the inference model.");
DEFINE_string(params_file, "model/__params__", "Directory of the inference model.");
DEFINE_int32(batch_size, 1, "Directory of the inference model.");
DEFINE_bool(use_gpu, true, "enable gpu");
DEFINE_bool(use_mkldnn, true, "enable mkldnn");
DEFINE_bool(mem_optim, true, "enable memory optimize");

using Time = decltype(std::chrono::high_resolution_clock::now());
Time time() { return std::chrono::high_resolution_clock::now(); };
double time_diff(Time t1, Time t2) {
  typedef std::chrono::microseconds ms;
  auto diff = t2 - t1;
  ms counter = std::chrono::duration_cast<ms>(diff);
  return counter.count() / 1000.0;
}

std::unique_ptr<paddle::PaddlePredictor> CreatePredictor() {
  AnalysisConfig config;
  config.SetModel(FLAGS_model_file,
                    FLAGS_params_file);
  if (FLAGS_use_gpu) {
    config.EnableUseGpu(100, 0);
  }
  if (FLAGS_use_mkldnn) {
    config.EnableMKLDNN();
  }
  // Open the memory optim.
  if (FLAGS_mem_optim) {
    config.EnableMemoryOptim();
  }
  // We use ZeroCopy, so we set config->SwitchUseFeedFetchOps(false)
  config.SwitchUseFeedFetchOps(false);
  return CreatePaddlePredictor(config);
}

void run(paddle::PaddlePredictor *predictor,
         const std::vector<float>& input,
         const std::vector<int>& input_shape, 
         const std::vector<int>& input_im,
         const std::vector<int>& input_im_shape,
         std::vector<float> *out_data) {
  auto input_names = predictor->GetInputNames();
  auto input_img = predictor->GetInputTensor(input_names[0]);
  input_img->Reshape(input_shape);
  input_img->copy_from_cpu(input.data());

  auto input_size = predictor->GetInputTensor(input_names[1]);
  input_size->Reshape(input_im_shape);
  input_size->copy_from_cpu(input_im.data());

  CHECK(predictor->ZeroCopyRun());

  auto output_names = predictor->GetOutputNames();
  // there is only one output of yolov3
  auto output_t = predictor->GetOutputTensor(output_names[0]);
  std::vector<int> output_shape = output_t->shape();
  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());

  out_data->resize(out_num);
  output_t->copy_to_cpu(out_data->data());
}

int main(int argc, char* argv[]) {
  google::ParseCommandLineFlags(&argc, &argv, true);
  auto predictor = CreatePredictor();

  const int height = 608;
  const int width = 608;
  const int channels = 3;
  std::vector<int> input_shape = {FLAGS_batch_size, channels, height, width};
  std::vector<float> input_data(FLAGS_batch_size * channels * height * width, 0);
  for (size_t i = 0; i < input_data.size(); ++i) {
    input_data[i] = i % 255 * 0.13f;
  }
  std::vector<int> input_im_shape = {FLAGS_batch_size, 2};
  std::vector<int> input_im_data(FLAGS_batch_size * 2, 608);

  std::vector<float> out_data;
  while(1){
  startc=clock();
  run(predictor.get(), input_data, input_shape, input_im_data, input_im_shape, &out_data);
  LOG(INFO) << "output num is " << out_data.size();
  endc=clock();
  double endtime=(double)(endc-startc)/CLOCKS_PER_SEC;
  cout<<"Total time:"<<endtime*1000<<"ms"<<std::endl;
  } 
  return 0;
}

问题描述：使用python预测库占用显存过高

shangzhizhou commented 4 years ago

C++中未使用TensorRT而python中使用了，请在同样条件下测试看看

QingYuan-L commented 4 years ago

def create_predictor(args):
   config = AnalysisConfig(args.model_file, args.params_file)
   config.switch_use_feed_fetch_ops(False)
   config.enable_memory_optim()
   # config.enable_tensorrt_engine(
   #     max_batch_size=1, min_subgraph_size=5,
   #     precision_mode=AnalysisConfig.Precision.Half,
   #     use_static=True, use_calib_mode=False)
   if args.use_gpu:
     config.enable_use_gpu(200, 0)
   else:
     # If not specific mkldnn, you can set the blas thread.
     # The thread num should not be greater than the number of cores in the CPU.
     config.set_cpu_math_library_num_threads(4)
     #config.enable_mkldnn()

   predictor = create_paddle_predictor(config)
   return predictor

@shangzhizhou 已经注释掉trt，显存没有变化

shangzhizhou commented 4 years ago

python执行前export FLAGS_fraction_of_gpu_memory_to_use = 0.1，python的enable_use_gpu显存设置不生效我们已经安排修复中。

paddle-bot-old[bot] commented 3 years ago

Since you haven\'t replied for more than a year, we have closed this issue/pr. If the problem is not solved or there is a follow-up one, please reopen it at any time and we will continue to follow up. 由于您超过一年未回复，我们将关闭这个issue/pr。若问题未解决或有后续问题，请随时重新打开，我们会继续跟进。

PaddlePaddle / Paddle

使用paddle inference demo,发现python 预测api会占满显存，但是c++ api只占用500m显存。 #27659