zhiqwang / yolort

yolort is a runtime stack for yolov5 on specialized accelerators such as tensorrt, libtorch, onnxruntime, tvm and ncnn.
https://zhiqwang.com/yolort
GNU General Public License v3.0
708 stars 153 forks source link

cant batch infer? #510

Closed yueyue0574 closed 6 months ago

yueyue0574 commented 6 months ago

🐛 Describe the bug

int main() { TRTLogger logger; auto engine_data = load_file("v5s6.nms.1280.4.trt"); auto runtime = make_nvshared(nvinfer1::createInferRuntime(logger)); initLibNvInferPlugins(&logger, ""); auto engine = make_nvshared(runtime->deserializeCudaEngine(engine_data.data(), engine_data.size()));

if (engine == nullptr) { printf("Deserialize cuda engine failed.\n"); runtime->destroy(); return -1; }

printf("Deserialize cuda engine successful.\n");

printf("engine->getNbBindings() %d\n", engine->getNbBindings());

cudaStream_t stream = nullptr; checkRuntime(cudaStreamCreate(&stream)); auto execution_context = make_nvshared(engine->createExecutionContext());

int input_batch = BATCH_SIZE; int input_channel = 3; int input_height = INPUT_H; int input_width = INPUT_W; int input_numel = input_batch input_channel input_height input_width; int input_delta = input_channel input_height input_width; float input_data_host = nullptr; float input_data_device = nullptr; checkRuntime(cudaMallocHost(&input_data_host, input_numel sizeof(float))); checkRuntime(cudaMalloc(&input_data_device, input_numel * sizeof(float)));

vector images = {cv::imread("0.png"), cv::imread("1.png"), cv::imread("2.png"), cv::imread("3.png")};

for (size_t i = 0; i < images.size(); i++) { // 通过双线性插值对图像进行resize float scale_x = input_width / (float)images[i].cols; float scale_y = input_height / (float)images[i].rows; float scale = std::min(scale_x, scale_y); float i2d[6], d2i[6]; // resize图像,源图像和目标图像几何中心的对齐 i2d[0] = scale; i2d[1] = 0; i2d[2] = (-scale images[i].cols + input_width + scale - 1) 0.5; i2d[3] = 0; i2d[4] = scale; i2d[5] = (-scale images[i].rows + input_height + scale - 1) 0.5;

cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);           // image to dst(network), 2x3 matrix
cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);           // dst to image, 2x3 matrix
cv::invertAffineTransform(m2x3_i2d, m2x3_d2i); // 计算一个反仿射变换

cv::Mat input_image(input_height, input_width, CV_8UC3);
cv::warpAffine(images[i], input_image, m2x3_i2d, input_image.size(), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114)); // 对图像做平移缩放旋转变换,可逆
int image_area = input_image.cols * input_image.rows;
unsigned char *pimage = input_image.data;
float *phost_b = input_data_host + input_delta * i + image_area * 0;
float *phost_g = input_data_host + input_delta * i + image_area * 1;
float *phost_r = input_data_host + input_delta * i + image_area * 2;
for (int k = 0; k < image_area; ++k, pimage += 3)
{
  // 注意这里的顺序rgb调换了
  *phost_r++ = pimage[0] / 255.0f;
  *phost_g++ = pimage[1] / 255.0f;
  *phost_b++ = pimage[2] / 255.0f;
}

} checkRuntime(cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream));

auto output_num_detections_dims = engine->getBindingDimensions(1); int output_num_detections = output_num_detections_dims.d[1]; auto output_detection_boxes_dims = engine->getBindingDimensions(2); int output_detection_boxes = output_detection_boxes_dims.d[1]; auto output_detection_scores_dims = engine->getBindingDimensions(3); int output_detection_scores = output_detection_scores_dims.d[1]; auto output_detection_classes_dims = engine->getBindingDimensions(4); int output_detection_classes = output_detection_classes_dims.d[1];

int output_numel = input_batch output_num_detections output_detection_boxes output_detection_scores output_detection_classes;

float output_data_host = nullptr; float output_data_device = nullptr; checkRuntime(cudaMallocHost(&output_data_host, sizeof(float) output_numel)); checkRuntime(cudaMalloc(&output_data_device, sizeof(float) output_numel));

// 明确当前推理时,使用的数据输入大小 auto input_dims = engine->getBindingDimensions(0); input_dims.d[0] = input_batch;

execution_context->setBindingDimensions(0, input_dims); float *bindings[] = {input_data_device, output_data_device}; bool status = execution_context->enqueueV2((void *)bindings, stream, nullptr); if (!status) std::cout << "Something is wrong in inference!\n"; std::cout << status << std::endl; checkRuntime(cudaMemcpyAsync(output_data_host, output_data_device, sizeof(float) output_numel, cudaMemcpyDeviceToHost, stream)); checkRuntime(cudaStreamSynchronize(stream));

return 0; } when i using bool status = execution_context->enqueueV2((void **)bindings, stream, nullptr);

error: 2: [pluginV2DynamicExtRunner.cpp::nvinfer1::rt::cuda::PluginV2DynamicExtRunner::execute::115] Error Code 2: Internal Error (Assertion status == kSTATUS_SUCCESS failed. )

Versions

master