ccccwb commented 1 year ago

你好，这是我在NX上测试的结果，我发现在解码yolo预测结果时，消耗了非常多时间，和推理时间完全不在一个量级，请问是什么原因造成的呢？

ccccwb commented 1 year ago

我把解码部分用CUDA重写，并行解码检测框，现在在JetsonNX上解码时间可以缩短到5ms

ccccwb commented 1 year ago

文件名：decodeTensorCUDA.cu 代码：

include "decodeTensorCUDA.h"

global void decodeTensorKernel( float detections, uint32_t masks, float anchors, float boxes, uint32_t grid_h, uint32_t grid_w, uint32_t numClasses, uint32_t numBBoxes) { // 获取idx uint32_t y = blockIdx.y blockDim.y + threadIdx.y; uint32_t x = blockIdx.x blockDim.x + threadIdx.x; if (y >= grid_h || x >= grid_w) return;

const int numGridCells = grid_h * grid_w;

for (uint32_t b = 0; b < numBBoxes; ++b)
{
    const float pw = anchors[masks[b] * 2];
    const float ph = anchors[masks[b] * 2 + 1];

    // printf("pw %f, ph %f \n",  pw, ph);
    const uint32_t bbindex = y * grid_w + x;
    boxes[18 * bbindex + 6 * b + 0] = x + detections[bbindex + numGridCells * (b * (5 + numClasses) + 0)];

    boxes[18 * bbindex + 6 * b + 1] = y + detections[bbindex + numGridCells * (b * (5 + numClasses) + 1)];
    boxes[18 * bbindex + 6 * b + 2] = pw * detections[bbindex + numGridCells * (b * (5 + numClasses) + 2)];
    boxes[18 * bbindex + 6 * b + 3] = ph * detections[bbindex + numGridCells * (b * (5 + numClasses) + 3)];

    // printf("x %f y %f w %f h %f\n", boxes[18 * bbindex + 6 * b + 0], boxes[18 * bbindex + 6 * b + 1], boxes[18 * bbindex + 6 * b + 2], boxes[18 * bbindex + 6 * b + 3]);

    const float objectness = detections[bbindex + numGridCells * (b * (5 + numClasses) + 4)];
    float maxProb = 0.0f;
    int maxIndex = -1;

    for (uint32_t i = 0; i < numClasses; ++i)
    {
        float prob = detections[bbindex + numGridCells * (b * (5 + numClasses) + (5 + i))];

        if (prob > maxProb)
        {
            maxProb = prob;
            maxIndex = i;
        }
    }
    // printf("objectness * maxProb  %f , objectness %f , maxProb %f \n", objectness * maxProb, objectness, maxProb);
    boxes[18 * bbindex + 6 * b + 4] = objectness * maxProb;
    boxes[18 * bbindex + 6 * b + 5] = (float) maxIndex;
}

}

float decodeTensorCUDA(const int imageIdx, const TensorInfo& tensor) { // 申请host 内存 int boxes_bytes = 6sizeof(float)tensor.grid_htensor.grid_wtensor.numBBoxes; // x y w h maxProb maxIndex 6个元素 const float detections = &tensor.hostBuffer[imageIdx tensor.volume]; float boxes = (float*) malloc(boxes_bytes);

uint32_t grid_h = tensor.grid_h;
uint32_t grid_w = tensor.grid_w;
uint32_t numClasses = tensor.numClasses;
uint32_t numBBoxes = tensor.numBBoxes;

// 申请 device 内存    sizeof(指针) 只能获取指针类型的字节数！！！！！！！！！！
float* d_detections;
int d_detections_size = sizeof(float) * grid_h * grid_w * (5 + numClasses) * numBBoxes;
cudaMalloc((void**) &d_detections, d_detections_size);
cudaMemcpy((void*) d_detections, (void*) detections, d_detections_size, cudaMemcpyHostToDevice);

uint32_t* d_masks;
cudaMalloc((void**) &d_masks, sizeof(uint32_t)*numBBoxes);
cudaMemcpy((void*) d_masks, (void*) &tensor.masks[0], sizeof(uint32_t)*numBBoxes, cudaMemcpyHostToDevice);

float* d_anchors;
cudaMalloc((void**) &d_anchors, sizeof(float)*tensor.anchors.size());
cudaMemcpy((void*) d_anchors, (void*) &tensor.anchors[0], sizeof(float)*tensor.anchors.size(), cudaMemcpyHostToDevice);    

float* d_boxes;
cudaMalloc((void**) &d_boxes, boxes_bytes);

// 定义 size 
dim3 threads_per_block(20, 20);
dim3 number_of_blocks((tensor.grid_w / threads_per_block.x) + 1, (tensor.grid_h / threads_per_block.y) + 1);

// 启动kernel   

decodeTensorKernel<<<number_of_blocks, threads_per_block>>>(d_detections, d_masks, d_anchors, d_boxes, grid_h, grid_w, numClasses, numBBoxes);
// 异步copy
cudaMemcpyAsync((void*) boxes, (void*) d_boxes, boxes_bytes, cudaMemcpyDeviceToHost); 

// 等待cuda搞完  
cudaDeviceSynchronize();

// 释放内存
cudaFree(d_detections);
cudaFree(d_masks);
cudaFree(d_anchors);
cudaFree(d_boxes);

return boxes;

}

ccccwb commented 1 year ago

文件名 decodeTensorCUDA.h 代码：

ifndef DECODETENSORCUDAH

define DECODETENSORCUDAH

include "yolo.h"

float* decodeTensorCUDA(const int imageIdx, const TensorInfo& tensor);

endif

ccccwb commented 1 year ago

文件名：yolov5.cpp 代码：

include "yolov5.h"

include "decodeTensorCUDA.h"

YoloV5::YoloV5( const NetworkInfo &networkinfo, const InferParams &inferparams) : Yolo( networkinfo, inferparams) {} std::vector YoloV5::decodeTensor(const int imageIdx, const int imageH, const int imageW, const TensorInfo& tensor) { float scale_h = 1.f; float scale_w = 1.f; int xOffset = 0; int yOffset = 0; calcuate_letterbox_message(m_InputH, m_InputW, imageH, imageW, scale_h, scale_w, xOffset, yOffset); std::vector binfo;

// 把所有框的 坐标， maxProb 和 maxIndex 用cuda计算后放到 boxes 这个指针中
float* boxes = decodeTensorCUDA(imageIdx, tensor);

// 遍历 boxes， 没想到怎么把这几个循环给去掉，但也挺快了，就不管了
for (uint32_t y = 0; y < tensor.grid_h; ++y)
{
    for (uint32_t x = 0; x < tensor.grid_w; ++x)
    {
        for (uint32_t b = 0; b < tensor.numBBoxes; ++b)
        {   
            const int bbindex = y * tensor.grid_w+ x;
            const float bx
                = boxes[18 * bbindex + 6*b + 0];

            const float by
                = boxes[18 * bbindex + 6*b + 1];

            const float bw
                = boxes[18 * bbindex + 6*b + 2];

            const float bh
                = boxes[18 * bbindex + 6*b + 3];

            const float maxProb
                = boxes[18 * bbindex + 6*b + 4];

            const int maxIndex 
                = (int) boxes[18 * bbindex + 6*b + 5];
            // std::cout<< bx << "  " << by << "  " << maxProb << "  " << m_ProbThresh << "  " << maxIndex << std::endl;
            if (maxProb > m_ProbThresh)
            {
                add_bbox_proposal(bx, by, bw, bh, tensor.stride_h, tensor.stride_w, scale_h, scale_w, xOffset, yOffset, maxIndex, maxProb, imageW, imageH, binfo);
            }

        }
    }
}
return binfo;

}

Nuzhny007 commented 1 year ago

@ccccwb do you can create pull request with your improvements?

ccccwb commented 1 year ago

@Nuzhny007 sure, i will pull it soon

ccccwb commented 1 year ago

hey, i already create this pull request. @Nuzhny007

enazoe / yolo-tensorrt

检测结果解码时间长 #189

include "decodeTensorCUDA.h"

ifndef DECODETENSORCUDAH

define DECODETENSORCUDAH

include "yolo.h"

endif

include "yolov5.h"

include "decodeTensorCUDA.h"