FeiYull / TensorRT-Alpha

🔥🔥🔥TensorRT for YOLOv8、YOLOv8-Pose、YOLOv8-Seg、YOLOv8-Cls、YOLOv7、YOLOv6、YOLOv5、YOLONAS......🚀🚀🚀CUDA IS ALL YOU NEED.🍎🍎🍎
GNU General Public License v2.0
1.3k stars 201 forks source link

u2net 如何改成3通道的输出 #78

Closed a819411321 closed 8 months ago

a819411321 commented 8 months ago

第一处修改

u2net::U2NET::U2NET(const utils::InitParameter& param) : m_param(param)
{
    // input
    m_input_src_device = nullptr;
    m_input_rgb_device = nullptr;
    m_input_resize_device = nullptr;
    m_input_norm_device = nullptr;
    m_input_hwc_device = nullptr;
    m_max_val_device = nullptr;
    m_min_val_device = nullptr;
    checkRuntime(cudaMalloc(&m_input_src_device,    param.batch_size * 3 * param.src_h * param.src_w * sizeof(float)));
    checkRuntime(cudaMalloc(&m_input_rgb_device,    param.batch_size * 3 * param.src_h * param.src_w * sizeof(float)));
    checkRuntime(cudaMalloc(&m_input_resize_device, param.batch_size * 3 * param.dst_h * param.dst_h * sizeof(float)));
    checkRuntime(cudaMalloc(&m_input_norm_device,   param.batch_size * 3 * param.dst_h * param.dst_h * sizeof(float)));
    checkRuntime(cudaMalloc(&m_input_hwc_device,    param.batch_size * 3 * param.dst_h * param.dst_h * sizeof(float)));
    checkRuntime(cudaMalloc(&m_max_val_device,      param.batch_size  * sizeof(float)));
    checkRuntime(cudaMalloc(&m_min_val_device,      param.batch_size  * sizeof(float)));
    // output
    m_output_src_device = nullptr;
    m_output_resize_device = nullptr;
    m_output_resize_host = nullptr;
    m_output_mask_host = nullptr;
//把输出的空间乘了3倍
    checkRuntime(cudaMalloc(&m_output_resize_device,    param.batch_size * 3 * param.src_h * param.src_w * sizeof(float)));
    m_output_resize_host = new float[param.batch_size * 3 * param.src_h * param.src_w];
    m_output_mask_host = new float[3 * param.src_h * param.src_w];
}

第二处 因为我的网络的输出是1x3x320x320 输出的图片是320x320x3 所以不需要resize 我就把resize函数注释了,将m_output_src_device的首地址赋值给m_output_resize_device

void u2net::U2NET::postprocess(const std::vector<cv::Mat>& imgsBatch)
{
    float* p_tmp = m_output_src_device;
    float* p_max = m_max_val_device;
    float* p_min = m_min_val_device;
    for (size_t i = 0; i < imgsBatch.size(); i++)
    {
        thrust::pair<float*, float*> min_max_dev = thrust::minmax_element(thrust::device, p_tmp, p_tmp + m_param.dst_h * m_param.dst_w);
        p_tmp += m_param.dst_h * m_param.dst_w;
        checkRuntime(cudaMemcpy(p_min++, min_max_dev.first, sizeof(float), cudaMemcpyDeviceToDevice));
        checkRuntime(cudaMemcpy(p_max++, min_max_dev.second, sizeof(float), cudaMemcpyDeviceToDevice));
    }
    u2netNormPredDevice(m_param.batch_size, m_output_src_device, m_param.dst_w, m_param.dst_h, 255.f, m_min_val_device, m_max_val_device);
m_output_resize_device=m_output_src_device
    //resizeDevice(m_param.batch_size, m_output_src_device, m_param.dst_w, m_param.dst_h,
    //    m_output_resize_device, m_param.src_w, m_param.src_h, utils::ColorMode::GRAY, m_src2dst);
}

第三处 网络输出结果x255 原先是单通道 我参考前处理的u2netDivMaxDevice修改方法将u2netNormPredDevice函数的grid_size和img_area 都*3了,但是结果不对。

void u2netNormPredDevice(const int& batchSize, float* src, int srcWidth, int srcHeight, float scale, float* minVals, float* maxVals)
{
    dim3 block_size(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid_size((srcWidth * srcHeight * 3 + BLOCK_SIZE - 1) / BLOCK_SIZE,
        (batchSize + BLOCK_SIZE - 1) / BLOCK_SIZE);

    int img_area = srcHeight * srcWidth * 3;
    int img_height = srcHeight;
    int img_width = srcWidth;
    u2net_norm_pred_device_kernel << < grid_size, block_size, 0, nullptr >> > (batchSize, src, img_height, img_width, img_area, scale, minVals, maxVals);
}

但是结果仍然不对