Closed a819411321 closed 8 months ago
第一处修改
u2net::U2NET::U2NET(const utils::InitParameter& param) : m_param(param) { // input m_input_src_device = nullptr; m_input_rgb_device = nullptr; m_input_resize_device = nullptr; m_input_norm_device = nullptr; m_input_hwc_device = nullptr; m_max_val_device = nullptr; m_min_val_device = nullptr; checkRuntime(cudaMalloc(&m_input_src_device, param.batch_size * 3 * param.src_h * param.src_w * sizeof(float))); checkRuntime(cudaMalloc(&m_input_rgb_device, param.batch_size * 3 * param.src_h * param.src_w * sizeof(float))); checkRuntime(cudaMalloc(&m_input_resize_device, param.batch_size * 3 * param.dst_h * param.dst_h * sizeof(float))); checkRuntime(cudaMalloc(&m_input_norm_device, param.batch_size * 3 * param.dst_h * param.dst_h * sizeof(float))); checkRuntime(cudaMalloc(&m_input_hwc_device, param.batch_size * 3 * param.dst_h * param.dst_h * sizeof(float))); checkRuntime(cudaMalloc(&m_max_val_device, param.batch_size * sizeof(float))); checkRuntime(cudaMalloc(&m_min_val_device, param.batch_size * sizeof(float))); // output m_output_src_device = nullptr; m_output_resize_device = nullptr; m_output_resize_host = nullptr; m_output_mask_host = nullptr; //把输出的空间乘了3倍 checkRuntime(cudaMalloc(&m_output_resize_device, param.batch_size * 3 * param.src_h * param.src_w * sizeof(float))); m_output_resize_host = new float[param.batch_size * 3 * param.src_h * param.src_w]; m_output_mask_host = new float[3 * param.src_h * param.src_w]; }
第二处 因为我的网络的输出是1x3x320x320 输出的图片是320x320x3 所以不需要resize 我就把resize函数注释了,将m_output_src_device的首地址赋值给m_output_resize_device
void u2net::U2NET::postprocess(const std::vector<cv::Mat>& imgsBatch) { float* p_tmp = m_output_src_device; float* p_max = m_max_val_device; float* p_min = m_min_val_device; for (size_t i = 0; i < imgsBatch.size(); i++) { thrust::pair<float*, float*> min_max_dev = thrust::minmax_element(thrust::device, p_tmp, p_tmp + m_param.dst_h * m_param.dst_w); p_tmp += m_param.dst_h * m_param.dst_w; checkRuntime(cudaMemcpy(p_min++, min_max_dev.first, sizeof(float), cudaMemcpyDeviceToDevice)); checkRuntime(cudaMemcpy(p_max++, min_max_dev.second, sizeof(float), cudaMemcpyDeviceToDevice)); } u2netNormPredDevice(m_param.batch_size, m_output_src_device, m_param.dst_w, m_param.dst_h, 255.f, m_min_val_device, m_max_val_device); m_output_resize_device=m_output_src_device //resizeDevice(m_param.batch_size, m_output_src_device, m_param.dst_w, m_param.dst_h, // m_output_resize_device, m_param.src_w, m_param.src_h, utils::ColorMode::GRAY, m_src2dst); }
第三处 网络输出结果x255 原先是单通道 我参考前处理的u2netDivMaxDevice修改方法将u2netNormPredDevice函数的grid_size和img_area 都*3了,但是结果不对。
void u2netNormPredDevice(const int& batchSize, float* src, int srcWidth, int srcHeight, float scale, float* minVals, float* maxVals) { dim3 block_size(BLOCK_SIZE, BLOCK_SIZE); dim3 grid_size((srcWidth * srcHeight * 3 + BLOCK_SIZE - 1) / BLOCK_SIZE, (batchSize + BLOCK_SIZE - 1) / BLOCK_SIZE); int img_area = srcHeight * srcWidth * 3; int img_height = srcHeight; int img_width = srcWidth; u2net_norm_pred_device_kernel << < grid_size, block_size, 0, nullptr >> > (batchSize, src, img_height, img_width, img_area, scale, minVals, maxVals); }
但是结果仍然不对
第一处修改
第二处 因为我的网络的输出是1x3x320x320 输出的图片是320x320x3 所以不需要resize 我就把resize函数注释了,将m_output_src_device的首地址赋值给m_output_resize_device
第三处 网络输出结果x255 原先是单通道 我参考前处理的u2netDivMaxDevice修改方法将u2netNormPredDevice函数的grid_size和img_area 都*3了,但是结果不对。
但是结果仍然不对