Open hacktmz opened 3 years ago
然后官方的解码demo 是这样写的,里面的api为什么跟pynvjpeg完全不一样
int decode_images(const FileData &img_data, const std::vector
CHECK_CUDA(cudaEventCreate(&startEvent, cudaEventBlockingSync)); CHECK_CUDA(cudaEventCreate(&stopEvent, cudaEventBlockingSync));
std::vector<const unsigned char*> batched_bitstreams;
std::vector
// bit-streams that batched decode cannot handle
std::vector<const unsigned char*> otherdecode_bitstreams;
std::vector
// if(params.hw_decode_available){ // for(int i = 0; i < params.batch_size; i++){ // // extract bitstream meta data to figure out whether a bit-stream can be decoded // nvjpegJpegStreamParseHeader(params.nvjpeg_handle, (const unsigned char *)img_data[i].data(), img_len[i], params.jpeg_streams[0]); // int isSupported = -1; // nvjpegDecodeBatchedSupported(params.nvjpeg_handle, params.jpeg_streams[0], &isSupported);
// if(isSupported == 0){ // batched_bitstreams.push_back((const unsigned char )img_data[i].data()); // batched_bitstreams_size.push_back(img_len[i]); // batched_output.push_back(out[i]); // } else { // otherdecode_bitstreams.push_back((const unsigned char )img_data[i].data()); // otherdecode_bitstreams_size.push_back(img_len[i]); // otherdecode_output.push_back(out[i]); // } // } // } else { for(int i = 0; i < params.batch_size; i++) { otherdecode_bitstreams.push_back((const unsigned char *)img_data[i].data()); otherdecode_bitstreams_size.push_back(img_len[i]); otherdecode_output.push_back(out[i]); } // }
CHECK_CUDA(cudaEventRecord(startEvent, params.stream));
if(batched_bitstreams.size() > 0)
{
CHECK_NVJPEG(
nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state,
batched_bitstreams.size(), 1, params.fmt));
CHECK_NVJPEG(nvjpegDecodeBatched(
params.nvjpeg_handle, params.nvjpeg_state, batched_bitstreams.data(),
batched_bitstreams_size.data(), batched_output.data(), params.stream));
}
if(otherdecode_bitstreams.size() > 0)
{
CHECK_NVJPEG(nvjpegStateAttachDeviceBuffer(params.nvjpeg_decoupled_state, params.device_buffer));
int buffer_index = 0;
CHECK_NVJPEG(nvjpegDecodeParamsSetOutputFormat(params.nvjpeg_decode_params, params.fmt));
for (int i = 0; i < params.batch_size; i++) {
CHECK_NVJPEG(
nvjpegJpegStreamParse(params.nvjpeg_handle, otherdecode_bitstreams[i], otherdecode_bitstreams_size[i],
0, 0, params.jpeg_streams[buffer_index]));
CHECK_NVJPEG(nvjpegStateAttachPinnedBuffer(params.nvjpeg_decoupled_state,
params.pinned_buffers[buffer_index]));
CHECK_NVJPEG(nvjpegDecodeJpegHost(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
params.nvjpeg_decode_params, params.jpeg_streams[buffer_index]));
CHECK_CUDA(cudaStreamSynchronize(params.stream));
CHECK_NVJPEG(nvjpegDecodeJpegTransferToDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
params.jpeg_streams[buffer_index], params.stream));
buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode to avoid an extra sync
CHECK_NVJPEG(nvjpegDecodeJpegDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
&otherdecode_output[i], params.stream));
}
}
CHECK_CUDA(cudaEventRecord(stopEvent, params.stream));
CHECK_CUDA(cudaEventSynchronize(stopEvent));
CHECK_CUDA(cudaEventElapsedTime(&loopTime, startEvent, stopEvent));
time = static_cast
return EXIT_SUCCESS; }
为什么测试解码提升很小?
GPU硬解码在编解码速度上是比CPU快的,但是处理过程中多了 host to device 和 device to host 内存复制时间。所以在处理小图片的速度不理想。由于GPU核心数远远多于CPU核心数,所以使用多线程也有利于获得更好的结果。
与官方demo 使用的api不一样
pynvjpeg旨在使用nvjpeg的编解码功能,实现兼容opencv的接口。
Why the decoding time is similar?
GPU hard decoding is faster than CPU in encoding and decoding process, but there is a process names host to device or device to host memory copying spends time. For the reason, the decoding/encoding time is similar, especially decoding/encoding small pictures . Using multi-threading, The GPU should got the more better score than CPU.
The API is different from the official demo
PyNvjpeg is designed to be compatible with OpenCV.
环境 20核cpu cuda 10.2 T4 单卡
def get_image(image_url): if not image_url: return None try: image_url = parse.unquote(image_url) response = requests.get(image_url) if response.status_code != 200: print("get image filed!!!!!!") return " " return response.content except Exception as e: print(e) raise
def test_load_img(image, count): start = cv2.getTickCount() for num in range(0, count): np_image = np.frombuffer(bytearray(image), np.uint8) cv_image = cv2.imdecode(np_image, cv2.IMREAD_COLOR) end1 = cv2.getTickCount() print("load img1 base line = %s" % ((end1 - start) / cv2.getTickFrequency())) return cv_image
def test_load_img_nvjpeg(image, count): from nvjpeg import NvJpeg nj = NvJpeg() start = cv2.getTickCount() for num in range(0, count): np_image = np.asarray(bytearray(image), dtype="uint8")
cv_image = cv2.imdecode(np_image, cv2.IMREAD_COLOR)
if name == "main": image = get_image("http://cdn.weipaitang.com/img/20200313rli2rh7p-jdgj-7vyi-91qd-584099256046-W3024H4032") if image != "": count = 100 test_load_img_nvjpeg(image, count) cv_image = test_load_img(image, count)
结果为: opencv 14秒 pynvjpeg 12.6秒 (确定观察到GPU的使用率,没有任何报错)