ken1714 / dehazing-sample

画像ファイルを入力し、ヘイズ(霧)を除去した画像ファイルを出力するc++プログラム。
0 stars 0 forks source link

ヘイズ除去処理(RemoveHaze)のCUDA実装 #22

Closed ken1714 closed 2 years ago

ken1714 commented 2 years ago

RemoveHazeで行っているヘイズ除去処理をCUDAで実装する。入出力画像はGpuMatで管理する。

ken1714 commented 2 years ago

まずナイーブに実装: 6dd3e5e8248d2084278a2bb11c700baf7eecdf9a

テストコードのtest_remove_haze.cppに実行時間計測処理を追加し、ctest --verboseを実行した際の出力を確認。単純にヘイズ除去処理時間のみを見ると、1024x768サイズの画像を処理するのにCPU版では28.175[msec]かかっているのに対し、GPU版では0.412[msec]と、処理時間が約68分の1になっていることがわかる。

4: Test command: /root/projects/dehazing-sample/dehazing-sample/build/tests/test_remove_haze
4: Test timeout computed to be: 10000000
4: Running main() from gmock_main.cc
4: [==========] Running 2 tests from 2 test suites.
4: [----------] Global test environment set-up.
4: [----------] 1 test from RemoveHazeTest
4: [ RUN      ] RemoveHazeTest.RemoveHaze
4: CPU: 28.175[msec]
4: [       OK ] RemoveHazeTest.RemoveHaze (62 ms)
4: [----------] 1 test from RemoveHazeTest (62 ms total)
4: 
4: [----------] 1 test from RemoveHazeTestCUDA
4: [ RUN      ] RemoveHazeTestCUDA.RemoveHazeCUDA
4: GPU: 0.412[msec]
4: [       OK ] RemoveHazeTestCUDA.RemoveHazeCUDA (144 ms)
4: [----------] 1 test from RemoveHazeTestCUDA (144 ms total)
4: 
4: [----------] Global test environment tear-down
4: [==========] 2 tests from 2 test suites ran. (206 ms total)
4: [  PASSED  ] 2 tests.
4/5 Test #4: test_remove_haze .................   Passed    0.23 sec
ken1714 commented 2 years ago

実行時間計測を入れたtest_remove_haze.cpp

#include "gtest/gtest.h"
#include "remove_haze.hpp"
#include "remove_haze.cuh"
#include "utils.hpp"
#include <chrono>

// Path
const std::string inputImagePath = "data/original.png";
const std::string transmissionPath = "data/transmission_map.png";
const std::string referenceImagePath = "data/dehazed.png";

TEST(RemoveHazeTest, RemoveHaze) {
    // Read images
    const cv::Mat inputImage = cv::imread(inputImagePath);
    const cv::Mat transmissionMap = cv::imread(transmissionPath, cv::IMREAD_GRAYSCALE);
    const cv::Mat referenceImage = cv::imread(referenceImagePath);
    cv::Mat outputImage;

    // Execute
    const double minTransmission = 0.1;
    const cv::Vec3b atmosphericLight = cv::Vec3b(255, 252, 249);
    std::unique_ptr<RemoveHaze> removeHaze(new RemoveHaze(minTransmission));
    auto start = std::chrono::system_clock::now();
    removeHaze->execute(inputImage, transmissionMap, atmosphericLight, outputImage);
    auto end = std::chrono::system_clock::now();
    double elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end-start).count();
    std::cout << "CPU: " << elapsed / 1e3 << "[msec]" << std::endl;

    EXPECT_EQ(equal2Images(outputImage, referenceImage), true);
}

TEST(RemoveHazeTestCUDA, RemoveHazeCUDA) {
    // Read images
    const cv::Mat inputImage = cv::imread(inputImagePath);
    const cv::Mat transmissionMap = cv::imread(transmissionPath, cv::IMREAD_GRAYSCALE);
    const cv::Mat referenceImage = cv::imread(referenceImagePath);
    cv::Mat outputImage;

    cv::cuda::GpuMat dInputImage(inputImage);
    cv::cuda::GpuMat dTransmissionMap(transmissionMap);

    const size_t stepSize = dInputImage.step * sizeof(unsigned char);
    const size_t memSize  = stepSize * dInputImage.rows;

    float *outputImagePtr;
    cudaMalloc((void**)&outputImagePtr, memSize);
    cv::cuda::GpuMat dOutputImage(inputImage.rows, inputImage.cols, CV_8UC3, (void*)outputImagePtr, stepSize);

    dInputImage.upload(inputImage);
    dTransmissionMap.upload(transmissionMap);

    // Execute
    const double minTransmission = 0.1;
    const cv::Vec3b atmosphericLight = cv::Vec3b(255, 252, 249);
    std::unique_ptr<RemoveHaze> removeHaze(new RemoveHaze(minTransmission));
    auto start = std::chrono::system_clock::now();
    removeHaze->execute(dInputImage, dTransmissionMap, atmosphericLight, dOutputImage);
    auto end = std::chrono::system_clock::now();
    double elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end-start).count();
    std::cout << "GPU: " << elapsed / 1e3 << "[msec]" << std::endl;

    dOutputImage.download(outputImage);
    EXPECT_EQ(equal2Images(outputImage, referenceImage), true);
}
ken1714 commented 2 years ago

2441eeae78557c3951f22ff8bd3eeb501ff39858 についてブロックサイズを(64, 12)として実行時間を計測した結果。https://github.com/ken1714/dehazing-sample/issues/22#issuecomment-1008301715 に記載したようにブロックサイズが(64, 4)のときは0.412msecかかっていたのに対し、(64, 12)の場合は0.403msecとわずかに速くなる。

https://github.com/ken1714/dehazing-sample/blob/2441eeae78557c3951f22ff8bd3eeb501ff39858/dehazing-sample/src/remove_haze.cu#L49

4: Test command: /root/projects/dehazing-sample/dehazing-sample/build/tests/test_remove_haze
4: Test timeout computed to be: 10000000
4: Running main() from gmock_main.cc
4: [==========] Running 2 tests from 2 test suites.
4: [----------] Global test environment set-up.
4: [----------] 1 test from RemoveHazeTest
4: [ RUN      ] RemoveHazeTest.RemoveHaze
4: CPU: 28.171[msec]
4: [       OK ] RemoveHazeTest.RemoveHaze (62 ms)
4: [----------] 1 test from RemoveHazeTest (62 ms total)
4: 
4: [----------] 1 test from RemoveHazeTestCUDA
4: [ RUN      ] RemoveHazeTestCUDA.RemoveHazeCUDA
4: GPU: 0.403[msec]
4: [       OK ] RemoveHazeTestCUDA.RemoveHazeCUDA (129 ms)
4: [----------] 1 test from RemoveHazeTestCUDA (129 ms total)
4: 
4: [----------] Global test environment tear-down
4: [==========] 2 tests from 2 test suites ran. (192 ms total)
4: [  PASSED  ] 2 tests.
4/5 Test #4: test_remove_haze .................   Passed    0.21 sec
test 5
    Start 5: test_transmission_map
ken1714 commented 2 years ago

ブロックサイズを(64, 4, 3)に変更し、かつチャンネルごとに別スレッドに分けてカーネルを実行するように修正: 74b36d042f148c146224b415a0dc7b809137e152

実行時間は0.404msecと、2441eeae78557c3951f22ff8bd3eeb501ff39858 とあまり変わらない気もするので、コードコピーを減らすという意味でもチャンネルごとに別スレッドに分けてカーネルを実行する仕様で進めていく。

4: Test command: /root/projects/dehazing-sample/dehazing-sample/build/tests/test_remove_haze
4: Test timeout computed to be: 10000000
4: Running main() from gmock_main.cc
4: [==========] Running 2 tests from 2 test suites.
4: [----------] Global test environment set-up.
4: [----------] 1 test from RemoveHazeTest
4: [ RUN      ] RemoveHazeTest.RemoveHaze
4: CPU: 28.19[msec]
4: [       OK ] RemoveHazeTest.RemoveHaze (62 ms)
4: [----------] 1 test from RemoveHazeTest (62 ms total)
4: 
4: [----------] 1 test from RemoveHazeTestCUDA
4: [ RUN      ] RemoveHazeTestCUDA.RemoveHazeCUDA
4: GPU: 0.404[msec]
4: [       OK ] RemoveHazeTestCUDA.RemoveHazeCUDA (144 ms)
4: [----------] 1 test from RemoveHazeTestCUDA (144 ms total)
4: 
4: [----------] Global test environment tear-down
4: [==========] 2 tests from 2 test suites ran. (206 ms total)
4: [  PASSED  ] 2 tests.
4/5 Test #4: test_remove_haze .................   Passed    0.23 sec
ken1714 commented 2 years ago

環境光を配列に格納し、CUDAカーネル内で環境光を参照する場合は、当然GPUデバイスにデータを転送する必要がある。

https://github.com/ken1714/dehazing-sample/blob/74b36d042f148c146224b415a0dc7b809137e152/dehazing-sample/src/remove_haze.cu#L38-L42

これを行わない場合、下記のようにメモリアクセスに関するエラーが発生してしまう(downloadのタイミングでエラーとなっていたため、GPUデバイスへのデータ転送が原因であることに気づくのが遅れた)。

4: Test command: /root/projects/dehazing-sample/dehazing-sample/build/tests/test_remove_haze
4: Test timeout computed to be: 10000000
4: Running main() from gmock_main.cc
4: [==========] Running 2 tests from 2 test suites.
4: [----------] Global test environment set-up.
4: [----------] 1 test from RemoveHazeTest
4: [ RUN      ] RemoveHazeTest.RemoveHaze
4: CPU: 28.189[msec]
4: [       OK ] RemoveHazeTest.RemoveHaze (62 ms)
4: [----------] 1 test from RemoveHazeTest (62 ms total)
4: 
4: [----------] 1 test from RemoveHazeTestCUDA
4: [ RUN      ] RemoveHazeTestCUDA.RemoveHazeCUDA
4: GPU: 0.412[msec]
4: unknown file: Failure
4: C++ exception with description "OpenCV(4.5.4) /installs/opencv-4.5.4/modules/core/src/cuda/gpu_mat.cu:249: error: (-217:Gpu API call) an illegal memory access was encountered in function 'download'
4: " thrown in the test body.
4: [  FAILED  ] RemoveHazeTestCUDA.RemoveHazeCUDA (144 ms)
4: [----------] 1 test from RemoveHazeTestCUDA (144 ms total)
4: 
4: [----------] Global test environment tear-down
4: [==========] 2 tests from 2 test suites ran. (207 ms total)
4: [  PASSED  ] 1 test.
4: [  FAILED  ] 1 test, listed below:
4: [  FAILED  ] RemoveHazeTestCUDA.RemoveHazeCUDA
4: 
4:  1 FAILED TEST
ken1714 commented 2 years ago

一般的に処理を高速化する際、速度重視であれば倍精度(double)よりも単精度(float)が望ましいので、floatで処理するように修正したが、0.4msec程度と、doubleの場合とほぼ変わらず。

floatの場合とdoubleの場合で処理結果が一致しなくなり、テストも通らなくなるという... 画像データは[0, 255]の8bitで表されるので、倍精度は求められず単精度で十分と考え、ここはfloatで計算するように修正する。 0b40acf4a94101857cdb6a1fcfe9f46fab08136e

4: Test command: /root/projects/dehazing-sample/dehazing-sample/build/tests/test_remove_haze
4: Test timeout computed to be: 10000000
4: Running main() from gmock_main.cc
4: [==========] Running 2 tests from 2 test suites.
4: [----------] Global test environment set-up.
4: [----------] 1 test from RemoveHazeTest
4: [ RUN      ] RemoveHazeTest.RemoveHaze
4: CPU: 28.183[msec]
4: [       OK ] RemoveHazeTest.RemoveHaze (62 ms)
4: [----------] 1 test from RemoveHazeTest (62 ms total)
4: 
4: [----------] 1 test from RemoveHazeTestCUDA
4: [ RUN      ] RemoveHazeTestCUDA.RemoveHazeCUDA
4: GPU: 0.409[msec]
4: /root/projects/dehazing-sample/dehazing-sample/tests/test_remove_haze.cpp:65: Failure
4: Expected equality of these values:
4:   equal2Images(outputImage, referenceImage)
4:     Which is: false
4:   true
4: [  FAILED  ] RemoveHazeTestCUDA.RemoveHazeCUDA (172 ms)
4: [----------] 1 test from RemoveHazeTestCUDA (172 ms total)
4: 
4: [----------] Global test environment tear-down
4: [==========] 2 tests from 2 test suites ran. (235 ms total)
4: [  PASSED  ] 1 test.
4: [  FAILED  ] 1 test, listed below:
4: [  FAILED  ] RemoveHazeTestCUDA.RemoveHazeCUDA
4: 
4:  1 FAILED TEST
倍精度での出力 単精度での出力 差分
dehazed_gpu output_float diff_double_float
ken1714 commented 2 years ago

メモ

GpuMatのメンバのdataはuchar*型である点に注意。float型など、GpuMatのデータの先頭ポインタを任意の型のポインタで取り出したい場合、下記のようにするのが良い?

outputImage.ptr<float>()

https://docs.opencv.org/3.4/d0/d60/classcv_1_1cuda_1_1GpuMat.html#a5139f9492f9079c7b9e414d50da332a3

cv::cuda::minMaxはシングルチャンネルの入力配列のみ処理可能: https://docs.opencv.org/3.4/d5/de6/group__cudaarithm__reduce.html#ga8d7de68c10717cf25e787e3c20d2dfee

ken1714 commented 2 years ago

ヘイズ除去後に画素値を[0, 255]の範囲にクリッピングする処理をcv::cuda::minおよびcv::cuda::maxを使用するように変更した。dehazeGPUにクリッピング処理のif文を削除し、分岐効率の改善を試みたが、実行時間42.498msecとかえって大幅に増えてしまう結果に...

4: Test command: /root/projects/dehazing-sample/dehazing-sample/build/tests/test_remove_haze
4: Test timeout computed to be: 10000000
4: Running main() from gmock_main.cc
4: [==========] Running 2 tests from 2 test suites.
4: [----------] Global test environment set-up.
4: [----------] 1 test from RemoveHazeTest
4: [ RUN      ] RemoveHazeTest.RemoveHaze
4: CPU: 28.083[msec]
4: [       OK ] RemoveHazeTest.RemoveHaze (62 ms)
4: [----------] 1 test from RemoveHazeTest (62 ms total)
4: 
4: [----------] 1 test from RemoveHazeTestCUDA
4: [ RUN      ] RemoveHazeTestCUDA.RemoveHazeCUDA
4: GPU: 42.498[msec]
4: /root/projects/dehazing-sample/dehazing-sample/tests/test_remove_haze.cpp:63: Failure
4: Expected equality of these values:
4:   equal2Images(outputImage, referenceImage)
4:     Which is: false
4:   true
4: [  FAILED  ] RemoveHazeTestCUDA.RemoveHazeCUDA (199 ms)
4: [----------] 1 test from RemoveHazeTestCUDA (199 ms total)
4: 
4: [----------] Global test environment tear-down
4: [==========] 2 tests from 2 test suites ran. (261 ms total)
4: [  PASSED  ] 1 test.
4: [  FAILED  ] 1 test, listed below:
4: [  FAILED  ] RemoveHazeTestCUDA.RemoveHazeCUDA
4: 
4:  1 FAILED TEST
4/5 Test #4: test_remove_haze .................***Failed    0.29 sec
remove_haze.cu
```cu #include "remove_haze.cuh" __global__ void dehazeGPU(unsigned char* const inputValue, unsigned char* const transmission, float* dehazedValue, const float minTransmission, float* const atmosphericLight, const int imageWidth, const int imageHeight) { int channelNum = 3; int xIdx = blockIdx.x * blockDim.x + threadIdx.x; int yIdx = blockIdx.y * blockDim.y + threadIdx.y; int zIdx = threadIdx.z; if (xIdx >= imageWidth || yIdx >= imageHeight || zIdx >= channelNum) return; float transmissionValue = (float)(unsigned char)transmission[yIdx * imageWidth + xIdx] / 255.0; transmissionValue = (transmissionValue > minTransmission) ? transmissionValue: minTransmission; float dehazed = ((float)(unsigned char)inputValue[(yIdx * imageWidth + xIdx) * channelNum + zIdx] - atmosphericLight[zIdx]) / transmissionValue + atmosphericLight[zIdx]; dehazedValue[(yIdx * imageWidth + xIdx) * channelNum + zIdx] = dehazed; } void executeDehazeGPU(unsigned char* const inputValue, unsigned char* const transmission, float* dehazedValue, const float minTransmission, float* const atmosphericLight, const int imageWidth, const int imageHeight) { // Block and grid settings int channelNum = 3; dim3 block(64, 4, channelNum); dim3 grid((imageWidth + block.x - 1) / block.x, (imageHeight + block.y - 1) / block.y); // Copy atmospheric light from host to device float *dAtmosphericLight; int atmosphericLightMemSize = sizeof(float) * channelNum; cudaMalloc((void **)&dAtmosphericLight, atmosphericLightMemSize); cudaMemcpy(dAtmosphericLight, atmosphericLight, atmosphericLightMemSize, cudaMemcpyHostToDevice); dehazeGPU<<>>(inputValue, transmission, dehazedValue, minTransmission, dAtmosphericLight, imageWidth, imageHeight); } ```
remove_haze.cpp(一部)
```cpp void RemoveHaze::execute(const cv::cuda::GpuMat& inputImage, const cv::cuda::GpuMat& transmissionMap, const cv::Vec3b atmosphericLight, cv::cuda::GpuMat& outputImage) { const int imageHeight = inputImage.rows; const int imageWidth = inputImage.cols; float atmosphericLightArray[3] = {(float)(unsigned char)atmosphericLight[0], (float)(unsigned char)atmosphericLight[1], (float)(unsigned char)atmosphericLight[2]}; executeDehazeGPU(inputImage.data, transmissionMap.data, outputImage.ptr(), _minTransmission, atmosphericLightArray, imageWidth, imageHeight); cv::cuda::min(outputImage, 255.0, outputImage); cv::cuda::max(outputImage, 0.0, outputImage); outputImage.convertTo(outputImage, CV_8UC3); } ```

0b40acf4a94101857cdb6a1fcfe9f46fab08136e からの差分まとめ(下記のパッチを適用することで再現可能)

$ git diff > cuda_minmax.patch
$ cd dehazing-sample
$ patch -p1 < cuda_minmax.patch
cuda_minmax.patch
```patch diff --git a/dehazing-sample/include/remove_haze.cuh b/dehazing-sample/include/remove_haze.cuh index 388f47c..ade18f1 100644 --- a/dehazing-sample/include/remove_haze.cuh +++ b/dehazing-sample/include/remove_haze.cuh @@ -3,9 +3,9 @@ #include "cuda_runtime.h" -__global__ void dehazeGPU(unsigned char* const inputValue, unsigned char* const transmission, unsigned char* dehazedValue, +__global__ void dehazeGPU(unsigned char* const inputValue, unsigned char* const transmission, float* dehazedValue, const float minTransmission, float* const atmosphericLight, const int imageWidth, const int imageHeight); -void executeDehazeGPU(unsigned char* inputValue, unsigned char* const transmission, unsigned char* dehazedValue, +void executeDehazeGPU(unsigned char* inputValue, unsigned char* const transmission, float* dehazedValue, const float minTransmission, float* const atmosphericLight, const int imageWidth, const int imageHeight); diff --git a/dehazing-sample/src/remove_haze.cpp b/dehazing-sample/src/remove_haze.cpp index 4d78268..2329ec1 100644 --- a/dehazing-sample/src/remove_haze.cpp +++ b/dehazing-sample/src/remove_haze.cpp @@ -1,5 +1,6 @@ #include "remove_haze.hpp" #include "remove_haze.cuh" +#include RemoveHaze::RemoveHaze(const double minTransmission): _minTransmission(minTransmission) { @@ -45,5 +46,9 @@ void RemoveHaze::execute(const cv::cuda::GpuMat& inputImage, const cv::cuda::Gpu (float)(unsigned char)atmosphericLight[1], (float)(unsigned char)atmosphericLight[2]}; - executeDehazeGPU(inputImage.data, transmissionMap.data, outputImage.data, _minTransmission, atmosphericLightArray, imageWidth, imageHeight); + executeDehazeGPU(inputImage.data, transmissionMap.data, outputImage.ptr(), _minTransmission, atmosphericLightArray, imageWidth, imageHeight); + + cv::cuda::min(outputImage, 255.0, outputImage); + cv::cuda::max(outputImage, 0.0, outputImage); + outputImage.convertTo(outputImage, CV_8UC3); } diff --git a/dehazing-sample/src/remove_haze.cu b/dehazing-sample/src/remove_haze.cu index 03905ae..a6ae4f9 100644 --- a/dehazing-sample/src/remove_haze.cu +++ b/dehazing-sample/src/remove_haze.cu @@ -1,7 +1,7 @@ #include "remove_haze.cuh" -__global__ void dehazeGPU(unsigned char* const inputValue, unsigned char* const transmission, unsigned char* dehazedValue, +__global__ void dehazeGPU(unsigned char* const inputValue, unsigned char* const transmission, float* dehazedValue, const float minTransmission, float* const atmosphericLight, const int imageWidth, const int imageHeight) { @@ -17,17 +17,11 @@ __global__ void dehazeGPU(unsigned char* const inputValue, unsigned char* const float dehazed = ((float)(unsigned char)inputValue[(yIdx * imageWidth + xIdx) * channelNum + zIdx] - atmosphericLight[zIdx]) / transmissionValue + atmosphericLight[zIdx]; - if (dehazed > 255) { - dehazed = 255; - } else if (dehazed < 0) { - dehazed = 0; - } - dehazedValue[(yIdx * imageWidth + xIdx) * channelNum + zIdx] = dehazed; } -void executeDehazeGPU(unsigned char* const inputValue, unsigned char* const transmission, unsigned char* dehazedValue, +void executeDehazeGPU(unsigned char* const inputValue, unsigned char* const transmission, float* dehazedValue, const float minTransmission, float* const atmosphericLight, const int imageWidth, const int imageHeight) { // Block and grid settings diff --git a/dehazing-sample/tests/test_remove_haze.cpp b/dehazing-sample/tests/test_remove_haze.cpp index c776d3c..656b611 100644 --- a/dehazing-sample/tests/test_remove_haze.cpp +++ b/dehazing-sample/tests/test_remove_haze.cpp @@ -2,6 +2,7 @@ #include "remove_haze.hpp" #include "remove_haze.cuh" #include "utils.hpp" +#include // Path const std::string inputImagePath = "data/original.png"; @@ -19,7 +20,11 @@ TEST(RemoveHazeTest, RemoveHaze) { const double minTransmission = 0.1; const cv::Vec3b atmosphericLight = cv::Vec3b(255, 252, 249); std::unique_ptr removeHaze(new RemoveHaze(minTransmission)); + auto start = std::chrono::system_clock::now(); removeHaze->execute(inputImage, transmissionMap, atmosphericLight, outputImage); + auto end = std::chrono::system_clock::now(); + double elapsed = std::chrono::duration_cast(end-start).count(); + std::cout << "CPU: " << elapsed / 1e3 << "[msec]" << std::endl; EXPECT_EQ(equal2Images(outputImage, referenceImage), true); } @@ -34,12 +39,12 @@ TEST(RemoveHazeTestCUDA, RemoveHazeCUDA) { cv::cuda::GpuMat dInputImage(inputImage); cv::cuda::GpuMat dTransmissionMap(transmissionMap); - const size_t stepSize = dInputImage.step * sizeof(unsigned char); + const size_t stepSize = dInputImage.step * sizeof(float); const size_t memSize = stepSize * dInputImage.rows; float *outputImagePtr; cudaMalloc((void**)&outputImagePtr, memSize); - cv::cuda::GpuMat dOutputImage(inputImage.rows, inputImage.cols, CV_8UC3, (void*)outputImagePtr, stepSize); + cv::cuda::GpuMat dOutputImage(inputImage.rows, inputImage.cols, CV_32FC3, (void*)outputImagePtr, stepSize); dInputImage.upload(inputImage); dTransmissionMap.upload(transmissionMap); @@ -48,8 +53,13 @@ TEST(RemoveHazeTestCUDA, RemoveHazeCUDA) { const double minTransmission = 0.1; const cv::Vec3b atmosphericLight = cv::Vec3b(255, 252, 249); std::unique_ptr removeHaze(new RemoveHaze(minTransmission)); + auto start = std::chrono::system_clock::now(); removeHaze->execute(dInputImage, dTransmissionMap, atmosphericLight, dOutputImage); + auto end = std::chrono::system_clock::now(); + double elapsed = std::chrono::duration_cast(end-start).count(); + std::cout << "GPU: " << elapsed / 1e3 << "[msec]" << std::endl; dOutputImage.download(outputImage); EXPECT_EQ(equal2Images(outputImage, referenceImage), true); + cv::imwrite("data/output_gpumat_clipping.png", outputImage); } ```