The result of CLWinogradConvolutionLayer is incorrect.How should I do?

Output of 'strings libarm_compute.so | grep arm_compute_version':

Platform: rock-5b , opencl2.0

Operating System: ubuntu

Problem description: I wanna compare the speed between winograd and im2Col+gemm in opencl .But I found the result of winorgrad is incorrect. (The right part)

When the BATCH_SIZE = 512, it happend !!!

the winograd code: ` CLScheduler::get().default_init();

CLTensor imgTensor;
CLTensor kernelTensor, OTensor;

imgTensor.allocator()->init(TensorInfo(TensorShape(IMG_H, IMG_W, IMG_CHANNEL), 1, DataType::F32));

kernelTensor.allocator()->init(TensorInfo(TensorShape(KERNEL_H, KERNEL_W, KERNEL_CHANNEL, BATCH_SIZE), 1, DataType::F32));

OTensor.allocator()->init(TensorInfo(TensorShape(out_h, out_w, BATCH_SIZE), 1, DataType::F32));

struct timeval tstart1, tend1;
gettimeofday(&tstart1, NULL);

// std::cout << "winogradInfo :" << std::endl;
CLWinogradConvolutionLayer winogradInfo;
winogradInfo.configure(&imgTensor, &kernelTensor, nullptr, &OTensor, PadStrideInfo(STRIDE, STRIDE, PAD, PAD, PAD, PAD, DimensionRoundingType::FLOOR));

gettimeofday(&tend1, NULL);

imgTensor.allocator()->allocate();
kernelTensor.allocator()->allocate();
OTensor.allocator()->allocate();

// std::cout << "img tensor:" << std::endl;
imgTensor.map();
arm_compute::utils::fill_tensor_vector(imgTensor, img);
// imgTensor.print(std::cout);
imgTensor.unmap();

// std::cout << "kernel tensor:" << std::endl;
kernelTensor.map();
arm_compute::utils::fill_tensor_vector(kernelTensor, kernel_list);
// kernelTensor.print(std::cout);
kernelTensor.unmap();

// 开始计时
struct timeval tstart, tend;
gettimeofday(&tstart, NULL);

winogradInfo.run();

// 结束计时
gettimeofday(&tend, NULL);

// std::cout << "out tensor:" << std::endl;
OTensor.map();
OTensor.print(std::cout);
OTensor.unmap();`

the im2Col code: ` CLScheduler::get().default_init();

CLTensor imgTensor;
CLTensor kernelTensor, OTensor;

imgTensor.allocator()->init(TensorInfo(TensorShape(IMG_H, IMG_W, IMG_CHANNEL), 1, DataType::F32));

kernelTensor.allocator()->init(TensorInfo(TensorShape(KERNEL_H, KERNEL_W, KERNEL_CHANNEL, BATCH_SIZE), 1, DataType::F32));

OTensor.allocator()->init(TensorInfo(TensorShape(out_h, out_w, BATCH_SIZE), 1, DataType::F32));

struct timeval tstart1, tend1;
gettimeofday(&tstart1, NULL);

std::cout << "GEMMInfo :" << std::endl;
CLGEMMConvolutionLayer GEMMInfo;
GEMMInfo.configure(&imgTensor, &kernelTensor, nullptr, &OTensor, PadStrideInfo(STRIDE, STRIDE, PAD, PAD, PAD, PAD, DimensionRoundingType::FLOOR));

gettimeofday(&tend1, NULL);

imgTensor.allocator()->allocate();
kernelTensor.allocator()->allocate();
OTensor.allocator()->allocate();

std::cout << "img tensor:" << std::endl;
imgTensor.map();
arm_compute::utils::fill_tensor_vector(imgTensor, img);
// imgTensor.print(std::cout);
imgTensor.unmap();

std::cout << "kernel tensor:" << std::endl;
kernelTensor.map();
arm_compute::utils::fill_tensor_vector(kernelTensor, kernel_list);
// kernelTensor.print(std::cout);
kernelTensor.unmap();

//  CLScheduler::get().sync();

// 开始计时
struct timeval tstart, tend;
gettimeofday(&tstart, NULL);

GEMMInfo.run();

// 结束计时
gettimeofday(&tend, NULL);

std::cout << "out tensor:" << std::endl;
OTensor.map();
OTensor.print(std::cout);
OTensor.unmap();`

ARM-software / ComputeLibrary

The result of CLWinogradConvolutionLayer is incorrect.How should I do? #1103