Why Im2Col and Col2Im Process in Int8-CpuGemmConv take so much time?

Hi! I am trying to implement a WDSR-Quantized Model via ACL on Android Cpu-based Device. Here is my code for WDSR Implementation:
#include "arm_compute/runtime/NEON/NEFunctions.h"

#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/Allocator.h"
#include "arm_compute/runtime/BlobLifetimeManager.h"
#include "arm_compute/runtime/MemoryManagerOnDemand.h"
#include "arm_compute/runtime/PoolManager.h"
#include "utils/Utils.h"
#include "support/ToolchainSupport.h"

#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
#include <arm_neon.h>

#include <cstdlib>
#include <sstream>
#include <time.h>

using namespace arm_compute;
using namespace utils;

int main()
{
    std::cout<<"Tensor Definition...\n";
    //Define Tensor
    Tensor input;

    //Layer1: Subtraction
    Tensor sub_1;
    Tensor layer1_out;

    //Layer2: InputQuantization
    Tensor layer2_out;

    //Layer3_1: Convolution1 (conv_only)
    Tensor conv1_weight;
    Tensor conv1_bias_s32;
    Tensor layer3_out_s32;
    //Layer3_2: BitShift S32toInt8
    Tensor layer3_out;

    //Layer4_1: Convolution2 (conv_only)
    Tensor conv2_weight;
    Tensor conv2_bias_s32;
    Tensor layer4_out_s32;
    //Layer4_2: BitShift S32toInt8
    Tensor layer4_out;

    //Layer5_1: Convolution3 (conv_only)
    Tensor conv3_weight;
    Tensor conv3_bias_s32;
    Tensor layer5_out_s32;
    //Layer5_2 BitShift S32toInt8
    Tensor layer5_out;

   //SkipLink1: Connect Conv1 S32 with Conv4
    Tensor sk1_out_s32;

    //Layer6_1: Convolution4 (conv_only)
    Tensor conv4_weight;
    Tensor conv4_bias_s32;
    Tensor layer6_out_s32;
    //Layer6_2: TensorAdd: SkipLink1Out Only;
    Tensor layer6_with_sk1_s32;
    //Layer6_3: BitShift S32toInt8
    Tensor layer6_out;

    std::cout<<"Layer Function Definition...\n";
    //Define Layer
    NEArithmeticSubtraction sublayer_1;
    NEQuantizationLayer quant_2;
    NEConvolutionLayer conv3_1;
    NEGEMMLowpOutputStage outstage3_2;
    NEConvolutionLayer conv4_1;
    NEGEMMLowpOutputStage outstage4_2;
    NEConvolutionLayer conv5_1;
    NEGEMMLowpOutputStage outstage5_2;
    NEConvolutionLayer conv6_1;
    NEArithmeticAddition addlayer6_2;
    NEGEMMLowpOutputStage outstage6_2;

    std::cout<<"Tensor Configuration...\n";
    //Layer1
    const TensorShape inputshape(960,540,3);
    input.allocator()->init(TensorInfo(inputshape, 1, DataType::F32)); //DataLayout: Default NCHW;
    sub_1.allocator()->init(TensorInfo(inputshape, 1, DataType::F32)); //A matrix with identical value:0.5 of size(960 , 540);
    layer1_out.allocator()->init(TensorInfo(inputshape, 1, DataType::F32));

    //Layer2
    const QuantizationInfo qinfo = QuantizationInfo(0.015625,0);
    layer2_out.allocator()->init(TensorInfo(inputshape, 1, DataType::QASYMM8_SIGNED, qinfo));

    //Layer3
    const TensorShape conv1_weight_shape(3,3,3,4);
    const TensorShape conv1_bias_shape(4);
    const TensorShape conv1_output_shape(960,540,4);
    conv1_weight.allocator()->init(TensorInfo(conv1_weight_shape, 1, DataType::QASYMM8_SIGNED));
    conv1_bias_s32.allocator()->init(TensorInfo(conv1_bias_shape, 1, DataType::S32));
    layer3_out_s32.allocator()->init(TensorInfo(conv1_output_shape, 1, DataType::S32));
    layer3_out.allocator()->init(TensorInfo(conv1_output_shape, 1, DataType::QASYMM8_SIGNED));

    //Layer4
    const TensorShape conv2_weight_shape(1,1,4,24);
    const TensorShape conv2_bias_shape(24);
    const TensorShape conv2_output_shape(960,540,24);
    conv2_weight.allocator()->init(TensorInfo(conv2_weight_shape, 1, DataType::QASYMM8_SIGNED));
    conv2_bias_s32.allocator()->init(TensorInfo(conv2_bias_shape, 1, DataType::S32));
    layer4_out_s32.allocator()->init(TensorInfo(conv2_output_shape, 1, DataType::S32));
    layer4_out.allocator()->init(TensorInfo(conv2_output_shape, 1, DataType::QASYMM8_SIGNED));

    //Layer5
    const TensorShape conv3_weight_shape(1,1,24,3);
    const TensorShape conv3_bias_shape(3);
    const TensorShape conv3_output_shape(960,540,3);
    conv3_weight.allocator()->init(TensorInfo(conv3_weight_shape, 1, DataType::QASYMM8_SIGNED));
    conv3_bias_s32.allocator()->init(TensorInfo(conv3_bias_shape, 1, DataType::S32));
    layer5_out_s32.allocator()->init(TensorInfo(conv3_output_shape, 1, DataType::S32));
    layer5_out.allocator()->init(TensorInfo(conv3_output_shape, 1, DataType::QASYMM8_SIGNED));

    //Layer6
    const TensorShape conv4_weight_shape(3,3,3,4);
    const TensorShape conv4_bias_shape(4);
    const TensorShape conv4_output_shape(960,540,4);
    conv4_weight.allocator()->init(TensorInfo(conv4_weight_shape, 1, DataType::QASYMM8_SIGNED));
    conv4_bias_s32.allocator()->init(TensorInfo(conv4_bias_shape, 1, DataType::S32));
    layer6_out_s32.allocator()->init(TensorInfo(conv4_output_shape, 1, DataType::S32));
    layer6_out.allocator()->init(TensorInfo(conv4_output_shape, 1, DataType::QASYMM8_SIGNED));

    //SkipLink1
    const TensorShape sk1_out_shape(960,540,4);
    sk1_out_s32.allocator()->init(TensorInfo(sk1_out_shape, 1, DataType::S32));
    layer6_with_sk1_s32.allocator()->init(TensorInfo(sk1_out_shape, 1, DataType::S32));

    std::cout<<"Layer Configuration...\n";

    //Configure Layers
    //Layer1 SubstractionLayer with Bias: 0.5,  (960,540,3), (960,540,3) -> (960,540,3)
    sublayer_1.configure(&input, &sub_1, &layer1_out, ConvertPolicy::SATURATE);

    //Layer2 QuantizationLayer with scale: 0.015625, (960,540,3), (960,540,3) -> (960,540,3)
    quant_2.configure(&layer1_out, &layer2_out);

    //Layer3_1 Convolution, (960,540,3) (3,3,3,4), PadStride(1,1,1,1) -> (960,540,4)
    conv3_1.configure(&layer2_out, &conv1_weight, nullptr, &layer3_out_s32, PadStrideInfo(1, 1, 1, 1));
    //Layer3_2 BitShift S32toInt8, (960,540,4) (no activation function!!, RightShift:9) -> (960,540,4)
    GEMMLowpOutputStageInfo info1;
    info1.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
    info1.output_data_type    = DataType::QASYMM8_SIGNED;
    info1.gemmlowp_min_bound  = -128;  //no ReLU here;
    info1.gemmlowp_max_bound  = 127;
    info1.gemmlowp_offset     = -9;

    outstage3_2.configure(&layer3_out_s32, &conv1_bias_s32, &layer3_out, info1);

    //Layer4_1 Convolution, (960,540,4) (1,1,4,24), PadStride(1,1,0,0) -> (960,540,24)
    conv4_1.configure(&layer3_out, &conv2_weight, nullptr, &layer4_out_s32, PadStrideInfo(1, 1, 0, 0));
    //Layer4_2 BitShift S32toInt8, (960,540,24) (ReLU Here!!!, RightShift:) -> (960,540,24)
    GEMMLowpOutputStageInfo info2;
    info2.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
    info2.output_data_type    = DataType::QASYMM8_SIGNED;
    info2.gemmlowp_min_bound  = 0;  //ReLU here;
    info2.gemmlowp_max_bound  = 127;
    info2.gemmlowp_offset     = -9;

    outstage4_2.configure(&layer4_out_s32, &conv2_bias_s32, &layer4_out, info2);

    //Layer5_1 Convolution, (960,540,24) (1,1,24,3), PadStride(1,1,0,0) -> (960,540,3)
    conv5_1.configure(&layer4_out, &conv3_weight, nullptr, &layer5_out_s32, PadStrideInfo(1, 1, 0, 0));
    //Layer5_2 BitShift S32toInt8, (960,540,3) (no activation function!!!, RightShift:) -> (960,540,3)
    GEMMLowpOutputStageInfo info3;
    info3.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
    info3.output_data_type    = DataType::QASYMM8_SIGNED;
    info3.gemmlowp_min_bound  = -128;  //no ReLU here;
    info3.gemmlowp_max_bound  = 127;
    info3.gemmlowp_offset     = -9;

    outstage5_2.configure(&layer5_out_s32, &conv3_bias_s32, &layer5_out, info3);

    //Layer6_1 Convolution, (960,540,3) (3,3,3,4), PadStride(1,1,1,1) -> (960,540,4)
    conv6_1.configure(&layer5_out, &conv4_weight, nullptr, &layer6_out_s32, PadStrideInfo(1, 1, 1, 1));
    //Layer6_2 TensorAdd of SkipLink1, (960,540,4), (960,540,4) -> (960,540,4)
    addlayer6_2.configure(&layer6_out_s32, &sk1_out_s32, &layer6_with_sk1_s32, ConvertPolicy::SATURATE);
    //Layer6_3 BitShift S32toInt8, (960,540,4) (no activation function!!!, RightShift:) -> (960,540,4)
    GEMMLowpOutputStageInfo info4;
    info4.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
    info4.output_data_type    = DataType::QASYMM8_SIGNED;
    info4.gemmlowp_min_bound  = -128;  //no ReLU here;
    info4.gemmlowp_max_bound  = 127;
    info4.gemmlowp_offset     = -9;

    outstage6_2.configure(&layer6_with_sk1_s32, &conv4_bias_s32, &layer6_out, info4);

    std::cout<<"All Configuration Done...\n";

    //Tensor allocation
    input.allocator()->allocate();
    sub_1.allocator()->allocate();
    layer1_out.allocator()->allocate();
    layer2_out.allocator()->allocate();
    conv1_weight.allocator()->allocate();
    conv1_bias_s32.allocator()->allocate();
    layer3_out_s32.allocator()->allocate();
    layer3_out.allocator()->allocate();
    conv2_weight.allocator()->allocate();
    conv2_bias_s32.allocator()->allocate();
    layer4_out_s32.allocator()->allocate();
    layer4_out.allocator()->allocate();
    conv3_weight.allocator()->allocate();
    conv3_bias_s32.allocator()->allocate();
    layer5_out_s32.allocator()->allocate();
    layer5_out.allocator()->allocate();
    sk1_out_s32.allocator()->allocate();
    conv4_weight.allocator()->allocate();
    conv4_bias_s32.allocator()->allocate();
    layer6_out_s32.allocator()->allocate();
    layer6_with_sk1_s32.allocator()->allocate();
    layer6_out.allocator()->allocate();

    //Test Several Tensor Info
    int i_x = layer2_out.info()->strides_in_bytes().x();
    int i_y = layer2_out.info()->strides_in_bytes().y();
    int i_z = layer2_out.info()->strides_in_bytes().z();

    std::cout<<i_x<<"\n";
    std::cout<<i_y<<"\n";
    std::cout<<i_z<<"\n";

    std::cout<<"\n";

    auto *sk1_ptr = reinterpret_cast<int32_t *>(layer3_out_s32.buffer());
    auto *bias1_ptr = reinterpret_cast<int32_t *>(conv1_bias_s32.buffer());
    auto *sk1_out_ptr = reinterpret_cast<int32_t *>(sk1_out_s32.buffer());
    int             loopsize = 960*540;   // as 960*540 could be exactly divided into 16 based odds
    int             step = 16;  
    int             result_fixedpoint_multiplier=0;
    int32_t         result_shift=0;

    struct timeval start, end;
    float totatcnt = 0;

    /*
    std::cout<<"Memory Allocation Test between For-Loop and Memcpy...\n";
    std::cout<<"\n";

    //Test for Allocation Speed: for loop Versus memcpy
    //for loop:
    auto *input_ptr = reinterpret_cast<float *>(input.buffer());
    gettimeofday(&start, NULL);
    for(int i=0; i<960*540; ++i)
    {
        *(input_ptr+0) = 1;
        *(input_ptr+1) = 1;
        *(input_ptr+2) = 1;

        input_ptr +=3;
    }
    gettimeofday(&end, NULL);
    int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec -start.tv_usec;
    printf("timecost: %d us\n", timeuse);

    //memcpy:
    float arr[3] = {1};

    gettimeofday(&start, NULL);
    for(int i=0; i<960*540; ++i)
    {
        memcpy(input_ptr, arr, 3 * sizeof(float));
        input_ptr +=3;
    }
    gettimeofday(&end, NULL);
    timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec -start.tv_usec;
    printf("timecost: %d us\n", timeuse);

    std::cout<<"\n";

    //Im2Col Test
    std::cout<<"Im2Col via Memcpy Test...\n";
    std::cout<<"\n";
    */

    //
    //
    //Model Execution

    for(int i=0; i<4; i++)
    {
        gettimeofday(&start, NULL);
        sublayer_1.run();
        quant_2.run();
        conv3_1.run();
        outstage3_2.run();
        conv4_1.run();
        outstage4_2.run();
        conv5_1.run();
        outstage5_2.run();
        conv6_1.run();

        //SkipLink Design and Execution
        //SL1:
        //Conv1_out_s32 + Bias * scale (keep S32 format) size(960,540,4)
        //Processing based on channel-splitwise:
        //Step1: Add Bias(S32)
        //Step2: Int-Based Devision

        for(int j = 0; j<4; j++)    // 4 channels lead to 4 outer loops
        {
            //Load Bias Value for the channel
            int32_t bias_value = bias1_ptr[j];
            const int32x4_t bias_s32 = vdupq_n_s32(bias_value);

            for(int i=0; i<=loopsize-step; i+=step)
            {
                int32x4x4_t in_s32 =
                {
                    {
                        vld1q_s32(sk1_ptr + i + 0 + j*loopsize),
                        vld1q_s32(sk1_ptr + i + 4 + j*loopsize),
                        vld1q_s32(sk1_ptr + i + 8 + j*loopsize),
                        vld1q_s32(sk1_ptr + i + 12 + j*loopsize)
                    }
                };

                // Add the bias to GEMM's result
                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32);
                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32);
                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32);
                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32);

                // Int-Based Scale Process
                if(result_shift < 0)
                {
                    in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
                    in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
                    in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift)));
                    in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift)));

                    in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
                    in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
                    in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
                    in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
                }
                else
                {
                    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
                    in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
                    in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
                    in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
                    in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);

                    // Round to the nearest division by a power-of-two using result_shift_s32
                    in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
                    in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
                    in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
                    in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
                }

                // Load Processed S32 Res into New Tensor
                vst1q_s32(sk1_out_ptr + i + 0 + j*loopsize, in_s32.val[0]);
                vst1q_s32(sk1_out_ptr + i + 4 + j*loopsize, in_s32.val[1]);
                vst1q_s32(sk1_out_ptr + i + 8 + j*loopsize, in_s32.val[2]);
                vst1q_s32(sk1_out_ptr + i + 12 + j*loopsize, in_s32.val[3]);
            }
    }

    addlayer6_2.run();
    outstage6_2.run();

    gettimeofday(&end, NULL);

    int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec -start.tv_usec;
    printf("timecost: %d us\n", timeuse);

    }
    //SkipLink2 Design

    return 0;

}
The data layout of Int8 Tensors here are DataLayout::NCHW.
My Command for Library Compiling is: scons Werror=0 -j8 debug=0 asserts=1 neon=1 opencl=1 benchmark_examples=1 os=android arch=arm64-v8a;
My Test Device is: Samsung S10;
I have added the timer benchmark tool in CpuGemmConv2d.cpp to measure the time cost of each stage.
Here is my performance benchmark:
time for input reshaping: 7322 us time for Int8Gemm: 1736 us time for OutputReshaping: 3959 us
time for input reshaping: 4327 us time for Int8Gemm: 4310 us time for OutputReshaping: 22391 us
time for input reshaping: 7178 us time for Int8Gemm: 2166 us time for OutputReshaping: 3163 us
time for input reshaping: 7519 us time for Int8Gemm: 1201 us time for OutputReshaping: 4045 us
Total timecost: 79522 us
I have run over 200 iters on the execution so 80ms is the average time cost for model inference.
My question is, why most Time Consumption of ConvolutionLayer has been spent on Im2Col and Col2Im kernels? For Conv2, the Col2Im kernel has taken 20ms, nearly 1/4 of the inference latency of the whole model.
I have tried NHWC Layout, which seems even worse on Int8 DataType.
Thanks,
LEI
ARM-software / ComputeLibrary

Why Im2Col and Col2Im Process in Int8-CpuGemmConv take so much time? #1011