ARM-software / ComputeLibrary

The Compute Library is a set of computer vision and machine learning functions optimised for both Arm CPUs and GPUs using SIMD technologies.
MIT License
2.75k stars 767 forks source link

Latency for Conv2d and Depthwise #1074

Closed wenhyan closed 4 months ago

wenhyan commented 8 months ago

arm_compute_version=v23.05 Build options: {'Werror': '1', 'debug': '0', 'asserts': '1', 'neon': '1', 'opencl': '0', 'os': 'linux', 'arch': 'armv8a'} Git hash=b'6c713f090601ea839d944a30888ea56eb2f43988'

Platform: Raspberry-Pi 3B A53

Operating System: Linux

Problem description: The latency of a depthwise is affected by the previous convolution

Two conv2d and one is 3x3 conv2d one is 3x3 depthwise, the first conv2d output is second depthwise input, I use same memory buffer to save feature map. The latency of conv2d is 13ms and depthwise is 4 ms. If don't use same memory buffer, the latency of depthwise is 1 ms.

Use same buffer conv_output_buffer

{
    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_bias;
    Tensor conv_output;

    conv_input.allocator()->init(TensorInfo(TensorShape(3, 320, 320), 1, DataType::F32, DataLayout::NHWC));
    conv_weight.allocator()->init(TensorInfo(TensorShape(3, 3, 3, 16), 1, DataType::F32, DataLayout::NHWC));
    conv_bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    conv_output.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv_input_buffer  = (uint8_t *)malloc(4 * 320 * 320 * 3);
    uint8_t *conv_weight_buffer = (uint8_t *)malloc(4 * 16 * 3 * 3 * 3);
    uint8_t *conv_bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *conv_output_buffer = (uint8_t *)malloc(4 * 16 * 160 * 160);

    conv_input.allocator()->import_memory(conv_input_buffer);
    conv_weight.allocator()->import_memory(conv_weight_buffer);
    conv_bias.allocator()->import_memory(conv_bias_buffer);
    conv_output.allocator()->import_memory(conv_output_buffer);

    NEGEMMConvolutionLayer conv;
    conv.configure(&conv_input,
                   &conv_weight,
                   &conv_bias,
                   &conv_output,
                   PadStrideInfo(2, 2, 1, 1),
                   WeightsInfo(),
                   Size2D(1, 1),
                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                       6.0f));

    Tensor input;
    Tensor weight;
    Tensor bias;
    Tensor output;

    input.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));
    weight.allocator()->init(TensorInfo(TensorShape(16, 3, 3), 1, DataType::F32, DataLayout::NHWC));
    bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    output.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *input_buffer  = (uint8_t *)malloc(4 * 160 * 160 * 16);
    uint8_t *weight_buffer = (uint8_t *)malloc(4 * 3 * 3 * 16);
    uint8_t *bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *output_buffer = (uint8_t *)malloc(4 * 16 * 80 * 80);

    input.allocator()->import_memory(conv_output_buffer);
    weight.allocator()->import_memory(weight_buffer);
    bias.allocator()->import_memory(bias_buffer);
    output.allocator()->import_memory(output_buffer);

    NEDepthwiseConvolutionLayer depth_conv;
    depth_conv.configure(&input,
                         &weight,
                         &bias,
                         &output,
                         PadStrideInfo(2, 2, 1, 1),
                         1,
                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f));

    Tensor conv1_input;
    Tensor conv1_weight;
    Tensor conv1_bias;
    Tensor conv1_output;

    conv1_input.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));
    conv1_weight.allocator()->init(TensorInfo(TensorShape(16, 1, 1, 8), 1, DataType::F32, DataLayout::NHWC));
    conv1_bias.allocator()->init(TensorInfo(TensorShape(8), 1, DataType::F32, DataLayout::NHWC));
    conv1_output.allocator()->init(TensorInfo(TensorShape(8, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv1_input_buffer  = (uint8_t *)malloc(4 * 80 * 80 * 16);
    uint8_t *conv1_weight_buffer = (uint8_t *)malloc(4 * 16 * 1 * 1 * 8);
    uint8_t *conv1_bias_buffer   = (uint8_t *)malloc(4 * 8);
    uint8_t *conv1_output_buffer = (uint8_t *)malloc(4 * 8 * 80 * 80);

    conv1_input.allocator()->import_memory(output_buffer);
    conv1_weight.allocator()->import_memory(conv1_weight_buffer);
    conv1_bias.allocator()->import_memory(conv1_bias_buffer);
    conv1_output.allocator()->import_memory(conv1_output_buffer);

    NEGEMMConvolutionLayer conv1;
    conv1.configure(&conv1_input,
                    &conv1_weight,
                    &conv1_bias,
                    &conv1_output,
                    PadStrideInfo(1, 1, 0, 0),
                    WeightsInfo(),
                    Size2D(1, 1),
                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                        6.0f));

    for(int i = 0; i < 5; i++)
    {

        conv.run();

        depth_conv.run();

        conv1.run();

        std::cout << "=============================" << std::endl;
    }

    free(input_buffer);
    free(bias_buffer);
    free(weight_buffer);
    free(output_buffer);
    free(conv_input_buffer);
    free(conv_bias_buffer);
    free(conv_weight_buffer);
    free(conv_output_buffer);
    free(conv1_input_buffer);
    free(conv1_bias_buffer);
    free(conv1_weight_buffer);
    free(conv1_output_buffer);
}

Don't use same buffer

{
    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_bias;
    Tensor conv_output;

    conv_input.allocator()->init(TensorInfo(TensorShape(3, 320, 320), 1, DataType::F32, DataLayout::NHWC));
    conv_weight.allocator()->init(TensorInfo(TensorShape(3, 3, 3, 16), 1, DataType::F32, DataLayout::NHWC));
    conv_bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    conv_output.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv_input_buffer  = (uint8_t *)malloc(4 * 320 * 320 * 3);
    uint8_t *conv_weight_buffer = (uint8_t *)malloc(4 * 16 * 3 * 3 * 3);
    uint8_t *conv_bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *conv_output_buffer = (uint8_t *)malloc(4 * 16 * 160 * 160);

    conv_input.allocator()->import_memory(conv_input_buffer);
    conv_weight.allocator()->import_memory(conv_weight_buffer);
    conv_bias.allocator()->import_memory(conv_bias_buffer);
    conv_output.allocator()->import_memory(conv_output_buffer);

    NEGEMMConvolutionLayer conv;
    conv.configure(&conv_input,
                   &conv_weight,
                   &conv_bias,
                   &conv_output,
                   PadStrideInfo(2, 2, 1, 1),
                   WeightsInfo(),
                   Size2D(1, 1),
                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                       6.0f));

    Tensor input;
    Tensor weight;
    Tensor bias;
    Tensor output;

    input.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));
    weight.allocator()->init(TensorInfo(TensorShape(16, 3, 3), 1, DataType::F32, DataLayout::NHWC));
    bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    output.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *input_buffer  = (uint8_t *)malloc(4 * 160 * 160 * 16);
    uint8_t *weight_buffer = (uint8_t *)malloc(4 * 3 * 3 * 16);
    uint8_t *bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *output_buffer = (uint8_t *)malloc(4 * 16 * 80 * 80);

    input.allocator()->import_memory(input_buffer);
    weight.allocator()->import_memory(weight_buffer);
    bias.allocator()->import_memory(bias_buffer);
    output.allocator()->import_memory(output_buffer);

    NEDepthwiseConvolutionLayer depth_conv;
    depth_conv.configure(&input,
                         &weight,
                         &bias,
                         &output,
                         PadStrideInfo(2, 2, 1, 1),
                         1,
                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f));

    Tensor conv1_input;
    Tensor conv1_weight;
    Tensor conv1_bias;
    Tensor conv1_output;

    conv1_input.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));
    conv1_weight.allocator()->init(TensorInfo(TensorShape(16, 1, 1, 8), 1, DataType::F32, DataLayout::NHWC));
    conv1_bias.allocator()->init(TensorInfo(TensorShape(8), 1, DataType::F32, DataLayout::NHWC));
    conv1_output.allocator()->init(TensorInfo(TensorShape(8, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv1_input_buffer  = (uint8_t *)malloc(4 * 80 * 80 * 16);
    uint8_t *conv1_weight_buffer = (uint8_t *)malloc(4 * 16 * 1 * 1 * 8);
    uint8_t *conv1_bias_buffer   = (uint8_t *)malloc(4 * 8);
    uint8_t *conv1_output_buffer = (uint8_t *)malloc(4 * 8 * 80 * 80);

    conv1_input.allocator()->import_memory(output_buffer);
    conv1_weight.allocator()->import_memory(conv1_weight_buffer);
    conv1_bias.allocator()->import_memory(conv1_bias_buffer);
    conv1_output.allocator()->import_memory(conv1_output_buffer);

    NEGEMMConvolutionLayer conv1;
    conv1.configure(&conv1_input,
                    &conv1_weight,
                    &conv1_bias,
                    &conv1_output,
                    PadStrideInfo(1, 1, 0, 0),
                    WeightsInfo(),
                    Size2D(1, 1),
                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                        6.0f));

    for(int i = 0; i < 5; i++)
    {

        conv.run();

        depth_conv.run();

        conv1.run();

        std::cout << "=============================" << std::endl;
    }

    free(input_buffer);
    free(bias_buffer);
    free(weight_buffer);
    free(output_buffer);
    free(conv_input_buffer);
    free(conv_bias_buffer);
    free(conv_weight_buffer);
    free(conv_output_buffer);
    free(conv1_input_buffer);
    free(conv1_bias_buffer);
    free(conv1_weight_buffer);
    free(conv1_output_buffer);
}

Time used is 13.863 Time used is 4.076 Time used is 0.593 =============================

Time used is 13.736 Time used is 1.204 Time used is 0.66

morgolock commented 5 months ago

Hi @wenhyan

I made some changes to your code to assess the performance and I don't see any differences. I tried on A73 and built the test with -O3. The library was built with scons os=linux opencl=0 asserts=0 examples=0 neon=1 arch=armv8a benchmark_examples=0 examples=0 arch=armv8a debug=0 validation_tests=0 opencl=0

See the output of the two binaries:

# LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./same_latency 
SAVE BUFFER
 same buffers 92ms to run.
# LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./diff_latency 
Different buffer
DIFF BUFFER
 diff buffers 91ms to run.

And the code below

#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "utils/Utils.h"
#include "tests/SimpleTensor.h"
#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h"
#include <chrono>

using namespace std;
using namespace arm_compute;
using namespace arm_compute::test;

int main()
{

#if 0
    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_bias;
    Tensor conv_output;

    conv_input.allocator()->init(TensorInfo(TensorShape(3, 320, 320), 1, DataType::F32, DataLayout::NHWC));
    conv_weight.allocator()->init(TensorInfo(TensorShape(3, 3, 3, 16), 1, DataType::F32, DataLayout::NHWC));
    conv_bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    conv_output.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv_input_buffer  = (uint8_t *)malloc(4 * 320 * 320 * 3);
    uint8_t *conv_weight_buffer = (uint8_t *)malloc(4 * 16 * 3 * 3 * 3);
    uint8_t *conv_bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *conv_output_buffer = (uint8_t *)malloc(4 * 16 * 160 * 160);

    conv_input.allocator()->import_memory(conv_input_buffer);
    conv_weight.allocator()->import_memory(conv_weight_buffer);
    conv_bias.allocator()->import_memory(conv_bias_buffer);
    conv_output.allocator()->import_memory(conv_output_buffer);

    NEGEMMConvolutionLayer conv;
    conv.configure(&conv_input,
                   &conv_weight,
                   &conv_bias,
                   &conv_output,
                   PadStrideInfo(2, 2, 1, 1),
                   WeightsInfo(),
                   Size2D(1, 1),
                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                       6.0f));

    Tensor input;
    Tensor weight;
    Tensor bias;
    Tensor output;

    input.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));
    weight.allocator()->init(TensorInfo(TensorShape(16, 3, 3), 1, DataType::F32, DataLayout::NHWC));
    bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    output.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *input_buffer  = (uint8_t *)malloc(4 * 160 * 160 * 16);
    uint8_t *weight_buffer = (uint8_t *)malloc(4 * 3 * 3 * 16);
    uint8_t *bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *output_buffer = (uint8_t *)malloc(4 * 16 * 80 * 80);

    input.allocator()->import_memory(conv_output_buffer);
    weight.allocator()->import_memory(weight_buffer);
    bias.allocator()->import_memory(bias_buffer);
    output.allocator()->import_memory(output_buffer);

    NEDepthwiseConvolutionLayer depth_conv;
    depth_conv.configure(&input,
                         &weight,
                         &bias,
                         &output,
                         PadStrideInfo(2, 2, 1, 1),
                         1,
                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f));

    Tensor conv1_input;
    Tensor conv1_weight;
    Tensor conv1_bias;
    Tensor conv1_output;

    conv1_input.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));
    conv1_weight.allocator()->init(TensorInfo(TensorShape(16, 1, 1, 8), 1, DataType::F32, DataLayout::NHWC));
    conv1_bias.allocator()->init(TensorInfo(TensorShape(8), 1, DataType::F32, DataLayout::NHWC));
    conv1_output.allocator()->init(TensorInfo(TensorShape(8, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv1_input_buffer  = (uint8_t *)malloc(4 * 80 * 80 * 16);
    uint8_t *conv1_weight_buffer = (uint8_t *)malloc(4 * 16 * 1 * 1 * 8);
    uint8_t *conv1_bias_buffer   = (uint8_t *)malloc(4 * 8);
    uint8_t *conv1_output_buffer = (uint8_t *)malloc(4 * 8 * 80 * 80);

    conv1_input.allocator()->import_memory(output_buffer);
    conv1_weight.allocator()->import_memory(conv1_weight_buffer);
    conv1_bias.allocator()->import_memory(conv1_bias_buffer);
    conv1_output.allocator()->import_memory(conv1_output_buffer);

    NEGEMMConvolutionLayer conv1;
    conv1.configure(&conv1_input,
                    &conv1_weight,
                    &conv1_bias,
                    &conv1_output,
                    PadStrideInfo(1, 1, 0, 0),
                    WeightsInfo(),
                    Size2D(1, 1),
                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                        6.0f));

    std::cout << "SAVE BUFFER\n";

    auto start_time = std::chrono::high_resolution_clock::now();
        conv.run();
        depth_conv.run();
        conv1.run();

    for(int i = 0; i < 25; i++)
    {
        conv.run();
        depth_conv.run();
        conv1.run();
    }
    auto end_time = std::chrono::high_resolution_clock::now();
    auto time = end_time - start_time;
    std::cout << " same buffers "<<   time/std::chrono::milliseconds(1) << "ms to run.\n";

    free(input_buffer);
    free(bias_buffer);
    free(weight_buffer);
    free(output_buffer);
    free(conv_input_buffer);
    free(conv_bias_buffer);
    free(conv_weight_buffer);
    free(conv_output_buffer);
    free(conv1_input_buffer);
    free(conv1_bias_buffer);
    free(conv1_weight_buffer);
    free(conv1_output_buffer); 
#else
    std::cout << "Different buffer\n";

    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_bias;
    Tensor conv_output;

    conv_input.allocator()->init(TensorInfo(TensorShape(3, 320, 320), 1, DataType::F32, DataLayout::NHWC));
    conv_weight.allocator()->init(TensorInfo(TensorShape(3, 3, 3, 16), 1, DataType::F32, DataLayout::NHWC));
    conv_bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    conv_output.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv_input_buffer  = (uint8_t *)malloc(4 * 320 * 320 * 3);
    uint8_t *conv_weight_buffer = (uint8_t *)malloc(4 * 16 * 3 * 3 * 3);
    uint8_t *conv_bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *conv_output_buffer = (uint8_t *)malloc(4 * 16 * 160 * 160);

    conv_input.allocator()->import_memory(conv_input_buffer);
    conv_weight.allocator()->import_memory(conv_weight_buffer);
    conv_bias.allocator()->import_memory(conv_bias_buffer);
    conv_output.allocator()->import_memory(conv_output_buffer);

    NEGEMMConvolutionLayer conv;
    conv.configure(&conv_input,
                   &conv_weight,
                   &conv_bias,
                   &conv_output,
                   PadStrideInfo(2, 2, 1, 1),
                   WeightsInfo(),
                   Size2D(1, 1),
                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                       6.0f));

    Tensor input;
    Tensor weight;
    Tensor bias;
    Tensor output;

    input.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));
    weight.allocator()->init(TensorInfo(TensorShape(16, 3, 3), 1, DataType::F32, DataLayout::NHWC));
    bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
    output.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *input_buffer  = (uint8_t *)malloc(4 * 160 * 160 * 16);
    uint8_t *weight_buffer = (uint8_t *)malloc(4 * 3 * 3 * 16);
    uint8_t *bias_buffer   = (uint8_t *)malloc(4 * 16);
    uint8_t *output_buffer = (uint8_t *)malloc(4 * 16 * 80 * 80);

    input.allocator()->import_memory(input_buffer);
    weight.allocator()->import_memory(weight_buffer);
    bias.allocator()->import_memory(bias_buffer);
    output.allocator()->import_memory(output_buffer);

    NEDepthwiseConvolutionLayer depth_conv;
    depth_conv.configure(&input,
                         &weight,
                         &bias,
                         &output,
                         PadStrideInfo(2, 2, 1, 1),
                         1,
                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f));

    Tensor conv1_input;
    Tensor conv1_weight;
    Tensor conv1_bias;
    Tensor conv1_output;

    conv1_input.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));
    conv1_weight.allocator()->init(TensorInfo(TensorShape(16, 1, 1, 8), 1, DataType::F32, DataLayout::NHWC));
    conv1_bias.allocator()->init(TensorInfo(TensorShape(8), 1, DataType::F32, DataLayout::NHWC));
    conv1_output.allocator()->init(TensorInfo(TensorShape(8, 80, 80), 1, DataType::F32, DataLayout::NHWC));

    uint8_t *conv1_input_buffer  = (uint8_t *)malloc(4 * 80 * 80 * 16);
    uint8_t *conv1_weight_buffer = (uint8_t *)malloc(4 * 16 * 1 * 1 * 8);
    uint8_t *conv1_bias_buffer   = (uint8_t *)malloc(4 * 8);
    uint8_t *conv1_output_buffer = (uint8_t *)malloc(4 * 8 * 80 * 80);

    conv1_input.allocator()->import_memory(output_buffer);
    conv1_weight.allocator()->import_memory(conv1_weight_buffer);
    conv1_bias.allocator()->import_memory(conv1_bias_buffer);
    conv1_output.allocator()->import_memory(conv1_output_buffer);

    NEGEMMConvolutionLayer conv1;
    conv1.configure(&conv1_input,
                    &conv1_weight,
                    &conv1_bias,
                    &conv1_output,
                    PadStrideInfo(1, 1, 0, 0),
                    WeightsInfo(),
                    Size2D(1, 1),
                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                        6.0f));
    std::cout << "DIFF BUFFER\n";

    auto start_time = std::chrono::high_resolution_clock::now();
        conv.run();
        depth_conv.run();
        conv1.run();

    for(int i = 0; i < 25; i++)
    {
        conv.run();
        depth_conv.run();
        conv1.run();
    }
    auto end_time = std::chrono::high_resolution_clock::now();
    auto time = end_time - start_time;
    std::cout << " diff buffers "<<   time/std::chrono::milliseconds(1) << "ms to run.\n";

    free(input_buffer);
    free(bias_buffer);
    free(weight_buffer);
    free(output_buffer);
    free(conv_input_buffer);
    free(conv_bias_buffer);
    free(conv_weight_buffer);
    free(conv_output_buffer);
    free(conv1_input_buffer);
    free(conv1_bias_buffer);
    free(conv1_weight_buffer);
    free(conv1_output_buffer);

#endif

    return 0;
}
wenhyan commented 4 months ago

Hi @morgolock Thx. I will try it again.