Closed wenhyan closed 9 months ago
Hi @wenhyan
I made some changes to your code to assess the performance and I don't see any differences. I tried on A73 and built the test with -O3
. The library was built with scons os=linux opencl=0 asserts=0 examples=0 neon=1 arch=armv8a benchmark_examples=0 examples=0 arch=armv8a debug=0 validation_tests=0 opencl=0
See the output of the two binaries:
# LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./same_latency
SAVE BUFFER
same buffers 92ms to run.
# LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./diff_latency
Different buffer
DIFF BUFFER
diff buffers 91ms to run.
And the code below
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "utils/Utils.h"
#include "tests/SimpleTensor.h"
#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h"
#include <chrono>
using namespace std;
using namespace arm_compute;
using namespace arm_compute::test;
int main()
{
#if 0
Tensor conv_input;
Tensor conv_weight;
Tensor conv_bias;
Tensor conv_output;
conv_input.allocator()->init(TensorInfo(TensorShape(3, 320, 320), 1, DataType::F32, DataLayout::NHWC));
conv_weight.allocator()->init(TensorInfo(TensorShape(3, 3, 3, 16), 1, DataType::F32, DataLayout::NHWC));
conv_bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
conv_output.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));
uint8_t *conv_input_buffer = (uint8_t *)malloc(4 * 320 * 320 * 3);
uint8_t *conv_weight_buffer = (uint8_t *)malloc(4 * 16 * 3 * 3 * 3);
uint8_t *conv_bias_buffer = (uint8_t *)malloc(4 * 16);
uint8_t *conv_output_buffer = (uint8_t *)malloc(4 * 16 * 160 * 160);
conv_input.allocator()->import_memory(conv_input_buffer);
conv_weight.allocator()->import_memory(conv_weight_buffer);
conv_bias.allocator()->import_memory(conv_bias_buffer);
conv_output.allocator()->import_memory(conv_output_buffer);
NEGEMMConvolutionLayer conv;
conv.configure(&conv_input,
&conv_weight,
&conv_bias,
&conv_output,
PadStrideInfo(2, 2, 1, 1),
WeightsInfo(),
Size2D(1, 1),
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
6.0f));
Tensor input;
Tensor weight;
Tensor bias;
Tensor output;
input.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));
weight.allocator()->init(TensorInfo(TensorShape(16, 3, 3), 1, DataType::F32, DataLayout::NHWC));
bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
output.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));
uint8_t *input_buffer = (uint8_t *)malloc(4 * 160 * 160 * 16);
uint8_t *weight_buffer = (uint8_t *)malloc(4 * 3 * 3 * 16);
uint8_t *bias_buffer = (uint8_t *)malloc(4 * 16);
uint8_t *output_buffer = (uint8_t *)malloc(4 * 16 * 80 * 80);
input.allocator()->import_memory(conv_output_buffer);
weight.allocator()->import_memory(weight_buffer);
bias.allocator()->import_memory(bias_buffer);
output.allocator()->import_memory(output_buffer);
NEDepthwiseConvolutionLayer depth_conv;
depth_conv.configure(&input,
&weight,
&bias,
&output,
PadStrideInfo(2, 2, 1, 1),
1,
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f));
Tensor conv1_input;
Tensor conv1_weight;
Tensor conv1_bias;
Tensor conv1_output;
conv1_input.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));
conv1_weight.allocator()->init(TensorInfo(TensorShape(16, 1, 1, 8), 1, DataType::F32, DataLayout::NHWC));
conv1_bias.allocator()->init(TensorInfo(TensorShape(8), 1, DataType::F32, DataLayout::NHWC));
conv1_output.allocator()->init(TensorInfo(TensorShape(8, 80, 80), 1, DataType::F32, DataLayout::NHWC));
uint8_t *conv1_input_buffer = (uint8_t *)malloc(4 * 80 * 80 * 16);
uint8_t *conv1_weight_buffer = (uint8_t *)malloc(4 * 16 * 1 * 1 * 8);
uint8_t *conv1_bias_buffer = (uint8_t *)malloc(4 * 8);
uint8_t *conv1_output_buffer = (uint8_t *)malloc(4 * 8 * 80 * 80);
conv1_input.allocator()->import_memory(output_buffer);
conv1_weight.allocator()->import_memory(conv1_weight_buffer);
conv1_bias.allocator()->import_memory(conv1_bias_buffer);
conv1_output.allocator()->import_memory(conv1_output_buffer);
NEGEMMConvolutionLayer conv1;
conv1.configure(&conv1_input,
&conv1_weight,
&conv1_bias,
&conv1_output,
PadStrideInfo(1, 1, 0, 0),
WeightsInfo(),
Size2D(1, 1),
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
6.0f));
std::cout << "SAVE BUFFER\n";
auto start_time = std::chrono::high_resolution_clock::now();
conv.run();
depth_conv.run();
conv1.run();
for(int i = 0; i < 25; i++)
{
conv.run();
depth_conv.run();
conv1.run();
}
auto end_time = std::chrono::high_resolution_clock::now();
auto time = end_time - start_time;
std::cout << " same buffers "<< time/std::chrono::milliseconds(1) << "ms to run.\n";
free(input_buffer);
free(bias_buffer);
free(weight_buffer);
free(output_buffer);
free(conv_input_buffer);
free(conv_bias_buffer);
free(conv_weight_buffer);
free(conv_output_buffer);
free(conv1_input_buffer);
free(conv1_bias_buffer);
free(conv1_weight_buffer);
free(conv1_output_buffer);
#else
std::cout << "Different buffer\n";
Tensor conv_input;
Tensor conv_weight;
Tensor conv_bias;
Tensor conv_output;
conv_input.allocator()->init(TensorInfo(TensorShape(3, 320, 320), 1, DataType::F32, DataLayout::NHWC));
conv_weight.allocator()->init(TensorInfo(TensorShape(3, 3, 3, 16), 1, DataType::F32, DataLayout::NHWC));
conv_bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
conv_output.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));
uint8_t *conv_input_buffer = (uint8_t *)malloc(4 * 320 * 320 * 3);
uint8_t *conv_weight_buffer = (uint8_t *)malloc(4 * 16 * 3 * 3 * 3);
uint8_t *conv_bias_buffer = (uint8_t *)malloc(4 * 16);
uint8_t *conv_output_buffer = (uint8_t *)malloc(4 * 16 * 160 * 160);
conv_input.allocator()->import_memory(conv_input_buffer);
conv_weight.allocator()->import_memory(conv_weight_buffer);
conv_bias.allocator()->import_memory(conv_bias_buffer);
conv_output.allocator()->import_memory(conv_output_buffer);
NEGEMMConvolutionLayer conv;
conv.configure(&conv_input,
&conv_weight,
&conv_bias,
&conv_output,
PadStrideInfo(2, 2, 1, 1),
WeightsInfo(),
Size2D(1, 1),
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
6.0f));
Tensor input;
Tensor weight;
Tensor bias;
Tensor output;
input.allocator()->init(TensorInfo(TensorShape(16, 160, 160), 1, DataType::F32, DataLayout::NHWC));
weight.allocator()->init(TensorInfo(TensorShape(16, 3, 3), 1, DataType::F32, DataLayout::NHWC));
bias.allocator()->init(TensorInfo(TensorShape(16), 1, DataType::F32, DataLayout::NHWC));
output.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));
uint8_t *input_buffer = (uint8_t *)malloc(4 * 160 * 160 * 16);
uint8_t *weight_buffer = (uint8_t *)malloc(4 * 3 * 3 * 16);
uint8_t *bias_buffer = (uint8_t *)malloc(4 * 16);
uint8_t *output_buffer = (uint8_t *)malloc(4 * 16 * 80 * 80);
input.allocator()->import_memory(input_buffer);
weight.allocator()->import_memory(weight_buffer);
bias.allocator()->import_memory(bias_buffer);
output.allocator()->import_memory(output_buffer);
NEDepthwiseConvolutionLayer depth_conv;
depth_conv.configure(&input,
&weight,
&bias,
&output,
PadStrideInfo(2, 2, 1, 1),
1,
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f));
Tensor conv1_input;
Tensor conv1_weight;
Tensor conv1_bias;
Tensor conv1_output;
conv1_input.allocator()->init(TensorInfo(TensorShape(16, 80, 80), 1, DataType::F32, DataLayout::NHWC));
conv1_weight.allocator()->init(TensorInfo(TensorShape(16, 1, 1, 8), 1, DataType::F32, DataLayout::NHWC));
conv1_bias.allocator()->init(TensorInfo(TensorShape(8), 1, DataType::F32, DataLayout::NHWC));
conv1_output.allocator()->init(TensorInfo(TensorShape(8, 80, 80), 1, DataType::F32, DataLayout::NHWC));
uint8_t *conv1_input_buffer = (uint8_t *)malloc(4 * 80 * 80 * 16);
uint8_t *conv1_weight_buffer = (uint8_t *)malloc(4 * 16 * 1 * 1 * 8);
uint8_t *conv1_bias_buffer = (uint8_t *)malloc(4 * 8);
uint8_t *conv1_output_buffer = (uint8_t *)malloc(4 * 8 * 80 * 80);
conv1_input.allocator()->import_memory(output_buffer);
conv1_weight.allocator()->import_memory(conv1_weight_buffer);
conv1_bias.allocator()->import_memory(conv1_bias_buffer);
conv1_output.allocator()->import_memory(conv1_output_buffer);
NEGEMMConvolutionLayer conv1;
conv1.configure(&conv1_input,
&conv1_weight,
&conv1_bias,
&conv1_output,
PadStrideInfo(1, 1, 0, 0),
WeightsInfo(),
Size2D(1, 1),
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
6.0f));
std::cout << "DIFF BUFFER\n";
auto start_time = std::chrono::high_resolution_clock::now();
conv.run();
depth_conv.run();
conv1.run();
for(int i = 0; i < 25; i++)
{
conv.run();
depth_conv.run();
conv1.run();
}
auto end_time = std::chrono::high_resolution_clock::now();
auto time = end_time - start_time;
std::cout << " diff buffers "<< time/std::chrono::milliseconds(1) << "ms to run.\n";
free(input_buffer);
free(bias_buffer);
free(weight_buffer);
free(output_buffer);
free(conv_input_buffer);
free(conv_bias_buffer);
free(conv_weight_buffer);
free(conv_output_buffer);
free(conv1_input_buffer);
free(conv1_bias_buffer);
free(conv1_weight_buffer);
free(conv1_output_buffer);
#endif
return 0;
}
Hi @morgolock Thx. I will try it again.
arm_compute_version=v23.05 Build options: {'Werror': '1', 'debug': '0', 'asserts': '1', 'neon': '1', 'opencl': '0', 'os': 'linux', 'arch': 'armv8a'} Git hash=b'6c713f090601ea839d944a30888ea56eb2f43988'
Platform: Raspberry-Pi 3B A53
Operating System: Linux
Problem description: The latency of a depthwise is affected by the previous convolution
Two conv2d and one is 3x3 conv2d one is 3x3 depthwise, the first conv2d output is second depthwise input, I use same memory buffer to save feature map. The latency of conv2d is 13ms and depthwise is 4 ms. If don't use same memory buffer, the latency of depthwise is 1 ms.
Use same buffer conv_output_buffer
Don't use same buffer
Time used is 13.863 Time used is 4.076 Time used is 0.593 =============================
Time used is 13.736 Time used is 1.204 Time used is 0.66