Closed poltomo closed 3 months ago
@poltomo Thanks for reporting, we are looking into at and coming back soon.
Hi @poltomo, The exact example above was compiled against Arm Compute Library built for armv8 on Linux and ran without segmentation fault. Here are the steps done:
scons -s -j 8 Werror=0 debug=0 arch=armv8a os=linux neon=1 validation_tests=0 build_dir=neconv_example opencl=0
aarch64-none-linux-gnu-g++ --version aarch64-none-linux-gnu-g++ (fsf-10.128) 10.2.1 20201112
$ aarch64-none-linux-gnu-g++ examples/neconv.cpp -I. -I include/ utils/Utils.cpp -std=c++14 -L build/neconv_example/ -larm_compute -o ne_conv_layer
running on a Linux board with armv8 architecture, got the following output, with exit code 0:
$ ./ne_conv_layer N 1 Hi 256 Wi 256 Ci 240 Hf 3 Wf 3 Ho 254 Wo 254 Co 64 0.569512 echo $? 0
Could you please provide more information to reproduce the Segmentation Fault? Thanks
@ramelg01 I think the operation just was not supported, that's probably why it segfaulted. See below. I confirmed this is the case for some op configs with the validate function (see below).
see below for updated and cleaner benchmark that checks if op is supported
built libarm_compute.so
CC=aarch64-linux-android26-clang CXX=aarch64-linux-android26-clang++ scons build_dir=build_neon_flags/ toolchain_prefix="" Werror=1 -j4 debug=0 asserts=0 neon=1 cppthreads=0 openmp=0 opencl=0 embed_kernels=1 os=android arch=arm64-v8a extra_cxx_flags="-Ofast -ffast-math -funsafe-math-optimizations"
#include "arm_compute/core/Types.h"
#include "utils/Utils.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include<iostream>
#include "my_benchmark.hpp"
#define HIGH 12312.232
#define LOW -12312.232
// #ifndef DATA_TYPE
#define DATA_TYPE F32 // F32, F16, QSYMM8, QASYMM8
#define BATCH_N 1
#define HI 1024
#define WI 1024
#define CI 3
#define HF 1
#define WF 1
#define HO HI-HF+1
#define WO WI-WF+1
#define CO 3
// #endif // ifndef N
#define TRIALS 2.0
using namespace std;
using namespace arm_compute;
void fill_tensor(Tensor& conv_weight, DataType dt, float lo = LOW, float hi = HIGH) {
switch ((int)dt) {
case (int)DataType::F32:
for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
((float*)conv_weight.buffer())[i] = LOW + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HIGH-LOW)));
}
break;
case (int)DataType::F16:
for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
((__fp16*)conv_weight.buffer())[i] = LOW + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HIGH-LOW)));
}
break;
case (int)DataType::QSYMM8:
for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
(conv_weight.buffer())[i] = rand() % 256;
}
break;
case (int)DataType::QASYMM8:
for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
(conv_weight.buffer())[i] = rand() % 256;
}
break;
}
}
void memset_tensor(Tensor& conv_weight, DataType dt) {
switch ((int)dt) {
case (int)DataType::F32:
memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(float));
break;
case (int)DataType::F16:
memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(__fp16));
break;
case (int)DataType::QSYMM8:
memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
break;
case (int)DataType::QASYMM8:
memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
break;
}
}
int main() {
// print benchmark info
switch ((int)DataType::DATA_TYPE) {
case (int)DataType::F32:
cout << "F32" << '\n';
break;
case (int)DataType::F16:
cout << "F16" << '\n';
break;
case (int)DataType::QSYMM8:
cout << "QSYMM8" << '\n';
break;
case (int)DataType::QASYMM8:
cout << "QASYMM8" << '\n';
break;
}
cout << "N " << BATCH_N << '\n';
cout << "Hi " << HI << '\n';
cout << "Wi " << WI << '\n';
cout << "Ci " << CI << '\n';
cout << "Hf " << HF << '\n';
cout << "Wf " << WF << '\n';
cout << "Ho " << HO << '\n';
cout << "Wo " << WO << '\n';
cout << "Co " << CO << endl;
// test initialization
auto data_type = DataType::DATA_TYPE;
Tensor conv_input;
Tensor conv_weight;
Tensor conv_output;
auto input_info = TensorInfo(TensorShape(CI, WI, HI, BATCH_N), 1, data_type, DataLayout::NHWC);
auto weight_info = TensorInfo(TensorShape(CO, HF, WF, CI), 1, data_type, DataLayout::NHWC);
auto output_info = TensorInfo(TensorShape(CO, WO, HO, BATCH_N), 1, data_type, DataLayout::NHWC);
conv_input.allocator()->init(input_info);
conv_weight.allocator()->init(weight_info);
conv_output.allocator()->init(output_info);
conv_input.allocator()->allocate();
conv_weight.allocator()->allocate();
conv_output.allocator()->allocate();
NEDirectConvolutionLayer conv1{};
NEGEMMConvolutionLayer conv2{};
NEWinogradConvolutionLayer conv3{};
if (NEDirectConvolutionLayer::validate(conv_input.info(), conv_weight.info(), nullptr, conv_output.info(), PadStrideInfo(1, 1, 0, 0))) {
cout << "NEDirectConvolutionLayer" << '\n';
conv1.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
conv1.run();
std::chrono::duration<double> total_time(0);
double trials = TRIALS;
for (int j= 0;j < trials;++j) {
fill_tensor(conv_input, data_type);
memset_tensor(conv_output, data_type);
{
Timer timer(&total_time);
conv1.run();
}
}
std::cout << (total_time.count() / trials) << endl;
}
else {
std::cout << "NEDirectConvolutionLayer not supported" << "\n";
}
if (NEGEMMConvolutionLayer::validate(conv_input.info(), conv_weight.info(), nullptr, conv_output.info(), PadStrideInfo(1,1,0,0), WeightsInfo(), Size2D(1U,1U), ActivationLayerInfo(), true, 1U)) {
cout << "NEGEMMConvolutionLayer" << '\n';
conv2.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
conv2.run();
std::chrono::duration<double> total_time(0);
double trials = TRIALS;
for (int j= 0;j < trials;++j) {
fill_tensor(conv_input, data_type);
memset_tensor(conv_output, data_type);
{
Timer timer(&total_time);
conv2.run();
}
}
std::cout << (total_time.count() / trials) << endl;
}
else {
std::cout << "NEGEMMConvolutionLayer not supported" << "\n";
}
if (NEWinogradConvolutionLayer::validate(conv_input.info(), conv_weight.info(), nullptr, conv_output.info(), PadStrideInfo(1,1,0,0), ActivationLayerInfo(), true)) {
cout << "NEWinogradConvolutionLayer" << '\n';
conv3.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
conv3.run();
std::chrono::duration<double> total_time(0);
double trials = TRIALS;
for (int j= 0;j < trials;++j) {
fill_tensor(conv_input, data_type);
memset_tensor(conv_output, data_type);
{
Timer timer(&total_time);
conv3.run();
}
}
std::cout << (total_time.count() / trials) << endl;
}
else {
std::cout << "NEWinogradConvolutionLayer not supported" << "\n";
}
conv_input.allocator()->free();
conv_output.allocator()->free();
conv_weight.allocator()->free();
}
output
F32
N 1
Hi 1024
Wi 1024
Ci 3
Hf 1
Wf 1
Ho 1024
Wo 1024
Co 3
NEDirectConvolutionLayer
0.0791217
NEGEMMConvolutionLayer
0.00665224
NEWinogradConvolutionLayer not supported
In this case, its reasonable since gemm is most performant across the board for 1x1 convs
but why does this dimension define not get any support? I've seen this exact conv in mobilenet.
#define DATA_TYPE F32 // F32, F16, QSYMM8, QASYMM8
#define BATCH_N 1
#define HI 1024
#define WI 1024
#define CI 32
#define HF 1
#define WF 1
#define HO HI-HF+1
#define WO WI-WF+1
#define CO 120
@ramelg01 basically no 1x1 convs are working for big channel_in and channel_out sizes
@ramelg01 Also, how do I benchmark conv implementation minus any runtime/scheduler activity?
Hi @poltomo
case 1x1 is not supported in wingrad convolution,
There is an error thrown when running winograd convolution configure
conv3.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
terminate called after throwing an instance of 'std::runtime_error'
what(): in validate src/cpu/operators/CpuWinogradConv2d.cpp:347: Unsupported kernel size: 1 x 1.
Winograd breaks the dot-product down to smaller pieces which can't be used by adjacent convolution operation and it'll only incur extra processing. So, 1x1 Winograd will increase overhead, thereby deteriorating performance.
I am getting a segmentation fault for a simple 3x3 convolution.