NEConvolutionLayer Segmentation Fault

poltomo commented 4 months ago

I am getting a segmentation fault for a simple 3x3 convolution.

#include"arm_compute/core/Types.h"
#include"arm_compute/runtime/NEON/NEFunctions.h"

#include"utils/Utils.h"

#include<chrono>
#include<iostream>

#define HI 64343.324234
#define LO -64343.324234

using namespace std;
using namespace arm_compute;

struct Timer {
    std::chrono::time_point<std::chrono::high_resolution_clock> start;
    std::chrono::duration<double>* time;
    Timer(std::chrono::duration<double>* time) : start{std::chrono::high_resolution_clock::now()}, time{time} {}
    ~Timer() {
        auto end = std::chrono::high_resolution_clock::now();
        *time += (end - start);
    }
};

void fill_tensor(Tensor& conv_weight, DataType dt, float lo = LO, float hi = HI) {
    switch ((int)dt) {
    case (int)DataType::F32:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            ((float*)conv_weight.buffer())[i] = LO + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HI-LO)));
        }
        break;
    case (int)DataType::F16:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            ((__fp16*)conv_weight.buffer())[i] = LO + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HI-LO)));
        }
        break;
    case (int)DataType::QSYMM8:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            (conv_weight.buffer())[i] = rand() % 256;
        }
        break;
    case (int)DataType::QASYMM8:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            (conv_weight.buffer())[i] = rand() % 256;
        }
        break;
    }
}
void memset_tensor(Tensor& conv_weight, DataType dt) {
    switch ((int)dt) {
    case (int)DataType::F32:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(float));
        break;
    case (int)DataType::F16:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(__fp16));
        break;
    case (int)DataType::QSYMM8:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
        break;
    case (int)DataType::QASYMM8:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
        break;
    }
}

int main()
{
    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_bias;
    Tensor conv_output;

    const unsigned int N = 1;
    const unsigned int Hi = 256;
    const unsigned int Wi = 256;
    const unsigned int Ci = 240;

    const unsigned int Hf = 3;
    const unsigned int Wf = 3;

    const int Ho = Hi - Hf + 1;
    const unsigned int Wo = Wi - Wf + 1;
    const unsigned int Co = 64;

    cout << "N " << N << endl;
    cout << "Hi " << Hi << endl;
    cout << "Wi " << Wi << endl;
    cout << "Ci " << Ci << endl;
    cout << "Hf " << Hf << endl;
    cout << "Wf " << Wf << endl;
    cout << "Ho " << Ho << endl;
    cout << "Wo " << Wo << endl;
    cout << "Co " << Co << endl;

    auto data_type = DataType::F32;

    auto input_info = TensorInfo(TensorShape(Ci, Wi, Hi), 1, data_type, DataLayout::NHWC);
    auto weight_info = TensorInfo(TensorShape(Co, Hf, Wf, Ci), 1, data_type, DataLayout::NHWC);
    auto output_info = TensorInfo(TensorShape(Co, Wo, Ho), 1, data_type, DataLayout::NHWC);

    conv_input.allocator()->init(input_info);
    conv_weight.allocator()->init(weight_info);
    conv_output.allocator()->init(output_info);
    conv_input.allocator()->allocate();
    conv_weight.allocator()->allocate();
    conv_output.allocator()->allocate();

    NEConvolutionLayer conv5{};
    conv5.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1,1,0,0));
    conv5.run();
    std::chrono::duration<double> total_time5(0);

    double n = 5;
    for (int j= 0;j < n;++j) {
        fill_tensor(conv_input, data_type);
        memset_tensor(conv_output, data_type);
        {
            Timer timer(&total_time5);

            conv5.run();
        }
    }
    std::cout << (total_time5.count() / n) << "\n";
}

ramelg01 commented 4 months ago

@poltomo Thanks for reporting, we are looking into at and coming back soon.

ramelg01 commented 4 months ago

Hi @poltomo, The exact example above was compiled against Arm Compute Library built for armv8 on Linux and ran without segmentation fault. Here are the steps done:

scons -s -j 8 Werror=0 debug=0 arch=armv8a os=linux neon=1 validation_tests=0 build_dir=neconv_example opencl=0
aarch64-none-linux-gnu-g++ --version aarch64-none-linux-gnu-g++ (fsf-10.128) 10.2.1 20201112
$ aarch64-none-linux-gnu-g++ examples/neconv.cpp -I. -I include/ utils/Utils.cpp -std=c++14 -L build/neconv_example/ -larm_compute -o ne_conv_layer
running on a Linux board with armv8 architecture, got the following output, with exit code 0:

$ ./ne_conv_layer N 1 Hi 256 Wi 256 Ci 240 Hf 3 Wf 3 Ho 254 Wo 254 Co 64 0.569512 echo $? 0
Could you please provide more information to reproduce the Segmentation Fault? Thanks

poltomo commented 4 months ago

@ramelg01 I think the operation just was not supported, that's probably why it segfaulted. See below. I confirmed this is the case for some op configs with the validate function (see below).

see below for updated and cleaner benchmark that checks if op is supported

built libarm_compute.so

CC=aarch64-linux-android26-clang CXX=aarch64-linux-android26-clang++ scons build_dir=build_neon_flags/ toolchain_prefix="" Werror=1 -j4 debug=0 asserts=0 neon=1 cppthreads=0 openmp=0 opencl=0 embed_kernels=1 os=android arch=arm64-v8a  extra_cxx_flags="-Ofast -ffast-math -funsafe-math-optimizations"

#include "arm_compute/core/Types.h"
#include "utils/Utils.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"

#include<iostream>
#include "my_benchmark.hpp"

#define HIGH 12312.232
#define LOW -12312.232

// #ifndef DATA_TYPE
#define DATA_TYPE F32  // F32, F16, QSYMM8, QASYMM8
#define BATCH_N 1
#define HI 1024
#define WI 1024
#define CI 3
#define HF 1
#define WF 1
#define HO HI-HF+1
#define WO WI-WF+1
#define CO 3
// #endif // ifndef N

#define TRIALS 2.0

using namespace std;
using namespace arm_compute;

void fill_tensor(Tensor& conv_weight, DataType dt, float lo = LOW, float hi = HIGH) {
    switch ((int)dt) {
    case (int)DataType::F32:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            ((float*)conv_weight.buffer())[i] = LOW + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HIGH-LOW)));
        }
        break;
    case (int)DataType::F16:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            ((__fp16*)conv_weight.buffer())[i] = LOW + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HIGH-LOW)));
        }
        break;
    case (int)DataType::QSYMM8:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            (conv_weight.buffer())[i] = rand() % 256;
        }
        break;
    case (int)DataType::QASYMM8:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            (conv_weight.buffer())[i] = rand() % 256;
        }
        break;
    }
}
void memset_tensor(Tensor& conv_weight, DataType dt) {
    switch ((int)dt) {
    case (int)DataType::F32:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(float));
        break;
    case (int)DataType::F16:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(__fp16));
        break;
    case (int)DataType::QSYMM8:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
        break;
    case (int)DataType::QASYMM8:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
        break;
    }
}

int main() {
    // print benchmark info
    switch ((int)DataType::DATA_TYPE) {
        case (int)DataType::F32:
            cout << "F32" << '\n';
            break;
        case (int)DataType::F16:
            cout << "F16" << '\n';
            break;
        case (int)DataType::QSYMM8:
            cout << "QSYMM8" << '\n';
            break;
        case (int)DataType::QASYMM8:
            cout << "QASYMM8" << '\n';
            break;
    }
    cout << "N " << BATCH_N << '\n';
    cout << "Hi " << HI << '\n';
    cout << "Wi " << WI << '\n';
    cout << "Ci " << CI << '\n';
    cout << "Hf " << HF << '\n';
    cout << "Wf " << WF << '\n';
    cout << "Ho " << HO << '\n';
    cout << "Wo " << WO << '\n';
    cout << "Co " << CO << endl;

    // test initialization
    auto data_type = DataType::DATA_TYPE;

    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_output;

    auto input_info = TensorInfo(TensorShape(CI, WI, HI, BATCH_N), 1, data_type, DataLayout::NHWC);
    auto weight_info = TensorInfo(TensorShape(CO, HF, WF, CI), 1, data_type, DataLayout::NHWC);
    auto output_info = TensorInfo(TensorShape(CO, WO, HO, BATCH_N), 1, data_type, DataLayout::NHWC);

    conv_input.allocator()->init(input_info);
    conv_weight.allocator()->init(weight_info);
    conv_output.allocator()->init(output_info);

    conv_input.allocator()->allocate();
    conv_weight.allocator()->allocate();
    conv_output.allocator()->allocate();

    NEDirectConvolutionLayer conv1{};
    NEGEMMConvolutionLayer conv2{};
    NEWinogradConvolutionLayer conv3{};

    if (NEDirectConvolutionLayer::validate(conv_input.info(), conv_weight.info(), nullptr, conv_output.info(), PadStrideInfo(1, 1, 0, 0))) {
        cout << "NEDirectConvolutionLayer" << '\n';
        conv1.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
        conv1.run();

        std::chrono::duration<double> total_time(0);

        double trials = TRIALS;
        for (int j= 0;j < trials;++j) {
            fill_tensor(conv_input, data_type);
            memset_tensor(conv_output, data_type);
            {
                Timer timer(&total_time);
                conv1.run();
            }
        }
        std::cout << (total_time.count() / trials) << endl;
    }
    else {
        std::cout << "NEDirectConvolutionLayer not supported" << "\n";
    }
    if (NEGEMMConvolutionLayer::validate(conv_input.info(), conv_weight.info(), nullptr, conv_output.info(), PadStrideInfo(1,1,0,0), WeightsInfo(), Size2D(1U,1U), ActivationLayerInfo(), true, 1U)) {
        cout << "NEGEMMConvolutionLayer" << '\n';
        conv2.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
        conv2.run();

        std::chrono::duration<double> total_time(0);

        double trials = TRIALS;
        for (int j= 0;j < trials;++j) {
            fill_tensor(conv_input, data_type);
            memset_tensor(conv_output, data_type);
            {
                Timer timer(&total_time);
                conv2.run();
            }
        }
        std::cout << (total_time.count() / trials) << endl;
    }
    else {
        std::cout << "NEGEMMConvolutionLayer not supported" << "\n";
    }
    if (NEWinogradConvolutionLayer::validate(conv_input.info(), conv_weight.info(), nullptr, conv_output.info(), PadStrideInfo(1,1,0,0), ActivationLayerInfo(), true)) {
        cout << "NEWinogradConvolutionLayer" << '\n';
        conv3.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
        conv3.run();

        std::chrono::duration<double> total_time(0);

        double trials = TRIALS;
        for (int j= 0;j < trials;++j) {
            fill_tensor(conv_input, data_type);
            memset_tensor(conv_output, data_type);
            {
                Timer timer(&total_time);
                conv3.run();
            }
        }
        std::cout << (total_time.count() / trials) << endl;
    }
    else {
        std::cout << "NEWinogradConvolutionLayer not supported" << "\n";
    }
    conv_input.allocator()->free();
    conv_output.allocator()->free();
    conv_weight.allocator()->free();
}

output

F32
N 1
Hi 1024
Wi 1024
Ci 3
Hf 1
Wf 1
Ho 1024
Wo 1024
Co 3
NEDirectConvolutionLayer
0.0791217
NEGEMMConvolutionLayer
0.00665224
NEWinogradConvolutionLayer not supported

In this case, its reasonable since gemm is most performant across the board for 1x1 convs

but why does this dimension define not get any support? I've seen this exact conv in mobilenet.

#define DATA_TYPE F32  // F32, F16, QSYMM8, QASYMM8
#define BATCH_N 1
#define HI 1024
#define WI 1024
#define CI 32
#define HF 1
#define WF 1
#define HO HI-HF+1
#define WO WI-WF+1
#define CO 120

poltomo commented 4 months ago

@ramelg01 basically no 1x1 convs are working for big channel_in and channel_out sizes

poltomo commented 4 months ago

@ramelg01 Also, how do I benchmark conv implementation minus any runtime/scheduler activity?

ramelg01 commented 4 months ago

Hi @poltomo case 1x1 is not supported in wingrad convolution, There is an error thrown when running winograd convolution configure conv3.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));

terminate called after throwing an instance of 'std::runtime_error'
  what():  in validate src/cpu/operators/CpuWinogradConv2d.cpp:347: Unsupported kernel size: 1 x 1.

Winograd breaks the dot-product down to smaller pieces which can't be used by adjacent convolution operation and it'll only incur extra processing. So, 1x1 Winograd will increase overhead, thereby deteriorating performance.

ARM-software / ComputeLibrary

NEConvolutionLayer Segmentation Fault #1126