ARM-software / ComputeLibrary

The Compute Library is a set of computer vision and machine learning functions optimised for both Arm CPUs and GPUs using SIMD technologies.
MIT License
2.79k stars 773 forks source link

NEConvolutionLayer Segmentation Fault #1126

Closed poltomo closed 1 month ago

poltomo commented 1 month ago

I am getting a segmentation fault for a simple 3x3 convolution.

#include"arm_compute/core/Types.h"
#include"arm_compute/runtime/NEON/NEFunctions.h"

#include"utils/Utils.h"

#include<chrono>
#include<iostream>

#define HI 64343.324234
#define LO -64343.324234

using namespace std;
using namespace arm_compute;

struct Timer {
    std::chrono::time_point<std::chrono::high_resolution_clock> start;
    std::chrono::duration<double>* time;
    Timer(std::chrono::duration<double>* time) : start{std::chrono::high_resolution_clock::now()}, time{time} {}
    ~Timer() {
        auto end = std::chrono::high_resolution_clock::now();
        *time += (end - start);
    }
};

void fill_tensor(Tensor& conv_weight, DataType dt, float lo = LO, float hi = HI) {
    switch ((int)dt) {
    case (int)DataType::F32:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            ((float*)conv_weight.buffer())[i] = LO + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HI-LO)));
        }
        break;
    case (int)DataType::F16:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            ((__fp16*)conv_weight.buffer())[i] = LO + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HI-LO)));
        }
        break;
    case (int)DataType::QSYMM8:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            (conv_weight.buffer())[i] = rand() % 256;
        }
        break;
    case (int)DataType::QASYMM8:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            (conv_weight.buffer())[i] = rand() % 256;
        }
        break;
    }
}
void memset_tensor(Tensor& conv_weight, DataType dt) {
    switch ((int)dt) {
    case (int)DataType::F32:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(float));
        break;
    case (int)DataType::F16:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(__fp16));
        break;
    case (int)DataType::QSYMM8:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
        break;
    case (int)DataType::QASYMM8:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
        break;
    }
}

int main()
{
    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_bias;
    Tensor conv_output;

    const unsigned int N = 1;
    const unsigned int Hi = 256;
    const unsigned int Wi = 256;
    const unsigned int Ci = 240;

    const unsigned int Hf = 3;
    const unsigned int Wf = 3;

    const int Ho = Hi - Hf + 1;
    const unsigned int Wo = Wi - Wf + 1;
    const unsigned int Co = 64;

    cout << "N " << N << endl;
    cout << "Hi " << Hi << endl;
    cout << "Wi " << Wi << endl;
    cout << "Ci " << Ci << endl;
    cout << "Hf " << Hf << endl;
    cout << "Wf " << Wf << endl;
    cout << "Ho " << Ho << endl;
    cout << "Wo " << Wo << endl;
    cout << "Co " << Co << endl;

    auto data_type = DataType::F32;

    auto input_info = TensorInfo(TensorShape(Ci, Wi, Hi), 1, data_type, DataLayout::NHWC);
    auto weight_info = TensorInfo(TensorShape(Co, Hf, Wf, Ci), 1, data_type, DataLayout::NHWC);
    auto output_info = TensorInfo(TensorShape(Co, Wo, Ho), 1, data_type, DataLayout::NHWC);

    conv_input.allocator()->init(input_info);
    conv_weight.allocator()->init(weight_info);
    conv_output.allocator()->init(output_info);
    conv_input.allocator()->allocate();
    conv_weight.allocator()->allocate();
    conv_output.allocator()->allocate();

    NEConvolutionLayer conv5{};
    conv5.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1,1,0,0));
    conv5.run();
    std::chrono::duration<double> total_time5(0);

    double n = 5;
    for (int j= 0;j < n;++j) {
        fill_tensor(conv_input, data_type);
        memset_tensor(conv_output, data_type);
        {
            Timer timer(&total_time5);

            conv5.run();
        }
    }
    std::cout << (total_time5.count() / n) << "\n";
}
ramelg01 commented 1 month ago

@poltomo Thanks for reporting, we are looking into at and coming back soon.

ramelg01 commented 1 month ago

Hi @poltomo, The exact example above was compiled against Arm Compute Library built for armv8 on Linux and ran without segmentation fault. Here are the steps done:

poltomo commented 1 month ago

@ramelg01 I think the operation just was not supported, that's probably why it segfaulted. See below. I confirmed this is the case for some op configs with the validate function (see below).

see below for updated and cleaner benchmark that checks if op is supported

built libarm_compute.so

CC=aarch64-linux-android26-clang CXX=aarch64-linux-android26-clang++ scons build_dir=build_neon_flags/ toolchain_prefix="" Werror=1 -j4 debug=0 asserts=0 neon=1 cppthreads=0 openmp=0 opencl=0 embed_kernels=1 os=android arch=arm64-v8a  extra_cxx_flags="-Ofast -ffast-math -funsafe-math-optimizations"
#include "arm_compute/core/Types.h"
#include "utils/Utils.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"

#include<iostream>
#include "my_benchmark.hpp"

#define HIGH 12312.232
#define LOW -12312.232

// #ifndef DATA_TYPE
#define DATA_TYPE F32  // F32, F16, QSYMM8, QASYMM8
#define BATCH_N 1
#define HI 1024
#define WI 1024
#define CI 3
#define HF 1
#define WF 1
#define HO HI-HF+1
#define WO WI-WF+1
#define CO 3
// #endif // ifndef N

#define TRIALS 2.0

using namespace std;
using namespace arm_compute;

void fill_tensor(Tensor& conv_weight, DataType dt, float lo = LOW, float hi = HIGH) {
    switch ((int)dt) {
    case (int)DataType::F32:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            ((float*)conv_weight.buffer())[i] = LOW + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HIGH-LOW)));
        }
        break;
    case (int)DataType::F16:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            ((__fp16*)conv_weight.buffer())[i] = LOW + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HIGH-LOW)));
        }
        break;
    case (int)DataType::QSYMM8:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            (conv_weight.buffer())[i] = rand() % 256;
        }
        break;
    case (int)DataType::QASYMM8:
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            (conv_weight.buffer())[i] = rand() % 256;
        }
        break;
    }
}
void memset_tensor(Tensor& conv_weight, DataType dt) {
    switch ((int)dt) {
    case (int)DataType::F32:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(float));
        break;
    case (int)DataType::F16:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size() * sizeof(__fp16));
        break;
    case (int)DataType::QSYMM8:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
        break;
    case (int)DataType::QASYMM8:
        memset(conv_weight.buffer(), 0, conv_weight.info()->tensor_shape().total_size());
        break;
    }
}

int main() {
    // print benchmark info
    switch ((int)DataType::DATA_TYPE) {
        case (int)DataType::F32:
            cout << "F32" << '\n';
            break;
        case (int)DataType::F16:
            cout << "F16" << '\n';
            break;
        case (int)DataType::QSYMM8:
            cout << "QSYMM8" << '\n';
            break;
        case (int)DataType::QASYMM8:
            cout << "QASYMM8" << '\n';
            break;
    }
    cout << "N " << BATCH_N << '\n';
    cout << "Hi " << HI << '\n';
    cout << "Wi " << WI << '\n';
    cout << "Ci " << CI << '\n';
    cout << "Hf " << HF << '\n';
    cout << "Wf " << WF << '\n';
    cout << "Ho " << HO << '\n';
    cout << "Wo " << WO << '\n';
    cout << "Co " << CO << endl;

    // test initialization
    auto data_type = DataType::DATA_TYPE;

    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_output;

    auto input_info = TensorInfo(TensorShape(CI, WI, HI, BATCH_N), 1, data_type, DataLayout::NHWC);
    auto weight_info = TensorInfo(TensorShape(CO, HF, WF, CI), 1, data_type, DataLayout::NHWC);
    auto output_info = TensorInfo(TensorShape(CO, WO, HO, BATCH_N), 1, data_type, DataLayout::NHWC);

    conv_input.allocator()->init(input_info);
    conv_weight.allocator()->init(weight_info);
    conv_output.allocator()->init(output_info);

    conv_input.allocator()->allocate();
    conv_weight.allocator()->allocate();
    conv_output.allocator()->allocate();

    NEDirectConvolutionLayer conv1{};
    NEGEMMConvolutionLayer conv2{};
    NEWinogradConvolutionLayer conv3{};

    if (NEDirectConvolutionLayer::validate(conv_input.info(), conv_weight.info(), nullptr, conv_output.info(), PadStrideInfo(1, 1, 0, 0))) {
        cout << "NEDirectConvolutionLayer" << '\n';
        conv1.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
        conv1.run();

        std::chrono::duration<double> total_time(0);

        double trials = TRIALS;
        for (int j= 0;j < trials;++j) {
            fill_tensor(conv_input, data_type);
            memset_tensor(conv_output, data_type);
            {
                Timer timer(&total_time);
                conv1.run();
            }
        }
        std::cout << (total_time.count() / trials) << endl;
    }
    else {
        std::cout << "NEDirectConvolutionLayer not supported" << "\n";
    }
    if (NEGEMMConvolutionLayer::validate(conv_input.info(), conv_weight.info(), nullptr, conv_output.info(), PadStrideInfo(1,1,0,0), WeightsInfo(), Size2D(1U,1U), ActivationLayerInfo(), true, 1U)) {
        cout << "NEGEMMConvolutionLayer" << '\n';
        conv2.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
        conv2.run();

        std::chrono::duration<double> total_time(0);

        double trials = TRIALS;
        for (int j= 0;j < trials;++j) {
            fill_tensor(conv_input, data_type);
            memset_tensor(conv_output, data_type);
            {
                Timer timer(&total_time);
                conv2.run();
            }
        }
        std::cout << (total_time.count() / trials) << endl;
    }
    else {
        std::cout << "NEGEMMConvolutionLayer not supported" << "\n";
    }
    if (NEWinogradConvolutionLayer::validate(conv_input.info(), conv_weight.info(), nullptr, conv_output.info(), PadStrideInfo(1,1,0,0), ActivationLayerInfo(), true)) {
        cout << "NEWinogradConvolutionLayer" << '\n';
        conv3.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
        conv3.run();

        std::chrono::duration<double> total_time(0);

        double trials = TRIALS;
        for (int j= 0;j < trials;++j) {
            fill_tensor(conv_input, data_type);
            memset_tensor(conv_output, data_type);
            {
                Timer timer(&total_time);
                conv3.run();
            }
        }
        std::cout << (total_time.count() / trials) << endl;
    }
    else {
        std::cout << "NEWinogradConvolutionLayer not supported" << "\n";
    }
    conv_input.allocator()->free();
    conv_output.allocator()->free();
    conv_weight.allocator()->free();
}

output

F32
N 1
Hi 1024
Wi 1024
Ci 3
Hf 1
Wf 1
Ho 1024
Wo 1024
Co 3
NEDirectConvolutionLayer
0.0791217
NEGEMMConvolutionLayer
0.00665224
NEWinogradConvolutionLayer not supported

In this case, its reasonable since gemm is most performant across the board for 1x1 convs

but why does this dimension define not get any support? I've seen this exact conv in mobilenet.

#define DATA_TYPE F32  // F32, F16, QSYMM8, QASYMM8
#define BATCH_N 1
#define HI 1024
#define WI 1024
#define CI 32
#define HF 1
#define WF 1
#define HO HI-HF+1
#define WO WI-WF+1
#define CO 120
poltomo commented 1 month ago

@ramelg01 basically no 1x1 convs are working for big channel_in and channel_out sizes

poltomo commented 1 month ago

@ramelg01 Also, how do I benchmark conv implementation minus any runtime/scheduler activity?

ramelg01 commented 1 month ago

Hi @poltomo case 1x1 is not supported in wingrad convolution, There is an error thrown when running winograd convolution configure conv3.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));

terminate called after throwing an instance of 'std::runtime_error'
  what():  in validate src/cpu/operators/CpuWinogradConv2d.cpp:347: Unsupported kernel size: 1 x 1.

Winograd breaks the dot-product down to smaller pieces which can't be used by adjacent convolution operation and it'll only incur extra processing. So, 1x1 Winograd will increase overhead, thereby deteriorating performance.