ARM-software / ComputeLibrary

The Compute Library is a set of computer vision and machine learning functions optimised for both Arm CPUs and GPUs using SIMD technologies.
2.87k stars 782 forks source link

NEGEMMConvolutionLayer is just returning zeros #1123

Closed poltomo closed 4 months ago

poltomo commented 4 months ago

This benchmark just return zeros. The documentation and the header files give conflicting information about the tensor inititalizations. There is also no documentation on the NEGEMMConvolutionLayer.

strings ComputeLibrary/build/build_neon/libarm_compute.so| grep arm_compute_version
arm_compute_version=v24.06 Build options: {'build_dir': 'build_neon/', 'toolchain_prefix': '', 'Werror': '1', 'debug': '0', 'asserts': '0', 'neon': '1', 'cppthreads': '0', 'openmp': '0', 'opencl': '0', 'embed_kernels': '1', 'os': 'android', 'arch': 'arm64-v8a'} Git hash=b'93e6401a3bf2da5ed0b19b50625eb3f9edb2b50e'

I am convolving a 1 channel 3x3 input with a 1 output channel 3x3 kernel. There is no bias. The output should be nonzero.

#include "arm_compute/core/Types.h"
// #include "arm_compute/runtime/Allocator.h"
// #include "arm_compute/runtime/BlobLifetimeManager.h"
// #include "arm_compute/runtime/MemoryManagerOnDemand.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
// #include "arm_compute/runtime/PoolManager.h"

#include "utils/Utils.h"

// #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"

// #include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
#include "src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp"

// #include "src/core/helpers/WindowHelpers.h"

#include <chrono>

#include<iostream>

using namespace std;
using namespace arm_compute;

struct Timer {
    std::chrono::time_point<std::chrono::high_resolution_clock> start;
    std::chrono::duration<double>* time;
    Timer(std::chrono::duration<double>* time) : start{std::chrono::high_resolution_clock::now()}, time{time} {}
    ~Timer() {
        auto end = std::chrono::high_resolution_clock::now();
        *time += (end - start);
    }
};

int main()
{
    Tensor conv_input;
    Tensor conv_weight;
    Tensor conv_bias;
    Tensor conv_output;

    const unsigned int N = 1;
    const unsigned int Hi = 3;
    const unsigned int Wi = 3;
    const unsigned int Ci = 1;

    const unsigned int Hf = 3;
    const unsigned int Wf = 3;

    const unsigned int Ho = Hi - Hf + 1;
    const unsigned int Wo = Wi - Wf + 1;
    const unsigned int Co = 1;

    cout << "N " << N << endl;
    cout << "Hi " << Hi << endl;
    cout << "Wi " << Wi << endl;
    cout << "Ci " << Ci << endl;
    cout << "Hf " << Hf << endl;
    cout << "Wf " << Wf << endl;
    cout << "Ho " << Ho << endl;
    cout << "Wo " << Wo << endl;
    cout << "Co " << Co << endl;

    // auto input_info = TensorInfo(TensorShape(Hi, Wi, Ci), 1, DataType::F32, DataLayout::NHWC);
    // auto weight_info = TensorInfo(TensorShape(Hf, Wf, Ci, Co), 1, DataType::F32, DataLayout::NHWC);
    // auto output_info = TensorInfo(TensorShape(Ho, Wo, Co), 1, DataType::F32, DataLayout::NHWC);

    // auto input_info = TensorInfo(TensorShape(Ci, Wi, Hi), 1, DataType::F32, DataLayout::NHWC);
    // auto weight_info = TensorInfo(TensorShape(Co, Hf, Wf, Ci), 1, DataType::F32);
    // auto output_info = TensorInfo(TensorShape(Co, Wo, Ho), 1, DataType::F32);

    auto input_info = TensorInfo(TensorShape(Hi, Wi, Ci), 1, DataType::F32, DataLayout::NHWC);
    auto weight_info = TensorInfo(TensorShape(Hf, Wf, Ci, Co), 1, DataType::F32, DataLayout::NHWC);
    auto output_info = TensorInfo(TensorShape(Ho, Wo, Co), 1, DataType::F32, DataLayout::NHWC);

    conv_input.allocator()->init(input_info);
    conv_weight.allocator()->init(weight_info);
    conv_output.allocator()->init(output_info);
    conv_input.allocator()->allocate();
    conv_weight.allocator()->allocate();
    conv_output.allocator()->allocate();

    // arm_compute::NEDirectConvolutionLayer conv;
    // conv.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));

    arm_compute::NEGEMMConvolutionLayer conv;
    conv.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0));
    // conv.configure(&conv_input, &conv_weight, nullptr, &conv_output, PadStrideInfo(1, 1, 0, 0), WeightsInfo(false, Wf,Hf,Co,false), Size2D(1U,1U), ActivationLayerInfo(), true, 1U);

    conv.run();

    // Window win = calculate_max_window(*conv_output.info(), Steps());
    // arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d(win, &conv_input, &conv_weight, &conv_output, PadStrideInfo(1,1,0,0));

    memset(conv_output.buffer(), 0, conv_output.info()->tensor_shape().total_size() * sizeof(float));

    double n = 3;
    std::chrono::duration<double> total_time(0);
    for (int i = 0;i < n;++i) {
        for (int i = 0;i < conv_input.info()->tensor_shape().total_size();++i) {
            // ((int*)conv_input.buffer())[i] = rand();
            ((float*)conv_input.buffer())[i] = i + 1;
        }
        for (int i = 0;i < conv_weight.info()->tensor_shape().total_size();++i) {
            // ((int*)conv_weight.buffer())[i] = rand();
            ((float*)conv_weight.buffer())[i] = i + 1;
        }
        memset(conv_output.buffer(), 0, conv_output.info()->tensor_shape().total_size() * sizeof(float));
        {
            Timer timer(&total_time);
            // arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d(win, &conv_input, &conv_weight, &conv_output, PadStrideInfo(1,1,0,0));
            conv.run();
        }
    }
    std::cout << (total_time.count() / n) << "\n";

    for (int i = 0;i < conv_output.info()->tensor_shape().total_size() && i < 10;++i) {
        cout << ((float*)conv_output.buffer())[i] << ' ';
    } cout << endl;
}
poltomo commented 4 months ago

@morgolock I think you helpes someone where a convolution layer was returning all zeros. Is this related tot that issue?

poltomo commented 4 months ago

Fixed for NHWC. I should have trusted the documentation not the headers

auto input_info = TensorInfo(TensorShape(Ci, Wi, Hi), 1, DataType::F32, DataLayout::NHWC);
auto weight_info = TensorInfo(TensorShape(Co, Hf, Wf, Ci), 1, DataType::F32, DataLayout::NHWC);
auto output_info = TensorInfo(TensorShape(Co, Wo, Ho), 1, DataType::F32, DataLayout::NHWC);