Improve runtime on VGG-like models

Dobiasd / frugally-deep

A lightweight header-only library for using Keras (TensorFlow) models in C++.

MIT License

1.07k stars 235 forks source link

Improve runtime on VGG-like models #55

Closed Dobiasd closed 4 years ago

Dobiasd commented 6 years ago

Improve runtime on VGG-like models

Compiled using GCC with -O3 -mavx and running on a single-CPU, frugally-deep is faster on most typical model architectures than Keras+TensorFlow. However on VGG16 and VGG19 we are about 20% behind. I think it would be awesome if we were able to change this! :D

Profiling forward passes on VGG16 in and endless loop shows that the matrix multiplication in our im2col convolution takes up about 80% of the CPU time.

We currently use GEMM (general matrix multiplication) of the Eigen library for this.

So to isolate the problem, I wrote the following benchmark. To decide on matrix sizes for this test, I measured the runtime of the single layers of VGG16 and used the dimensions typically occuring in the most costly ones.

It compares the performance of different ways to do GEMM with Eigen, i.e. aligned/unaligned memory and row-majow/column-major memory order. But these make no difference.

#include <chrono>
#include <iostream>

#include <eigen3/Eigen/Dense>

using RowMajorMat = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;

#define mat_size_1 512
#define mat_size_2 4609
#define mat_size_3 784

float multiply_unaligned()
{
    RowMajorMat a(mat_size_1, mat_size_2);
    RowMajorMat b(mat_size_2, mat_size_3);
    const std::size_t num_bytes = a.rows() * b.cols() * sizeof(float);
    float* ptr = (float*)std::malloc(num_bytes);
    Eigen::Map<RowMajorMat, Eigen::Unaligned> c(
        ptr,
        static_cast<Eigen::Index>(a.rows()),
        static_cast<Eigen::Index>(b.cols()));
    c.noalias() = a * b;
    float check_value = ptr[0];
    std::free(ptr);
    return check_value;
}

float multiply_col_major()
{
    using ColMajorMat = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
    ColMajorMat a(mat_size_1, mat_size_2);
    ColMajorMat b(mat_size_2, mat_size_3);
    const std::size_t num_bytes = a.rows() * b.cols() * sizeof(float);
    float* ptr = (float*)std::malloc(num_bytes);
    Eigen::Map<ColMajorMat, Eigen::Unaligned> c(
        ptr,
        static_cast<Eigen::Index>(a.rows()),
        static_cast<Eigen::Index>(b.cols()));
    c.noalias() = a * b;
    float check_value = ptr[0];
    std::free(ptr);
    return check_value;
}

float multiply_aligned()
{
    RowMajorMat a(mat_size_1, mat_size_2);
    RowMajorMat b(mat_size_2, mat_size_3);
    const std::size_t num_bytes = a.rows() * b.cols() * sizeof(float);
    float* ptr = (float*)aligned_alloc(128, num_bytes);
    Eigen::Map<RowMajorMat, Eigen::Aligned> c(
        ptr,
        static_cast<Eigen::Index>(a.rows()),
        static_cast<Eigen::Index>(b.cols()));
    c.noalias() = a * b;
    float check_value = ptr[0];
    std::free(ptr);
    return check_value;
}

/*
// Needs
// #define EIGEN_STACK_ALLOCATION_LIMIT 0
// before
// #include <eigen3/Eigen/Dense>
// However then produces
// /usr/local/include/eigen3/Eigen/src/Core/AssignEvaluator.h:207:88: fatal error: template instantiation depth exceeds maximum of 900 (use -ftemplate-depth= to increase the maximum)
// Out Matrices seem to simply be too large for this.
float multiply_fixed()
{
    Eigen::Matrix<float, mat_size_1, mat_size_2, Eigen::RowMajor> a;
    Eigen::Matrix<float, mat_size_2, mat_size_3, Eigen::RowMajor> b;
    Eigen::Matrix<float, mat_size_1, mat_size_3, Eigen::RowMajor> res;
    res = a * b;
    float check_value = res(0,0);
    return check_value;
}
*/

template <typename Func>
void measure(const std::string& name, const Func f)
{
    using namespace std::chrono;
    float checksum = 0.0f; // to prevent compiler from optimizing everything away
    const auto start_time_ns = high_resolution_clock::now().time_since_epoch().count();
    const std::size_t runs = 10;
    for (size_t i = 0; i < runs; ++i)
    {
        checksum += f();
    }
    const auto end_time_ns = high_resolution_clock::now().time_since_epoch().count();
    const auto elapsed_ms = (end_time_ns - start_time_ns) / (runs * 1000000);
    std::cout << name << " (checksum: " << checksum << ") elapsed_ms: " << elapsed_ms << std::endl;
}

int main()
{
    for (std::size_t i = 0; i < 5; ++i)
    {
        measure("unaligned", multiply_unaligned);
        measure("aligned  ", multiply_aligned);
        measure("col major", multiply_col_major);
        //measure("fixed    ", multiply_fixed);
        std::cout << "--------" << std::endl;
    }
}

g++ --std=c++14 -O3 -mavx gemm_benchmark.cpp -o gemm_benchmark && taskset --cpu-list 1 ./gemm_benchmark

Output on my machine:

unaligned (checksum: 0) elapsed_ms: 66
aligned   (checksum: 0) elapsed_ms: 65
col major (checksum: 0) elapsed_ms: 66
--------
unaligned (checksum: 0) elapsed_ms: 64
aligned   (checksum: 0) elapsed_ms: 65
col major (checksum: 0) elapsed_ms: 66
--------
unaligned (checksum: 0) elapsed_ms: 67
aligned   (checksum: 0) elapsed_ms: 66
col major (checksum: 0) elapsed_ms: 66
--------
unaligned (checksum: 0) elapsed_ms: 64
aligned   (checksum: 0) elapsed_ms: 65
col major (checksum: 0) elapsed_ms: 65
--------
unaligned (checksum: 0) elapsed_ms: 65
aligned   (checksum: 0) elapsed_ms: 64
col major (checksum: 0) elapsed_ms: 65

Any ideas on how to improve are very welcome. :)

Dobiasd commented 6 years ago

Just tried out OpenBLAS as an alternative to Eigen, but it is not faster.

#include <chrono>
#include <iostream>

#include <eigen3/Eigen/Dense>

extern "C" {
    void sgemm_(char*, char*, int*, int*,int*, float*, float*, int*, float*, int*, float*, float*, int*);
}

using RowMajorMat = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;

#define mat_size_1 512
#define mat_size_2 4609
#define mat_size_3 784

float multiply_unaligned()
{
    RowMajorMat a(mat_size_1, mat_size_2);
    RowMajorMat b(mat_size_2, mat_size_3);
    const std::size_t num_bytes = a.rows() * b.cols() * sizeof(float);
    float* ptr = (float*)std::malloc(num_bytes);
    Eigen::Map<RowMajorMat, Eigen::Unaligned> c(
        ptr,
        static_cast<Eigen::Index>(a.rows()),
        static_cast<Eigen::Index>(b.cols()));
    c.noalias() = a * b;
    float check_value = ptr[0];
    std::free(ptr);
    return check_value;
}

float multiply_openblas()
{
    int m = mat_size_1;
    int n = mat_size_3;
    int k = mat_size_2;
    int sizeofa = m * k;
    int sizeofb = k * n;
    int sizeofc = m * n;
    char ta = 'N';
    char tb = 'N';
    float alpha = 1;
    float beta = 1;

    struct timeval start,finish;
    float duration;

    float* A = (float*)malloc(sizeof(float) * sizeofa);
    float* B = (float*)malloc(sizeof(float) * sizeofb);
    float* C = (float*)malloc(sizeof(float) * sizeofc);

    sgemm_(&ta, &tb, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m);

    float check = C[0];

    free(A);
    free(B);
    free(C);

    return check;
}

template <typename Func>
void measure(const std::string& name, const Func f)
{
    using namespace std::chrono;
    float checksum = 0.0f; // to prevent compiler from optimizing everything away
    const auto start_time_ns = high_resolution_clock::now().time_since_epoch().count();
    const std::size_t runs = 10;
    for (size_t i = 0; i < runs; ++i)
    {
        checksum += f();
    }
    const auto end_time_ns = high_resolution_clock::now().time_since_epoch().count();
    const auto elapsed_ms = (end_time_ns - start_time_ns) / (runs * 1000000);
    std::cout << name << " (checksum: " << checksum << ") elapsed_ms: " << elapsed_ms << std::endl;
}

int main()
{
    for (std::size_t i = 0; i < 5; ++i)
    {
        measure("unaligned", multiply_unaligned);
        measure("openblas ", multiply_openblas);
        std::cout << "--------" << std::endl;
    }
}

g++ --std=c++14 -O3 -mfma gemm_benchmark.cpp -lopenblas -lgfortran -o gemm_benchmark && taskset --cpu-list 1 ./gemm_benchmark

unaligned (checksum: 0) elapsed_ms: 41
openblas  (checksum: 0) elapsed_ms: 43
--------
unaligned (checksum: 0) elapsed_ms: 39
openblas  (checksum: 0) elapsed_ms: 43
--------
unaligned (checksum: 0) elapsed_ms: 39
openblas  (checksum: 0) elapsed_ms: 41
--------
unaligned (checksum: 0) elapsed_ms: 39
openblas  (checksum: 0) elapsed_ms: 42
--------
unaligned (checksum: 0) elapsed_ms: 38
openblas  (checksum: 0) elapsed_ms: 42

I'm using -mfma instead of -mavx here, to make things fair, because OpenBLAS prebuilt library uses it anyway.

Dobiasd commented 6 years ago

@patrikhuber

In our Slow-ish run time on MSVC issue you mentioned the following:

In the link about im2col that you posted, he afterwards introduces a faster convolution (mainly for 3x3 kernels) from this paper https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Lavin_Fast_Algorithms_for_CVPR_2016_paper.pdf, based on Winograd. They get like a 2x speed-up on VGG for a batch size of 1, on the GPU. It's overall quite cool, and I've seen this used in some deep learning repositories across GitHub.

Is my understanding correct, that convolutions that are implemented that way produce similar but not fully equivalent results to "normal" (precise) convolutions, i.e. nested loops or im2col+GEMM?

If that really is the case, I think that these are not well-suited for frugally-deep, since we would need drop the major guarantee we provide, i.e. producing the same output as Keras. Or the reference forward pass in Python for these tests would also have to be made with the same Winograd-derived convolutions, or even the whole training. I'm not sure if accuracy suffers if training is done with normal conv and prediction Winograd-based. What do you think?

patrikhuber commented 6 years ago

Hi @Dobiasd,

Is my understanding correct, that convolutions that are implemented that way produce similar but not fully equivalent results to "normal" (precise) convolutions

I'm not sure to be honest, from quickly glancing over the paper again I didn't find anything. But my guess would be that probably the result can be slightly different, since the computation is done in quite a different way (even just reordering of floating point computations can give different results I think and is allowed or not allowed, depending on compiler flags).

Wouldn't even the computation on the GPU potentially give different results than computing on the CPU? Or if you use stuff like cuDNN? I'm not too familiar with the floating point model and IEEE floating point standard though and whether this is something that's relied on in deep learning or not.

we would need drop the major guarantee we provide, i.e. producing the same output as Keras.

I agree this is a very strong plus. Maybe that screams for an option/choice? :-) I think there is many scenarios where speed is much more important than having the same result up to the 15th digit, and I think these kernels are quite widely used in industry actually (I can't give you a specific example but I've seen WinoGrad and Nervana mentioned in so many places, on GitHub etc.). As long as the model still has a similar accuracy, you don't really care if for example a segmentation result is 1-2 pixels different or a face box is 1 or 2 pixels larger. So I would also not think that it makes a difference if a network is using a different convolution algorithm at train- and test-time. But I must also say that I lack the practical experience, so this is more guessing and only based on limited practical experience and what I heard/read "around the net".

Do we know an expert? :-D

Dobiasd commented 6 years ago

I'm also not sure if floating point operations are so strictly defined that CPU vs. GPU does not matter for the results or if it does. But at least the following small test code

import tensorflow as tf
a = tf.constant(1.2345678)
b = tf.constant(9.87654321)
with tf.Session() as sess:
    print("add: {}".format(sess.run(a+b)))
    print("multiply: {}".format(sess.run(a*b)))

outputs exactly the same on my PC:

add: 11.11111068725586
multiply: 12.193262100219727

with GPU and CPU (i.e. CUDA_VISIBLE_DEVICES='').

You are right, providing Winograd as an option might be a good solution. Now we just need a good CPU implementation. :D

Do we know an expert? :-D

You are the expert I know. :stuck_out_tongue:

patrikhuber commented 6 years ago

You are the expert I know. 😛

I wish! I read quite a lot of various stuff, but as mentioned, don't have too much "real" practical deep learning experience yet :-O ... :-)

patrikhuber commented 6 years ago

Found this posted on /r/cpp today: https://github.com/GValiente/pocket-tensor. One interesting thing about it (I thought at first, at least) is that it uses libsimdpp, a "header-only zero-overhead C++ low level SIMD library". I am thinking however that Eigen does all these vectorisations too (see "Vectorization" on this page), so it probably wouldn't be too helpful to incorporate libsimdpp. Eigen can also even generate NEON instructions. So what's maybe more interesting is that there are some benchmarks comparing it to fdeep - well, only 1 benchmark actually, where the run time is very similar to fdeep. But chances are they only measured 1 forward pass and the numbers are not too meaningful (and I can't find the benchmark code in the repo). So what would maybe be interesting is a more thorough benchmark comparison.

It also links to kerasify, another C++ library for running Keras models (the first library is a fork of this).

But it looks to me like fdeep wins on all fronts - kerasify is dead and only time will tell if pocket-tensor will gain some traction.

Dobiasd commented 6 years ago

Good finding! The genesis and existence of other libraries with the same purpose only confirm the need for something like it. And some competition is good for the users. It's nice pocket-tensor supports LSTM, which is still a todo for me. However at first glance it looks like pocker-tensor only supports sequential models and no functional ones. :wink:

Yes, it seems like Eigen is already doing a good job in generating the vector instructions. And 29298 μs vs. 27329 μs is not that big of a difference. The real puzzle for me still is where TensorFlow saves these 20% on VGG models. :rage:

GValiente commented 6 years ago

Hi guys, pocket-tensor guy here :)

First of all, I think frugally-deep is the better library by far, with much more supported Keras features and great support from the author. It's just a shame than LSTM layers are still not supported.

I have developed pocket-tensor because I needed to execute predictions on embedded devices. Because work reasons, I can't crosscompile code, so I have to build it on the embedded device. I can't use frugally-deep because the compiler runs out of memory building it, and Kerasify performance can be improved.

About the performance thing, measures on the documentation are the lowest value of 9 forward passes. Next week I'll upload a benchmark app and post PC measures too.

headupinclouds commented 6 years ago

You are right, providing Winograd as an option might be a good solution. Now we just need a good CPU implementation. :D

Hi! I've been following the conversation off and on. FWIW, NCNN includes a 3x3 Winograd implementation (assembly) here. Most of the optimization in that lib seems to be targeting mobile/arm, and I suspect the Eigen backend in frugally-deep will compare favorably for the x86 platforms.

Dobiasd commented 6 years ago

@GValiente

Thanks for the kind words. :) And good work on improving the performance of Kerasify that much! The story behind frugally-deep is quite similar. The development was also deployment-driven, i.e. I wanted to deploy a Keras model in a 32-bit C++ application, and making TensorFlow 32-bit compatible seemed almost impossible to me.

Do you know where the compiler runs out of memory? I assume it's because of too much template magic, so probably related to FunctionalPlus or Eigen?

I guess I really have to start looking into this LSTM thing. :wink:

Dobiasd commented 6 years ago

@headupinclouds

Cool, thanks for the link. I'll see if I can run some performance comparisons. To me it is almost a miracle how people can write such stuff in asm. I only used it to write minimal procedural graphics using it about 20 years ago, and I still remember how hard it was to keep all this state in ones head. :dizzy:

Dobiasd commented 6 years ago

Test results of `Tencent/ncnn`:

The matrix sizes used in the original tests

#define mat_size_1 512
#define mat_size_2 4609
#define mat_size_3 784

correspond to the VGG16 layers that convolve a (512, 28, 28) tensor with 512 kernels of shape (3, 3, 512) to a new tensor of size (512, 28, 28).

So I added the corresponding calls to ncnn::conv3x3s1_sse and ncnn::conv3x3s1_winograd64_neon5 to the benchmark. I had some problems with building the ncnn library, so that is why I used the software-engineering-wise very bad hack of defining the constructor for ncnn::Option and manually linking mat.cpp into the executable. :wink:

The tests include the sse-x86 version and the assembly-arm-winograd64-neon5 version.

#include <chrono>
#include <iostream>

#include <eigen3/Eigen/Dense>

#include "net.h"
namespace ncnn {
    #include "layer/x86/convolution_3x3.h"
    #include "layer/arm/convolution_3x3.h"
    Option::Option()
    {
        lightmode = true;
        num_threads = 1;
        blob_allocator = 0;
        workspace_allocator = 0;
    }
}

using RowMajorMat = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;

#define mat_size_1 512
#define mat_size_2 4609
#define mat_size_3 784

float multiply_unaligned()
{
    RowMajorMat a(mat_size_1, mat_size_2);
    RowMajorMat b(mat_size_2, mat_size_3);
    const std::size_t num_bytes = a.rows() * b.cols() * sizeof(float);
    float* ptr = (float*)std::malloc(num_bytes);
    Eigen::Map<RowMajorMat, Eigen::Unaligned> c(
        ptr,
        static_cast<Eigen::Index>(a.rows()),
        static_cast<Eigen::Index>(b.cols()));
    c.noalias() = a * b;
    float check_value = ptr[0];
    std::free(ptr);
    return check_value;
}

float conv_ncnn_x86()
{
    ncnn::Mat bottom_blob(28, 28, 512);
    ncnn::Mat top_blob(28, 28, 512);
    ncnn::Mat kernel(3, 3, 512);
    ncnn::Mat bias(512);
    ncnn::Option opt;
    ncnn::conv3x3s1_sse(
        bottom_blob, top_blob, kernel, bias, opt);
    return top_blob.row(0)[0];
}

float conv_ncnn_arm_wino_5()
{
    ncnn::Mat bottom_blob(28, 28, 512);
    ncnn::Mat top_blob(28, 28, 512);
    ncnn::Mat kernel(3, 3, 512);
    ncnn::Mat bias(512);
    ncnn::Option opt;
    ncnn::conv3x3s1_winograd64_neon5(
        bottom_blob, top_blob, kernel, bias, opt);
    return top_blob.row(0)[0];
}

template <typename Func>
void measure(const std::string& name, const Func f)
{
    using namespace std::chrono;
    float checksum = 0.0f; // to prevent compiler from optimizing everything away
    const auto start_time_ns = high_resolution_clock::now().time_since_epoch().count();
    const std::size_t runs = 10;
    for (size_t i = 0; i < runs; ++i)
    {
        checksum += f();
    }
    const auto end_time_ns = high_resolution_clock::now().time_since_epoch().count();
    const auto elapsed_ms = (end_time_ns - start_time_ns) / (runs * 1000000);
    std::cout << name << " (checksum: " << checksum << ") elapsed_ms: " << elapsed_ms << std::endl;
}

int main()
{
    for (std::size_t i = 0; i < 5; ++i)
    {
        measure("unaligned                        ", multiply_unaligned);
        measure("ncnn x86                         ", conv_ncnn_x86);
        measure("ncnn::conv3x3s1_winograd64_neon_5", conv_ncnn_arm_wino_5);
        std::cout << "--------" << std::endl;
    }
}

$ g++ --std=c++14 -O3 -mfma mat.cpp ncnn_test.cpp -o gemm_benchmark && taskset --cpu-list 1 ./gemm_benchmark
unaligned                         (checksum: 0) elapsed_ms: 39
ncnn x86                          (checksum: 0) elapsed_ms: 405
ncnn::conv3x3s1_winograd64_neon_5 (checksum: 0) elapsed_ms: 111
--------
unaligned                         (checksum: 0) elapsed_ms: 35
ncnn x86                          (checksum: 0) elapsed_ms: 402
ncnn::conv3x3s1_winograd64_neon_5 (checksum: 0) elapsed_ms: 109
--------
unaligned                         (checksum: 0) elapsed_ms: 35
ncnn x86                          (checksum: 0) elapsed_ms: 398
ncnn::conv3x3s1_winograd64_neon_5 (checksum: 0) elapsed_ms: 110
--------
unaligned                         (checksum: 0) elapsed_ms: 35
ncnn x86                          (checksum: 0) elapsed_ms: 399
ncnn::conv3x3s1_winograd64_neon_5 (checksum: 0) elapsed_ms: 111
--------
unaligned                         (checksum: 0) elapsed_ms: 35
ncnn x86                          (checksum: 0) elapsed_ms: 396
ncnn::conv3x3s1_winograd64_neon_5 (checksum: 0) elapsed_ms: 110

$ g++ --std=c++14 -O3 -mavx mat.cpp ncnn_test.cpp -o gemm_benchmark && taskset --cpu-list 1 ./gemm_benchmark
unaligned                         (checksum: 0) elapsed_ms: 67
ncnn x86                          (checksum: 0) elapsed_ms: 585
ncnn::conv3x3s1_winograd64_neon_5 (checksum: 0) elapsed_ms: 128
--------
unaligned                         (checksum: 0) elapsed_ms: 66
ncnn x86                          (checksum: 0) elapsed_ms: 585
ncnn::conv3x3s1_winograd64_neon_5 (checksum: 0) elapsed_ms: 127
--------
unaligned                         (checksum: 0) elapsed_ms: 66
ncnn x86                          (checksum: 0) elapsed_ms: 584
ncnn::conv3x3s1_winograd64_neon_5 (checksum: 0) elapsed_ms: 128
--------
unaligned                         (checksum: 0) elapsed_ms: 66
ncnn x86                          (checksum: 0) elapsed_ms: 581
ncnn::conv3x3s1_winograd64_neon_5 (checksum: 0) elapsed_ms: 130
--------
unaligned                         (checksum: 0) elapsed_ms: 65
ncnn x86                          (checksum: 0) elapsed_ms: 574
ncnn::conv3x3s1_winograd64_neon_5 (checksum: 0) elapsed_ms: 126

This looks suspiciously slow. Perhaps I did something wrong in my use of ncnn (but I can't figure out what it could be right now). However if the results are valid, I guess this library is out. :snail:

GValiente commented 6 years ago

@Dobiasd I don't know why gcc crashes, but I suppose than GCC runs out of memory. I have to build it on a PocketBeagle, which only has 512MB of RAM :p

I have uploaded a benchmark application and some PC performance measures.

Dobiasd commented 6 years ago

@GValiente OK, I guess there would be several potential paths of searching for a solution that uses frugally-deep:

use a different compiler version
use a different compiler (clang perhaps)
add external storage (usb stick etc.) and use as swap / RAM extension to provide more memory, even if slow
analyse where GCC crashes, perhaps it actually is not out-of-memory but has a different problem
compile in some kind of emulator / virtual machine with the same settings as the target system
throw away (i.e. mock) different parts of the frugally-deep code until the problem-causing part is isolated

But since you already have a nice and working solution with pocket-tensor, I guess exploring these things is not really necessary for you. :slightly_smiling_face:

headupinclouds commented 6 years ago

So I added the corresponding calls to ncnn::conv3x3s1_sse and ncnn::conv3x3s1_winograd64_neon5 to the benchmark

Wow. Thanks for the benchmark. The ncnn::conv3x3s1_winograd64_neon5 is implemented for ARM/NEON and should probably be tested on an Android or iOS device with a matching toolchain that enables the relevant preprocessor definitions:

#if __ARM_NEON
#if __aarch64__

etc.

It looks like the code will run on an x86 platform, but everything will end up defaulting to the generic C code intended for "remainder' processing, so the optimization won't get tested with that build.

https://github.com/Tencent/ncnn/blob/af49e2cada56cdae5879686e35033e14dc42ea25/src/layer/arm/convolution_3x3.h#L318-L361

The lib seems to be focused on mobile platforms with basic support for x86. I'll try to adapt the sample you put together for mobile device testing later this week.

GValiente commented 6 years ago

@Dobiasd The proper fix is to crosscompile it on a proper PC with a big chunk of RAM :D

Anyway, thanks for the suggestions.

Dobiasd commented 5 years ago

Since with the latest measurements (new TensorFlow version), TensorFlow has caught up in regards of performance for all model types, VGG-like models are no special case any more. Thus I'll close this issue. :wink:

Dobiasd commented 4 years ago

With version 2.0, TensorFlow seems to have improved its performance even more. :open_mouth:

Model	Keras + TF.1.13.1	Keras + TF 2.0	frugally-deep
`DenseNet121`	0.33 s	0.19 s	0.23 s
`DenseNet169`	0.39 s	0.23 s	0.33 s
`DenseNet201`	0.48 s	0.28 s	0.41 s
`InceptionV3`	0.35 s	0.18 s	0.38 s
`MobileNet`	0.11 s	0.07 s	0.14 s
`MobileNetV2`	0.13 s	0.08 s	0.15 s
`NASNetLarge`	2.03 s	1.34 s	4.50 s
`NASNetMobile`	0.18 s	0.16 s	0.35 s
`ResNet50`	0.32 s	0.18 s	0.26 s
`VGG16`	0.64 s	0.39 s	0.70 s
`VGG19`	0.78 s	0.48 s	0.93 s
`Xception`	0.65 s	0.27 s	1.21 s

Tests are still run with CUDA_VISIBLE_DEVICES='' taskset --cpu-list 1 ... to make them fair.

And TensorFlow 2.0 still prints Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA. So it seems to not be using newer CPU instruction sets than before.

Maybe they'll use something new for convolutions? I'll see if I can find out anything.

Dobiasd commented 4 years ago

It's not just a TensorFlow 2.x thing. The performance improvements happened earlier:

Test code:

# conv2d_performance_test.py
import datetime

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D
from tensorflow.keras.models import Model

inputs = Input(shape=(1024, 1024, 3))
x = Conv2D(128, (3, 3))(inputs)
model = Model(inputs=inputs, outputs=x)
model.compile(loss='categorical_crossentropy', optimizer='nadam')

data_in = np.random.normal(size=(1, 1024, 1024, 3))
model.predict(data_in)
print(f'tensorflow=={tf.__version__}')
for _ in range(5):
    start_time = datetime.datetime.now()
    model.predict(data_in)
    duration = datetime.datetime.now() - start_time
    print('Forward pass took {} s.'.format(duration.total_seconds()))

Running with different TensorFlow version:

sudo pip3 uninstall tensorflow -y
sudo pip3 install tensorflow==1.13.1
CUDA_VISIBLE_DEVICES='' taskset --cpu-list 1 python3 conv2d_performance_test.py
sudo pip3 uninstall tensorflow -y
sudo pip3 install tensorflow==1.13.2
CUDA_VISIBLE_DEVICES='' taskset --cpu-list 1 python3 conv2d_performance_test.py
sudo pip3 uninstall tensorflow -y
sudo pip3 install tensorflow==1.14.0
CUDA_VISIBLE_DEVICES='' taskset --cpu-list 1 python3 conv2d_performance_test.py
sudo pip3 uninstall tensorflow -y
sudo pip3 install tensorflow==1.15.0
CUDA_VISIBLE_DEVICES='' taskset --cpu-list 1 python3 conv2d_performance_test.py
sudo pip3 uninstall tensorflow -y
sudo pip3 install tensorflow==2.0.0
CUDA_VISIBLE_DEVICES='' taskset --cpu-list 1 python3 conv2d_performance_test.py

Results:

tensorflow==1.13.1
Forward pass took 0.949945 s.
Forward pass took 0.952494 s.
Forward pass took 0.948515 s.
Forward pass took 0.950895 s.
Forward pass took 0.96188 s.

tensorflow==1.13.2
Forward pass took 0.943243 s.
Forward pass took 0.947731 s.
Forward pass took 0.942158 s.
Forward pass took 0.94615 s.
Forward pass took 0.946767 s.

tensorflow==1.14.0
Forward pass took 0.758585 s.
Forward pass took 0.75338 s.
Forward pass took 0.749206 s.
Forward pass took 0.74818 s.
Forward pass took 0.74659 s.

tensorflow==1.15.0
Forward pass took 0.518368 s.
Forward pass took 0.520312 s.
Forward pass took 0.517211 s.
Forward pass took 0.520846 s.
Forward pass took 0.51732 s.

tensorflow==2.0.0
Forward pass took 0.59772 s.
Forward pass took 0.597734 s.
Forward pass took 0.595437 s.
Forward pass took 0.595418 s.
Forward pass took 0.587147 s.

So they seem to have done something between 1.13.2 and 1.14.0 and also between 1.14.0 and 1.15.0.

That will be fun to dissect. :grin:

Dobiasd commented 4 years ago

OK, it's not that Eigen's matrix multiplication has become faster in the last years. Comparing version 3.3.0 (2016-11-10) with the recent one, i.e., 3.3.7 (2018-12-11), I don't measure a difference.

git clone https://github.com/eigenteam/eigen-git-mirror
cd eigen-git-mirror
mkdir -p build && cd build

git checkout tags/3.3.0
cmake ..
make && sudo make install
g++ -std=c++14 -O3 -mavx ../../main.cpp -o ../../main
echo "3.3.0"
../../main

git checkout tags/3.3.7
cmake ..
make && sudo make install
g++ -std=c++14 -O3 -mavx ../../main.cpp -o ../../main
echo "3.3.7"
../../main

//main.cpp
#include <chrono>
#include <iostream>

#include <eigen3/Eigen/Dense>

using RowMajorMat = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;

#define mat_size_1 1024
#define mat_size_2 4096
#define mat_size_3 784

float multiply()
{
    RowMajorMat a(mat_size_1, mat_size_2);
    RowMajorMat b(mat_size_2, mat_size_3);
    const std::size_t num_bytes = a.rows() * b.cols() * sizeof(float);
    float* ptr = (float*)std::malloc(num_bytes);
    Eigen::Map<RowMajorMat, Eigen::Unaligned> c(
        ptr,
        static_cast<Eigen::Index>(a.rows()),
        static_cast<Eigen::Index>(b.cols()));
    c.noalias() = a * b;
    float check_value = ptr[0];
    std::free(ptr);
    return check_value;
}

template <typename Func>
void measure(const std::string& name, const Func f)
{
    using namespace std::chrono;
    float checksum = 0.0f; // to prevent compiler from optimizing everything away
    const auto start_time_ns = high_resolution_clock::now().time_since_epoch().count();
    const std::size_t runs = 10;
    for (size_t i = 0; i < runs; ++i)
    {
        checksum += f();
    }
    const auto end_time_ns = high_resolution_clock::now().time_since_epoch().count();
    const auto elapsed_ms = (end_time_ns - start_time_ns) / (runs * 1000000);
    std::cout << name << " (checksum: " << checksum << ") elapsed_ms: " << elapsed_ms << std::endl;
}

int main()
{
    measure("multiply                         ", multiply);
}

Output:

...
3.3.0
multiply                          (checksum: 0) elapsed_ms: 129
...
3.3.7
multiply                          (checksum: 0) elapsed_ms: 128
...

So there must be something else going on. Seem like it might be worth to have a second look at Eigen Tensors and re-visit issue 167. :grin:

Dobiasd / frugally-deep

Improve runtime on VGG-like models #55

Test results of Tencent/ncnn:

Test results of `Tencent/ncnn`: