Small array multiplication is slow

ansar-sa commented 3 years ago

Benchmark results:

STATS FOR OPENBLAS
---------------------------------------------------------------

NUMBER OF THREADS:          1
NUMBER OF PROCESSORS:       6
CONFIG:                     OpenBLAS 0.3.13 NO_LAPACKE DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64
CORE NAME:                  Haswell
PARALLEL:                   1

2021-07-10T17:29:27+00:00
Running ./eigen_performance_test
Run on (6 X 2900 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x6)
  L1 Instruction 32 KiB (x6)
  L2 Unified 256 KiB (x6)
  L3 Unified 12288 KiB (x6)
Load Average: 1.39, 0.97, 0.65
-------------------------------------------------------------------------------
Benchmark                                     Time             CPU   Iterations
-------------------------------------------------------------------------------
eigen_matrix_vector_mult_test/2            52.0 ns         51.9 ns     10896682
eigen_matrix_vector_mult_test/4            45.3 ns         45.1 ns     15167031
eigen_matrix_vector_mult_test/8            58.9 ns         58.7 ns     11416469
eigen_matrix_vector_mult_test/16           97.1 ns         97.0 ns      6808424
eigen_matrix_vector_mult_test/32            225 ns          224 ns      3104947
eigen_matrix_vector_mult_test/64            708 ns          707 ns       982545
eigen_matrix_vector_mult_test/128          3247 ns         3241 ns       215232
eigen_matrix_vector_mult_test/256         17878 ns        17854 ns        40561
eigen_matrix_vector_mult_test/512         79920 ns        79654 ns         8723
eigen_matrix_vector_mult_test/1024       400534 ns       399784 ns         1756
eigen_matrix_matrix_mult_test/2            10.0 ns         9.98 ns     68861889
eigen_matrix_matrix_mult_test/4            7.77 ns         7.76 ns     88185892
eigen_matrix_matrix_mult_test/8            33.3 ns         33.3 ns     20416419
eigen_matrix_matrix_mult_test/16            221 ns          221 ns      3159655
eigen_matrix_matrix_mult_test/32            484 ns          484 ns      1371857
eigen_matrix_matrix_mult_test/64           2112 ns         2112 ns       318067
eigen_matrix_matrix_mult_test/128         14355 ns        14352 ns        48288
eigen_matrix_matrix_mult_test/256         61533 ns        61529 ns        11485
eigen_matrix_matrix_mult_test/512        276762 ns       276711 ns         2525
eigen_matrix_matrix_mult_test/1024      1389020 ns      1388564 ns          523
xtensor_matrix_matrix_mult_test/2           403 ns          403 ns      1692067
xtensor_matrix_matrix_mult_test/4           402 ns          402 ns      1746090
xtensor_matrix_matrix_mult_test/8           418 ns          417 ns      1674838
xtensor_matrix_matrix_mult_test/16          436 ns          436 ns      1562858
xtensor_matrix_matrix_mult_test/32          562 ns          561 ns      1202666
xtensor_matrix_matrix_mult_test/64         1056 ns         1056 ns       667352
xtensor_matrix_matrix_mult_test/128        3066 ns         3066 ns       222012
xtensor_matrix_matrix_mult_test/256       14641 ns        14595 ns        47534
xtensor_matrix_matrix_mult_test/512       56908 ns        56907 ns        11483
xtensor_matrix_matrix_mult_test/1024     352145 ns       351940 ns         2301

Benchmark code:

#include <benchmark/benchmark.h>

#include <Eigen/Dense>
#include <xtensor-blas/xlinalg.hpp>
#include <xtensor/xnoalias.hpp>
#include <xtensor/xtensor.hpp>

void blas_stats() {
  std::cout << "\nSTATS FOR "
               "OPENBLAS\n-----------------------------------------------------"
               "----------\n\n";

  openblas_set_num_threads(1);
  std::cout << "NUMBER OF THREADS:          " << openblas_get_num_threads()
            << std::endl;

  // Get the number of physical processors (cores)
  std::cout << "NUMBER OF PROCESSORS:       " << openblas_get_num_procs()
            << std::endl;
  // Get the build configure on runtime.
  std::cout << "CONFIG:                     " << openblas_get_config()
            << std::endl;

  /*Get the CPU corename on runtime.*/
  std::cout << "CORE NAME:                  " << openblas_get_corename()
            << std::endl;

  // Get the parallelization type which is used by OpenBLAS
  std::cout << "PARALLEL:                   " << openblas_get_parallel()
            << std::endl;
  std::cout << "\n\n";
}

static void eigen_matrix_vector_mult_test(benchmark::State &state) {
  int size = state.range(0);
  Eigen::MatrixXd m = Eigen::MatrixXd::Random(size, size);
  Eigen::VectorXd v = Eigen::VectorXd::Random(size);
  Eigen::MatrixXd r = Eigen::MatrixXd::Random(size, 1);
  for (auto _ : state) {
    r.noalias() = m * v;
  }
}
BENCHMARK(eigen_matrix_vector_mult_test)->RangeMultiplier(2)->Range(2, 8 << 7);

static void eigen_matrix_matrix_mult_test(benchmark::State &state) {
  int size = state.range(0);
  Eigen::MatrixXd m = Eigen::MatrixXd::Random(size, size);
  Eigen::MatrixXd v = Eigen::MatrixXd::Random(size, 1);
  Eigen::MatrixXd r = Eigen::MatrixXd::Random(size, 1);
  for (auto _ : state) {
    r.noalias() = m * v;
  }
}
BENCHMARK(eigen_matrix_matrix_mult_test)->RangeMultiplier(2)->Range(2, 8 << 7);

static void xtensor_matrix_matrix_mult_test(benchmark::State &state) {
  uint64_t size = state.range(0);
  xt::xarray<double>::shape_type shape_m = {size, size};
  xt::xarray<double>::shape_type shape_v = {size};
  xt::xarray<double> m(shape_m);
  xt::xarray<double> v(shape_v);
  xt::xarray<double> r(shape_v);
  for (auto _ : state) {
    xt::noalias(r) = xt::linalg::dot(m, v);
  }
}
BENCHMARK(xtensor_matrix_matrix_mult_test)
    ->RangeMultiplier(2)
    ->Range(2, 8 << 7);

int main(int argc, char **argv) {
  blas_stats();
  benchmark::Initialize(&argc, argv);
  if (benchmark::ReportUnrecognizedArguments(argc, argv))
    return 1;
  benchmark::RunSpecifiedBenchmarks();
}

Any thoughts why small arrays for xtensor incur so much overhead? what should I do if most of my arrays are small (but not fixed size).

wolfv commented 3 years ago

I think there are two differences with eigen:

we always call BLAS, where Eigen has it's own implementation that's optimized better for small matrices
when you do a dot operation, there is some overhead to check if you're broadcasting

It could be interesting to make a 2D dot function that works faster on small matrices / matrix-vector combination. We could even use Eigen as a "backend" for that :)

ansar-sa commented 3 years ago

It could be interesting to make a 2D dot function that works faster on small matrices / matrix-vector combination. We could even use Eigen as a "backend" for that :)

This is an interesting idea. First, two thoughts:

Can you please confirm that the way I wrote the dot product is optimal? I built using these definitions: add_definitions(-DHAVE_CBLAS=1 -DWITH_OPENBLAS=1 -DXTENSOR_ENABLE_XSIMD=1) set(LINK_LIBRARIES ${CONAN_LIBS} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} xtensor xtensor::optimize xtensor::use_xsimd) set(RELEASE_FLAGS "-Ofast -DNDEBUG -march=native ${WARNING_FLAGS}")
Should I try to link with MKL or do you think that will be just as slow for small matrices?

In case both options are a no go, yes I would love to see xtensor handle small matrix / matrix, matrix / vec dot products fast either via handrolled simd code or via eigen backend.

wolfv commented 3 years ago

As I mentioned before, the dot product is more general than Eigen's matrix / matrix-vector multiplication as it performs broadcasting. Without broadcasting checks you'd probably already be 2x faster for small matrices.

You could try using xtensor as a class. It has a compile time dimension (just as MatrixXd is always 2-dimensionla) you can use xtensor<double, 2>.

wolfv commented 3 years ago

MKL won't help much I am afraid :)

wolfv commented 3 years ago

calling into BLAS functions has some intrinsic overhead (since even a function call vs. inlined cost has some cost attached to it).

ansar-sa commented 3 years ago

Using xtensor class does not help either. As you say, the call into BLAS and broadcast checks is probably dominating the time with small matrices. Do you have some example code on how I can use xsimd to multiply a small matrix with a vector please?

ansar-sa commented 3 years ago

Actually the problem is in other kinds of operations as well. Even simple array ops such as element wise array sums for xtensor based code is very slow compared to eigen when it comes to small arrays. Can we please look what bounds / broadcast checks are the culprit. Benchmark results:

Running ./array_ops_performance_test
Run on (6 X 2900 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x6)
  L1 Instruction 32 KiB (x6)
  L2 Unified 256 KiB (x6)
  L3 Unified 12288 KiB (x6)
Load Average: 0.02, 0.64, 0.73
----------------------------------------------------------------
Benchmark                      Time             CPU   Iterations
----------------------------------------------------------------
eigen_array_sum/2          0.857 ns        0.854 ns    840056658
eigen_array_sum/4           3.61 ns         3.61 ns    197734174
eigen_array_sum/8           7.89 ns         7.88 ns     82837277
eigen_array_sum/16          36.7 ns         36.7 ns     19514002
eigen_array_sum/32           153 ns          153 ns      4527008
eigen_array_sum/64          1117 ns         1115 ns       627337
eigen_array_sum/128         6368 ns         6357 ns       102095
eigen_array_sum/256        32093 ns        32039 ns        24341
eigen_array_sum/512       155964 ns       155738 ns         4763
xtensor_array_sum/2          133 ns          132 ns      5450911
xtensor_array_sum/4          142 ns          141 ns      4697384
xtensor_array_sum/8          150 ns          150 ns      4680743
xtensor_array_sum/16         249 ns          249 ns      2787851
xtensor_array_sum/32         544 ns          542 ns      1326147
xtensor_array_sum/64        1954 ns         1950 ns       343751
xtensor_array_sum/128       8772 ns         8759 ns        74394
xtensor_array_sum/256      35809 ns        35724 ns        19592
xtensor_array_sum/512     113890 ns       113503 ns         6039

#include <benchmark/benchmark.h>

#include <Eigen/Dense>
#include <xtensor/xarray.hpp>

using namespace Eigen;

static void eigen_array_sum(benchmark::State &state) {
  int size = state.range(0);
  MatrixXd a = MatrixXd::Random(size, size);
  MatrixXd b = MatrixXd::Random(size, size);
  MatrixXd r = MatrixXd::Random(size, size);

  for (auto _ : state) {
    r = a + b;
  }
}
BENCHMARK(eigen_array_sum)->RangeMultiplier(2)->Range(2, 8 << 6);

static void xtensor_array_sum(benchmark::State &state) {
  uint64_t size = state.range(0);
  xt::xarray<double>::shape_type shape = {size, size};
  xt::xarray<double> a(shape);
  xt::xarray<double> b(shape);
  xt::xarray<double> r(shape);

  for (auto _ : state) {
    r = a + b;
  }
}
BENCHMARK(xtensor_array_sum)->RangeMultiplier(2)->Range(2, 8 << 6);

BENCHMARK_MAIN();

wolfv commented 3 years ago

comparing MatrixXd and xt::xarray<double> is never a fair comparison. Quite different containers (since xarray is dynamically ndimensional, MatrixXd is statically 2d).

ansar-sa commented 3 years ago

I'll update benchmark results using xtensor<double, 2>

ansar-sa commented 3 years ago

-------------------------------------------------------------------------
Benchmark                               Time             CPU   Iterations
-------------------------------------------------------------------------
eigen_array_sum/2                    1.29 ns         1.28 ns    457478259
eigen_array_sum/4                    3.31 ns         3.29 ns    208422955
eigen_array_sum/8                    6.83 ns         6.80 ns     96818924
eigen_array_sum/16                   32.8 ns         32.6 ns     20155042
eigen_array_sum/32                    138 ns          138 ns      4957244
eigen_array_sum/64                   1009 ns         1008 ns       652397
eigen_array_sum/128                  6065 ns         6060 ns       112739
eigen_array_sum/256                 27283 ns        27275 ns        25480
eigen_array_sum/512                120261 ns       120255 ns         5317
xtensor_array_sum/2                  34.4 ns         34.4 ns     20148667
xtensor_array_sum/4                  37.2 ns         37.2 ns     18514938
xtensor_array_sum/8                  50.0 ns         50.0 ns     12796309
xtensor_array_sum/16                  139 ns          139 ns      4996273
xtensor_array_sum/32                  392 ns          392 ns      1787989
xtensor_array_sum/64                 1740 ns         1738 ns       406974
xtensor_array_sum/128                7859 ns         7858 ns        81830
xtensor_array_sum/256               32434 ns        32432 ns        21366
xtensor_array_sum/512              104275 ns       104264 ns         6603

static void eigen_array_sum(benchmark::State &state) {
  int size = state.range(0);
  MatrixXd a = MatrixXd::Random(size, size);
  MatrixXd b = MatrixXd::Random(size, size);
  MatrixXd r = MatrixXd::Random(size, size);

  for (auto _ : state) {
    r = a + b;
  }
}
BENCHMARK(eigen_array_sum)->RangeMultiplier(2)->Range(2, 8 << 6);

static void xtensor_array_sum(benchmark::State &state) {
  uint64_t size = state.range(0);
  xt::xtensor<double, 2>::shape_type shape = {size, size};
  xt::xtensor<double, 2> a(shape);
  xt::xtensor<double, 2> b(shape);
  xt::xtensor<double, 2> r(shape);

  for (auto _ : state) {
    r = a + b;
  }
}
BENCHMARK(xtensor_array_sum)->RangeMultiplier(2)->Range(2, 8 << 6);

wolfv commented 3 years ago

you can shave off some more ns by doing xt::noalias(r) = a + b;

There might be some similar trick for eigen.

xtensor-stack / xtensor-blas

Small array multiplication is slow #201