nholthaus / units

a compile-time, header-only, dimensional analysis and unit conversion library built on c++14 with no dependencies.
http://nholthaus.github.io/units/
MIT License
955 stars 135 forks source link

Use exponentiation by squares for pow() #262

Closed ts826848 closed 3 years ago

ts826848 commented 3 years ago

Multiple pow() implementations were considered:

  1. A naive linear recursive implementation (the existing implementation)
  2. A tail-recursive linear implementation
  3. A non-tail-recursive exponentiation by squares implementation
  4. A tail-recursive exponentiation by squares implementation
  5. An iterative implementation ("right-to-left binary method" 0)

Based on quick and dirty benchmarks on a 2015 Retina Pro 2.8 GHz (source code in [1, 2], results in [3]), implementation 4 was chosen since it resulted in good benchmark numbers for a wide range of exponents for both GCC and Clang.

Some implementation notes:

[1]: CMakeLists.txt

    cmake_minimum_required(VERSION 3.18)
    project(pow-benchmarks)
    set(CMAKE_CXX_EXTENSIONS OFF)

    add_subdirectory(benchmark/)

    add_executable(main main.cpp)
    target_link_libraries(main benchmark::benchmark)
    target_compile_features(main
      PRIVATE cxx_std_17
    )
    target_compile_options(main
      PRIVATE -march=native
    )

[2]: main.cpp

    #include <benchmark/benchmark.h>

    constexpr auto base = 1.1;

    auto pow_rec(double x, unsigned int y) {
        if (y == 0) { return 1.0; }
        return x * pow_rec(x, y - 1);
    }

    static void BM_pow_rec(benchmark::State& state) {
        for (auto _ : state) {
            benchmark::DoNotOptimize(pow_rec(base, state.range(0)));
        }
    }

    auto pow_rec_acc(double acc, double x, unsigned int y) {
        if (y == 0) { return acc; }
        return pow_rec_acc(acc * x, x, y - 1);
    }

    static void BM_pow_rec_acc(benchmark::State& state) {
        for (auto _ : state) {
            benchmark::DoNotOptimize(pow_rec_acc(1.0, base, state.range(0)));
        }
    }

    auto pow_ebs(double x, unsigned int y) {
        if (y == 0) { return 1.0; }
        if (y % 2 == 0) { return pow_ebs(x * x, y / 2); }
        return x * pow_ebs(x * x, (y - 1) / 2);
    }

    static void BM_pow_ebs(benchmark::State& state) {
        for (auto _ : state) {
            benchmark::DoNotOptimize(pow_ebs(base, state.range(0)));
        }
    }

    auto pow_ebs_acc(double acc, double x, unsigned int y) {
        if (y == 0) { return acc; }
        if (y % 2 == 0) { return pow_ebs_acc(acc, x * x, y / 2); }
        return pow_ebs_acc(acc * x, x * x, (y - 1) / 2);
    }

    static void BM_pow_ebs_acc(benchmark::State& state) {
        for (auto _ : state) {
            benchmark::DoNotOptimize(pow_ebs_acc(1.0, base, state.range(0)));
        }
    }

    auto pow_rtl(double x, unsigned int y) {
        double result = 1;
        while (y > 0) {
            if (y % 2 == 1) { result *= x; }
            x *= x;
            y /= 2;
        }
        return result;
    }

    static void BM_pow_rtl(benchmark::State& state) {
        for (auto _ : state) {
            benchmark::DoNotOptimize(pow_rtl(base, state.range(0)));
        }
    }
    BENCHMARK(BM_pow_rec)->RangeMultiplier(2)->Range(1, 1024);
    BENCHMARK(BM_pow_rec_acc)->RangeMultiplier(2)->Range(1, 1024);
    BENCHMARK(BM_pow_ebs)->RangeMultiplier(2)->Range(1, 1024);
    BENCHMARK(BM_pow_ebs_acc)->RangeMultiplier(2)->Range(1, 1024);
    BENCHMARK(BM_pow_rtl)->RangeMultiplier(2)->Range(1, 1024);

    BENCHMARK_MAIN();

[3]: Benchmark results:

name,exponent,iterations,real_time,cpu_time,time_unit
clang_BM_pow_rec,1,242623383,2.85982,2.85864,ns
clang_BM_pow_rec,2,157872421,4.40969,4.40885,ns
clang_BM_pow_rec,4,94543490,7.46502,7.46196,ns
clang_BM_pow_rec,8,54229515,12.92,12.9153,ns
clang_BM_pow_rec,16,19914028,35.263,35.1746,ns
clang_BM_pow_rec,32,10156114,69.5893,69.2,ns
clang_BM_pow_rec,64,5334146,139.066,138.576,ns
clang_BM_pow_rec,128,2572357,273.276,272.662,ns
clang_BM_pow_rec,256,1339944,549.8,547.006,ns
clang_BM_pow_rec,512,583552,1110.01,1100.6,ns
clang_BM_pow_rec,1024,319804,2207.38,2204.24,ns
clang_BM_pow_rec_acc,1,576017906,1.21254,1.20992,ns
clang_BM_pow_rec_acc,2,470107856,1.45131,1.4495,ns
clang_BM_pow_rec_acc,4,379595135,1.86401,1.86114,ns
clang_BM_pow_rec_acc,8,394490656,1.82825,1.80791,ns
clang_BM_pow_rec_acc,16,166837476,4.22444,4.21326,ns
clang_BM_pow_rec_acc,32,59731549,11.882,11.8725,ns
clang_BM_pow_rec_acc,64,19138023,36.6328,36.6003,ns
clang_BM_pow_rec_acc,128,6571721,107.976,107.889,ns
clang_BM_pow_rec_acc,256,2565465,275.823,275.585,ns
clang_BM_pow_rec_acc,512,1148011,610.191,609.555,ns
clang_BM_pow_rec_acc,1024,537197,1283.06,1281.3,ns
clang_BM_pow_ebs,1,230396545,2.9994,2.99676,ns
clang_BM_pow_ebs,2,169067424,4.08076,4.07118,ns
clang_BM_pow_ebs,4,144069680,4.98213,4.91438,ns
clang_BM_pow_ebs,8,122845811,5.44799,5.39654,ns
clang_BM_pow_ebs,16,123172212,5.5894,5.57018,ns
clang_BM_pow_ebs,32,121115648,5.95688,5.93962,ns
clang_BM_pow_ebs,64,118419272,6.02155,5.99638,ns
clang_BM_pow_ebs,128,115924748,6.06574,6.0628,ns
clang_BM_pow_ebs,256,110894603,6.77586,6.75139,ns
clang_BM_pow_ebs,512,98698589,7.84512,7.76027,ns
clang_BM_pow_ebs,1024,89059657,8.00756,7.9626,ns
clang_BM_pow_ebs_acc,1,657567189,1.05315,1.04851,ns
clang_BM_pow_ebs_acc,2,469131169,1.53819,1.53625,ns
clang_BM_pow_ebs_acc,4,312749921,2.20653,2.20532,ns
clang_BM_pow_ebs_acc,8,259702678,2.61954,2.61573,ns
clang_BM_pow_ebs_acc,16,180832294,3.89507,3.89197,ns
clang_BM_pow_ebs_acc,32,188172549,3.64933,3.64264,ns
clang_BM_pow_ebs_acc,64,166588132,4.26915,4.26723,ns
clang_BM_pow_ebs_acc,128,174694285,4.00344,4.00193,ns
clang_BM_pow_ebs_acc,256,134532595,5.23862,5.22875,ns
clang_BM_pow_ebs_acc,512,119228083,5.84675,5.81626,ns
clang_BM_pow_ebs_acc,1024,111431255,6.18642,6.18045,ns
clang_BM_pow_rtl,1,673361807,1.08527,1.08058,ns
clang_BM_pow_rtl,2,525853197,1.32037,1.3183,ns
clang_BM_pow_rtl,4,409069659,1.72002,1.71799,ns
clang_BM_pow_rtl,8,314953544,2.20521,2.19442,ns
clang_BM_pow_rtl,16,268517133,2.52929,2.52372,ns
clang_BM_pow_rtl,32,234065960,2.84197,2.84108,ns
clang_BM_pow_rtl,64,202013789,3.46654,3.46535,ns
clang_BM_pow_rtl,128,169815120,4.0331,4.03292,ns
clang_BM_pow_rtl,256,147780339,4.73918,4.73832,ns
clang_BM_pow_rtl,512,127560318,5.54143,5.52924,ns
clang_BM_pow_rtl,1024,113921167,6.35124,6.31617,ns
gcc_BM_pow_rec,1,1000000000,0.642449,0.642183,ns
gcc_BM_pow_rec,2,736997263,0.936602,0.935837,ns
gcc_BM_pow_rec,4,577076858,1.228,1.22261,ns
gcc_BM_pow_rec,8,395080681,1.7532,1.74838,ns
gcc_BM_pow_rec,16,75348216,9.35105,9.34704,ns
gcc_BM_pow_rec,32,24714722,27.885,27.8778,ns
gcc_BM_pow_rec,64,6954517,99.4378,99.3757,ns
gcc_BM_pow_rec,128,3508403,200.837,200.776,ns
gcc_BM_pow_rec,256,1692338,415.756,415.565,ns
gcc_BM_pow_rec,512,852785,840.297,839.985,ns
gcc_BM_pow_rec,1024,420264,1653.17,1652.97,ns
gcc_BM_pow_rec_acc,1,1000000000,0.683353,0.683236,ns
gcc_BM_pow_rec_acc,2,886771897,0.794739,0.793799,ns
gcc_BM_pow_rec_acc,4,401567260,1.73483,1.72641,ns
gcc_BM_pow_rec_acc,8,250294991,2.95757,2.94537,ns
gcc_BM_pow_rec_acc,16,95675469,7.07709,7.06796,ns
gcc_BM_pow_rec_acc,32,41662451,16.8187,16.8082,ns
gcc_BM_pow_rec_acc,64,14443800,47.6729,47.6236,ns
gcc_BM_pow_rec_acc,128,5953647,118.254,118.182,ns
gcc_BM_pow_rec_acc,256,2466873,286.095,285.817,ns
gcc_BM_pow_rec_acc,512,1130655,615.528,615.259,ns
gcc_BM_pow_rec_acc,1024,547200,1283.82,1283.04,ns
gcc_BM_pow_ebs,1,672081725,1.04422,1.04349,ns
gcc_BM_pow_ebs,2,449328575,1.56498,1.56402,ns
gcc_BM_pow_ebs,4,383118696,1.82464,1.82372,ns
gcc_BM_pow_ebs,8,336292709,2.16009,2.15451,ns
gcc_BM_pow_ebs,16,165761294,4.23759,4.23051,ns
gcc_BM_pow_ebs,32,149499816,4.74884,4.74265,ns
gcc_BM_pow_ebs,64,127367674,5.34939,5.33855,ns
gcc_BM_pow_ebs,128,126221645,5.84178,5.82675,ns
gcc_BM_pow_ebs,256,107998025,7.38296,6.83305,ns
gcc_BM_pow_ebs,512,100404487,7.17541,7.12359,ns
gcc_BM_pow_ebs,1024,92229044,7.71009,7.67266,ns
gcc_BM_pow_ebs_acc,1,658730532,1.04911,1.04819,ns
gcc_BM_pow_ebs_acc,2,428708790,1.60377,1.60127,ns
gcc_BM_pow_ebs_acc,4,327703082,2.12746,2.12434,ns
gcc_BM_pow_ebs_acc,8,253354229,2.8163,2.76942,ns
gcc_BM_pow_ebs_acc,16,218485087,3.10567,3.10326,ns
gcc_BM_pow_ebs_acc,32,140335683,4.88869,4.88666,ns
gcc_BM_pow_ebs_acc,64,157410551,4.43992,4.43803,ns
gcc_BM_pow_ebs_acc,128,141516777,5.02243,5.01728,ns
gcc_BM_pow_ebs_acc,256,96357680,6.3678,6.35574,ns
gcc_BM_pow_ebs_acc,512,99450183,7.3439,7.19385,ns
gcc_BM_pow_ebs_acc,1024,97465887,7.33387,7.32162,ns
gcc_BM_pow_rtl,1,771638961,0.939868,0.936382,ns
gcc_BM_pow_rtl,2,566054519,1.31514,1.29922,ns
gcc_BM_pow_rtl,4,298165431,2.51659,2.50175,ns
gcc_BM_pow_rtl,8,238699293,3.18444,3.13001,ns
gcc_BM_pow_rtl,16,149156732,4.59082,4.57887,ns
gcc_BM_pow_rtl,32,117928502,6.28446,6.19749,ns
gcc_BM_pow_rtl,64,90744102,7.75125,7.64506,ns
gcc_BM_pow_rtl,128,80328659,8.35933,8.34487,ns
gcc_BM_pow_rtl,256,79322810,8.50594,8.4981,ns
gcc_BM_pow_rtl,512,78111923,9.37514,9.35696,ns
gcc_BM_pow_rtl,1024,61735472,11.1617,11.1471,ns
ts826848 commented 3 years ago

In case you're interested, here's an Excel file with the benchmark numbers: pow-benches.xlsx

coveralls commented 3 years ago

Coverage Status

Coverage remained the same at 0.0% when pulling dfa0f33835c40a6e24c7ab7e992476935ada118b on ts826848:pow-exp-by-squares into e91f45783bc24ad45961340f11d8c5a22ae1db1e on nholthaus:v3.x.

coveralls commented 3 years ago

Coverage Status

Coverage remained the same at 0.0% when pulling dfa0f33835c40a6e24c7ab7e992476935ada118b on ts826848:pow-exp-by-squares into e91f45783bc24ad45961340f11d8c5a22ae1db1e on nholthaus:v3.x.

nholthaus commented 3 years ago

The quality of this PR astonished me. Definitely interested in the benchmark graph!

Thank you for this valuable contribution.

Out of curiosity, what library were you using for your benchmarks?

ts826848 commented 3 years ago

Thanks! To be honest, it was actually a bit more of a rabbit hole than I initially expected, but it was an enjoyable process nevertheless.

I used Google Benchmark. I cloned the repo into the folder where I wrote the benchmark file so it could use the same compiler/flags as the benchmark file. I'd get linker errors if I installed it using the package manager.