google / benchmark

A microbenchmark support library
Apache License 2.0
9.06k stars 1.63k forks source link

[BUG] DoNotOptimize unpredictable on ternary conditionals #1188

Open Krasjet opened 3 years ago

Krasjet commented 3 years ago

Describe the bug DoNotOptimize seems to have unpredictable behavior on ternary conditionals.

I'm trying to benchmark the performance of manual flushing to zero on denormals and I define a macro like this:

#include <cmath>
#include <cfloat>

#define FLUSHF(x) ((x) = fabsf(x)<FLT_MIN ? 0 : (x))

which basically flushes any denormal floats (i.e. below FLT_MIN) to 0 to prevent performance degradation caused by denormal numbers on x86.

This macro is used in the following benchmark:

static void
flush_32(benchmark::State& state)
{
  float mem = FLT_MIN;
  for (auto _ : state) {
    benchmark::DoNotOptimize(mem = 0.999f * mem);
    benchmark::DoNotOptimize(FLUSHF(mem));
  }
}
BENCHMARK(flush_32);

When compiling using g++ with -O3 optimization, FLUSHF doesn't seem to be working correctly and the flushing does not happen, which results in slower execution.

To see this in action, try compile the following with -O2, -O3, and clang++ and see the performance difference (you might need an x86 machine)

#include <benchmark/benchmark.h>
#include <cmath>
#include <cfloat>

#define FLUSHF(x) ((x) = fabsf(x)<FLT_MIN ? 0 : (x))

static void
flush_32(benchmark::State& state)
{
  float mem = FLT_MIN;
  for (auto _ : state) {
    benchmark::DoNotOptimize(mem = 0.999f * mem);
    benchmark::DoNotOptimize(FLUSHF(mem));
  }
}
BENCHMARK(flush_32);
BENCHMARK_MAIN();
$ g++ -O3 bug.cc -lbenchmark
$ ./a.out
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
flush_32         36.9 ns         36.9 ns     18957274
$ g++ -O2 bug.cc -lbenchmark
$ ./a.out
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
flush_32         6.06 ns         6.05 ns    101055135
$ clang++ -O3 bug.cc -lbenchmark
$ ./a.out
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
flush_32         5.30 ns         5.30 ns    102412118
$ clang++ -O2 bug.cc -lbenchmark
$ ./a.out
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
flush_32         5.23 ns         5.23 ns    101888597

Because FLUSHF is partially optimized away for gcc with -O3, it runs much slower.

Strangely, the flushing works fine with double. I also tried the original escape function from the video mentioned in the comment.

template <class Tp>
inline void escape(Tp& value)
{
  asm volatile("" : : "g"(value) : "memory");
}

and it works correctly. Try compile the following to see the problem:

#include <benchmark/benchmark.h>
#include <cmath>
#include <cfloat>

#define FLUSH(x) ((x) = fabs(x)<DBL_MIN ? 0 : (x))
#define FLUSHF(x) ((x) = fabsf(x)<FLT_MIN ? 0 : (x))

template <class Tp>
inline void escape(Tp& value)
{
  asm volatile("" : : "g"(value) : "memory");
}

static void
flush_64(benchmark::State& state)
{
  double mem = FLT_MIN;
  for (auto _ : state) {
    benchmark::DoNotOptimize(mem = 0.999 * mem);
    benchmark::DoNotOptimize(FLUSH(mem));
  }
}
BENCHMARK(flush_64);

static void
flush_32(benchmark::State& state)
{
  float mem = FLT_MIN;
  for (auto _ : state) {
    benchmark::DoNotOptimize(mem = 0.999f * mem);
    benchmark::DoNotOptimize(FLUSHF(mem));
  }
}
BENCHMARK(flush_32);

static void
escape_32(benchmark::State& state)
{
  float mem = FLT_MIN;
  for (auto _ : state) {
    benchmark::DoNotOptimize(mem = 0.999f * mem);
    escape(FLUSHF(mem));
  }
}
BENCHMARK(escape_32);

BENCHMARK_MAIN();
$ g++ -O3 bug.cc -lbenchmark
$ ./a.out
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
flush_64        0.606 ns        0.605 ns    995985269
flush_32         37.5 ns         37.5 ns     18310441
escape_32       0.773 ns        0.772 ns    901474201
$ g++ -O2 bug.cc -lbenchmark
$ ./a.out
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
flush_64         5.94 ns         5.94 ns    101705109
flush_32         6.12 ns         6.11 ns    115467957
escape_32        4.65 ns         4.64 ns    152872058
$ clang++ -O3 bug.cc -lbenchmark
$ ./a.out
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
flush_64         5.24 ns         5.24 ns     99614101
flush_32         5.34 ns         5.34 ns    130798955
escape_32        5.35 ns         5.35 ns    129643413
$ clang++ -O2 bug.cc -lbenchmark
$ ./a.out
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
flush_64         5.33 ns         5.24 ns     98349666
flush_32         5.38 ns         5.27 ns    132894039
escape_32        5.44 ns         5.44 ns    129066762

The anomaly of flush_32 with gcc -O3 is apparently a problem.

Is there a way to fix this problem without manually introducing another non-portable function to escape the optimization?

System Which OS, compiler, and compiler version are you using:

Expected behavior DoNotOptimize works predictably on ternary conditionals.

Krasjet commented 3 years ago

This is strange. After I replace every instance of DoNotOptimize with escape, flush_32 always runs around 0.3ns slower no matter how many times I rerun it, even though it is exactly the same as escape_32

#include <benchmark/benchmark.h>
#include <cmath>
#include <cfloat>

#define FLUSH(x) ((x) = fabs(x)<DBL_MIN ? 0 : (x))
#define FLUSHF(x) ((x) = fabsf(x)<FLT_MIN ? 0 : (x))

template <class Tp>
inline void escape(Tp& value)
{
  asm volatile("" :: "g"(value) : "memory");
}

static void
flush_64(benchmark::State& state)
{
  double mem = FLT_MIN;
  for (auto _ : state) {
    escape(mem = 0.999 * mem);
    escape(FLUSH(mem));
  }
}
BENCHMARK(flush_64);

static void
flush_32(benchmark::State& state)
{
  float mem = FLT_MIN;
  for (auto _ : state) {
    escape(mem = 0.999f * mem);
    escape(FLUSHF(mem));
  }
}
BENCHMARK(flush_32);

static void
escape_32(benchmark::State& state)
{
  float mem = FLT_MIN;
  for (auto _ : state) {
    escape(mem = 0.999f * mem);
    escape(FLUSHF(mem));
  }
}
BENCHMARK(escape_32);

BENCHMARK_MAIN();
$ g++ -O3 bug.cc -lbenchmark
$ ./a.out
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
flush_64        0.693 ns        0.693 ns    858596482
flush_32         1.01 ns         1.01 ns    691820352
escape_32       0.689 ns        0.688 ns   1000000000

While if we set the constraint to "r" instead, which is used by folly,

template <class Tp>
inline void escape(Tp& value)
{
  asm volatile("" :: "r"(value));
}

the execution time would be the same

$ g++ -O3 bug.cc -lbenchmark
$ ./a.out
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
flush_64        0.678 ns        0.677 ns    884857534
flush_32        0.678 ns        0.677 ns   1000000000
escape_32       0.678 ns        0.677 ns    994171725