Open Quuxplusone opened 7 years ago
Bugzilla Link | PR33013 |
Status | NEW |
Importance | P enhancement |
Reported by | Sanjay Patel (spatel+llvm@rotateright.com) |
Reported on | 2017-05-11 16:27:32 -0700 |
Last modified on | 2017-09-05 06:00:49 -0700 |
Version | trunk |
Hardware | PC All |
CC | amjad.aboud@intel.com, david.l.kreitzer@intel.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, michaeljclark@mac.com, peter@cordes.ca, t.p.northover@gmail.com, zia.ansari@intel.com, zvirack@gmail.com |
Fixed by commit(s) | |
Attachments | |
Blocks | |
Blocked by | |
See also | PR31602 |
An example from https://groups.google.com/forum/#!topic/llvm-dev/Tl9SD-edUJI
where the select is not explicit in IR:
define i64 @cvtf(float %f) {
%cvt = fptoui float %f to i64
ret i64 %cvt
}
movss LCPI0_0(%rip), %xmm1 ## xmm1 = mem[0],zero,zero,zero
movaps %xmm0, %xmm2
subss %xmm1, %xmm2
cvttss2si %xmm2, %rax
movabsq $-9223372036854775808, %rcx ## imm = 0x8000000000000000
xorq %rax, %rcx
cvttss2si %xmm0, %rax
ucomiss %xmm1, %xmm0
cmovaeq %rcx, %rax
retq
Another issue related to the use of predicate logic versus a branch is the
unconditional execution of ISD::FSUB which causes FE_INEXACT for float to u64
and double to u64 conversion. See test code and workaround below.
See SelectionDAGLegalize::ExpandNode()
case ISD::FP_TO_UINT: {
SDValue True, False;
EVT VT = Node->getOperand(0).getValueType();
EVT NVT = Node->getValueType(0);
APFloat apf(DAG.EVTToAPFloatSemantics(VT),
APInt::getNullValue(VT.getSizeInBits()));
APInt x = APInt::getSignMask(NVT.getSizeInBits());
(void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
Tmp1 = DAG.getConstantFP(apf, dl, VT);
Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT),
Node->getOperand(0),
Tmp1, ISD::SETLT);
True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
// TODO: Should any fast-math-flags be set for the FSUB?
False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
DAG.getNode(ISD::FSUB, dl, VT,
Node->getOperand(0), Tmp1));
False = DAG.getNode(ISD::XOR, dl, NVT, False,
DAG.getConstant(x, dl, NVT));
Tmp1 = DAG.getSelect(dl, NVT, Tmp2, True, False);
Results.push_back(Tmp1);
break;
}
Expected output:
$ clang fcvt-bug.cc
$ ./a.out
1 exact
1 inexact
1 exact
1 inexact
1 exact
1 inexact
1 exact
1 inexact
Erroneous output:
$ clang -DDISABLE_WORKAROUND fcvt-bug.cc
$ ./a.out
1 exact
1 inexact
1 inexact
1 inexact
1 exact
1 inexact
1 inexact
1 inexact
$ cat fcvt-bug.cc
#include <cstdio>
#include <cmath>
#include <limits>
#include <fenv.h>
typedef float f32;
typedef double f64;
typedef signed int s32;
typedef unsigned int u32;
typedef signed long long s64;
typedef unsigned long long u64;
#if defined __GNUC__ && defined __x86_64__ && !defined DISABLE_WORKAROUND
__attribute__ ((noinline)) s64 fcvt_s_lu(f32 f)
{
s64 res;
static const union { f32 f; u32 u; } val = { .u = 0x5f000000 };
if (f > val.f) {
__asm__(
"subss %[sub], %[input]\n "
"movabsq $0x8000000000000000, %%rdx\n "
"cvttss2siq %[input], %[result]\n "
"xorq %%rdx, %[result]"
: [result] "=r"(res)
: [sub] "m"(val.f), [input] "x"(f)
: "rdx"
);
} else {
__asm__(
"cvttss2siq %[input], %[result]"
: [result] "=r"(res)
: [input] "x"(f)
);
}
return res;
}
__attribute__ ((noinline)) s64 fcvt_d_lu(f64 f)
{
s64 res;
static const union f64_bits { f64 f; u64 u; } val = { .u = 0x43E0000000000000ULL };
if (f > val.f) {
__asm__(
"subsd %[sub], %[input]\n "
"movabsq $0x8000000000000000, %%rdx\n "
"cvttsd2siq %[input], %[result]\n "
"xorq %%rdx, %[result]"
: [result] "=r"(res)
: [sub] "m"(val.f), [input] "x"(f)
: "rdx"
);
} else {
__asm__(
"cvttsd2siq %[input], %[result]"
: [result] "=r"(res)
: [input] "x"(f)
);
}
return res;
}
#else
__attribute__ ((noinline)) s64 fcvt_s_lu(f32 f) { return s64(u64(f)); }
__attribute__ ((noinline)) s64 fcvt_d_lu(f64 f) { return s64(u64(f)); }
#endif
__attribute__ ((noinline)) s32 fcvt_s_wu(f32 f) { return s32(u32(f)); }
__attribute__ ((noinline)) s32 fcvt_d_wu(f64 f) { return s32(u32(f)); }
void test_fcvt_s_wu(f32 a)
{
feclearexcept(FE_ALL_EXCEPT);
printf("%d ", fcvt_s_wu(a));
printf("%s\n", fetestexcept(FE_INEXACT) ? "inexact" : "exact");
}
void test_fcvt_s_lu(f32 a)
{
feclearexcept(FE_ALL_EXCEPT);
printf("%lld ", fcvt_s_lu(a));
printf("%s\n", fetestexcept(FE_INEXACT) ? "inexact" : "exact");
}
void test_fcvt_d_wu(f64 a)
{
feclearexcept(FE_ALL_EXCEPT);
printf("%d ", fcvt_d_wu(a));
printf("%s\n", fetestexcept(FE_INEXACT) ? "inexact" : "exact");
}
void test_fcvt_d_lu(f64 a)
{
feclearexcept(FE_ALL_EXCEPT);
printf("%lld ", fcvt_d_lu(a));
printf("%s\n", fetestexcept(FE_INEXACT) ? "inexact" : "exact");
}
int main()
{
fesetround(FE_TONEAREST);
test_fcvt_s_wu(1.0f);
test_fcvt_s_wu(1.1f);
test_fcvt_s_lu(1.0f);
test_fcvt_s_lu(1.1f);
test_fcvt_d_wu(1.0f);
test_fcvt_d_wu(1.1f);
test_fcvt_d_lu(1.0f);
test_fcvt_d_lu(1.1f);
}
Given that the primary issue is the speed of branch vs predicate logic, here is
a microbenchmark of the two approaches:
$ time fcvt-branch
real 0m0.208s
user 0m0.201s
sys 0m0.002s
$ time fcvt-cmov
real 0m0.241s
user 0m0.235s
sys 0m0.002s
int main() {
static const union { float f; unsigned u; } val = { .u = 0x5f000000 };
for (unsigned long i = 0; i < 100000000; i++) {
unsigned long res;
float f = 1.0f;
#ifndef USE_CMOV
__asm__ volatile (
"ucomiss %[sub], %[input]\n "
"ja 1f\n "
"cvttss2siq %[input], %[result]\n "
"jmp 2f\n "
"1:\n "
"subss %[sub], %[input]\n "
"movabsq $0x8000000000000000, %%rdx\n "
"cvttss2siq %[input], %[result]\n "
"xorq %%rdx, %[result]\n "
"2: "
: [result] "=r"(res)
: [sub] "m"(val.f), [input] "x"(f)
: "rdx"
);
#else
__asm__ volatile (
"movss %[sub], %%xmm1\n "
"movaps %[input], %%xmm2\n "
"subss %%xmm1, %%xmm2\n "
"cvttss2si %%xmm2, %[result]\n "
"movabsq $0x8000000000000000, %%rdx\n "
"xorq %[result], %%rdx\n "
"cvttss2si %[input], %[result]\n "
"ucomiss %%xmm1, %[input]\n "
"cmovaeq %%rdx, %[result]\n "
: [result] "=r"(res)
: [sub] "m"(val.f), [input] "x"(f)
: "rdx", "xmm1", "xmm2"
);
#endif
}
}
(In reply to Michael Clark from comment #3)
> Given that the primary issue is the speed of branch vs predicate logic, here
> is a microbenchmark of the two approaches:
As written, this is the absolute best case for the branching version, right? we
should vary the input values to exercise the predictor, so we know when/if the
cmov version is ever better.
If you can provide some info about your test machine, that would be good too.
It's quite likely that the data and our conclusions are going to be different
for say Intel Skylake vs. AMD Jaguar.
For now, I'm assuming that x86 needs to do something like this:
https://reviews.llvm.org/rL292154
This is for 2 reasons:
1. Converting selects to branches early could have a negative impact on
DAGCombiner/isel because that's limited to a basic block. We actually do
convert some selects early in CodeGenPrepare::optimizeSelectInst(), but that's
limited to the case where we have a known expensive op (division for example)
and/or metadata (profile info) to guide that decision.
2. As shown in the fptoui example, the select may not exist in IR. In that
case, there's not much choice. We have to do this transform later unless we're
willing to expand fptoui into the equivalent IR that contains a select and/or a
compare+branch in CGP.
There are certainly cases where cmov's data dependency is a bottleneck, instead
of a control dependency that can be speculated and that avoids doing some work.
e.g. http://stackoverflow.com/questions/28875325/gcc-optimization-flag-o3-makes-
code-slower-then-o2 shows a case where a cmov data dependency (from gcc -O3) is
slower than a conditional branch (from gcc -O2), for sorted input that leads to
perfect branch prediction.
But caution is required here. More branchy code could be slower, even with
predictable branches.
I'm not sure what heuristics would be most useful in guiding the choice between
branchy vs. branchless. Obviously saving a lot of work in one side of the
branch is a big one, but also creating loop-carried dep chains that would be
the biggest bottleneck is another. (In that SO question, unrolling with
multiple accumulators would have avoided a latency bottleneck.)
----
More total branches would take up more branch-predictor entries, which would
likely mean that on average branches are predicted less well. This is another
effect that microbenchmarks won't reveal.
On some CPUs, never-taken branches don't use any branch-prediction resources
(since fall-through is the default prediction), but on at least some Intel
CPUs, they probably do affect and are affected by other branches.
http://agner.org/optimize/ says that recent Intel CPUs don't use static branch
prediction, but instead
Also, taken branches always have the potential to reduce front-end throughput,
even when predicted perfectly. CPUs have buffers between fetch and decode to
hide fetch bubbles created by branches, but too many could expose them. uop-
caches do help with this, but they aren't trace caches (so a taken branch
usually means that the next uop comes from a different line of the uop cache).
----
Those inline-asm sequences have a flaw which hurts the branching version: The
fast-path has a taken JMP, because it has to jump over the slow-path instead of
putting it out-of-line at the end of the function. (This is only an inline-asm
problem; gcc and clang of course already put unlikely branch targets out-of-
line and have them jump back, so this is only a problem with the test
methodology).
Taken-branch throughput can be an issue inside a tight loop like the one you're
testing. (e.g. Haswell/Skylake can execute/retire 2 not-taken branches per
clock, but only one taken branch per 2 clocks. Loop-branches are a special
case, so tiny loops can run at 1 iteration per clock.)
Oops, forgot to finish writing the paragraph about static branch prediction,
and whether modern Intel CPUs use it or not. Matt Godbolt did some
investigation of Nehalem vs. Haswell: https://xania.org/201602/bpu-part-two.
Also forgot to mention: on Intel Pentium-M/Core2 through Haswell, cmov is 2
uops with 2c latency. On Broadwell/SKL, it's 1 uop (like ADC/SBB and some
other 3-input instructions, taking advantage of the 3-input uop support
introduced for FMA in Haswell).
CMOV is only 1 uop / m-op on AMD, and on Silvermont (but it is 2c latency on
Silvermont).
Anyway, on much of the current installed-base of x86 CPUs, CMOV is 2 uops / 2c
latency, but that will change as more older Intel CPUs are replaced.
Of course, CMOV isn't the only option for branchless code. Especially on CPUs
where it's 2 uops, an alternative like SAR/AND (or BMI2 ANDN) can be good vs.
CMP/CMOV. See this interesting case of compiler hand-holding for example:
http://stackoverflow.com/questions/34407437/what-is-the-efficient-way-to-count-
set-bits-at-a-position-or-lower/34410357#34410357, where we couldn't avoid a
CMP or TEST because POPCNT clobbers flags, so SAR/AND is no worse than
TEST/CMOV even on CPUs with efficient CMOV.
Thanks, Peter.
One more variable to consider here: what is the size difference between the cmov and branch versions? I haven't counted the bytes, but that may tip the scale one way or the other when optimizing for size. And even nastier, are we supposed to be aligning the taken target block with nops? I think we do that for loop start blocks, but I haven't seen it with if/then.
> what is the size difference between the cmov and branch versions?
for gcc's versions:
branchless: (sub-optimal, should avoid the MOV with an LEA)
4008d0: 48 63 10 movsxd rdx,DWORD PTR [rax]
4008d3: 48 89 d1 mov rcx,rdx
4008d6: 48 01 da add rdx,rbx
4008d9: 83 c1 80 add ecx,0xffffff80
4008dc: 48 0f 4d da cmovge rbx,rdx
4008e0: 48 83 c0 04 add rax,0x4
4008e4: 4c 39 e0 cmp rax,r12
4008e7: 75 e7 jne 4008d0 <main+0xc0>
0x18 bytes of code, could be 0x16 I think
branchy:
400a20: 4d 63 1a movsxd r11,DWORD PTR [r10]
400a23: 41 83 fb 7f cmp r11d,0x7f
400a27: 7e 7d jle 400aa6 <main+0x116>
400a29: 49 83 c2 04 add r10,0x4
400a2d: 4c 01 db add rbx,r11
400a30: 4d 39 ca cmp r10,r9
400a33: 75 eb jne 400a20 <main+0x90>
0x14 bytes of code. (Could be 0x13 if gcc picked a reg that didn't need a REX
prefix for the 32-bit cmp).
Sorry for using gcc output in this clang bug, but I already had the compiled
binaries there to disassemble. I think it's going to be pretty common for the
branchless version to be a little bit larger, especially when it requires
constants in registers (because CMOV doesn't take immediate operands).
> are we supposed to be aligning the taken target block with nops?
Definitely not, since the NOP would run inside the loop. Branch-target
alignment is not very important in CPUs with uop-caches, and completely
irrelevant when running from the loop buffer. (The loop buffer *does* act like
a trace cache for tiny loops, I think, so the uop following a taken branch can
issue as part of the same group as the branch.)
Putting even a single long-NOP inside the loop to align a non-loop branch
target would hurt the front-end more than it helped, because that NOP would
take up issue/retire bandwidth.
The alignment of branches (the jcc, not the target) can affect performance
because it determines which other branches it aliases, so it can be hard to
measure just the effect of branch alignment. Changing the alignment of
anything tends to change the alignment of everything else in the program.
It could possibly help some to stretch out the real instructions in the loop by
using redundant prefixes or longer addressing modes / immediates than
necessary, if that avoids having the branch target be very near the end of a
16B-aligned block, but I'm not sure about that. (And unfortunately there's no
mechanism in gas or NASM syntax, or any other asm syntax I'm aware of for
requesting that.)
Decreasing code-density is a risky bet, especially when you don't have
profiling data to show which loops are truly hot and actually bottlenecked on
front-end throughput, not an execution unit, latency, or memory.
With respect to microbenchmark using static values, i.e. always predicted
branches, not being representative of a real workload, then one would ask the
question of what would be representative of an actual workload.
1. Values vary but are less than LLONG_MAX, 100% predicted branches
2. Values vary but a few outliers are out of range, >99% predicted branches
3. Pathological case. i.e. floating point noise values
Use case number 3 seems unusual as one developing software would typically
convert values that they would expect would be in range versus what would
essentially be a binary conversion of 0 or ULLONG_MAX for random values with
random exponents that mostly fall outside of the range of u64. Actual code
would be more likely to calculate a max value to scale the floating point
values before a conversion (having written such code many times).
In essence, I am saying it would be fair to say that in the majority of code
that the branch prediction rate is likely to be near the order of >99%, perhaps
closer to 100% in most cases.
Adding to that, whether the choice to use size_t / unsigned is because the value is intended to be positive versus gaining the additional range between LLONG_MAX and ULLONG_MAX.
Values from to 2^53 to 2^64, of course, will all be inexact conversions, and for the signed case of +/-2^53 to +/-2^63 are already handled in hardware with the MXCSR INEXACT bit being set by cvttss2si/cvttsd2si
(In reply to Michael Clark from comment #9)
> always predicted branches not being representative of a real workload
Agreed that a branchy float->u64 conversion will predict extremely well in most
code. But it's not necessarily representative of other cases that are only say
98% predictable, which is the big-picture for this bug.
getting slightly off-topic here, I know you brought this up as a silly example
of what *not* to optimize for.
> 3. Pathological case. i.e. floating point noise values
> essentially be a binary conversion of 0 or ULLONG_MAX for random values with
> random exponents that mostly fall outside of the range of u64.
Fun fact about IEEE754: Half of all representable values have magnitude less
than 1.0. Most have a magnitude below LLONG_MAX, so cvtss2si %xmm, %r64 will
produce something other than the "integer indefinite" value (1ULL<<63) that
Intel documents for out-of-range conversions:
http://felixcloutier.com/x86/CVTSS2SI.html.
The range of representable values is huge, but they get sparser the farther
away from 0 you get.
For negative floats, your conversion code appears to implement the semantics of
(uint64_t)(int64_t)f.
That's what C programmers should write if they want an unsigned result but
don't need the extra input range, and want efficiency on x86 without AVX512
(and probably some other architectures which only provide float<->signed
conversions). Unfortunately C makes it hard to tell the compiler what you
do/don't care about without making the code really ugly.
As far as the predictability of the branch: only bit patterns from 0x5f000000
to 0x7f7fffff and +Inf (0x7f800000) compare >= (float)(1ULL<<63).
(Exponent=255 and non-zero mantissa is a NaN, which is unordered.) Check it
out on https://www.h-schmidt.net/FloatConverter/IEEE754.html.
----
BTW, you can use LEA instead of XORQ so you can hoist the movabsq of the
constant out of the loop without needing an extra MOV. XORing to flip the MSB
is the same as adding to flip the MSB: with nowhere for the carry to go, add
and add-without-carry are the same. (And so is subtract).
Unfortunately I don't see a way to use CMPLTPS/ANDPS to do the branchless stuff
with only one cvttss2si (and avoid integer CMOV), unless we do the conditional
sign-bit flip with vectors, too (e.g. integer left-shift of a compare result
and XORPS, or maybe PSIGND). (And that would still change the result in the
positive out-of-range because we'd get the "integer indefinite" value and *not*
flip the MSB to make it 0 instead of 1ULL<<63.)
(In reply to Michael Clark from comment #10)
> Values from to 2^53 to 2^64, of course, will all be inexact conversions, and
> for the signed case of +/-2^53 to +/-2^63 are already handled in hardware
> with the MXCSR INEXACT bit being set by cvttss2si/cvttsd2si
Are you sure about that?
Not all integers are exactly representable as float/double (beyond 2^24 or
2^53), but all floats/doubles in that range represent integers. e.g. all the
even integers from 2^24 to 2^25 are exactly representable. (2^24+1 is the
first number that can't be exactly represented by IEEE float32
http://stackoverflow.com/questions/3793838/which-is-the-first-integer-that-an-ieee-754-float-is-incapable-of-representing-e).
Put another way, when the bias-corrected exponent is greater than the number of
mantissa bits, multiplying the mantissa by 2^exponent shifts the decimal point
(err, base-2 radix point?) to the right all the way out of the mantissa,
leaving some zeros in the LSB of the base2 integer representation.
I haven't tested, but why would hardware raise the inexact exception in that
case?
Sure you there are some representable values but there is a loss of precision.
I would have to read the IEEE-754 spec to be sure on the exact rules regarding exact vs inexact conversions but there is a loss of precision above 2^53 such that a precise countable integer can not be distinguished due to right most bits of the significand that cannot be represented, by definition precision has to be lost as while a magnitude of 2^64 can be represented there are only repsectively 52 bits and 23 bits of precision for double and single.
At magnitudes around 2^63 there are 10 bits of double precision lost and 39 bits of single precision lost so the conversion will alias with values increasing in magnitude by 2^10 for double to u64 and 2^39 for single precision to u64. You can do an exact conversion to float or double for specific values but not the other way.
This is I believe why the subtraction of 2^63 causes the inexact flag to be set (which is done to allow the use of the signed hardware conversion to emulate an unsigned conversion).
There can only be an exact conversion for positive 0 to < 2^53 for double precision values and 0 to < 2^24 for single precision values.
In my particular use case I am emulating a RISC-V processor that has an unsigned conversions and I am running the RISC-V compliance tests on the results of the conversion intrinsics generated by the compiler.
I understand AVX-512 implements native unsigned conversions, in which case its handled by the hardware.
Sorry for continuing the off-topic discussion in this thread, will keep this
short.
(In reply to Michael Clark from comment #13)
> I would have to read the IEEE-754 spec to be sure on the exact rules
> regarding exact vs inexact conversions
If the rounding step changes the value, it's inexact. If not, it's exact.
For a fp->int conversion, it's a round to integer of the fp value. For a
subtraction, it's a round to float or double of the result.
In the float->u64 case, it should be possible for both those operations to be
exact, e.g. 1.5*2^63 - 2^63 = 2^62 exactly. (1.5 is a possible mantissa value).
I tested what x86 hardware conversion (on Skylake) does for an int->float->int
round trip, using some printing around this:
float f = a;
long long roundtrip = f;
clang uses the expected cvtsi2ss and cvttss2si.
With input = 64 + (1LL << 30), rounding to nearest float loses the +64
inexact 0x40000040(1073741888) -> 1073741824.000000 -> 0x40000000(1073741824)
With input of 128 + (1LL << 30), that doesn't happen.
exact 0x40000080(1073741952) -> 1073741952.000000 -> 0x40000080(1073741952)
Peter, when you say exact, you mean the inexact flag is not set in MXCSR? after the conversion or that the imprecise floating point aliases to an exact integer representation? (i.e. you can perform a round trip and get the same value). Subtle difference.
I'll run your test and check the MXCSR flags...
The imprecise float is "aliasing" with an integer representation. float can't
represent more than 24 bits of precision (counting the leading 1 bit). It's
arguable as to whether the implementation is doing the right thing for
magnitudes above the precision of the floating point type. The hardware is
reporting an exact conversion to an integer when it can't know if the
conversion is exact (using signed integers in this test as the subtract in the
unsigned conversion can cause inexact). I guess "unknown" is not INEXACT.
$ cat roundtrip.c
#include <stdio.h>
#include <math.h>
#include <fenv.h>
typedef float f32;
typedef long long s64;
__attribute__ ((noinline)) s64 fcvt_lu_s(f32 f) { return (s64)f; }
__attribute__ ((noinline)) f32 fcvt_s_lu(s64 u) { return (f32)u; }
void test_fcvt(s64 u)
{
#pragma STDC FENV_ACCESS on
f32 f;
s64 v;
int nx;
feclearexcept(FE_ALL_EXCEPT);
nx = fetestexcept(FE_INEXACT);
printf("%lld %s\n", u, nx ? "inexact" : "exact");
feclearexcept(FE_ALL_EXCEPT);
f = fcvt_s_lu(u);
nx = fetestexcept(FE_INEXACT);
printf("%f %s\n", f, nx ? "inexact" : "exact");
feclearexcept(FE_ALL_EXCEPT);
v = fcvt_s_lu(f);
nx = fetestexcept(FE_INEXACT);
printf("%lld %s\n", v, nx ? "inexact" : "exact");
}
int main()
{
fesetround(FE_TONEAREST);
test_fcvt((1 << 20) + 64);
test_fcvt((1 << 20) + 128);
test_fcvt((1 << 30) + 64);
test_fcvt((1 << 30) + 128);
}
$ gcc roundtrip.c -o roundtrip
$ ./roundtrip
1048640 exact
1048640.000000 exact
1048640 exact
1048704 exact
1048704.000000 exact
1048704 exact
1073741888 exact
1073741824.000000 inexact
1073741824 exact
1073741952 exact
1073741952.000000 exact
1073741952 exact
A cmov-to-branch pass for x86 has been proposed here:
https://reviews.llvm.org/D34769
(In reply to Sanjay Patel from comment #17)
> A cmov-to-branch pass for x86 has been proposed here:
> https://reviews.llvm.org/D34769
This was committed at rL308142