Open Quuxplusone opened 5 years ago
Attached test.c
(4992 bytes, text/x-csrc): Benchmark sample code
Gah. Misclick.
As you can see, the automatically vectorized code is significantly slower than
the nonvectorized code, which is still slightly slower than the optimized
version.
What I found clang was doing was extracting the individual values from the
lanes, doing the same routine I did with normal registers, then putting them
back. GCC actually does a similar thing.
U64x2 badmult(U64x2 v1, U64x2 v2)
{
return v1 * v2;
}
Clang 7.0.0 output:
badmult:
.fnstart
@ %bb.0:
.save {r4, r5, r6, r7, r11, lr}
push {r4, r5, r6, r7, r11, lr}
.setfp r11, sp, #16
add r11, sp, #16
vmov d17, r2, r3
add r12, r11, #8
vmov d16, r0, r1
vld1.64 {d18, d19}, [r12]
vmov.32 r0, d16[0]
vmov.32 r1, d18[0]
vmov.32 r12, d16[1]
vmov.32 r3, d17[0]
vmov.32 r2, d19[0]
vmov.32 lr, d17[1]
vmov.32 r6, d19[1]
umull r7, r5, r1, r0
mla r1, r1, r12, r5
umull r5, r4, r2, r3
mla r2, r2, lr, r4
vmov.32 r4, d18[1]
mla r2, r6, r3, r2
vmov.32 d17[0], r5
vmov.32 d16[0], r7
vmov.32 d17[1], r2
mla r0, r4, r0, r1
vmov.32 d16[1], r0
vmov r2, r3, d17
vmov r0, r1, d16
pop {r4, r5, r6, r7, r11, pc}
GCC 8.2.0 output
badmult:
@ args = 16, pretend = 0, frame = 16
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, r5, r6, r7, lr}
sub sp, sp, #20
vmov d18, r0, r1 @ v2di
vmov d19, r2, r3
vldr d16, [sp, #40]
vldr d17, [sp, #48]
vmov r0, r1, d18 @ v2di
vmov r6, r7, d16 @ v2di
vmov r2, r3, d19 @ v2di
vmov r4, r5, d17 @ v2di
mul lr, r0, r7
mla lr, r6, r1, lr
mul ip, r2, r5
umull r0, r1, r0, r6
mla ip, r4, r3, ip
add r1, lr, r1
umull r2, r3, r2, r4
strd r0, [sp]
add r3, ip, r3
strd r2, [sp, #8]
ldmia sp, {r0-r3}
add sp, sp, #20
pop {r4, r5, r6, r7, pc}
Neither produces optimal code.
However, Clang does produce optimal x86 code:
clang --target=i386-none-eabi -O3 -msse4.1
badmult: # @badmult
# %bb.0:
pushl %ebp
movdqa %xmm1, %xmm2
movdqa %xmm0, %xmm3
movl %esp, %ebp
psrlq $32, %xmm2
psrlq $32, %xmm3
pmuludq %xmm0, %xmm2
pmuludq %xmm1, %xmm3
pmuludq %xmm1, %xmm0
paddq %xmm2, %xmm3
psllq $32, %xmm3
paddq %xmm3, %xmm0
popl %ebp
retl
Which again, is basically doing the same thing only I think it does it in
reverse. I am not very good with x86 asm.
It seems like you should be able to save a multiply for the ARM sequence by using vmul.i32 instead of two vmull.u32. Or would that end up more expensive due to extra shuffles?
(The other issue here is that the cost modeling is messed up; currently the vectorizer is underestimating the cost of multiply, so it vectorizes when it isn't actually profitable.)
So you mean this:
U64x2 goodmult(U64x2 top, U64x2 bot) {
U32x2 d0 = vmovn_u64(top); // U32x2 d0 = top;
U32x2 d1 = vshrn_n_u64(top, 32); // U32x2 d1 = top >> 32;
U32x2 d2 = vmovn_u64(bot); // U32x2 d2 = bot & 0xFFFFFFFF;
U32x2 d3 = vshrn_n_u64(bot, 32); // U32x2 d3 = bot >> 32;
U64x2 q3 = vmull_u32(d2, d0); // U64x2 q3 = (U64x2)d2 * d0;
U32x2 d5 = vshrn_n_u64(q3, 32); // U32x2 d5 = q3 >> 32;
d5 = vmla_u32(d5, d2, d1); // d5 += d2 * d1;
d5 = vmla_u32(d5, d3, d0); // d5 += d3 * d0;
// return (q3 & 0xFFFFFFFF) | (d5 << 32);
return vsliq_n_u64(q3, vmovl_u32(d5), 32);
}
Hmm. Seems a lot slower...
goodmult2:
.fnstart @ %bb.0:
.save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
.setfp r11, sp, #28
add r11, sp, #28
.pad #4
sub sp, sp, #4
add r12, r11, #8
vmov d19, r2, r3
vld1.64 {d16, d17}, [r12]
vmov d18, r0, r1
vshr.u64 q10, q8, #32
vmov.32 r12, d18[0]
vmov.32 r1, d20[0]
vmov.32 lr, d18[1]
vshr.u64 q11, q9, #32
vmov.32 r3, d16[0]
vmov.32 r0, d22[0]
vmov.32 r2, d16[1]
vmov.32 r4, d22[1]
umull r9, r6, r1, r12
mla lr, r1, lr, r6
umull r10, r1, r0, r3
mla r0, r0, r2, r1
vmov.32 r1, d19[0]
vmov.32 r2, d21[0]
mla r8, r4, r3, r0
vmov.32 r3, d19[1]
umull r4, r7, r2, r1
mla r2, r2, r3, r7
vmov.32 r3, d17[0]
vmov.32 r7, d23[0]
vmov.32 r0, d17[1]
umull r5, r6, r7, r3
mla r0, r7, r0, r6
vmov.32 d25[0], r4
vmov.32 d26[0], r10
vmov.32 d27[0], r5
vmov.32 r4, d23[1]
mla r0, r4, r3, r0
vmov.32 r3, d21[1]
vmov.32 r4, d20[1]
mla r1, r3, r1, r2
vmov.32 d24[0], r9
vmovn.i64 d18, q9
mla r2, r4, r12, lr
vmov.32 d27[1], r0
vmovn.i64 d16, q8
vmull.u32 q8, d16, d18
vmov.32 d26[1], r8
vmov.32 d25[1], r1
vmov.32 d24[1], r2
vadd.i64 q9, q12, q13
vsra.u64 q9, q8, #32
vmov.i64 q10, #0xffffffff
vand q9, q9, q10
vsli.64 q8, q9, #32
vmov r0, r1, d16
vmov r2, r3, d17
sub sp, r11, #28
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
Good job, Clang. With optimizations like these, who needs intrinsics?
http://imgur.com/a/Hdpj3dm
However, just transcribing the sse4 seems much faster, even though it is doing
three 64-bit multiplies.
U64x2 goodmult(U64x2 top, U64x2 bot)
{
U32x2 topHi = vshrn_n_u64(top, 32); // U32x2 topHi = top >> 32;
U32x2 topLo = vmovn_u64(top); // U32x2 topLo = top & 0xFFFFFFFF;
U32x2 botHi = vshrn_n_u64(bot, 32); // U32x2 botHi = bot >> 32;
U32x2 botLo = vmovn_u64(bot); // U32x2 botLo = bot & 0xFFFFFFFF;
U64x2 prod1 = vmull_u32(topHi, botLo); // U64x2 prod1 = (U64x2)topHi * (U64x2)botLo;
U64x2 prod2 = vmull_u32(topLo, botLo); // U64x2 prod2 = (U64x2)topLo * (U64x2)botLo;
U64x2 prod3 = vmull_u32(botHi, topLo); // U64x2 prod3 = (U64x2)botHi * (U64x2)topLo;
prod1 = vaddq_u64(prod1, prod3); // prod1 += prod3;
prod1 = vshlq_n_u64(prod1, 32); // prod1 <<= 32;
return vaddq_u64(prod1, prod2); // return prod1 + prod2;
}
Right now my battery is low, so results fluctuate, but I've been getting some
good results. (Note: on 100,000 iters instead of 1,000,000, removed autovec)
nonvec: 1.217838, result: { 0x8d7a364399c05870, 0x0596b80877b2cf28 }
badmult: 1.727765, result: { 0x8d7a364399c05870, 0x0596b80877b2cf28 }
goodmult: 0.585781, result: { 0x8d7a364399c05870, 0x0596b80877b2cf28 }
goodmult_old: 0.732337, result: { 0x8d7a364399c05870, 0x0596b80877b2cf28 }
Though yeah, the cost model is clearly not working properly, as you can see
from Clang literally ignoring my intrinsics.
The first version was literally translating the 64-bit multiply code generated
for a normal U64 with NEON instructions.
I also want to add that even if I used the inline asm for the mangled version,
it still was a tiny bit slower than the original.
#define goodmult(_top, _bot) ({ \
U64x2 _output;\
asm(" vmovn.i64 d23, %[top]\n\
vshrn.i64 d17, %[top], #32\n\
vmovn.i64 d22, %[bot]\n\
vshrn.i64 d18, %[bot], #32\n\
vmull.u32 %[output], d23, d22\n\
vshrn.i64 d16, %[top], #32\n\
vmla.i32 d16, d23, d18\n\
vmla.i32 d16, d17, d22\n\
vmovl.u32 q12, d16\n\
vsli.64 %[output], q12, #32\n" \
: [output] "=w" (_output) \
: [top] "w" (_top), [bot] "w" (_bot) \
: "d16", "d17", "d18", "q10", "q12", "d20", "d21", "d22", "d23"); \
_output; \
})
Was that or the SSE4 what you were suggesting, or am I missing something?
nonvec: 13.608231, result: { 0x044d11583f4213d6, 0x426931c6659c0d76 }
badmult: 21.227451, result: { 0x044d11583f4213d6, 0x426931c6659c0d76 }
goodmult_alt: 16.175385, result: { 0x044d11583f4213d6, 0x426931c6659c0d76 }
goodmult_sse: 9.651844, result: { 0x044d11583f4213d6, 0x426931c6659c0d76 }
goodmult_old: 12.094364, result: { 0x044d11583f4213d6, 0x426931c6659c0d76 }
Yeah, the one based on SSE is much faster.
The inline asm had a typo, it should be:
vshrn.i64 d16, %[output], #32\n\
Instead of
vshrn.i64 d16, %[top], #32\n\
But either way, it is too slow.
Please file a separate bug for the multiply intrinsic issue; we shouldn't be
doing that. (I'm guessing it's not related to the other issues here.)
I was thinking of something more like the following, which is a few
instructions shorter:
#include <arm_neon.h>
typedef int64x2_t U64x2;
typedef int32x2_t U32x2;
typedef int32x4_t U32x4;
U64x2 twomul(U64x2 top, U64x2 bot) {
U32x2 d0 = vmovn_u64(top);
U32x2 d2 = vmovn_u64(bot);
U32x4 top_re = vreinterpretq_s32_u64(top);
U32x4 bot_re = vrev64q_s32(vreinterpretq_s32_u64(bot));
U32x4 prod = vmulq_u32(top_re, bot_re);
U64x2 paired = vpaddlq_s32(prod);
U64x2 shifted = vshlq_n_s64(paired, 32);
return vmlal_s32(shifted, d0, d2);
}
Ok, so Clang showed the error in my SSE version, and actually gave a proper
optimization using the accumulate instructions.
The main part is only four instructions.
U64x2 goodmult_sse(U64x2 top, U64x2 bot)
{
U32x2 topHi = vshrn_n_u64(top, 32); // U32x2 topHi = top >> 32;
U32x2 topLo = vmovn_u64(top); // U32x2 topLo = top & 0xFFFFFFFF;
U32x2 botHi = vshrn_n_u64(bot, 32); // U32x2 botHi = bot >> 32;
U32x2 botLo = vmovn_u64(bot); // U32x2 botLo = bot & 0xFFFFFFFF;
U64x2 ret64 = vmull_u32(topHi, botLo); // U64x2 ret64 = (U64x2)topHi * (U64x2)botLo;
ret64 = vmlal_u32(ret64, botHi, topLo); // ret64 += (U64x2)botHi * (U64x2)topLo;
ret64 = vshlq_n_u64(ret64, 32); // ret64 <<= 32;
ret64 = vmlal_u32(ret64, botLo, topLo); // ret64 += (U64x2)botLo * (U64x2)topLo;
return ret64;
}
One thing that would be a little more difficult to optimize but is a tiny bit
faster is this.
If clang can replace an vld1q_u64 with vld2_u32, it is even faster, as we can
just do this:
U64x2 goodmult_sse_interleaved2(const U32x2x2 top, const U32x2x2 bot)
{
U64x2 ret64 = vmull_u32(top.val[1], bot.val[0]); // U64x2 ret64 = (U64x2)topHi * (U64x2)botLo;
ret64 = vmlal_u32(ret64, bot.val[1], top.val[0]); // ret64 += (U64x2)botHi * (U64x2)topLo;
ret64 = vshlq_n_u64(ret64, 32); // ret64 <<= 32;
ret64 = vmlal_u32(ret64, bot.val[0], top.val[0]); // ret64 += (U64x2)botLo * (U64x2)topLo;
return ret64;
}
Also, your version is a little slower than the SSE version after the changes.
It also gives a different result.
nonvec: 8.209048, result: { 0x84fb25d13f764255, 0xca365acbd8c3a25b }
badmult: 13.071927, result: { 0x84fb25d13f764255, 0xca365acbd8c3a25b }
twomul: 6.362965, result: { 0x2bb4402bb413a73a, 0x0632ed04715ebf8c }
goodmult_sse: 5.863495, result: { 0x84fb25d13f764255, 0xca365acbd8c3a25b }
goodmult_sse_interleaved: 5.234455, result: { 0x84fb25d13f764255,
0xca365acbd8c3a25b }
goodmult_old: 8.853989, result: { 0x84fb25d13f764255, 0xca365acbd8c3a25b }
By preinterleaving the constants and changing the vld1q_u64 in the loop to
vld2_u32, we only need one vmovn/vshrn, and it gives us a little over .5s
speedup.
Looking over my version, I think I accidentally used vmlal_s32 instead of vmlal_u32; that probably explains the correctness issue.
Interesting that twomul is slightly slower, but not a big deal either way, I guess. Probably sensitive to the latency/throughput of specific instructions. (I would guess vmulq_u32 is slow everywhere, but not sure about the other instructions).
For goodmult_sse, I think you could use vmul/vmla instead of vmull/vmlal for the first two multiplies, but it probably doesn't make much difference either way.
0.4s slower with this base code, but might be fluctuation. (Haven't updated the
comments)
It might be carrying or register swapping.
U64x2 goodmult_sse_interleaved4(const U32x2x2 top, const U32x2x2 bot)
{
U32x2 ret32 = vmul_u32(top.val[1], bot.val[0]); // U64x2 ret64 = (U64x2)topHi * (U64x2)botLo;
ret32 = vmla_u32(ret32, bot.val[1], top.val[0]); // ret64 += (U64x2)botHi * (U64x2)topLo;
U64x2 ret64 = vshll_n_u32(ret32, 32); // ret64 <<= 32;
ret64 = vmlal_u32(ret64, bot.val[0], top.val[0]); // ret64 += (U64x2)botLo * (U64x2)topLo;
return ret64;
}
Actually, I looked it up, and vmla, vmlal, vmull, and vmul all have the same
timing, two cycles.
I did some calculations.
twomul: 9 cycles
goodmul_sse (0 interleaved): 11 cycles
goodmul_sse (1 interleaved): 9 cycles
goodmul_sse (2 interleaved): 7 cycles.
Also, I am not familiar with the codebase, but I have a theory that Clang
ignores the 1-cycle cost of vmov and assumes it is free.
Because if we ignore the vmov instructions in badmult, then we only use 6
cycles:
umull r7, r5, r1, r0
mla r1, r1, r12, r5
umull r5, r4, r2, r3
mla r2, r2, lr, r4
mla r2, r6, r3, r2
mla r0, r4, r0, r1
Clang's weird decision to ignore my intrinsics makes sense if it ignores vmov.
Filed bug 40032 to track the issue with vmla_u32 getting scalarized, which is
sort of orthogonal to the other issues here due to the way the optimization
passes interact.
There should be basically two tasks here: one, LowerMUL in
lib/Target/ARM/ARMISelLowering.cpp should be changed to emit the suggested
sequence instead of just falling back to scalarization using "return
SDValue();". Two, ARMTTIImpl::getArithmeticInstrCost should be changed to
reflect the actual cost of various multiplies.
This is probably a straightforward task to pick up for someone new to LLVM, or
the ARM backend.
I've opened a bug on GCC, it is here. I linked the GCC bug back to this thread as well.
My new 2011 MacBook Pro came in with a quad core i7 and an SSD, and now I can
compile an x86 and ARM-only LLVM in reasonable time. My old MacBook literally
took overnight.
I was able to adapt the X86 code to ARM. Right now it is pretty basic, and it
doesn't understand masking, but I presume giving it the fix for that pmuludq
bug would work.
lib/Target/ARM/ARMISelLowering.cpp:7424:
if (VT == MVT::v2i64) {
// One optimal way of doing an i64x2 is the exact same way as for SSE.
// TODO: Implement the faster but less modular way, which is this...
// vmovn.i64 topLo, top @ v2i64 topLo = top & 0xFFFFFFFF;
// vmovn.i64 botLo, bot @ v2i64 botLo = bot & 0xFFFFFFFF;
// @ v4i32 bot32 = (v4i32) bot;
// vrev64.32 botRe, bot @ v4i32 botRe = (v4i32) { bot32[1], bot32[0], bot32[3], bot32[2] };
// vmul.i32 botRe, botRe, top @ botRe *= (v4i32) top;
// vpaddl.u32 botRe, botRe @ top = (v2i64) { (u64) botRe[0] + botRe[1], (u64) botRe[2] + botRe[3] }
// vshl.i64 top, botRe, #32 @ top <<= 32;
// vmlal.u32 top, topLo, botLo @ top += (v2i64) topLo * (v2i64) botLo;
// and make it so optimizations interleave loads for the first one to avoid vshrn/vmovn.
SDLoc DL(Op);
SDValue top = Op.getOperand(0);
SDValue bot = Op.getOperand(1);
KnownBits topKnown = DAG.computeKnownBits(top);
KnownBits botKnown = DAG.computeKnownBits(bot);
APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
bool topLoIsZero = LowerBitsMask.isSubsetOf(topKnown.Zero);
bool botLoIsZero = LowerBitsMask.isSubsetOf(botKnown.Zero);
APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
bool topHiIsZero = UpperBitsMask.isSubsetOf(topKnown.Zero);
bool botHiIsZero = UpperBitsMask.isSubsetOf(botKnown.Zero);
SDValue topLo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, top);
SDValue botLo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32, bot);
SDValue c32 = DAG.getConstant(32, DL, MVT::i32);
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue topLoBotLo = Zero;
if (!topLoIsZero && !botLoIsZero)
topLoBotLo = DAG.getNode(ARMISD::VMULLu, DL, VT, topLo, botLo);
// Don't go any further if we are only multiplying low bits.
if (topHiIsZero && botHiIsZero)
return topLoBotLo;
SDValue topLoBotHi = Zero;
if (!topLoIsZero && !botHiIsZero) {
SDValue botHi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32,
DAG.getNode(ARMISD::VSHRu, DL, VT, bot, c32));
topLoBotHi = DAG.getNode(ARMISD::VMULLu, DL, VT, topLo, botHi);
}
SDValue topHiBotLo = Zero;
if (!topHiIsZero && !botLoIsZero) {
SDValue topHi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i32,
DAG.getNode(ARMISD::VSHRu, DL, VT, top, c32));
topHiBotLo = DAG.getNode(ARMISD::VMULLu, DL, VT, topHi, botLo);
}
// (optimized to vmlal.u32)
SDValue Hi = DAG.getNode(ISD::ADD, DL, VT, topLoBotHi, topHiBotLo);
Hi = DAG.getNode(ARMISD::VSHL, DL, VT, Hi, c32);
// (optimized to vmlal.u32)
return DAG.getNode(ISD::ADD, DL, VT, topLoBotLo, Hi);
}
That gives us the base version:
typedef unsigned long long U64x2 __attribute__((vector_size(16)));
U64x2 mult(U64x2 top, U64x2 bot)
{
return top * bot;
}
vshrn.i64 d20, q8, #32
vmovn.i64 d21, q9
vmovn.i64 d16, q8
vmull.u32 q11, d21, d20
vshrn.i64 d17, q9, #32
vmlal.u32 q11, d17, d16
vshl.i64 q9, q11, #32
vmlal.u32 q9, d21, d16
We also reduce the multiplies when masked, but it does not remove the masking.
The goal is to at least have this
U64x2 mult_lo_lo(U64x2 top, U64x2 bot)
{
return (top & 0xFFFFFFFF) * (bot & 0xFFFFFFFF);
}
always emit this:
vmovn.i64 topLo, top
vmovn.i64 botLo, bot
vmull.u32 top, topLo, botLo
just like how it reliably generates pmuludq on x86.
Currently, mult_lo_lo generates this:
vmov.i64 q8, #0xffffffff
vand q9, q9, q8
vand q8, q10, q8
vmovn.i64 d18, q9
vmovn.i64 d16, q8
vmull.u32 q8, d16, d18
I am presuming that this would generate the twomul code, but I don't know how
to get vpaddl (goes immediately after c32 is declared).
if (!topLoIsZero && !botLoIsZero && !topHiIsZero && !botHiIsZero) {
bot = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, bot);
bot = DAG.getNode(ARMISD::VREV64, DL, MVT::v4i32, bot);
top = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, top);
bot = DAG.getNode(ISD::UMULO, DL, MVT::v4i32, bot, top);
top = DAG.getNode(ARMISD::VPADDLu, DL, MVT::v2i64, bot); // pseudocode
top = DAG.getNode(ISD::SHL, DL, VT, top, c32);
topLo = DAG.getNode(ARMISD::VMULLu, DL, VT, topLo, botLo);
return DAG.getNode(ISD::ADD, DL, VT, top, topLo);
}
My patch so far: https://reviews.llvm.org/D56118
test.c
(4992 bytes, text/x-csrc)