Open nschaeff opened 8 months ago
I think the code generated by clang 18.10 is good enough:
rescale_x4(double*, double const*, int): # @rescale_x4(double*, double const*, int)
test edx, edx
jle .LBB0_8
mov eax, edx
cmp edx, 4
jae .LBB0_3
xor ecx, ecx
jmp .LBB0_6
.LBB0_3:
mov ecx, eax
and ecx, 2147483644
mov edx, eax
shr edx, 2
and edx, 536870911
shl rdx, 5
xor r8d, r8d
.LBB0_4: # =>This Inner Loop Header: Depth=1
vmovupd ymm0, ymmword ptr [rsi + r8]
vmovupd ymm1, ymmword ptr [rdi + 4*r8]
vmovupd ymm2, ymmword ptr [rdi + 4*r8 + 32]
vmovupd ymm3, ymmword ptr [rdi + 4*r8 + 64]
vmovupd ymm4, ymmword ptr [rdi + 4*r8 + 96]
vperm2f128 ymm5, ymm1, ymm3, 32 # ymm5 = ymm1[0,1],ymm3[0,1]
vperm2f128 ymm6, ymm2, ymm4, 32 # ymm6 = ymm2[0,1],ymm4[0,1]
vperm2f128 ymm1, ymm1, ymm3, 49 # ymm1 = ymm1[2,3],ymm3[2,3]
vperm2f128 ymm2, ymm2, ymm4, 49 # ymm2 = ymm2[2,3],ymm4[2,3]
vunpcklpd ymm3, ymm5, ymm6 # ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
vunpcklpd ymm4, ymm1, ymm2 # ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
vunpckhpd ymm5, ymm5, ymm6 # ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
vunpckhpd ymm1, ymm1, ymm2 # ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
vmulpd ymm2, ymm0, ymm3
vmulpd ymm3, ymm0, ymm5
vmulpd ymm4, ymm0, ymm4
vmulpd ymm0, ymm0, ymm1
vinsertf128 ymm1, ymm2, xmm4, 1
vinsertf128 ymm5, ymm3, xmm0, 1
vperm2f128 ymm2, ymm2, ymm4, 49 # ymm2 = ymm2[2,3],ymm4[2,3]
vperm2f128 ymm0, ymm3, ymm0, 49 # ymm0 = ymm3[2,3],ymm0[2,3]
vunpcklpd ymm3, ymm1, ymm5 # ymm3 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
vunpcklpd ymm4, ymm2, ymm0 # ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
vunpckhpd ymm1, ymm1, ymm5 # ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3]
vunpckhpd ymm0, ymm2, ymm0 # ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
vmovupd ymmword ptr [rdi + 4*r8 + 64], ymm4
vmovupd ymmword ptr [rdi + 4*r8 + 96], ymm0
vmovupd ymmword ptr [rdi + 4*r8], ymm3
vmovupd ymmword ptr [rdi + 4*r8 + 32], ymm1
add r8, 32
cmp rdx, r8
jne .LBB0_4
cmp rcx, rax
je .LBB0_8
.LBB0_6:
mov rdx, rcx
shl rdx, 5
add rdi, rdx
.LBB0_7: # =>This Inner Loop Header: Depth=1
vbroadcastsd ymm0, qword ptr [rsi + 8*rcx]
vmulpd ymm0, ymm0, ymmword ptr [rdi]
vmovupd ymmword ptr [rdi], ymm0
inc rcx
add rdi, 32
cmp rax, rcx
jne .LBB0_7
.LBB0_8:
vzeroupper
ret
It seems this problem is fixed in clang 15+ @nschaeff
This is exactly what I call bad. Lots of useless shuffles: 16 shuffle instructions for 4 multiplications... Clang 16+ is a little less bad than 14 and 15, but still very bad compared to clang 13.0.1, that I paste below: zero shuffles (only broadcast directly from memory, which are free). I think it is not unreasonable to expect newer versions to improve the code produced.
rescale_x4(double*, double const*, int): # @rescale_x4(double*, double const*, int)
test edx, edx
jle .LBB0_8
mov ecx, edx
lea rax, [rcx - 1]
mov r8d, ecx
and r8d, 3
cmp rax, 3
jae .LBB0_3
xor edx, edx
jmp .LBB0_5
.LBB0_3:
and ecx, -4
lea rax, [rdi + 96]
xor edx, edx
.LBB0_4: # =>This Inner Loop Header: Depth=1
vbroadcastsd ymm0, qword ptr [rsi + 8*rdx]
vmulpd ymm0, ymm0, ymmword ptr [rax - 96]
vmovupd ymmword ptr [rax - 96], ymm0
vbroadcastsd ymm0, qword ptr [rsi + 8*rdx + 8]
vmulpd ymm0, ymm0, ymmword ptr [rax - 64]
vmovupd ymmword ptr [rax - 64], ymm0
vbroadcastsd ymm0, qword ptr [rsi + 8*rdx + 16]
vmulpd ymm0, ymm0, ymmword ptr [rax - 32]
vmovupd ymmword ptr [rax - 32], ymm0
vbroadcastsd ymm0, qword ptr [rsi + 8*rdx + 24]
vmulpd ymm0, ymm0, ymmword ptr [rax]
vmovupd ymmword ptr [rax], ymm0
add rdx, 4
sub rax, -128
cmp rcx, rdx
jne .LBB0_4
.LBB0_5:
test r8, r8
je .LBB0_8
lea rax, [rsi + 8*rdx]
shl rdx, 5
add rdi, rdx
shl r8, 3
xor ecx, ecx
.LBB0_7: # =>This Inner Loop Header: Depth=1
vbroadcastsd ymm0, qword ptr [rax + rcx]
vmulpd ymm0, ymm0, ymmword ptr [rdi + 4*rcx]
vmovupd ymmword ptr [rdi + 4*rcx], ymm0
add rcx, 8
cmp r8, rcx
jne .LBB0_7
.LBB0_8:
vzeroupper
ret
The reason for the change is that LoopVectorize is now able to vectorize. In this particular case, SLP vectorization would be sufficient to make full use of the available vector registers, which is why the codegen is worse now.
Sketched a potential improvement in https://github.com/llvm/llvm-project/pull/106441
A simple loop multiplying two arrays, with different multiplicity fails to vectorize efficiently on clang 14+, while it worked with clang 13.0.1 The loop is the following, where 4 consecutive values in data are multiplied by the same factor :
See on godbolt to see the crazy code generated by clang 14+, while clang 13.0.1 correctly uses
vbroadcastsd
: https://godbolt.org/z/desh4E49o