Open Quuxplusone opened 4 years ago
Bugzilla Link | PR47436 |
Status | NEW |
Importance | P enhancement |
Reported by | David Bolvansky (david.bolvansky@gmail.com) |
Reported on | 2020-09-06 06:14:39 -0700 |
Last modified on | 2021-05-11 06:01:45 -0700 |
Version | trunk |
Hardware | PC Linux |
CC | craig.topper@gmail.com, dtemirbulatov@gmail.com, florian_hahn@apple.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, spatel+llvm@rotateright.com |
Fixed by commit(s) | |
Attachments | |
Blocks | |
Blocked by | |
See also | PR31572, PR47554, PR47553 |
For plain -03:
Clang:
test(unsigned int): # @test(unsigned int)
mov eax, edi
mov byte ptr [rip + data], ah
lea ecx, [rax + 1]
mov byte ptr [rip + data+1], ch
lea ecx, [rax + 2]
mov byte ptr [rip + data+2], ch
lea ecx, [rax + 3]
mov byte ptr [rip + data+3], ch
lea ecx, [rax + 4]
mov byte ptr [rip + data+4], ch
lea ecx, [rax + 5]
mov byte ptr [rip + data+5], ch
lea ecx, [rax + 6]
mov byte ptr [rip + data+6], ch
lea ecx, [rax + 7]
mov byte ptr [rip + data+7], ch
lea ecx, [rax + 8]
mov byte ptr [rip + data+8], ch
lea ecx, [rax + 9]
mov byte ptr [rip + data+9], ch
lea ecx, [rax + 10]
mov byte ptr [rip + data+10], ch
lea ecx, [rax + 11]
mov byte ptr [rip + data+11], ch
lea ecx, [rax + 12]
mov byte ptr [rip + data+12], ch
lea ecx, [rax + 13]
mov byte ptr [rip + data+13], ch
lea ecx, [rax + 14]
mov byte ptr [rip + data+14], ch
add eax, 15
mov byte ptr [rip + data+15], ah
ret
GCC:
test(unsigned int):
movd xmm0, edi
movdqa xmm1, XMMWORD PTR .LC0[rip]
movdqa xmm2, XMMWORD PTR .LC2[rip]
punpcklwd xmm0, xmm0
pshufd xmm0, xmm0, 0
paddw xmm1, xmm0
paddw xmm0, XMMWORD PTR .LC1[rip]
psrlw xmm1, 8
psrlw xmm0, 8
pand xmm1, xmm2
pand xmm2, xmm0
movdqa xmm0, xmm1
packuswb xmm0, xmm2
movaps XMMWORD PTR data[rip], xmm0
ret
GCC's code is 2x faster than Clang. (Intel i7 4720HQ)
GCC -O3: 0m0,044s
Clang -O3: 0m0,096s
GCC -O3 -march=haswell: 0m0,040s
Clang -O3 -march=haswell: 0m0,191s :(
This is yet another case similar to [Bug #31572] where we fail to vectorize as
one/some of the elements have removed the operation (elt + 0, elt << 0, elt * 1
etc.).