Open davidbolvansky opened 4 years ago
mentioned in issue llvm/llvm-bugzilla-archive#50256
This is yet another case similar to [Bug #31572] where we fail to vectorize as one/some of the elements have removed the operation (elt + 0, elt << 0, elt * 1 etc.).
For plain -03:
Clang:
test(unsigned int): # @test(unsigned int)
mov eax, edi
mov byte ptr [rip + data], ah
lea ecx, [rax + 1]
mov byte ptr [rip + data+1], ch
lea ecx, [rax + 2]
mov byte ptr [rip + data+2], ch
lea ecx, [rax + 3]
mov byte ptr [rip + data+3], ch
lea ecx, [rax + 4]
mov byte ptr [rip + data+4], ch
lea ecx, [rax + 5]
mov byte ptr [rip + data+5], ch
lea ecx, [rax + 6]
mov byte ptr [rip + data+6], ch
lea ecx, [rax + 7]
mov byte ptr [rip + data+7], ch
lea ecx, [rax + 8]
mov byte ptr [rip + data+8], ch
lea ecx, [rax + 9]
mov byte ptr [rip + data+9], ch
lea ecx, [rax + 10]
mov byte ptr [rip + data+10], ch
lea ecx, [rax + 11]
mov byte ptr [rip + data+11], ch
lea ecx, [rax + 12]
mov byte ptr [rip + data+12], ch
lea ecx, [rax + 13]
mov byte ptr [rip + data+13], ch
lea ecx, [rax + 14]
mov byte ptr [rip + data+14], ch
add eax, 15
mov byte ptr [rip + data+15], ah
ret
GCC:
test(unsigned int):
movd xmm0, edi
movdqa xmm1, XMMWORD PTR .LC0[rip]
movdqa xmm2, XMMWORD PTR .LC2[rip]
punpcklwd xmm0, xmm0
pshufd xmm0, xmm0, 0
paddw xmm1, xmm0
paddw xmm0, XMMWORD PTR .LC1[rip]
psrlw xmm1, 8
psrlw xmm0, 8
pand xmm1, xmm2
pand xmm2, xmm0
movdqa xmm0, xmm1
packuswb xmm0, xmm2
movaps XMMWORD PTR data[rip], xmm0
ret
GCC's code is 2x faster than Clang. (Intel i7 4720HQ)
GCC -O3: 0m0,044s Clang -O3: 0m0,096s
GCC -O3 -march=haswell: 0m0,040s Clang -O3 -march=haswell: 0m0,191s :(
Extended Description
Clang -O3 -march=haswell:
GCC -O3 -march=haswell:
https://gcc.godbolt.org/z/zP4nWM