Quuxplusone / LLVMBugzillaTest

0 stars 0 forks source link

Suboptimal vectorization #46405

Open Quuxplusone opened 4 years ago

Quuxplusone commented 4 years ago
Bugzilla Link PR47436
Status NEW
Importance P enhancement
Reported by David Bolvansky (david.bolvansky@gmail.com)
Reported on 2020-09-06 06:14:39 -0700
Last modified on 2021-05-11 06:01:45 -0700
Version trunk
Hardware PC Linux
CC craig.topper@gmail.com, dtemirbulatov@gmail.com, florian_hahn@apple.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, spatel+llvm@rotateright.com
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also PR31572, PR47554, PR47553
#include<stdint.h>

uint8_t data[16];
 void test(unsigned i)
{
    unsigned j;
    for (j = 0; j < 16; j++)
        data[j] = (i + j) >> 8;
}

Clang -O3 -march=haswell:
test(unsigned int):                               # @test(unsigned int)
        lea     eax, [rdi + 12]
        vmovd   xmm0, eax
        lea     eax, [rdi + 13]
        vpinsrd xmm0, xmm0, eax, 1
        lea     eax, [rdi + 14]
        vpinsrd xmm0, xmm0, eax, 2
        lea     eax, [rdi + 15]
        vpinsrd xmm0, xmm0, eax, 3
        lea     eax, [rdi + 8]
        vmovd   xmm1, eax
        lea     eax, [rdi + 9]
        vpinsrd xmm1, xmm1, eax, 1
        lea     eax, [rdi + 10]
        vpinsrd xmm1, xmm1, eax, 2
        lea     eax, [rdi + 11]
        vpinsrd xmm1, xmm1, eax, 3
        lea     eax, [rdi + 4]
        vmovd   xmm2, eax
        lea     eax, [rdi + 5]
        vpinsrd xmm2, xmm2, eax, 1
        lea     eax, [rdi + 6]
        vpinsrd xmm2, xmm2, eax, 2
        lea     eax, [rdi + 7]
        vpinsrd xmm2, xmm2, eax, 3
        lea     eax, [rdi + 1]
        vmovd   xmm3, edi
        vpinsrd xmm3, xmm3, eax, 1
        lea     eax, [rdi + 2]
        vpinsrd xmm3, xmm3, eax, 2
        lea     eax, [rdi + 3]
        vpinsrd xmm3, xmm3, eax, 3
        vinserti128     ymm0, ymm1, xmm0, 1
        vinserti128     ymm1, ymm3, xmm2, 1
        vmovdqa ymm2, ymmword ptr [rip + .LCPI0_0] # ymm2 = [1,2,5,6,9,10,13,14,9,10,13,14,13,14,15,128,17,18,21,22,25,26,29,30,25,26,29,30,29,30,31,128]
        vpshufb ymm1, ymm1, ymm2
        vpermq  ymm1, ymm1, 232                 # ymm1 = ymm1[0,2,2,3]
        vmovdqa xmm3, xmmword ptr [rip + .LCPI0_1] # xmm3 = [255,255,255,255,255,255,255,255]
        vpand   xmm1, xmm1, xmm3
        vpshufb ymm0, ymm0, ymm2
        vpermq  ymm0, ymm0, 232                 # ymm0 = ymm0[0,2,2,3]
        vpand   xmm0, xmm0, xmm3
        vpackuswb       xmm0, xmm1, xmm0
        vmovdqa xmmword ptr [rip + data], xmm0
        vzeroupper
        ret

GCC -O3 -march=haswell:
test(unsigned int):
        vmovd   xmm0, edi
        vmovdqa xmm2, XMMWORD PTR .LC2[rip]
        vpbroadcastw    xmm0, xmm0
        vpaddw  xmm1, xmm0, XMMWORD PTR .LC0[rip]
        vpaddw  xmm0, xmm0, XMMWORD PTR .LC1[rip]
        vpsrlw  xmm1, xmm1, 8
        vpsrlw  xmm0, xmm0, 8
        vpand   xmm1, xmm2, xmm1
        vpand   xmm2, xmm2, xmm0
        vpackuswb       xmm0, xmm1, xmm2
        vmovdqa XMMWORD PTR data[rip], xmm0
        ret

https://gcc.godbolt.org/z/zP4nWM
Quuxplusone commented 4 years ago
For plain -03:

Clang:
test(unsigned int): # @test(unsigned int)
  mov eax, edi
  mov byte ptr [rip + data], ah
  lea ecx, [rax + 1]
  mov byte ptr [rip + data+1], ch
  lea ecx, [rax + 2]
  mov byte ptr [rip + data+2], ch
  lea ecx, [rax + 3]
  mov byte ptr [rip + data+3], ch
  lea ecx, [rax + 4]
  mov byte ptr [rip + data+4], ch
  lea ecx, [rax + 5]
  mov byte ptr [rip + data+5], ch
  lea ecx, [rax + 6]
  mov byte ptr [rip + data+6], ch
  lea ecx, [rax + 7]
  mov byte ptr [rip + data+7], ch
  lea ecx, [rax + 8]
  mov byte ptr [rip + data+8], ch
  lea ecx, [rax + 9]
  mov byte ptr [rip + data+9], ch
  lea ecx, [rax + 10]
  mov byte ptr [rip + data+10], ch
  lea ecx, [rax + 11]
  mov byte ptr [rip + data+11], ch
  lea ecx, [rax + 12]
  mov byte ptr [rip + data+12], ch
  lea ecx, [rax + 13]
  mov byte ptr [rip + data+13], ch
  lea ecx, [rax + 14]
  mov byte ptr [rip + data+14], ch
  add eax, 15
  mov byte ptr [rip + data+15], ah
  ret

GCC:
test(unsigned int):
        movd    xmm0, edi
        movdqa  xmm1, XMMWORD PTR .LC0[rip]
        movdqa  xmm2, XMMWORD PTR .LC2[rip]
        punpcklwd       xmm0, xmm0
        pshufd  xmm0, xmm0, 0
        paddw   xmm1, xmm0
        paddw   xmm0, XMMWORD PTR .LC1[rip]
        psrlw   xmm1, 8
        psrlw   xmm0, 8
        pand    xmm1, xmm2
        pand    xmm2, xmm0
        movdqa  xmm0, xmm1
        packuswb        xmm0, xmm2
        movaps  XMMWORD PTR data[rip], xmm0
        ret

GCC's code is 2x faster than Clang. (Intel i7 4720HQ)

GCC -O3:    0m0,044s
Clang -O3:  0m0,096s

GCC -O3 -march=haswell:     0m0,040s
Clang  -O3 -march=haswell:  0m0,191s :(
Quuxplusone commented 4 years ago
This is yet another case similar to [Bug #31572] where we fail to vectorize as
one/some of the elements have removed the operation (elt + 0, elt << 0, elt * 1
etc.).