llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
29.24k stars 12.07k forks source link

Suboptimal vectorization #46780

Open davidbolvansky opened 4 years ago

davidbolvansky commented 4 years ago
Bugzilla Link 47436
Version trunk
OS Linux
CC @topperc,@dtemirbulatov,@fhahn,@RKSimon,@rotateright

Extended Description

#include<stdint.h>

uint8_t data[16];
 void test(unsigned i)
{
    unsigned j;
    for (j = 0; j < 16; j++)
        data[j] = (i + j) >> 8;
}

Clang -O3 -march=haswell:

test(unsigned int):                               # @test(unsigned int)
        lea     eax, [rdi + 12]
        vmovd   xmm0, eax
        lea     eax, [rdi + 13]
        vpinsrd xmm0, xmm0, eax, 1
        lea     eax, [rdi + 14]
        vpinsrd xmm0, xmm0, eax, 2
        lea     eax, [rdi + 15]
        vpinsrd xmm0, xmm0, eax, 3
        lea     eax, [rdi + 8]
        vmovd   xmm1, eax
        lea     eax, [rdi + 9]
        vpinsrd xmm1, xmm1, eax, 1
        lea     eax, [rdi + 10]
        vpinsrd xmm1, xmm1, eax, 2
        lea     eax, [rdi + 11]
        vpinsrd xmm1, xmm1, eax, 3
        lea     eax, [rdi + 4]
        vmovd   xmm2, eax
        lea     eax, [rdi + 5]
        vpinsrd xmm2, xmm2, eax, 1
        lea     eax, [rdi + 6]
        vpinsrd xmm2, xmm2, eax, 2
        lea     eax, [rdi + 7]
        vpinsrd xmm2, xmm2, eax, 3
        lea     eax, [rdi + 1]
        vmovd   xmm3, edi
        vpinsrd xmm3, xmm3, eax, 1
        lea     eax, [rdi + 2]
        vpinsrd xmm3, xmm3, eax, 2
        lea     eax, [rdi + 3]
        vpinsrd xmm3, xmm3, eax, 3
        vinserti128     ymm0, ymm1, xmm0, 1
        vinserti128     ymm1, ymm3, xmm2, 1
        vmovdqa ymm2, ymmword ptr [rip + .LCPI0_0] # ymm2 = [1,2,5,6,9,10,13,14,9,10,13,14,13,14,15,128,17,18,21,22,25,26,29,30,25,26,29,30,29,30,31,128]
        vpshufb ymm1, ymm1, ymm2
        vpermq  ymm1, ymm1, 232                 # ymm1 = ymm1[0,2,2,3]
        vmovdqa xmm3, xmmword ptr [rip + .LCPI0_1] # xmm3 = [255,255,255,255,255,255,255,255]
        vpand   xmm1, xmm1, xmm3
        vpshufb ymm0, ymm0, ymm2
        vpermq  ymm0, ymm0, 232                 # ymm0 = ymm0[0,2,2,3]
        vpand   xmm0, xmm0, xmm3
        vpackuswb       xmm0, xmm1, xmm0
        vmovdqa xmmword ptr [rip + data], xmm0
        vzeroupper
        ret

GCC -O3 -march=haswell:

test(unsigned int):
        vmovd   xmm0, edi
        vmovdqa xmm2, XMMWORD PTR .LC2[rip]
        vpbroadcastw    xmm0, xmm0
        vpaddw  xmm1, xmm0, XMMWORD PTR .LC0[rip]
        vpaddw  xmm0, xmm0, XMMWORD PTR .LC1[rip]
        vpsrlw  xmm1, xmm1, 8
        vpsrlw  xmm0, xmm0, 8
        vpand   xmm1, xmm2, xmm1
        vpand   xmm2, xmm2, xmm0
        vpackuswb       xmm0, xmm1, xmm2
        vmovdqa XMMWORD PTR data[rip], xmm0
        ret

https://gcc.godbolt.org/z/zP4nWM

davidbolvansky commented 2 years ago

mentioned in issue llvm/llvm-bugzilla-archive#50256

RKSimon commented 4 years ago

This is yet another case similar to [Bug #​31572] where we fail to vectorize as one/some of the elements have removed the operation (elt + 0, elt << 0, elt * 1 etc.).

davidbolvansky commented 4 years ago

For plain -03:

Clang:

test(unsigned int): # @test(unsigned int)
  mov eax, edi
  mov byte ptr [rip + data], ah
  lea ecx, [rax + 1]
  mov byte ptr [rip + data+1], ch
  lea ecx, [rax + 2]
  mov byte ptr [rip + data+2], ch
  lea ecx, [rax + 3]
  mov byte ptr [rip + data+3], ch
  lea ecx, [rax + 4]
  mov byte ptr [rip + data+4], ch
  lea ecx, [rax + 5]
  mov byte ptr [rip + data+5], ch
  lea ecx, [rax + 6]
  mov byte ptr [rip + data+6], ch
  lea ecx, [rax + 7]
  mov byte ptr [rip + data+7], ch
  lea ecx, [rax + 8]
  mov byte ptr [rip + data+8], ch
  lea ecx, [rax + 9]
  mov byte ptr [rip + data+9], ch
  lea ecx, [rax + 10]
  mov byte ptr [rip + data+10], ch
  lea ecx, [rax + 11]
  mov byte ptr [rip + data+11], ch
  lea ecx, [rax + 12]
  mov byte ptr [rip + data+12], ch
  lea ecx, [rax + 13]
  mov byte ptr [rip + data+13], ch
  lea ecx, [rax + 14]
  mov byte ptr [rip + data+14], ch
  add eax, 15
  mov byte ptr [rip + data+15], ah
  ret

GCC:

test(unsigned int):
        movd    xmm0, edi
        movdqa  xmm1, XMMWORD PTR .LC0[rip]
        movdqa  xmm2, XMMWORD PTR .LC2[rip]
        punpcklwd       xmm0, xmm0
        pshufd  xmm0, xmm0, 0
        paddw   xmm1, xmm0
        paddw   xmm0, XMMWORD PTR .LC1[rip]
        psrlw   xmm1, 8
        psrlw   xmm0, 8
        pand    xmm1, xmm2
        pand    xmm2, xmm0
        movdqa  xmm0, xmm1
        packuswb        xmm0, xmm2
        movaps  XMMWORD PTR data[rip], xmm0
        ret

GCC's code is 2x faster than Clang. (Intel i7 4720HQ)

GCC -O3: 0m0,044s Clang -O3: 0m0,096s

GCC -O3 -march=haswell: 0m0,040s Clang -O3 -march=haswell: 0m0,191s :(