Quuxplusone / LLVMBugzillaTest

0 stars 0 forks source link

Missing autovectorization for 8-byte vectors #41380

Open Quuxplusone opened 5 years ago

Quuxplusone commented 5 years ago
Bugzilla Link PR42410
Status NEW
Importance P enhancement
Reported by David Bolvansky (david.bolvansky@gmail.com)
Reported on 2019-06-26 11:24:30 -0700
Last modified on 2019-06-26 12:16:24 -0700
Version trunk
Hardware PC Linux
CC craig.topper@gmail.com, hideki.saito@intel.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, spatel+llvm@rotateright.com
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also
void foo (char *__restrict arr1, char *__restrict arr2)
{
  for (int i = 0; i < 8; i++)
    arr1[i] += arr2[i];
}

Clang -O3 -march=skylake

foo(char*, char*):                             # @foo(char*, char*)
        mov     al, byte ptr [rsi]
        add     byte ptr [rdi], al
        mov     al, byte ptr [rsi + 1]
        add     byte ptr [rdi + 1], al
        mov     al, byte ptr [rsi + 2]
        add     byte ptr [rdi + 2], al
        mov     al, byte ptr [rsi + 3]
        add     byte ptr [rdi + 3], al
        mov     al, byte ptr [rsi + 4]
        add     byte ptr [rdi + 4], al
        mov     al, byte ptr [rsi + 5]
        add     byte ptr [rdi + 5], al
        mov     al, byte ptr [rsi + 6]
        add     byte ptr [rdi + 6], al
        mov     al, byte ptr [rsi + 7]
        add     byte ptr [rdi + 7], al
        ret

ICC -O3 -march=skylake

foo(char*, char*):
        vmovq     xmm0, QWORD PTR [rdi]                         #6.5
        vmovq     xmm1, QWORD PTR [rsi]                         #6.16
        vpaddb    xmm2, xmm0, xmm1                              #6.5
        vmovq     QWORD PTR [rdi], xmm2                         #6.5
        ret
Quuxplusone commented 5 years ago
foo(char*, char*):                             # @foo(char*, char*)
        xor     eax, eax
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        vmovq   xmm0, qword ptr [rsi + rax] # xmm0 = mem[0],zero
        vmovq   xmm1, qword ptr [rdi + rax] # xmm1 = mem[0],zero
        vpaddb  xmm0, xmm1, xmm0
        vmovq   qword ptr [rdi + rax], xmm0
        add     rax, 8
        cmp     rax, 8
        jne     .LBB0_1
        ret

-O3 -march=skylake -fno-unroll-loops helps a bit, but should not we rather
vectorize it than unroll it? And this vectorized code is not ideal too.