[SLP] Suboptimal codegen for v2sf operations

Quuxplusone commented 4 years ago


Bugzilla Link	PR46966
Status	NEW
Importance	P enhancement
Reported by	David Bolvansky (david.bolvansky@gmail.com)
Reported on	2020-08-03 07:16:39 -0700
Last modified on	2020-08-16 17:22:35 -0700
Version	trunk
Hardware	PC Linux
CC	craig.topper@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, spatel+llvm@rotateright.com
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also

float r[2], a[2], b[2], c[2];

void test_plus(void) {
  for (int i = 0; i < 2; i++) r[i] = a[i] + b[i];
}

Clang:
test_plus:                              # @test_plus
        movss   xmm0, dword ptr [rip + a]       # xmm0 = mem[0],zero,zero,zero
        addss   xmm0, dword ptr [rip + b]
        movss   dword ptr [rip + r], xmm0
        movss   xmm0, dword ptr [rip + a+4]     # xmm0 = mem[0],zero,zero,zero
        addss   xmm0, dword ptr [rip + b+4]
        movss   dword ptr [rip + r+4], xmm0
        ret

GCC:
test_plus:
        movq    xmm0, QWORD PTR a[rip]
        movq    xmm1, QWORD PTR b[rip]
        addps   xmm0, xmm1
        movlps  QWORD PTR r[rip], xmm0
        ret

Godbolt: https://godbolt.org/z/fnxGj1

Quuxplusone commented 4 years ago

We don't expect the backend to do ad-hoc vectorization. This looks like an
enhancement request for SLP to allow partial vector ops and/or adjust the cost
model.

Ie, we can get the vector add by changing the '2' limit to '4':

$ cat 46966.c
float r[4], a[4], b[4];

void test_plus4(void) {
  for (int i = 0; i < 4; i++) r[i] = a[i] + b[i];
}

bin $ clang -O2 -S 46966.c -o -
    movaps  _a(%rip), %xmm0
    addps   _b(%rip), %xmm0
    movaps  %xmm0, _r(%rip)
    retq

The original example as IR:

@a = local_unnamed_addr global [2 x float] zeroinitializer, align 4
@b = local_unnamed_addr global [2 x float] zeroinitializer, align 4
@r = local_unnamed_addr global [2 x float] zeroinitializer, align 4

define void @test_plus() local_unnamed_addr #0 {
entry:
  %0 = load float, float* getelementptr inbounds ([2 x float], [2 x float]* @a, i64 0, i64 0), align 4, !tbaa !3
  %1 = load float, float* getelementptr inbounds ([2 x float], [2 x float]* @b, i64 0, i64 0), align 4, !tbaa !3
  %add = fadd float %0, %1
  store float %add, float* getelementptr inbounds ([2 x float], [2 x float]* @r, i64 0, i64 0), align 4, !tbaa !3
  %2 = load float, float* getelementptr inbounds ([2 x float], [2 x float]* @a, i64 0, i64 1), align 4, !tbaa !3
  %3 = load float, float* getelementptr inbounds ([2 x float], [2 x float]* @b, i64 0, i64 1), align 4, !tbaa !3
  %add.1 = fadd float %2, %3
  store float %add.1, float* getelementptr inbounds ([2 x float], [2 x float]* @r, i64 0, i64 1), align 4, !tbaa !3
  ret void
}

attributes #0 = { nofree norecurse nounwind ssp uwtable "correctly-rounded-
divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-
pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-
infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-
signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-
size"="8" "target-cpu"="penryn" "target-
features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87"
"unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 12.0.0 (https://github.com/llvm/llvm-project.git
1c2777f585fc0e5e8f853dab455c62ae50298f9a)"}
!3 = !{!4, !4, i64 0}
!4 = !{!"float", !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C/C++ TBAA"}

Quuxplusone commented 4 years ago

#include <stdlib.h>

#define SIZE 2

typedef struct vec {
    float data[SIZE];
} vec;

vec add(vec a, vec b) {
    vec result;
    for (size_t i = 0; i < SIZE; ++i) {
        result.data[i] = a.data[i] + b.data[i];
    }
    return result;
}

Clang -Ofast  -march=skylake:
add(vec, vec):                           # @add(vec, vec)
        vaddps  xmm0, xmm1, xmm0
        vmovss  dword ptr [rsp - 8], xmm0
        vextractps      dword ptr [rsp - 4], xmm0, 1
        vmovsd  xmm0, qword ptr [rsp - 8]       # xmm0 = mem[0],zero
        ret

GCC trunk -Ofast  -march=skylake:
add(vec, vec):
        vaddps  xmm0, xmm0, xmm1
        ret

Quuxplusone / LLVMBugzillaTest

[SLP] Suboptimal codegen for v2sf operations #45935