llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28k stars 11.56k forks source link

[SLP] Suboptimal codegen for v2sf operations #46310

Closed davidbolvansky closed 2 months ago

davidbolvansky commented 4 years ago
Bugzilla Link 46966
Version trunk
OS Linux
CC @topperc,@RKSimon,@rotateright

Extended Description

float r[2], a[2], b[2], c[2];

void test_plus(void) { for (int i = 0; i < 2; i++) r[i] = a[i] + b[i]; }

Clang: test_plus: # @​test_plus movss xmm0, dword ptr [rip + a] # xmm0 = mem[0],zero,zero,zero addss xmm0, dword ptr [rip + b] movss dword ptr [rip + r], xmm0 movss xmm0, dword ptr [rip + a+4] # xmm0 = mem[0],zero,zero,zero addss xmm0, dword ptr [rip + b+4] movss dword ptr [rip + r+4], xmm0 ret

GCC: test_plus: movq xmm0, QWORD PTR a[rip] movq xmm1, QWORD PTR b[rip] addps xmm0, xmm1 movlps QWORD PTR r[rip], xmm0 ret

Godbolt: https://godbolt.org/z/fnxGj1

davidbolvansky commented 4 years ago

include

define SIZE 2

typedef struct vec { float data[SIZE]; } vec;

vec add(vec a, vec b) { vec result; for (size_t i = 0; i < SIZE; ++i) { result.data[i] = a.data[i] + b.data[i]; } return result; }

Clang -Ofast -march=skylake: add(vec, vec): # @​add(vec, vec) vaddps xmm0, xmm1, xmm0 vmovss dword ptr [rsp - 8], xmm0 vextractps dword ptr [rsp - 4], xmm0, 1 vmovsd xmm0, qword ptr [rsp - 8] # xmm0 = mem[0],zero ret

GCC trunk -Ofast -march=skylake: add(vec, vec): vaddps xmm0, xmm0, xmm1 ret

rotateright commented 4 years ago

We don't expect the backend to do ad-hoc vectorization. This looks like an enhancement request for SLP to allow partial vector ops and/or adjust the cost model.

Ie, we can get the vector add by changing the '2' limit to '4':

$ cat 46966.c float r[4], a[4], b[4];

void test_plus4(void) { for (int i = 0; i < 4; i++) r[i] = a[i] + b[i]; }

bin $ clang -O2 -S 46966.c -o - movaps _a(%rip), %xmm0 addps _b(%rip), %xmm0 movaps %xmm0, _r(%rip) retq

The original example as IR:

@​a = local_unnamed_addr global [2 x float] zeroinitializer, align 4 @​b = local_unnamed_addr global [2 x float] zeroinitializer, align 4 @​r = local_unnamed_addr global [2 x float] zeroinitializer, align 4

define void @​test_plus() local_unnamed_addr #​0 { entry: %0 = load float, float getelementptr inbounds ([2 x float], [2 x float] @​a, i64 0, i64 0), align 4, !tbaa !​3 %1 = load float, float getelementptr inbounds ([2 x float], [2 x float] @​b, i64 0, i64 0), align 4, !tbaa !​3 %add = fadd float %0, %1 store float %add, float getelementptr inbounds ([2 x float], [2 x float] @​r, i64 0, i64 0), align 4, !tbaa !​3 %2 = load float, float getelementptr inbounds ([2 x float], [2 x float] @​a, i64 0, i64 1), align 4, !tbaa !​3 %3 = load float, float getelementptr inbounds ([2 x float], [2 x float] @​b, i64 0, i64 1), align 4, !tbaa !​3 %add.1 = fadd float %2, %3 store float %add.1, float getelementptr inbounds ([2 x float], [2 x float] @​r, i64 0, i64 1), align 4, !tbaa !​3 ret void }

attributes #​0 = { nofree norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.module.flags = !{#0, !​1} !llvm.ident = !{#2}

!​0 = !{i32 1, !"wchar_size", i32 4} !​1 = !{i32 7, !"PIC Level", i32 2} !​2 = !{!"clang version 12.0.0 (https://github.com/llvm/llvm-project.git 1c2777f585fc0e5e8f853dab455c62ae50298f9a)"} !​3 = !{#4, !​4, i64 0} !​4 = !{!"float", !​5, i64 0} !​5 = !{!"omnipotent char", !​6, i64 0} !​6 = !{!"Simple C/C++ TBAA"}

alexey-bataev commented 2 months ago

include

define SIZE 2

typedef struct vec { float data[SIZE]; } vec;

vec add(vec a, vec b) { vec result; for (size_t i = 0; i < SIZE; ++i) { result.data[i] = a.data[i] + b.data[i]; } return result; }

Clang -Ofast -march=skylake: add(vec, vec): # @​add(vec, vec) vaddps xmm0, xmm1, xmm0 vmovss dword ptr [rsp - 8], xmm0 vextractps dword ptr [rsp - 4], xmm0, 1 vmovsd xmm0, qword ptr [rsp - 8] # xmm0 = mem[0],zero ret

GCC trunk -Ofast -march=skylake: add(vec, vec): vaddps xmm0, xmm0, xmm1 ret

Fixed , https://godbolt.org/z/3P9bfhz11