Closed davidbolvansky closed 2 months ago
typedef struct vec { float data[SIZE]; } vec;
vec add(vec a, vec b) { vec result; for (size_t i = 0; i < SIZE; ++i) { result.data[i] = a.data[i] + b.data[i]; } return result; }
Clang -Ofast -march=skylake: add(vec, vec): # @add(vec, vec) vaddps xmm0, xmm1, xmm0 vmovss dword ptr [rsp - 8], xmm0 vextractps dword ptr [rsp - 4], xmm0, 1 vmovsd xmm0, qword ptr [rsp - 8] # xmm0 = mem[0],zero ret
GCC trunk -Ofast -march=skylake: add(vec, vec): vaddps xmm0, xmm0, xmm1 ret
We don't expect the backend to do ad-hoc vectorization. This looks like an enhancement request for SLP to allow partial vector ops and/or adjust the cost model.
Ie, we can get the vector add by changing the '2' limit to '4':
$ cat 46966.c float r[4], a[4], b[4];
void test_plus4(void) { for (int i = 0; i < 4; i++) r[i] = a[i] + b[i]; }
bin $ clang -O2 -S 46966.c -o - movaps _a(%rip), %xmm0 addps _b(%rip), %xmm0 movaps %xmm0, _r(%rip) retq
The original example as IR:
@a = local_unnamed_addr global [2 x float] zeroinitializer, align 4 @b = local_unnamed_addr global [2 x float] zeroinitializer, align 4 @r = local_unnamed_addr global [2 x float] zeroinitializer, align 4
define void @test_plus() local_unnamed_addr #0 { entry: %0 = load float, float getelementptr inbounds ([2 x float], [2 x float] @a, i64 0, i64 0), align 4, !tbaa !3 %1 = load float, float getelementptr inbounds ([2 x float], [2 x float] @b, i64 0, i64 0), align 4, !tbaa !3 %add = fadd float %0, %1 store float %add, float getelementptr inbounds ([2 x float], [2 x float] @r, i64 0, i64 0), align 4, !tbaa !3 %2 = load float, float getelementptr inbounds ([2 x float], [2 x float] @a, i64 0, i64 1), align 4, !tbaa !3 %3 = load float, float getelementptr inbounds ([2 x float], [2 x float] @b, i64 0, i64 1), align 4, !tbaa !3 %add.1 = fadd float %2, %3 store float %add.1, float getelementptr inbounds ([2 x float], [2 x float] @r, i64 0, i64 1), align 4, !tbaa !3 ret void }
attributes #0 = { nofree norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{#0, !1} !llvm.ident = !{#2}
!0 = !{i32 1, !"wchar_size", i32 4} !1 = !{i32 7, !"PIC Level", i32 2} !2 = !{!"clang version 12.0.0 (https://github.com/llvm/llvm-project.git 1c2777f585fc0e5e8f853dab455c62ae50298f9a)"} !3 = !{#4, !4, i64 0} !4 = !{!"float", !5, i64 0} !5 = !{!"omnipotent char", !6, i64 0} !6 = !{!"Simple C/C++ TBAA"}
include
define SIZE 2
typedef struct vec { float data[SIZE]; } vec;
vec add(vec a, vec b) { vec result; for (size_t i = 0; i < SIZE; ++i) { result.data[i] = a.data[i] + b.data[i]; } return result; }
Clang -Ofast -march=skylake: add(vec, vec): # @add(vec, vec) vaddps xmm0, xmm1, xmm0 vmovss dword ptr [rsp - 8], xmm0 vextractps dword ptr [rsp - 4], xmm0, 1 vmovsd xmm0, qword ptr [rsp - 8] # xmm0 = mem[0],zero ret
GCC trunk -Ofast -march=skylake: add(vec, vec): vaddps xmm0, xmm0, xmm1 ret
Fixed , https://godbolt.org/z/3P9bfhz11
Extended Description
float r[2], a[2], b[2], c[2];
void test_plus(void) { for (int i = 0; i < 2; i++) r[i] = a[i] + b[i]; }
Clang: test_plus: # @test_plus movss xmm0, dword ptr [rip + a] # xmm0 = mem[0],zero,zero,zero addss xmm0, dword ptr [rip + b] movss dword ptr [rip + r], xmm0 movss xmm0, dword ptr [rip + a+4] # xmm0 = mem[0],zero,zero,zero addss xmm0, dword ptr [rip + b+4] movss dword ptr [rip + r+4], xmm0 ret
GCC: test_plus: movq xmm0, QWORD PTR a[rip] movq xmm1, QWORD PTR b[rip] addps xmm0, xmm1 movlps QWORD PTR r[rip], xmm0 ret
Godbolt: https://godbolt.org/z/fnxGj1