Open Quuxplusone opened 4 years ago
Bugzilla Link | PR46966 |
Status | NEW |
Importance | P enhancement |
Reported by | David Bolvansky (david.bolvansky@gmail.com) |
Reported on | 2020-08-03 07:16:39 -0700 |
Last modified on | 2020-08-16 17:22:35 -0700 |
Version | trunk |
Hardware | PC Linux |
CC | craig.topper@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, spatel+llvm@rotateright.com |
Fixed by commit(s) | |
Attachments | |
Blocks | |
Blocked by | |
See also |
We don't expect the backend to do ad-hoc vectorization. This looks like an
enhancement request for SLP to allow partial vector ops and/or adjust the cost
model.
Ie, we can get the vector add by changing the '2' limit to '4':
$ cat 46966.c
float r[4], a[4], b[4];
void test_plus4(void) {
for (int i = 0; i < 4; i++) r[i] = a[i] + b[i];
}
bin $ clang -O2 -S 46966.c -o -
movaps _a(%rip), %xmm0
addps _b(%rip), %xmm0
movaps %xmm0, _r(%rip)
retq
The original example as IR:
@a = local_unnamed_addr global [2 x float] zeroinitializer, align 4
@b = local_unnamed_addr global [2 x float] zeroinitializer, align 4
@r = local_unnamed_addr global [2 x float] zeroinitializer, align 4
define void @test_plus() local_unnamed_addr #0 {
entry:
%0 = load float, float* getelementptr inbounds ([2 x float], [2 x float]* @a, i64 0, i64 0), align 4, !tbaa !3
%1 = load float, float* getelementptr inbounds ([2 x float], [2 x float]* @b, i64 0, i64 0), align 4, !tbaa !3
%add = fadd float %0, %1
store float %add, float* getelementptr inbounds ([2 x float], [2 x float]* @r, i64 0, i64 0), align 4, !tbaa !3
%2 = load float, float* getelementptr inbounds ([2 x float], [2 x float]* @a, i64 0, i64 1), align 4, !tbaa !3
%3 = load float, float* getelementptr inbounds ([2 x float], [2 x float]* @b, i64 0, i64 1), align 4, !tbaa !3
%add.1 = fadd float %2, %3
store float %add.1, float* getelementptr inbounds ([2 x float], [2 x float]* @r, i64 0, i64 1), align 4, !tbaa !3
ret void
}
attributes #0 = { nofree norecurse nounwind ssp uwtable "correctly-rounded-
divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-
pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-
infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-
signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-
size"="8" "target-cpu"="penryn" "target-
features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87"
"unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 12.0.0 (https://github.com/llvm/llvm-project.git
1c2777f585fc0e5e8f853dab455c62ae50298f9a)"}
!3 = !{!4, !4, i64 0}
!4 = !{!"float", !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C/C++ TBAA"}
#include <stdlib.h>
#define SIZE 2
typedef struct vec {
float data[SIZE];
} vec;
vec add(vec a, vec b) {
vec result;
for (size_t i = 0; i < SIZE; ++i) {
result.data[i] = a.data[i] + b.data[i];
}
return result;
}
Clang -Ofast -march=skylake:
add(vec, vec): # @add(vec, vec)
vaddps xmm0, xmm1, xmm0
vmovss dword ptr [rsp - 8], xmm0
vextractps dword ptr [rsp - 4], xmm0, 1
vmovsd xmm0, qword ptr [rsp - 8] # xmm0 = mem[0],zero
ret
GCC trunk -Ofast -march=skylake:
add(vec, vec):
vaddps xmm0, xmm0, xmm1
ret