Open Quuxplusone opened 5 years ago
Bugzilla Link | PR40946 |
Status | NEW |
Importance | P enhancement |
Reported by | Bala Rishi (balarishi.bhogadi@amd.com) |
Reported on | 2019-03-04 01:52:29 -0800 |
Last modified on | 2019-03-08 05:01:13 -0800 |
Version | trunk |
Hardware | PC Linux |
CC | florian_hahn@apple.com, hfinkel@anl.gov, hideki.saito@intel.com, llvm-bugs@lists.llvm.org, spatel+llvm@rotateright.com |
Fixed by commit(s) | |
Attachments | |
Blocks | |
Blocked by | |
See also |
The problem is that before vectorization, the loop body contains loads for the
global variables (which store the pointers) and AA cannot prove they do not
alias with the store (a[I] = ...):
; ModuleID = 'testvec.c'
source_filename = "testvec.c"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.14.0"
@b = common local_unnamed_addr global i8* null, align 8
@c = common local_unnamed_addr global i8* null, align 8
@a = common local_unnamed_addr global i8* null, align 8
; Function Attrs: norecurse nounwind ssp uwtable
define void @loop_not_vectorized() local_unnamed_addr #0 {
entry:
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret void
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%0 = load i8*, i8** @b, align 8, !tbaa !3
%arrayidx = getelementptr inbounds i8, i8* %0, i64 %indvars.iv
%1 = load i8, i8* %arrayidx, align 1, !tbaa !7
%2 = load i8*, i8** @c, align 8, !tbaa !3
%arrayidx2 = getelementptr inbounds i8, i8* %2, i64 %indvars.iv
%3 = load i8, i8* %arrayidx2, align 1, !tbaa !7
%mul = mul i8 %3, %1
%4 = load i8*, i8** @a, align 8, !tbaa !3
%arrayidx6 = getelementptr inbounds i8, i8* %4, i64 %indvars.iv
store i8 %mul, i8* %arrayidx6, align 1, !tbaa !7
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1800
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-
fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-
legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-
non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-
math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
"stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-
features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-
fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 9.0.0 (https://github.com/llvm/llvm-project.git
f3feb6adb919770f7ad4888ecb3a5c15076e5bf6)"}
!3 = !{!4, !4, i64 0}
!4 = !{!"any pointer", !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C/C++ TBAA"}
!7 = !{!5, !5, i64 0}
IIUC, the problem is that TBAA cannot deduce NoAlias, as the base type is char and char can be used to access any type in C and C++ (http://llvm.org/docs/LangRef.html#semantics)
If you use e.g. int instead of char, it will vectorize.
Bala, even if these code vectorize, that comes with runtime pointer testing overhead. So, if you are after application code performance, it is best to modify the app code so that lack of alias is properly communicated to the compiler (e.g., using restrict pointer). If you are interested in getting perf through vectorization, consider moving towards using explicit vectorization (such as #pragma omp simd).
Vectorizer can "improve" its ability to do more runtime DD checks as required by the given input IR, but that always come with additional runtime overhead. So, whether that's a real "improvement" or not is "debatable".
If you are just asking for the difference between the two cases, Florian answered it well.
Thanks for the details.