Open Quuxplusone opened 8 years ago
Attached log_autovec_test.tgz
(2813 bytes, application/octet-stream): C test case, sh command to invoke clang, and example faulty IR
I tried your example with a recent build of llvm (clang version 4.0.0 (https://github.com/llvm-mirror/clang.git 8a6ea813424dabc71bf4514942e487bd0268a317) (https://github.com/llvm-mirror/llvm.git c170429d499f008b204e15cb7a6ef9d45e309d1f)) and it seems like logf is vectorized.
With -O3 -march=x86-64 -mavx2 -ffast-math -S -emit-llvm
the following IR is generated for test_vectorize_log:
; Function Attrs: nounwind ssp uwtable define void @test_vectorize_log(float nocapture readonly %x, float nocapture %y, i32 %n) local_unnamed_addr #0 { entry: %cmp7 = icmp sgt i32 %n, 0 br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry %wide.trip.count = zext i32 %n to i64 %min.iters.check = icmp ult i32 %n, 8 br i1 %min.iters.check, label %for.body.preheader14, label %min.iters.checked
for.body.preheader14: ; preds = %middle.block, %vector.memcheck, %min.iters.checked, %for.body.preheader %indvars.iv.ph = phi i64 [ 0, %vector.memcheck ], [ 0, %min.iters.checked ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] br label %for.body
min.iters.checked: ; preds = %for.body.preheader %0 = and i32 %n, 7 %n.mod.vf = zext i32 %0 to i64 %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf %cmp.zero = icmp eq i64 %n.vec, 0 br i1 %cmp.zero, label %for.body.preheader14, label %vector.memcheck
vector.memcheck: ; preds = %min.iters.checked %scevgep = getelementptr float, float %y, i64 %wide.trip.count %scevgep12 = getelementptr float, float %x, i64 %wide.trip.count %bound0 = icmp ugt float %scevgep12, %y %bound1 = icmp ugt float %scevgep, %x %memcheck.conflict = and i1 %bound0, %bound1 br i1 %memcheck.conflict, label %for.body.preheader14, label %vector.body.preheader
vector.body.preheader: ; preds = %vector.memcheck br label %vector.body
vector.body: ; preds = %vector.body.preheader, %vector.body %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ] %1 = getelementptr inbounds float, float %x, i64 %index %2 = bitcast float %1 to <8 x float> %wide.load = load <8 x float>, <8 x float> %2, align 4, !tbaa !2, !alias.scope !6 %3 = call fast <8 x float> @llvm.log.v8f32(<8 x float> %wide.load) %4 = getelementptr inbounds float, float %y, i64 %index %5 = bitcast float %4 to <8 x float> store <8 x float> %3, <8 x float> %5, align 4, !tbaa !2, !alias.scope !9, !noalias !6 %index.next = add i64 %index, 8 %6 = icmp eq i64 %index.next, %n.vec br i1 %6, label %middle.block, label %vector.body, !llvm.loop !11
middle.block: ; preds = %vector.body %cmp.n = icmp eq i32 %0, 0 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14
for.cond.cleanup.loopexit: ; preds = %for.body br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %middle.block, %entry ret void
for.body: ; preds = %for.body.preheader14, %for.body %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ] %arrayidx = getelementptr inbounds float, float %x, i64 %indvars.iv %7 = load float, float %arrayidx, align 4, !tbaa !2 %call = tail call fast float @logf(float %7) #2 %arrayidx2 = getelementptr inbounds float, float %y, i64 %indvars.iv store float %call, float %arrayidx2, align 4, !tbaa !2 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !14 }
I built LLVM + Clang at the same commits as you did. I built both on x86-64
with commands cmake /path/to/llvm, then make, then make clang.
Compiling the example (with clang --version output prepended) gives still shows
no vectorization for logf. Any idea what could differ between our two clang
binaries? How are you building clang?
My output:
clang version 4.0.0 (http://llvm.org/git/clang.git
8a6ea813424dabc71bf4514942e487bd0268a317) (http://llvm.org/git/llvm.git
c170429d499f008b204e15cb7a6ef9d45e309d1f)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /home/emartin
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define void @test_vectorize_log(float* nocapture readonly %x, float* nocapture
%y, i32 %n) local_unnamed_addr #0 {
entry:
%cmp7 = icmp sgt i32 %n, 0
br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %n to i64
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup
for.cond.cleanup: ; preds =
%for.cond.cleanup.loopexit, %entry
ret void
for.body: ; preds =
%for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4, !tbaa !1
%call = tail call fast float @__logf_finite(float %0) #2
%arrayidx2 = getelementptr inbounds float, float* %y, i64 %indvars.iv
store float %call, float* %arrayidx2, align 4, !tbaa !1
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}
log_autovec_test.tgz
(2813 bytes, application/octet-stream)