Open llvmbot opened 8 years ago
I built LLVM + Clang at the same commits as you did. I built both on x86-64 with commands cmake /path/to/llvm, then make, then make clang.
Compiling the example (with clang --version output prepended) gives still shows no vectorization for logf. Any idea what could differ between our two clang binaries? How are you building clang?
My output:
clang version 4.0.0 (http://llvm.org/git/clang.git 8a6ea813424dabc71bf4514942e487bd0268a317) (http://llvm.org/git/llvm.git c170429d499f008b204e15cb7a6ef9d45e309d1f) Target: x86_64-unknown-linux-gnu Thread model: posix InstalledDir: /home/emartin ; ModuleID = 'test.c' source_filename = "test.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable define void @test_vectorize_log(float nocapture readonly %x, float nocapture %y, i32 %n) local_unnamed_addr #0 { entry: %cmp7 = icmp sgt i32 %n, 0 br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry %wide.trip.count = zext i32 %n to i64 br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry ret void
for.body: ; preds = %for.body.preheader, %for.body %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] %arrayidx = getelementptr inbounds float, float %x, i64 %indvars.iv %0 = load float, float %arrayidx, align 4, !tbaa !1 %call = tail call fast float @__logf_finite(float %0) #2 %arrayidx2 = getelementptr inbounds float, float %y, i64 %indvars.iv store float %call, float %arrayidx2, align 4, !tbaa !1 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body }
I tried your example with a recent build of llvm (clang version 4.0.0 (https://github.com/llvm-mirror/clang.git 8a6ea813424dabc71bf4514942e487bd0268a317) (https://github.com/llvm-mirror/llvm.git c170429d499f008b204e15cb7a6ef9d45e309d1f)) and it seems like logf is vectorized.
With -O3 -march=x86-64 -mavx2 -ffast-math -S -emit-llvm
the following IR is generated for test_vectorize_log:
; Function Attrs: nounwind ssp uwtable define void @test_vectorize_log(float nocapture readonly %x, float nocapture %y, i32 %n) local_unnamed_addr #0 { entry: %cmp7 = icmp sgt i32 %n, 0 br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry %wide.trip.count = zext i32 %n to i64 %min.iters.check = icmp ult i32 %n, 8 br i1 %min.iters.check, label %for.body.preheader14, label %min.iters.checked
for.body.preheader14: ; preds = %middle.block, %vector.memcheck, %min.iters.checked, %for.body.preheader %indvars.iv.ph = phi i64 [ 0, %vector.memcheck ], [ 0, %min.iters.checked ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] br label %for.body
min.iters.checked: ; preds = %for.body.preheader %0 = and i32 %n, 7 %n.mod.vf = zext i32 %0 to i64 %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf %cmp.zero = icmp eq i64 %n.vec, 0 br i1 %cmp.zero, label %for.body.preheader14, label %vector.memcheck
vector.memcheck: ; preds = %min.iters.checked %scevgep = getelementptr float, float %y, i64 %wide.trip.count %scevgep12 = getelementptr float, float %x, i64 %wide.trip.count %bound0 = icmp ugt float %scevgep12, %y %bound1 = icmp ugt float %scevgep, %x %memcheck.conflict = and i1 %bound0, %bound1 br i1 %memcheck.conflict, label %for.body.preheader14, label %vector.body.preheader
vector.body.preheader: ; preds = %vector.memcheck br label %vector.body
vector.body: ; preds = %vector.body.preheader, %vector.body %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ] %1 = getelementptr inbounds float, float %x, i64 %index %2 = bitcast float %1 to <8 x float> %wide.load = load <8 x float>, <8 x float> %2, align 4, !tbaa !2, !alias.scope !6 %3 = call fast <8 x float> @llvm.log.v8f32(<8 x float> %wide.load) %4 = getelementptr inbounds float, float %y, i64 %index %5 = bitcast float %4 to <8 x float> store <8 x float> %3, <8 x float> %5, align 4, !tbaa !2, !alias.scope !9, !noalias !6 %index.next = add i64 %index, 8 %6 = icmp eq i64 %index.next, %n.vec br i1 %6, label %middle.block, label %vector.body, !llvm.loop !11
middle.block: ; preds = %vector.body %cmp.n = icmp eq i32 %0, 0 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14
for.cond.cleanup.loopexit: ; preds = %for.body br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %middle.block, %entry ret void
for.body: ; preds = %for.body.preheader14, %for.body %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ] %arrayidx = getelementptr inbounds float, float %x, i64 %indvars.iv %7 = load float, float %arrayidx, align 4, !tbaa !2 %call = tail call fast float @logf(float %7) #2 %arrayidx2 = getelementptr inbounds float, float %y, i64 %indvars.iv store float %call, float %arrayidx2, align 4, !tbaa !2 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !14 }
Extended Description
I've written 2 test functions in C that take in a float array x of size n and output float array f(x), where f is either fabsf or logf. The LLVM 3.9 auto-vectorization docs claim that both functions will be vectorized: http://llvm.org/releases/3.9.0/docs/Vectorizers.html#vectorization-of-function-calls
When running with "clang -O3 -march=x86-64 -mavx2 -ffast-math test.c -S -emit-llvm", the function calling fabsf is vectorized while the function calling logf is not. This is with clang 3.9, but I've also confirmed the bug exists back to at least clang 3.7. I've also observed that logf calls break vectorization of more complex loops, and I provide the comparison with fabsf as a reduced test case.
I've attached the C program, a sh script to invoke clang with correct arguments, and an example LLVM IR file I produced by running on my system.