Constant Folding openlibm functions

jwmerrill commented 9 years ago

https://groups.google.com/forum/#!topic/julia-users/Jndl9sYwj5Q reports a performance regression in some simple code from a blog post that was meant to illustrate the importance of type stability: http://www.johnmyleswhite.com/notebook/2013/12/06/writing-type-stable-code-in-julia/

The code is

function sumofsins2(n::Integer)  
    r = 0.0  
    for i in 1:n  
        r += sin(3.4)  
    end  
    return r  
end

and the blog post gives the output (as of Julia 2.x) of code_llvm(sumofsins2, (Int,))

define double @julia_sumofsins21068(i64) {  
top:  
  %1 = icmp slt i64 %0, 1, !dbg !5151  
  br i1 %1, label %L2, label %pass, !dbg !5151  

pass:                                             ; preds = %top, %pass  
  %"#s6.04" = phi i64 [ %3, %pass ], [ 1, %top ]  
  %r.03 = phi double [ %2, %pass ], [ 0.000000e+00, %top ]  
  %2 = fadd double %r.03, 0xBFD05AC910FF4C6C, !dbg !5156  
  %3 = add i64 %"#s6.04", 1, !dbg !5156  
  %4 = icmp sgt i64 %3, %0, !dbg !5151  
  br i1 %4, label %L2, label %pass, !dbg !5151  

L2:                                               ; preds = %pass, %top  
  %r.0.lcssa = phi double [ 0.000000e+00, %top ], [ %2, %pass ]  
  ret double %r.0.lcssa, !dbg !5157  
}

In this IR code, the sin(3.4) has been constant folded to 0xBFD05AC910FF4C6C.

As of Julia 3.4, the new llvm code is

define double @"julia_sumofsins2;20064"(i64) {
top:
  %1 = icmp sgt i64 %0, 0, !dbg !841
  br i1 %1, label %L, label %L3, !dbg !841

L:                                                ; preds = %top, %pass
  %r.0 = phi double [ %6, %pass ], [ 0.000000e+00, %top ]
  %"#s119.0" = phi i64 [ %5, %pass ], [ 1, %top ]
  %2 = call double inttoptr (i64 4551878240 to double (double)*)(double 3.400000e+00), !dbg !842
  %3 = fcmp ord double %2, 0.000000e+00, !dbg !842
  br i1 %3, label %pass, label %fail, !dbg !842

fail:                                             ; preds = %L
  %4 = load %jl_value_t** @jl_domain_exception, align 8, !dbg !842, !tbaa %jtbaa_const
  call void @jl_throw_with_superfluous_argument(%jl_value_t* %4, i32 4), !dbg !842
  unreachable, !dbg !842

pass:                                             ; preds = %L
  %5 = add i64 %"#s119.0", 1, !dbg !841
  %6 = fadd double %r.0, %2, !dbg !842
  %7 = icmp eq i64 %"#s119.0", %0, !dbg !842
  br i1 %7, label %L3, label %L, !dbg !842

L3:                                               ; preds = %pass, %top
  %r.1 = phi double [ 0.000000e+00, %top ], [ %6, %pass ]
  ret double %r.1, !dbg !845
}

so it looks like the call to sin is no longer being constant folded.

I haven't checked the generated code on master, but it sounds like the performance regression is still present there.

simonster commented 9 years ago

On 0.2.1:

julia> code_llvm(sin, (Float64,))

define double @julia_sin(double) {
top:
  %1 = call double @sin(double %0), !dbg !3342
  %2 = fcmp ord double %1, 0.000000e+00, !dbg !3342
  %3 = fcmp uno double %0, 0.000000e+00, !dbg !3342
  %4 = or i1 %2, %3, !dbg !3342
  br i1 %4, label %pass, label %fail, !dbg !3342

fail:                                             ; preds = %top
  %5 = load %jl_value_t** @jl_domain_exception, align 8, !dbg !3342
  call void @jl_throw_with_superfluous_argument(%jl_value_t* %5, i32 282), !dbg !3342
  unreachable, !dbg !3342

pass:                                             ; preds = %top
  ret double %1, !dbg !3342
}

On master:

define double @julia_sin_43255(double) {
top:
  %1 = call double inttoptr (i64 4668253248 to double (double)*)(double %0), !dbg !265
  %2 = fcmp ord double %1, 0.000000e+00, !dbg !265
  %3 = fcmp uno double %0, 0.000000e+00, !dbg !265
  %4 = or i1 %2, %3, !dbg !265
  br i1 %4, label %pass, label %fail, !dbg !265

fail:                                             ; preds = %top
  %5 = load %jl_value_t** @jl_domain_exception, align 8, !dbg !265, !tbaa %jtbaa_const
  call void @jl_throw_with_superfluous_argument(%jl_value_t* %5, i32 123), !dbg !265
  unreachable, !dbg !265

pass:                                             ; preds = %top
  ret double %1, !dbg !265
}

I'd guess that for LLVM to constant fold sin, it has to know that it's named sin, and somewhere along the line (precompilation?) we stopped giving it that information.

vtjnash commented 9 years ago

correct. we now force llvm to use the sin in libopenlibm, rather than giving it the freedom to pick any function named sin

dhoegh commented 9 years ago

I do not get the performance from constant folding, here is the output from code_llvm(sumofsins2, (Int,)).

julia> code_llvm(sumofsins2, (Int,))

define double @julia_sumofsins2_1172(i64) {
top:
  %1 = icmp sgt i64 %0, 0, !dbg !3561
  br i1 %1, label %L, label %L3, !dbg !3561

L:                                                ; preds = %top, %pass
  %r.0 = phi double [ %6, %pass ], [ 0.000000e+00, %top ]
  %"#s3.0" = phi i64 [ %5, %pass ], [ 1, %top ]
  %2 = call double inttoptr (i64 1752503104 to double (double)*)(double 3.400000e+00), !dbg !3562
  %3 = fcmp ord double %2, 0.000000e+00, !dbg !3562
  br i1 %3, label %pass, label %fail, !dbg !3562

fail:                                             ; preds = %L
  %4 = load %jl_value_t** @jl_domain_exception, align 8, !dbg !3562, !tbaa %jtbaa_const
  call void @jl_throw_with_superfluous_argument(%jl_value_t* %4, i32 4), !dbg !3562
  unreachable, !dbg !3562

pass:                                             ; preds = %L
  %5 = add i64 %"#s3.0", 1, !dbg !3561
  %6 = fadd double %r.0, %2, !dbg !3562
  %7 = icmp eq i64 %"#s3.0", %0, !dbg !3562
  br i1 %7, label %L3, label %L, !dbg !3562

L3:                                               ; preds = %pass, %top
  %r.1 = phi double [ 0.000000e+00, %top ], [ %6, %pass ]
  ret double %r.1, !dbg !3565
}

My version info is:

julia> versioninfo()
Julia Version 0.4.0-dev+2847
Commit fc61385 (2015-01-21 18:34 UTC)
Platform Info:
  System: Windows (x86_64-w64-mingw32)
  CPU: Intel(R) Core(TM) i7-2630QM CPU @ 2.00GHz
  WORD_SIZE: 64
  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Sandybridge)
  LAPACK: libopenblas
  LIBM: libopenlibm
  LLVM: libLLVM-3.3