How to get good evalpoly performance

heltonmc commented 2 years ago

I was hoping to write some specific functions for DoubleFloats using evalpoly routines but it appears that using BigFloats is significantly faster...


using DoubleFloats, BenchmarkTools

function d64_test(x::Double64)
    c = (
        df64"3.133239376997663645548490085151484674892E16", df64"-5.479944965767990821079467311839107722107E14",
        df64"6.290828903904724265980249871997551894090E12", df64"-3.633750176832769659849028554429106299915E10", 
        df64"1.207743757532429576399485415069244807022E8", df64"-2.107485999925074577174305650549367415465E5"
    )
    return evalpoly(x, c)
end
function big_test(x::BigFloat)
    c = (
        big"3.133239376997663645548490085151484674892E16", big"-5.479944965767990821079467311839107722107E14",
        big"6.290828903904724265980249871997551894090E12", big"-3.633750176832769659849028554429106299915E10", 
        big"1.207743757532429576399485415069244807022E8", big"-2.107485999925074577174305650549367415465E5"
    )
    return evalpoly(x, c)
end

julia> @benchmark d64_test(df64"1.0"*rand())
BenchmarkTools.Trial: 10000 samples with 5 evaluations.
 Range (min … max):  6.167 μs … 430.017 μs  ┊ GC (min … max): 0.00% … 97.86%
 Time  (median):     6.283 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   6.459 μs ±   5.374 μs  ┊ GC (mean ± σ):  1.16% ±  1.38%

julia> @benchmark big_test(big"1.0"*rand())
BenchmarkTools.Trial: 10000 samples with 10 evaluations.
 Range (min … max):  1.262 μs … 240.604 μs  ┊ GC (min … max): 0.00% … 99.32%
 Time  (median):     1.304 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   1.355 μs ±   3.362 μs  ┊ GC (mean ± σ):  3.50% ±  1.40%

Is there a better way to set up this problem using Double64?

Thank you!

JeffreySarnoff commented 2 years ago

You are getting contaminated results. There is a better way to benchmark it.

using DoubleFloats, BenchmarkTools

dfc = (
               df64"3.133239376997663645548490085151484674892E16", 
               df64"-5.479944965767990821079467311839107722107E14",
               df64"6.290828903904724265980249871997551894090E12", 
               df64"-3.633750176832769659849028554429106299915E10",
               df64"1.207743757532429576399485415069244807022E8", 
               df64"-2.107485999925074577174305650549367415465E5"
           );
bfc  = (
               big"3.133239376997663645548490085151484674892E16", 
               big"-5.479944965767990821079467311839107722107E14",
               big"6.290828903904724265980249871997551894090E12", 
               big"-3.633750176832769659849028554429106299915E10",
               big"1.207743757532429576399485415069244807022E8", 
               big"-2.107485999925074577174305650549367415465E5"
           );
dfx  = cos(df64"0.5");
bfx  = BigFloat(dfx);

@btime evalpoly(x, c) setup=(x=bfx; c=bfc)

@btime evalpoly(x, c) setup=(x=dfx; c=dfc)

with which I get ~3.5x speedup using Double64s (Julia Version 1.8.0-DEV.1115)


julia> @btime evalpoly(x, c) setup=(x=bfx; c=bfc)
  572.678 ns (20 allocations: 1.02 KiB)
3.085630375695602261716673360887377923773090914433281851181935837748260287702165e+16

julia> @btime evalpoly(x, c) setup=(x=dfx; c=dfc)
  164.086 ns (1 allocation: 32 bytes)
3.0856303756956024e16

julia> BigFloat(evalpoly(dfx, dfc))
3.0856303756956022617166733608873752103818333125673234462738037109375e+16

julia> Double64(evalpoly(bfx,bfc))
3.0856303756956024e16

julia> BigFloat(ans)
3.0856303756956022617166733608873752103818333125673234462738037109375e+16

heltonmc commented 2 years ago

Great thank you so much! Benchmarking always seems to be the issue.

I will close this issue, but for some reason I am still seeing a discrepancy in my actual function. This could be an implementation issue that I will check but will go ahead and put the code here....


@inline function besselj0_d64_kernel(x::Double64)
    J0_2N = (
        df64"3.133239376997663645548490085151484674892E16", df64"-5.479944965767990821079467311839107722107E14",
        df64"6.290828903904724265980249871997551894090E12", df64"-3.633750176832769659849028554429106299915E10", 
        df64"1.207743757532429576399485415069244807022E8", df64"-2.107485999925074577174305650549367415465E5",
        df64"1.562826808020631846245296572935547005859E2"
    )
    J0_2D = (
        df64"2.005273201278504733151033654496928968261E18", df64"2.063038558793221244373123294054149790864E16", 
        df64"1.053350447931127971406896594022010524994E14",df64"3.496556557558702583143527876385508882310E11", 
        df64"8.249114511878616075860654484367133976306E8",df64"1.402965782449571800199759247964242790589E6", 
        df64"1.619910762853439600957801751815074787351E3", df64"1.0"
    )  
    return evalpoly(x, J0_2N) / (evalpoly(x, J0_2D))
end
@inline function besselj0_d64_kernel(x::BigFloat)
    J0_2N = (
        big"3.133239376997663645548490085151484674892E16", big"-5.479944965767990821079467311839107722107E14",
        big"6.290828903904724265980249871997551894090E12", big"-3.633750176832769659849028554429106299915E10", 
        big"1.207743757532429576399485415069244807022E8", big"-2.107485999925074577174305650549367415465E5",
        big"1.562826808020631846245296572935547005859E2"
    )
    J0_2D = (
        big"2.005273201278504733151033654496928968261E18", big"2.063038558793221244373123294054149790864E16", 
        big"1.053350447931127971406896594022010524994E14",big"3.496556557558702583143527876385508882310E11", 
        big"8.249114511878616075860654484367133976306E8",big"1.402965782449571800199759247964242790589E6", 
        big"1.619910762853439600957801751815074787351E3", big"1.0"
    )  
    return evalpoly(x, J0_2N) / (evalpoly(x, J0_2D))
end

function besselj0(x::T) where {T<:Union{Double64, BigFloat}}
    if x == T(0.0)
        return T(1.0)
    end

    xx = abs(x)
    if xx <= T(2.0)
        z = xx * xx

        p = z * z * besselj0_d64_kernel(z)
        p -= T(0.25) * z
        p += T(1.0)
        return p
    end
end

julia> dfx  = cos(df64"0.5");

julia> bfx  = BigFloat(dfx);

julia> @btime besselj0(x) setup=(x=bfx)
  3.988 μs (74 allocations: 4.05 KiB)
0.8165340145876553610872273288398583420645441786030554404827067685098603756933289

julia> @btime besselj0(x) setup=(x=dfx)
  14.458 μs (76 allocations: 3.93 KiB)
0.8165340145876554

Again, thanks so much @JeffreySarnoff. Always very helpful

JeffreySarnoff commented 2 years ago

You are repeatedly evaluating the coeffs. The string to Double64 takes longer than the string to BigFloat because the first uses the second. establish the coeffs outside of the function that evaluates the the poly.

JeffreySarnoff commented 2 years ago

@heltonmc something like this

module BesselJ0

export besselj0

using DoubleFloats

const J0_2N = (
    df64"3.133239376997663645548490085151484674892E16", df64"-5.479944965767990821079467311839107722107E14",
    df64"6.290828903904724265980249871997551894090E12", df64"-3.633750176832769659849028554429106299915E10", 
    df64"1.207743757532429576399485415069244807022E8", df64"-2.107485999925074577174305650549367415465E5",
    df64"1.562826808020631846245296572935547005859E2"
)
const J0_2D = (
    df64"2.005273201278504733151033654496928968261E18", df64"2.063038558793221244373123294054149790864E16", 
    df64"1.053350447931127971406896594022010524994E14",df64"3.496556557558702583143527876385508882310E11", 
    df64"8.249114511878616075860654484367133976306E8",df64"1.402965782449571800199759247964242790589E6", 
    df64"1.619910762853439600957801751815074787351E3", df64"1.0"
)  

@inline function besselj0_kernel(x::Double64)
    return evalpoly(x, J0_2N) / evalpoly(x, J0_2D)
end

function besselj0(x::Double64)
    xx = abs(x)
    if iszero(xx)
        return one(Double64)
    elseif xx > 2.0
        return nothing     
    end

    z = xx * xx

    p = z * z * besselj0_kernel(z)
    p -= 0.25 * z
    p += 1.0
    return p
end

end # module

heltonmc commented 2 years ago

^^^^ For some reason I was thinking (hoping) I could get the constants to be evaluated at compile time if the sizes were known and fixed but I definitely misjudged this. Will have to check the Float32 and Float64 implementations as well. But it seems to be setting the polynomial coefficients as constants is the best way to go as you showed above. With this change the Double64 is 5x faster than the BigFloat implementation. Thank you!!

uniment commented 1 year ago

You can also evaluate constants outside the function body and then capture them with one of the following:

# local namespace; declare globals explicitly
let π = Double64(π)
    global c(r::Double64) = 2π*r
end

# global namespace; declare locals explicitly
begin local π = Double64(pi)
    c(r::Double64) = 2π*r
end

What's nice about this is, whatever equation is involved in calculating the constant can be expressed in the code for clarity without polluting the global namespace. Values can be computed in high-precision with BigFloat or BigInt and then converted to the target precision, all outside the function body.

heltonmc commented 1 year ago

This is an interesting application thank you and https://github.com/JuliaMath/DoubleFloats.jl/pull/162 should close this issue once merged as that was the original issue.

I think my general stumbling block is writing a module that doesn't have to explicitly load these other packages (DoubleFloats.jl, Quadmath.jl, etc..) so I can write as generic constants as possible (there are usually hundreds across many functions) so it's nice to just have them as strings or BigFloats stored as constants and then whatever type the user wants can convert them (preferably at compile time). I have yet to find a completely generic way of doing that.

JuliaMath / DoubleFloats.jl

How to get good evalpoly performance #134