JuliaLang / julia

The Julia Programming Language
https://julialang.org/
MIT License
45.72k stars 5.48k forks source link

simd performance regression in 1.10+rc1 relative to 1.9.4 #52307

Open lmiq opened 11 months ago

lmiq commented 11 months ago

Related to this discourse thread:

https://discourse.julialang.org/t/trivial-port-from-c-rust-to-julia/105787/46?u=lmiq

The code posted here runs much faster in 1.9.4 than in 1.10+rc1:

% time julia +1.9 --startup-file=no lv2_c.jl 0 1000 0.01 200
typeof(rng) = MarsagliaRng
Mean IS=0.04462564329047784  OOS=-0.0031496723033457423  Bias=0.047775315593823586
 11.083806 seconds (169.93 k allocations: 16.234 MiB, 1.22% compilation time)

real    0m11,553s
user    0m11,605s
sys 0m0,318s
% time julia +1.10 --startup-file=no lv2_c.jl 0 1000 0.01 200
typeof(rng) = MarsagliaRng
Mean IS=0.04462564329047784  OOS=-0.0031496723033457423  Bias=0.047775315593823586
 42.888539 seconds (168.42 k allocations: 16.271 MiB, 0.38% compilation time)

real    0m43,412s
user    0m43,523s
sys 0m0,291s

The issue is related in how the loop that starts with for i in (ilong:ncases-2) is lowered. If one adds @simd to that loop, performance become:

% time julia +1.9 --startup-file=no lv2_d.jl 0 1000 0.01 200
typeof(rng) = MarsagliaRng
Mean IS=0.04462564329047785  OOS=-0.0031496723033457423  Bias=0.04777531559382359
  4.091737 seconds (179.15 k allocations: 16.861 MiB, 3.62% compilation time)

real    0m4,544s
user    0m4,618s
sys 0m0,314s
% time julia +1.10 --startup-file=no lv2_d.jl 0 1000 0.01 200
typeof(rng) = MarsagliaRng
Mean IS=0.04462564329047785  OOS=-0.0031496723033457423  Bias=0.04777531559382359
  3.953665 seconds (177.21 k allocations: 16.855 MiB, 4.40% compilation time)

real    0m4,380s
user    0m4,486s
sys 0m0,297s

The code:

using Random
using Printf

struct ParamsResult
    short_term::Int
    long_term::Int
    performance::Float64
end

mutable struct MarsagliaRng
    q::Vector{UInt32}
    carry::UInt32
    mwc256_initialized::Bool
    mwc256_seed::Int32
    i::UInt8

    function MarsagliaRng(seed::Vector{UInt8})
        q = zeros(UInt32, 256)
        carry = 362436
        mwc256_initialized = false
        mwc256_seed = reinterpret(Int32, seed)[1]
        i = 255
        new(q, carry, mwc256_initialized, mwc256_seed, i)
    end
end

# Default constructor using system time for seeding
function MarsagliaRng()
    ts_nano = UInt8(nanosecond(now()))
    seed = [ts_nano, ts_nano >>> 8, ts_nano >>> 16, ts_nano >>> 24]
    MarsagliaRng(seed)
end

# Random number generator functions
function next_u32(rng::MarsagliaRng)::UInt32
    a = UInt64(809430660)

    if !rng.mwc256_initialized
        j = UInt32(rng.mwc256_seed)
        c1 = UInt32(69069)
        c2 = UInt32(12345)
        rng.mwc256_initialized = true
        for k in 1:256
            j = (c1 * j + c2) % UInt32
            rng.q[k] = j
        end
    end

    rng.i = (rng.i + 1) % 256
    t = a * UInt64(rng.q[rng.i+1]) + UInt64(rng.carry)
    rng.carry = (t >> 32)
    rng.q[rng.i+1] = t % UInt32

    return rng.q[rng.i+1]
end

function next_u64(rng::MarsagliaRng)::UInt64
    UInt64(next_u32(rng))
end

# EDIT: extend Base.rand rather than shadowing it.
# Sample function for generating a Float64
function Base.rand(rng::MarsagliaRng)::Float64
    mult = 1.0 / typemax(UInt32)
    mult * next_u32(rng)
end

function opt_params(which, ncases::Integer, x::Vector{Float64})
    FloatT = eltype(x)
    xs = cumsum(x)
    best_perf = typemin(FloatT)
    ibestshort = 0
    ibestlong = 0

    small_float = 1e-60
    for ilong in 2:199
        for ishort in 1:(ilong-1)
            total_return = zero(FloatT)
            win_sum = small_float
            lose_sum = small_float
            sum_squares = small_float
            short_sum = zero(FloatT)
            long_sum = zero(FloatT)

            let i = ilong - 1
                short_sum = xs[i+1] - xs[i-ishort + 2 - 1] 
                long_sum = xs[i+1]

                short_mean = short_sum / ishort
                long_mean = long_sum / ilong

                ret = (short_mean > long_mean) ? x[i+2] - x[i+1] :
                      (short_mean < long_mean) ? x[i+1] - x[i+2] : zero(FloatT)

                total_return += ret
                sum_squares += ret^2
                if ret > zero(FloatT)
                    win_sum += ret
                else
                    lose_sum -= ret
                end
            end

            # General case; i != ilong - 1
            # @simd for i in (ilong:ncases-2) # for fast code
            for i in (ilong:ncases-2)
                x1 = x[i+1]
        #short_sum += x1 - x[i-ishort+1]
        #long_sum += x1 - x[i-ilong+1]

                short_sum = xs[i+1] - xs[i-ishort + 1] 
                long_sum = xs[i+1] - xs[i-ilong + 1]

                short_mean = short_sum / ishort
                long_mean = long_sum / ilong

                ret = (short_mean > long_mean) ? x[i+2] - x1 :
                      (short_mean < long_mean) ? x1 - x[i+2] : zero(FloatT)

                total_return += ret
                sum_squares += ret^2
                if ret > zero(FloatT)
                    win_sum += ret
                else
                    lose_sum -= ret
                end
            end

            if which == 0
                total_return /= (ncases - ilong)
                if total_return > best_perf
                    best_perf = total_return
                    ibestshort = ishort
                    ibestlong = ilong
                end
            elseif which == 1
                pf = win_sum / lose_sum
                if pf > best_perf
                    best_perf = pf
                    ibestshort = ishort
                    ibestlong = ilong
                end
            elseif which == 2
                total_return /= (ncases - ilong)
                sum_squares /= (ncases - ilong)
                sum_squares -= total_return^2
                sr = total_return / (sqrt(sum_squares) + 1e-8)
                if sr > best_perf
                    best_perf = sr
                    ibestshort = ishort
                    ibestlong = ilong
                end
            end
        end
    end 

    ParamsResult(ibestshort, ibestlong, best_perf)
end

function test_system(ncases::Integer, x::Vector{Float64}, short_term::Integer, long_term::Integer)
    FloatT = eltype(x)
    sum1 = zero(FloatT)

    for i in (long_term-1:ncases-2)
        short_mean = sum(@view x[i-short_term+2:i+1]) / short_term
        long_mean = sum(@view x[i-long_term+2:i+1]) / long_term

        if short_mean > long_mean
            sum1 += x[i+2] - x[i+1]
        elseif short_mean < long_mean
            sum1 -= x[i+2] - x[i+1]
        end

    end
    sum1 / (ncases - long_term)
end

function main()
    if length(ARGS) < 4
        println("Usage: julia script.jl <which> <ncases> <trend> <nreps>")
        return
    end

    which = parse(Int, ARGS[1])
    ncases = parse(Int, ARGS[2])
    save_trend = parse(Float64, ARGS[3])
    nreps = parse(Int, ARGS[4])

    optimize(which, ncases, save_trend, nreps)
end

function optimize(which::Integer, ncases::Integer, save_trend::Float64, nreps::Integer, rng=MarsagliaRng(UInt8.([33, 0, 0, 0])))
    @show typeof(rng)
    _optimize(rng, which, ncases, save_trend, nreps)
end

function _optimize(rng, which::Integer, ncases::Integer, save_trend::Float64, nreps::Integer)
    print_results = false

    FloatT = typeof(save_trend)
    x = zeros(FloatT, ncases)

    is_mean = zero(FloatT)
    oos_mean = zero(FloatT)
    futures = Task[]

    for irep in 1:nreps
        # Generate in-sample
        trend = save_trend
        x[1] = zero(FloatT)
        for i in 2:ncases
            if (i - 1) % 50 == 0
                trend = -trend
            end
            x[i] = x[i-1] + trend + rand(rng) + rand(rng) - rand(rng) - rand(rng)
        end
        x_optim = copy(x)

        # Generate out-of-sample
        trend = save_trend
        x[1] = zero(FloatT)
        for i in 2:ncases
            if (i - 1) % 50 == 0
                trend = -trend
            end
            x[i] = x[i-1] + trend + rand(rng) + rand(rng) - rand(rng) - rand(rng)
        end
        x_oos = copy(x)
        future = Threads.@spawn begin
        params_result = opt_params(which, ncases, x_optim)

        oos_perf = test_system(ncases, x_oos, params_result.short_term, params_result.long_term)
        (params_result, oos_perf)
        end
        push!(futures, future)

    end
    for (irep, future) in enumerate(futures)
        params_result::ParamsResult, oos_perf::Float64 = fetch(future)
        is_mean += params_result.performance
        oos_mean += oos_perf
        if print_results
            @printf("%3d: %3d %3d  %8.4f %8.4f (%8.4f)\n",
                    irep, params_result.short_term, params_result.long_term, params_result.performance, oos_perf, params_result.performance - oos_perf)
        end
    end

    is_mean /= nreps
    oos_mean /= nreps

    println("Mean IS=$is_mean  OOS=$oos_mean  Bias=$(is_mean - oos_mean)")

end

@time main()
lmiq commented 11 months ago

Version alpha1 of 1.10 series already had the regression:

% time julia +1.10.0-alpha1 --startup-file=no lv2_c.jl 0 1000 0.01 200
typeof(rng) = MarsagliaRng
Mean IS=0.04462564329047784  OOS=-0.0031496723033457423  Bias=0.047775315593823586
 43.162031 seconds (180.03 k allocations: 16.689 MiB, 0.38% compilation time)

real    0m43,579s
user    0m43,634s
sys 0m0,347s
lmiq commented 11 months ago
julia> versioninfo()
Julia Version 1.10.0-rc1
Commit 5aaa9485436 (2023-11-03 07:44 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 8 × Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, skylake)
  Threads: 1 on 8 virtual cores
Environment:
  JULIA_EDITOR = vim
ericphanson commented 11 months ago

Maybe https://github.com/JuliaLang/julia/pull/49405? I see the first tag that PR is 1.10-alpha, so that adds up.

  • The @simd macro now has a more limited and clearer semantics, it only enables reordering and contraction of floating-point operations, instead of turning on all "fastmath" optimizations. If you observe performance regressions due to this change, you can recover previous behavior with @fastmath @simd, if you are OK with all the optimizations enabled by the @fastmath macro. ([#49405])
gbaraldi commented 11 months ago

I would also check the llvm bump

lmiq commented 11 months ago

I must add that in this newer machine I don't see the regression:


julia> versioninfo()
Julia Version 1.10.0-rc1
Commit 5aaa9485436 (2023-11-03 07:44 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 16 × 13th Gen Intel(R) Core(TM) i7-1360P
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, goldmont)
  Threads: 1 on 16 virtual cores
giordano commented 11 months ago

Can it be related to #51814?