Performance regression on Julia 1.8 with `--check-bounds=no` and views

kamesy commented 1 year ago

On Julia 1.8 there seems to be an odd performance regression/issue when the session is started with --check-bounds=no and the @batch -loop contains views:

using Polyester # v0.7.2

function mean2_batch!(y::AbstractVector, x::AbstractMatrix)
    m = size(x, 2)
    a = 1 / m
    @batch for i in eachindex(y)
        x̄ = x[i,1]
        for j in 2:m
            x̄ += x[i,j]
        end
        y[i] = a * x̄
    end
    return y
end

function mean2_batch_view!(y::AbstractVector, x::AbstractMatrix)
    m = size(x, 2)
    a = 1 / m
    @batch for i in eachindex(y)
        v = view(x, i, :)
        x̄ = v[1]
        for j in 2:m
            x̄ += v[j]
        end
        y[i] = a * x̄
    end
    return y
end

n, m = (300^3, 48)
x = randn(n, m)
y = randn(n)

$ julia +lts -t auto --check-bounds=no

1.6.7> versioninfo()
Julia Version 1.6.7
Commit 3b76b25b64 (2022-07-19 15:11 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Xeon(R) W-2195 CPU @ 2.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-11.0.1 (ORCJIT, skylake-avx512)

1.6.7> @benchmark mean2_batch!($y, $x)
BenchmarkTools.Trial: 27 samples with 1 evaluation.
 Range (min … max):  184.588 ms … 229.702 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     186.495 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   188.242 ms ±   8.316 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%

    █                                                            
  ▃▄█▆▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃ ▁
  185 ms           Histogram: frequency by time          230 ms <

 Memory estimate: 64 bytes, allocs estimate: 2.

1.6.7> @benchmark mean2_batch_view!($y, $x)
BenchmarkTools.Trial: 27 samples with 1 evaluation.
 Range (min … max):  185.563 ms … 189.317 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     186.459 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   186.654 ms ± 712.315 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

             ▂ ▂█                                                
  ▅▁▁▁▁▅▁▁▁▁▅██████▁▁▅▅▁▁▅▁▁▁▁▁▁▅▁▁▁▅▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅ ▁
  186 ms           Histogram: frequency by time          189 ms <

 Memory estimate: 64 bytes, allocs estimate: 2.

$ julia +1.7 -t auto --check-bounds=no

1.7.3> versioninfo()
Julia Version 1.7.3
Commit 742b9abb4d (2022-05-06 12:58 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Xeon(R) W-2195 CPU @ 2.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-12.0.1 (ORCJIT, skylake-avx512)

1.7.3> @benchmark mean2_batch!($y, $x)
BenchmarkTools.Trial: 27 samples with 1 evaluation.
 Range (min … max):  184.075 ms … 187.465 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     186.109 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   186.218 ms ± 684.829 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

                                     █▂▂                         
  ▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▁▅▁▁▁▅█████▁▁▅▁▅▁▁▁▁▁▁▁▁▁▅▁▁▁▁█▅▅ ▁
  184 ms           Histogram: frequency by time          187 ms <

 Memory estimate: 64 bytes, allocs estimate: 2.

1.7.3> @benchmark mean2_batch_view!($y, $x)
BenchmarkTools.Trial: 27 samples with 1 evaluation.
 Range (min … max):  185.316 ms … 187.682 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     186.149 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   186.246 ms ± 589.697 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

                  ▁▄   ▁█ ▁                              ▁       
  ▆▁▁▁▆▁▁▁▆▆▁▁▁▁▁▆██▆▆▆██▁█▁▁▆▁▁▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆▁▁▁█▁▁▁▁▆ ▁
  185 ms           Histogram: frequency by time          188 ms <

 Memory estimate: 64 bytes, allocs estimate: 2.

$ julia +1.8 -t auto --check-bounds=auto

1.8.5> versioninfo()
Julia Version 1.8.5
Commit 17cfb8e65ea (2023-01-08 06:45 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 36 × Intel(R) Xeon(R) W-2195 CPU @ 2.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-13.0.1 (ORCJIT, skylake-avx512)
  Threads: 36 on 36 virtual cores

1.8.5> @benchmark mean2_batch!($y, $x)
BenchmarkTools.Trial: 27 samples with 1 evaluation.
 Range (min … max):  189.473 ms … 206.379 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     189.790 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   191.797 ms ±   4.838 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █▁                                                             
  ██▁▁▃▄▄▁▁▁▁▁▁▁▁▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▁
  189 ms           Histogram: frequency by time          206 ms <

 Memory estimate: 64 bytes, allocs estimate: 2.

1.8.5> @benchmark mean2_batch_view!($y, $x)
BenchmarkTools.Trial: 27 samples with 1 evaluation.
 Range (min … max):  184.251 ms … 201.577 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     186.193 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   187.893 ms ±   4.647 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%

       ▂█                                                        
  ▄▁▁▁▁███▄▁▇▁▁▁▁▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▁▁▁▁▁▄▁▄ ▁
  184 ms           Histogram: frequency by time          202 ms <

 Memory estimate: 64 bytes, allocs estimate: 2.

$ julia +1.8 -t auto --check-bounds=no

1.8.5> versioninfo()
Julia Version 1.8.5
Commit 17cfb8e65ea (2023-01-08 06:45 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 36 × Intel(R) Xeon(R) W-2195 CPU @ 2.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-13.0.1 (ORCJIT, skylake-avx512)
  Threads: 36 on 36 virtual cores

1.8.5> @benchmark mean2_batch!($y, $x)
BenchmarkTools.Trial: 26 samples with 1 evaluation.
 Range (min … max):  189.297 ms … 236.241 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     189.997 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   192.468 ms ±   9.116 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █                                                              
  █▅▅▃▁▁▃▁▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃ ▁
  189 ms           Histogram: frequency by time          236 ms <

 Memory estimate: 64 bytes, allocs estimate: 2.

1.8.5> @benchmark mean2_batch_view!($y, $x)
BenchmarkTools.Trial: 9 samples with 1 evaluation.
 Range (min … max):  614.619 ms … 625.852 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     615.136 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   618.755 ms ±   5.137 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▁██            ▁                                        ▁  ▁▁  
  ███▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁██ ▁
  615 ms           Histogram: frequency by time          626 ms <

 Memory estimate: 64 bytes, allocs estimate: 2.

The two loops with @threads:

$ julia +1.8 -t auto --check-bounds=no

1.8.5> @benchmark mean2_threads!($y, $x)
BenchmarkTools.Trial: 31 samples with 1 evaluation.
 Range (min … max):  159.370 ms … 190.527 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     160.378 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   166.033 ms ±   9.451 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▂█                                                             
  ██▃▃▃▁▃▁▃▁▁▁▁▃▁▁▁▁▃▁▁▁▁▁▁▃▁▁▁▃▁▁▃▃▁▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▃▁▁▁▃▁▁▁▃ ▁
  159 ms           Histogram: frequency by time          191 ms <

 Memory estimate: 19.45 KiB, allocs estimate: 217.

1.8.5> @benchmark mean2_threads_view!($y, $x)
BenchmarkTools.Trial: 31 samples with 1 evaluation.
 Range (min … max):  159.572 ms … 191.850 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     160.891 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   164.210 ms ±   7.682 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █▆█▁                                                           
  ████▇▁▁▄▄▁▁▁▄▄▁▁▁▁▁▁▁▁▁▁▇▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▁▁▁▁▁▁▁▁▁▄ ▁
  160 ms           Histogram: frequency by time          192 ms <

 Memory estimate: 19.45 KiB, allocs estimate: 217.

chriselrod commented 1 year ago

If @batch is giving worse performance than Threads.@threads, you could try @batch per=thread for. Also, @simd would help on the inner most loop.

For this particular example

using LoopVectorization

function mean2_tturbo!(y::AbstractVector, x::AbstractMatrix)
    m = size(x, 2)
    a = 1 / m
    @tturbo for i in eachindex(y)
        x̄ = zero(eltype(x))
        for j in 1:m
            x̄ += x[i,j]
        end
        y[i] = a * x̄
    end
    return y
end

should be much faster. Note that

for i in eachindex(y)
    x̄ = zero(eltype(x))
    for j in 1:m
        x̄ += x[i,j]

should generally be faster than

for i in eachindex(y)
    x̄ = x[i,1]
    for j in 2:m
        x̄ += x[i,j]

when using @simd, @fastmath, or @turbo.

chriselrod commented 1 year ago

I can reproduce view being slow with --check-bound=no. In fact, it hasn't terminated for me yet. EDIT: I just got a segfault, so my current guess would be that it is slow because it is wrong, and working with junk data.

chriselrod commented 1 year ago

For convenience, a script to time them all:

using Polyester, LoopVectorization

function mean2_batch!(y::AbstractVector, x::AbstractMatrix)
    m = size(x, 2)
    a = 1 / m
    @batch for i in eachindex(y)
        x̄ = x[i,1]
        for j in 2:m
            x̄ += x[i,j]
        end
        y[i] = a * x̄
    end
    return y
end

function mean2_batch_view!(y::AbstractVector, x::AbstractMatrix)
    m = size(x, 2)
    a = 1 / m
    @batch for i in eachindex(y)
        v = view(x, i, :)
        x̄ = v[1]
        for j in 2:m
            x̄ += v[j]
        end
        y[i] = a * x̄
    end
    return y
end
function mean2_threads!(y::AbstractVector, x::AbstractMatrix)
    m = size(x, 2)
    a = 1 / m
    Threads.@threads for i in eachindex(y)
        x̄ = x[i,1]
        for j in 2:m
            x̄ += x[i,j]
        end
        y[i] = a * x̄
    end
    return y
end

function mean2_threads_view!(y::AbstractVector, x::AbstractMatrix)
    m = size(x, 2)
    a = 1 / m
    Threads.@threads for i in eachindex(y)
        v = view(x, i, :)
        x̄ = v[1]
        for j in 2:m
            x̄ += v[j]
        end
        y[i] = a * x̄
    end
    return y
end
function mean2_tturbo!(y::AbstractVector, x::AbstractMatrix)
    m = size(x, 2)
    a = 1 / m
    @tturbo for i in eachindex(y)
        x̄ = zero(eltype(x))
        for j in 1:m
            x̄ += x[i,j]
        end
        y[i] = a * x̄
    end
    return y
end

n, m = (300^3, 48);
x = randn(n, m);
y = randn(n);

@time mean2_batch!(y, x);
@time mean2_batch_view!(y, x);
@time mean2_threads!(y, x);
@time mean2_threads_view!(y, x);
@time mean2_tturbo!(y, x);

@time mean2_batch!(y, x);
@time mean2_batch!(y, x);
@time mean2_batch_view!(y, x);
@time mean2_batch_view!(y, x);
@time mean2_threads!(y, x);
@time mean2_threads!(y, x);
@time mean2_threads_view!(y, x);
@time mean2_threads_view!(y, x);
@time mean2_tturbo!(y, x);
@time mean2_tturbo!(y, x);

@benchmark mean2_batch!($y, $x)
@benchmark mean2_batch_view!($y, $x)
@benchmark mean2_threads!($y, $x)
@benchmark mean2_threads_view!($y, $x)
@benchmark mean2_tturbo!($y, $x)

FWIW, using --check-bounds=no also gives me really bad performance for mean2_batch!.

Anyway, just using Cthulhu.@descend shows the problem is that type inference fails when --check-bounds=no.

I'd say it's a base Julia issue, not a Polyester issue.

chriselrod commented 1 year ago

Closing in favor of: https://github.com/JuliaLang/julia/issues/49472

kamesy commented 1 year ago

Awesome. Thanks!

JuliaSIMD / Polyester.jl

Performance regression on Julia 1.8 with `--check-bounds=no` and views #109