JuliaSIMD / LoopVectorization.jl

Macro(s) for vectorizing loops.
MIT License
745 stars 67 forks source link

Invalid index error & no method matching vrem_fast #244

Open carstenbauer opened 3 years ago

carstenbauer commented 3 years ago

MWE:

using LoopVectorization

spin_conf = rand((-1,1),64,64);

function get(spin_conf, x, y)
    (Nx, Ny) = size(spin_conf)
    @inbounds spin_conf[(x-1+Nx)%Nx + 1, (y-1+Ny)%Ny + 1]
end

function energy(spin_conf)
    (Nx, Ny) = size(spin_conf)
    res = 0
    @avx for i = 1:Nx
        for j = 1:Ny
            res += -get(spin_conf,i,j)*(get(spin_conf,i+1,j) + get(spin_conf,i,j+1))
        end
    end
    return res
end

Trying to run energy(spin_conf) I get

julia> energy(spin_conf)
ERROR: ArgumentError: invalid index: VectorizationBase.Vec{4, Int64}<1, 2, 3, 4> of type VectorizationBase.Vec{4, Int64}
Stacktrace:
  [1] to_index(i::VectorizationBase.Vec{4, Int64})
    @ Base ./indices.jl:300
  [2] to_index(A::Matrix{Int64}, i::VectorizationBase.Vec{4, Int64})
    @ Base ./indices.jl:277
  [3] to_indices
    @ ./indices.jl:333 [inlined]
  [4] to_indices
    @ ./indices.jl:324 [inlined]
  [5] getindex
    @ ./abstractarray.jl:1170 [inlined]
  [6] get(spin_conf::Matrix{Int64}, x::VectorizationBase.MM{4, 1, Int64}, y::Int64)
    @ Main ./REPL[3]:3
  [7] macro expansion
    @ ~/.julia/packages/LoopVectorization/O8WW6/src/reconstruct_loopset.jl:630 [inlined]
  [8] _avx_!
    @ ~/.julia/packages/LoopVectorization/O8WW6/src/reconstruct_loopset.jl:630 [inlined]
  [9] energy(spin_conf::Matrix{Int64})
    @ Main ./REPL[4]:4
 [10] top-level scope
    @ REPL[6]:1
 [11] eval_user_input(ast::Any, backend::REPL.REPLBackend)
    @ REPL ~/.julia/sysimgs/std.so:-1
 [12] repl_backend_loop(backend::REPL.REPLBackend)
    @ REPL ~/.julia/sysimgs/std.so:-1
 [13] start_repl_backend(backend::REPL.REPLBackend, consumer::Any)
    @ REPL ~/.julia/sysimgs/std.so:-1
 [14] run_repl(repl::REPL.AbstractREPL, consumer::Any; backend_on_current_task::Bool)
    @ REPL ~/.julia/sysimgs/std.so:-1
 [15] run_repl(repl::REPL.AbstractREPL, consumer::Any)
    @ REPL ~/.julia/sysimgs/std.so:-1
 [16] (::Base.var"#874#876"{Bool, Bool, Bool})(REPL::Module)
    @ Base ~/.julia/sysimgs/std.so:-1
 [17] run_main_repl(interactive::Bool, quiet::Bool, banner::Bool, history_file::Bool, color_set::Bool)
    @ Base ~/.julia/sysimgs/std.so:-1
 [18] exec_options(opts::Base.JLOptions)
    @ Base ~/.julia/sysimgs/std.so:-1
 [19] _start()
    @ Base ~/.julia/sysimgs/std.so:-1

"Manually inlining" the get function:

function energy(spin_conf)
    (Nx, Ny) = size(spin_conf)
    res = 0
    @avx for i = 1:Nx
        for j = 1:Ny
            i0 = (i-1+Nx)%Nx
            i1 = (i+Nx)%Nx
            j0 = (j-1+Ny)%Ny
            j1 = (j+Ny)%Ny
            res += -spin_conf[i0,j0]*(spin_conf[i1,j0] + spin_conf[i0,j1])
        end
    end
    return res
end

I obtain

julia> energy(spin_conf)
ERROR: MethodError: no method matching vrem_fast(::VectorizationBase.Vec{4, Int64}, ::VectorizationBase.Vec{4, Int64})
Stacktrace:
  [1] rem_fast
    @ ~/.julia/packages/VectorizationBase/ALPNu/src/base_defs.jl:87 [inlined]
  [2] macro expansion
    @ ~/.julia/packages/LoopVectorization/O8WW6/src/reconstruct_loopset.jl:630 [inlined]
  [3] _avx_!(#unused#::Val{(false, 0, 0, false, 4, 32, 15, 64, 32768, 262144, 12582912, 0x0000000000000001)}, #unused#::Val{(:i, :i, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.loopvalue, 0x00, 0x01), :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x02), :LoopVectorization, :-, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x03), :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x04), :LoopVectorization, :+, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000304, LoopVectorization.compute, 0x00, 0x05), :LoopVectorization, :rem_fast, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000504, LoopVectorization.compute, 0x00, 0x06), :LoopVectorization, :+, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000104, LoopVectorization.compute, 0x00, 0x07), :LoopVectorization, :rem_fast, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000704, LoopVectorization.compute, 0x00, 0x08), :j, :j, LoopVectorization.OperationStruct(0x0000000000000002, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.loopvalue, 0x00, 0x09), :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x0a), :LoopVectorization, :-, LoopVectorization.OperationStruct(0x0000000000000002, 0x0000000000000000, 0x0000000000000000, 0x0000000000000902, LoopVectorization.compute, 0x00, 0x0b), :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x0c), :LoopVectorization, :+, LoopVectorization.OperationStruct(0x0000000000000002, 0x0000000000000000, 0x0000000000000000, 0x0000000000000b0c, LoopVectorization.compute, 0x00, 0x0d), :LoopVectorization, :rem_fast, LoopVectorization.OperationStruct(0x0000000000000002, 0x0000000000000000, 0x0000000000000000, 0x0000000000000d0c, LoopVectorization.compute, 0x00, 0x0e), :LoopVectorization, :+, LoopVectorization.OperationStruct(0x0000000000000002, 0x0000000000000000, 0x0000000000000000, 0x000000000000090c, LoopVectorization.compute, 0x00, 0x0f), :LoopVectorization, :rem_fast, LoopVectorization.OperationStruct(0x0000000000000002, 0x0000000000000000, 0x0000000000000000, 0x0000000000000f0c, LoopVectorization.compute, 0x00, 0x10), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x000000000000060e, LoopVectorization.memload, 0x01, 0x11), :LoopVectorization, :sub_fast, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000011, LoopVectorization.compute, 0x00, 0x12), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x000000000000080e, LoopVectorization.memload, 0x02, 0x13), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000610, LoopVectorization.memload, 0x03, 0x14), :LoopVectorization, :+, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000001314, LoopVectorization.compute, 0x00, 0x15), :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000012, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x16), :LoopVectorization, :vfmadd_fast, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000012, 0x0000000000000000, 0x0000000000121516, LoopVectorization.compute, 0x00, 0x16))}, #unused#::Val{(LoopVectorization.ArrayRefStruct{:spin_conf, Symbol("##vptr##_spin_conf")}(0x0000000000000202, 0x000000000000060e, 0x0000000000000000, 0x0000000000000101), LoopVectorization.ArrayRefStruct{:spin_conf, Symbol("##vptr##_spin_conf")}(0x0000000000000202, 0x000000000000080e, 0x0000000000000000, 0x0000000000000101), LoopVectorization.ArrayRefStruct{:spin_conf, Symbol("##vptr##_spin_conf")}(0x0000000000000202, 0x0000000000000610, 0x0000000000000000, 0x0000000000000101))}, #unused#::Val{(0, (23,), (4, 12, 22), ((2, (1, 64, true)),), (), (), ())}, #unused#::Val{(:i, :j)}, tuple#args#::Tuple{Tuple{ArrayInterface.OptionallyStaticUnitRange{Static.StaticInt{1}, Int64}, ArrayInterface.OptionallyStaticUnitRange{Static.StaticInt{1}, Int64}}, Tuple{VectorizationBase.GroupedStridedPointers{Tuple{Ptr{Int64}}, (1,), (0,), ((1, 2),), ((1, 2),), Tuple{Static.StaticInt{8}, Int64}, Tuple{Static.StaticInt{1}, Static.StaticInt{1}}}, Int64, Int64, Int64}})
    @ LoopVectorization ~/.julia/packages/LoopVectorization/O8WW6/src/reconstruct_loopset.jl:630
  [4] energy(spin_conf::Matrix{Int64})
    @ Main ./REPL[4]:4
  [5] top-level scope
    @ REPL[5]:1
  [6] eval_user_input(ast::Any, backend::REPL.REPLBackend)
    @ REPL ~/.julia/sysimgs/std.so:-1
  [7] repl_backend_loop(backend::REPL.REPLBackend)
    @ REPL ~/.julia/sysimgs/std.so:-1
  [8] start_repl_backend(backend::REPL.REPLBackend, consumer::Any)
    @ REPL ~/.julia/sysimgs/std.so:-1
  [9] run_repl(repl::REPL.AbstractREPL, consumer::Any; backend_on_current_task::Bool)
    @ REPL ~/.julia/sysimgs/std.so:-1
 [10] run_repl(repl::REPL.AbstractREPL, consumer::Any)
    @ REPL ~/.julia/sysimgs/std.so:-1
 [11] (::Base.var"#874#876"{Bool, Bool, Bool})(REPL::Module)
    @ Base ~/.julia/sysimgs/std.so:-1
 [12] run_main_repl(interactive::Bool, quiet::Bool, banner::Bool, history_file::Bool, color_set::Bool)
    @ Base ~/.julia/sysimgs/std.so:-1
 [13] exec_options(opts::Base.JLOptions)
    @ Base ~/.julia/sysimgs/std.so:-1
 [14] _start()
    @ Base ~/.julia/sysimgs/std.so:-1
carstenbauer commented 3 years ago

Since I know that you're interested in these things, maybe a bit of background. A C++ friend gave me this code (.txt because GitHub doesn't allow .cpp):

Ising2D.txt

and told me that his Julia code was much slower (>5x) (again .txt because .jl isn't allowed):

2dising.txt

I went ahead and tried to reproduce the timings and (using icc -O3 -I ~/.local/lib/eigen/ Ising2D.cpp -o Ising2D) got this entirely different picture:

➜  ~/Desktop/ising julia 2dising.jl
-0.4428173828125
-0.4118935546875
-0.436130859375
-0.460140625
-0.4394482421875
-0.4422109375
-0.4049609375
  3.088 s (14 allocations: 822.06 KiB)

➜  ~/Desktop/ising time ./Ising2D
-0.423299
./Ising2D  9.22s user 0.01s system 99% cpu 9.234 total

I asked him what he's doing differently and he told me that he is using icpx instead of icc or clang. For him, this makes the a interestingly HUGE difference:

cecri@cecri-Mint:~/work/Practices/2dIsing$ clang++-9 -O3 -march=native Ising2D.cpp 
cecri@cecri-Mint:~/work/Practices/2dIsing$ time ./a.out 
-0.434271
real    0m3.337s
user    0m3.330s
sys 0m0.004s

cecri@cecri-Mint:~/work/Practices/2dIsing$ icpx -Ofast -xHost Ising2D.cpp
cecri@cecri-Mint:~/work/Practices/2dIsing$ time ./a.out
-0.406395
real    0m0.667s
user    0m0.663s
sys 0m0.004s

So, while Julia is competitive to clang/icc, perhaps even faster, icpx crushes everyone else! My thought then was, alright I know a package that is known for automagical performance speedups, which is why I tried LoopVectorization.jl after optimising the code a bit (40% speedup on my machine):

using Random
using BenchmarkTools

const rng = MersenneTwister()

function get(spin_conf, x, y)
    (Nx, Ny) = size(spin_conf)
    @inbounds spin_conf[(x-1+Nx)%Nx + 1, (y-1+Ny)%Ny + 1]
end

function deltaE(spin_conf, i, j)
    return 2.0*get(spin_conf,i,j) * (get(spin_conf,i-1,j) + get(spin_conf,i+1,j) +
                                     get(spin_conf,i,j+1) + get(spin_conf,i,j-1))
end

function energy(spin_conf)
    (Nx, Ny) = size(spin_conf)
    res = 0
    for i = 1:Nx
        @simd for j = 1:Ny
            res += -get(spin_conf,i,j)*(get(spin_conf,i+1,j) + get(spin_conf,i,j+1))
        end
    end
    return res
end

magnetism(spin_conf) = sum(spin_conf)

function sweep!(rec, spin_conf, beta, M)
    (Nx, Ny) = size(spin_conf)
    @inbounds @simd for i = 1:M
        x = rand(rng, 1:Nx)
        y = rand(rng, 1:Ny)
        r = rand(rng)
        dE = deltaE(spin_conf, x, y)
        if exp(-beta*dE) > r
            spin_conf[x,y] *= -1
        end
        rec[i] = energy(spin_conf)
    end
    return rec
end

function main()
    beta = 0.2
    L = 64
    total_iteration = 100_000

    spin_conf = rand(rng, (-1,1), L, L)
    rec = Vector{Int64}(undef,total_iteration)

    sweep!(rec, spin_conf, beta, total_iteration)

    s = sum(last(rec, 1000))/1000
    s /= L^2
    println(s)
    return nothing
end

@btime main()

This is the background for this issue and any help in getting the magical performance of icpx in Julia would be appreciated :)

(Update: g++ can also give the "magic" when run with the -fwhole-program compiler flag)

carstenbauer commented 3 years ago

I realised that when I use the somewhat outdated @unroll macro from https://github.com/StephenVavasis/Unroll.jl/blob/master/src/Unroll.jl and hardcode the loop length (i.e. replace Ny by 64) like so

function energy(spin_conf)
    (Nx, Ny) = size(spin_conf)
    res = 0
    for i = 1:Nx
        @unroll for j = 1:64
            res += -get(spin_conf,i,j)*(get(spin_conf,i+1,j) + get(spin_conf,i,j+1))
        end
    end
    return res
end

I get similar "magical" performance in Julia:

julia> @btime main();
  459.902 ms (5 allocations: 821.34 KiB)

While this is great I had to use a macro from an unmaintained package and hardcode the iterator length. LoopVectorization would hopefully make this experience nicer. :)

C++ code compiled with g++ and -fwhole-program for comparison:

➜  ~/Desktop/ising cat compile_gcc.sh
g++-10 -O3 -march=native -funroll-loops -fwhole-program -I ~/.local/lib/eigen/ Ising2D.cpp -o Ising2D

➜  ~/Desktop/ising time ./Ising2D
-0.370169
./Ising2D  0.43s user 0.00s system 99% cpu 0.430 total

(Update: Unfortunately, my friend told me that if he does the same and puts a #pragma GCC unroll 64 in front of the loop he gets

cecri@cecri-Mint:~/work/Practices/2dIsing$ g++ -std=gnu++17 -O3 -march=native -ffast-math -fwhole-program Ising2D.cpp 
cecri@cecri-Mint:~/work/Practices/2dIsing$ time ./a.out 
-0.410533
real    0m0.179s
user    0m0.176s
sys 0m0.004s

So still more room for optimisation on the Julia side....)

chriselrod commented 3 years ago

Regarding the vrem_fast, that's a VectorizationBase bug.

julia> vxi = Vec(ntuple(Int,VectorizationBase.pick_vector_width(Int))...)
Vec{8, Int64}<1, 2, 3, 4, 5, 6, 7, 8>

julia> vxi2 = Vec(ntuple(_ -> rand(Int),VectorizationBase.pick_vector_width(Int))...)
Vec{8, Int64}<3175761247964132865, -7335689000808951881, 577807031437413038, -6325572139043877891, -55170765264911246, -6016006685313364129, -5181912736038640374, 6721328960625882679>

julia> vxi2 % vxi
Vec{8, Int64}<0, -1, 2, -3, -1, -1, -3, 7>

julia> @fastmath vxi2 % vxi
Vec{8, Int64}<0, -1, 2, -3, -1, -1, -3, 7>

julia> VectorizationBase.vrem_fast(vxi2, vxi)
ERROR: MethodError: no method matching vrem_fast(::Vec{8, Int64}, ::Vec{8, Int64})
Stacktrace:
 [1] top-level scope
   @ REPL[8]:1

julia> VectorizationBase.vrem(vxi2, vxi)
Vec{8, Int64}<0, -1, 2, -3, -1, -1, -3, 7>

Something's going wrong in the dispatch pipeline, but an obvious fix is to just define the fallback method in VectorizationBase:

@inline vrem_fast(a,b) = vrem(a,b)

The indexing issue would be a little trickier, but I should define getindex and setindex! methods for some AbstractArray types (those that support stridedpointer).

I'll try and take a look at everything else later. Definitely a fan of winning benchmarks. =)

chriselrod commented 3 years ago

However, I don't think LoopVectorization will do the right thing here, and it may be that gcc is more clever.

function energy(spin_conf)
    (Nx, Ny) = size(spin_conf)
    res = 0
    @avx for i = 1:Nx
        for j = 1:Ny
            i0 = (i-1+Nx)%Nx
            i1 = (i+Nx)%Nx
            j0 = (j-1+Ny)%Ny
            j1 = (j+Ny)%Ny
            res += -spin_conf[i0,j0]*(spin_conf[i1,j0] + spin_conf[i0,j1])
        end
    end
    return res
end

Basically, you need to get rid of the %Nx and %Ny by breaking up the iteration space. I'm guessing this is what gcc and icpx do, but is not an optimization I've implemented in LoopVectorization yet.

The splitting should be really straightforward to do manually.

carstenbauer commented 3 years ago

Thanks for the hints. Perhaps a better formulation of the benchmark challenge 😄 : https://discourse.julialang.org/t/performance-optimisation-julia-vs-c/59689

DNF2 commented 3 years ago

I got into a somewhat similar situation with x % UInt8 when x is an Int8. I got this error message:

ERROR: MethodError: no method matching vrem_fast(::VectorizationBase.Vec{32, Int8}, ::Type{UInt8})
Stacktrace:
 [1] rem_fast(v::VectorizationBase.Vec{32, Int8}, #unused#::Type{UInt8})

I guess this is a missing method, but when I extended vrem_fast as you suggested upthread, @chriselrod , I got an ERROR: InexactError: trunc(UInt8, 256) instead. Without @avx it works fine.