CliMA / ClimaCore.jl

CliMA model dycore
https://clima.github.io/ClimaCore.jl/dev
Apache License 2.0
85 stars 8 forks source link

`copyto!(::Field, ::Field)` performance is 2x slower than parent array #1403

Open charleskawczynski opened 1 year ago

charleskawczynski commented 1 year ago

MWE:

using Revise
import ClimaCore;
import ClimaCore.Fields as Fields
@isdefined(TU) || include(joinpath(pkgdir(ClimaCore), "test", "TestUtilities", "TestUtilities.jl"));
import .TestUtilities as TU;
FT = Float64;
space = TU.CenterExtrudedFiniteDifferenceSpace(FT;zelem=100, helem=25); # requires adding helem to kwarg.
Base.@kwdef struct BigStruct{T}
    x1::T = 0
    x2::T = 0
end;
function copy_arr!(x, y)
    Base.copyto!(x, y)
    return nothing
end;
function copy_arr_parent!(x, y)
    Base.copyto!(parent(x), parent(y))
    return nothing
end;
using BenchmarkTools;
x = fill((;s=BigStruct{FT}()), space);
y = fill((;s=BigStruct{FT}()), space);
trial = BenchmarkTools.@benchmark copy_arr!($x, $y);
show(stdout, MIME("text/plain"), trial);
trial = BenchmarkTools.@benchmark copy_arr_parent!($x, $y);
show(stdout, MIME("text/plain"), trial);
nothing
charleskawczynski commented 1 year ago

Same is true for Broadcasted paths:

using Revise
import ClimaCore;
import ClimaCore.Fields as Fields
using BenchmarkTools;
@isdefined(TU) || include(joinpath(pkgdir(ClimaCore), "test", "TestUtilities", "TestUtilities.jl"));
import .TestUtilities as TU;
FT = Float64;
space = TU.CenterExtrudedFiniteDifferenceSpace(FT;zelem=25, helem=10);
Base.@kwdef struct BigStruct{T}
    x1::T = 0
    x2::T = 0
end;
x = fill((;s=BigStruct{FT}()), space);
y = fill((;s=BigStruct{FT}()), space);
function bc_arr!(x, y)
    Base.copyto!(x, Base.Broadcast.broadcasted(identity, y))
    return nothing
end;
function copy_arr_parent!(x, y)
    Base.copyto!(parent(x), parent(y))
    return nothing
end;
trial = BenchmarkTools.@benchmark bc_arr!($x, $y);
show(stdout, MIME("text/plain"), trial);
trial = BenchmarkTools.@benchmark copy_arr_parent!($x, $y);
show(stdout, MIME("text/plain"), trial);
nothing
charleskawczynski commented 1 year ago

local notes:

charleskawczynski commented 1 year ago

This is related to cartesian indexing:

using Revise
import ClimaCore;
import ClimaCore.Fields as Fields
import ClimaCore.DataLayouts as DataLayouts
using BenchmarkTools;
@isdefined(TU) || include(joinpath(pkgdir(ClimaCore), "test", "TestUtilities", "TestUtilities.jl"));
import .TestUtilities as TU;
FT = Float64;
space = TU.CenterExtrudedFiniteDifferenceSpace(FT;zelem=25, helem=10);
Base.@kwdef struct BigStruct{T}
    x1::T = 0
    x2::T = 0
end;
x = fill((;s=BigStruct{FT}()), space);
y = fill((;s=BigStruct{FT}()), space);
function bc_arr!(x, y)
    Base.copyto!(x, Base.Broadcast.broadcasted(identity, y))
    return nothing
end;
function copy_arr_parent!(x, y)
    Base.copyto!(parent(x), parent(y))
    return nothing
end;
function cart_copyto!(x, y)
    xp = parent(x)
    yp = parent(y)
    (Nij, _, _, Nv, Nh) = size(xp)
    @inbounds for h in 1:Nh, v in 1:Nv, j in 1:Nij, i in 1:Nij
        idx = CartesianIndex((i, j, 1, v, h))
        xp[idx] = yp[idx]
    end
    return nothing
end;
function linear_index_copyto!(x, y)
    xp = parent(x)
    yp = parent(y)
    @inbounds for i in eachindex(xp, yp)
        xp[i] = yp[i]
    end
    return nothing
end;
function reshape_loop!(x, y)
    xp = parent(x)
    yp = parent(y)
    xr = reshape(xp, length(xp))
    yr = reshape(yp, length(yp))
    @inbounds for idx in 1:length(xr)
        xp[idx] = yp[idx]
    end
    return nothing
end;
trial = BenchmarkTools.@benchmark bc_arr!($x, $y);
show(stdout, MIME("text/plain"), trial);
trial = BenchmarkTools.@benchmark copy_arr_parent!($x, $y);
show(stdout, MIME("text/plain"), trial);
trial = BenchmarkTools.@benchmark cart_copyto!($x, $y);
show(stdout, MIME("text/plain"), trial);
trial = BenchmarkTools.@benchmark linear_index_copyto!($x, $y);
show(stdout, MIME("text/plain"), trial);
trial = BenchmarkTools.@benchmark reshape_loop!($x, $y);
show(stdout, MIME("text/plain"), trial);
nothing
charleskawczynski commented 1 year ago

More notes:

using Revise
import ClimaCore;
import ClimaCore.Fields as Fields
import ClimaCore.DataLayouts as DataLayouts
using BenchmarkTools;
@isdefined(TU) || include(joinpath(pkgdir(ClimaCore), "test", "TestUtilities", "TestUtilities.jl"));
import .TestUtilities as TU;
FT = Float64;
has_fast_indexing(::SubArray{T,N,P,I,L}) where {T,N,P,I,L} = L
space = TU.CenterExtrudedFiniteDifferenceSpace(FT;zelem=25, helem=10);
Base.@kwdef struct BigStruct{T}
    x1::T = 0
    x2::T = 0
    x3::T = 0
    x4::T = 0
    x5::T = 0
    x6::T = 0
    x7::T = 0
    x8::T = 0
    x9::T = 0
    x10::T = 0
    x11::T = 0
    x12::T = 0
    x13::T = 0
    x14::T = 0
    x15::T = 0
end;
x = fill((;s=BigStruct{FT}()), space);
y = fill((;s=BigStruct{FT}()), space);

function oranges(a, i)
    pa = parent(a)
    return ( # Specific to VIJFH
        Base.Slice(Base.OneTo(size(pa, 1))),
        Base.Slice(Base.OneTo(size(pa, 2))),
        Base.Slice(Base.OneTo(size(pa, 3))),
        i,
        Base.Slice(Base.OneTo(size(pa, 5))),
    );
end

px = parent(x);
py = parent(y);
perm = (
    1, # Nv
    2, # Nij
    3, # Nij
    5, # Nh
    4, # fields
)
fi = findfirst(j->j==4, perm)
pda(a, perm) = permutedims(parent(a), perm)
vpda(a, perm, i) = view(pda(a, perm), ntuple(j->oranges(a, i)[perm[j]], Val(5))...)
pdx = pda(x, perm)
pdy = pda(y, perm)
# @show has_fast_indexing(vpda(x, perm, 10))
# error("Done")

function bc_arr_unfused!(x, y) # Unfused Cartesian
    Base.copyto!(x.s.x1, Base.Broadcast.broadcasted(identity, y.s.x1))
    Base.copyto!(x.s.x2, Base.Broadcast.broadcasted(identity, y.s.x2))
    Base.copyto!(x.s.x3, Base.Broadcast.broadcasted(identity, y.s.x3))
    Base.copyto!(x.s.x4, Base.Broadcast.broadcasted(identity, y.s.x4))
    Base.copyto!(x.s.x5, Base.Broadcast.broadcasted(identity, y.s.x5))
    Base.copyto!(x.s.x6, Base.Broadcast.broadcasted(identity, y.s.x6))
    Base.copyto!(x.s.x7, Base.Broadcast.broadcasted(identity, y.s.x7))
    Base.copyto!(x.s.x8, Base.Broadcast.broadcasted(identity, y.s.x8))
    Base.copyto!(x.s.x9, Base.Broadcast.broadcasted(identity, y.s.x9))
    Base.copyto!(x.s.x10, Base.Broadcast.broadcasted(identity, y.s.x10))
    Base.copyto!(x.s.x11, Base.Broadcast.broadcasted(identity, y.s.x11))
    Base.copyto!(x.s.x12, Base.Broadcast.broadcasted(identity, y.s.x12))
    Base.copyto!(x.s.x13, Base.Broadcast.broadcasted(identity, y.s.x13))
    Base.copyto!(x.s.x14, Base.Broadcast.broadcasted(identity, y.s.x14))
    Base.copyto!(x.s.x15, Base.Broadcast.broadcasted(identity, y.s.x15))
    return nothing
end;
function bc_arr_fused!(x, y) # Fused Cartesian
    Base.copyto!(x, Base.Broadcast.broadcasted(identity, y))
    return nothing
end;
reshaped_sa(a) = reshape(parent(a), length(parent(a)))
function copy_arr_parent_unfused_reshaped_sa!(x, y) # Unfused reshaped subarray (linear, but slow)
    Base.copyto!(reshaped_sa(x.s.x1), reshaped_sa(y.s.x1))
    Base.copyto!(reshaped_sa(x.s.x2), reshaped_sa(y.s.x2))
    Base.copyto!(reshaped_sa(x.s.x3), reshaped_sa(y.s.x3))
    Base.copyto!(reshaped_sa(x.s.x4), reshaped_sa(y.s.x4))
    Base.copyto!(reshaped_sa(x.s.x5), reshaped_sa(y.s.x5))
    Base.copyto!(reshaped_sa(x.s.x6), reshaped_sa(y.s.x6))
    Base.copyto!(reshaped_sa(x.s.x7), reshaped_sa(y.s.x7))
    Base.copyto!(reshaped_sa(x.s.x8), reshaped_sa(y.s.x8))
    Base.copyto!(reshaped_sa(x.s.x9), reshaped_sa(y.s.x9))
    Base.copyto!(reshaped_sa(x.s.x10), reshaped_sa(y.s.x10))
    Base.copyto!(reshaped_sa(x.s.x11), reshaped_sa(y.s.x11))
    Base.copyto!(reshaped_sa(x.s.x12), reshaped_sa(y.s.x12))
    Base.copyto!(reshaped_sa(x.s.x13), reshaped_sa(y.s.x13))
    Base.copyto!(reshaped_sa(x.s.x14), reshaped_sa(y.s.x14))
    Base.copyto!(reshaped_sa(x.s.x15), reshaped_sa(y.s.x15))
    return nothing
end;

function copy_arr_parent_unfused_sa!(x, y) # Unfused SubArray
    Base.copyto!(parent(x.s.x1), parent(y.s.x1))
    Base.copyto!(parent(x.s.x2), parent(y.s.x2))
    Base.copyto!(parent(x.s.x3), parent(y.s.x3))
    Base.copyto!(parent(x.s.x4), parent(y.s.x4))
    Base.copyto!(parent(x.s.x5), parent(y.s.x5))
    Base.copyto!(parent(x.s.x6), parent(y.s.x6))
    Base.copyto!(parent(x.s.x7), parent(y.s.x7))
    Base.copyto!(parent(x.s.x8), parent(y.s.x8))
    Base.copyto!(parent(x.s.x9), parent(y.s.x9))
    Base.copyto!(parent(x.s.x10), parent(y.s.x10))
    Base.copyto!(parent(x.s.x11), parent(y.s.x11))
    Base.copyto!(parent(x.s.x12), parent(y.s.x12))
    Base.copyto!(parent(x.s.x13), parent(y.s.x13))
    Base.copyto!(parent(x.s.x14), parent(y.s.x14))
    Base.copyto!(parent(x.s.x15), parent(y.s.x15))
    return nothing
end;
function sa_permuted_dims(a, i, perm, pda)
    o = oranges(a, i)
    return view(pda, ntuple(i->o[perm[i]], Val(5))...)
end
function reshaped_sa_permuted_dims(a, i, perm, pda)
    pd = sa_permuted_dims(a, i, perm, pda)
    reshape(pd, length(pd))
end
function copy_arr_parent_unfused_sa_permuted_dims!(x, y, perm, pdx, pdy) # Unfused linear
    # subarray of reshaped
    Base.copyto!(sa_permuted_dims(x, 1, perm, pdx), sa_permuted_dims(y, 1, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 2, perm, pdx), sa_permuted_dims(y, 2, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 3, perm, pdx), sa_permuted_dims(y, 3, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 4, perm, pdx), sa_permuted_dims(y, 4, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 5, perm, pdx), sa_permuted_dims(y, 5, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 6, perm, pdx), sa_permuted_dims(y, 6, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 7, perm, pdx), sa_permuted_dims(y, 7, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 8, perm, pdx), sa_permuted_dims(y, 8, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 9, perm, pdx), sa_permuted_dims(y, 9, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 10, perm, pdx), sa_permuted_dims(y, 10, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 11, perm, pdx), sa_permuted_dims(y, 11, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 12, perm, pdx), sa_permuted_dims(y, 12, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 13, perm, pdx), sa_permuted_dims(y, 13, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 14, perm, pdx), sa_permuted_dims(y, 14, perm, pdy))
    Base.copyto!(sa_permuted_dims(x, 15, perm, pdx), sa_permuted_dims(y, 15, perm, pdy))
    return nothing
end;
function copy_arr_parent_unfused_reshaped_sa_permuted_dims!(x, y, perm, pdx, pdy) # Unfused linear
    # subarray of reshaped
    Base.copyto!(reshaped_sa_permuted_dims(x, 1, perm, pdx), reshaped_sa_permuted_dims(y, 1, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 2, perm, pdx), reshaped_sa_permuted_dims(y, 2, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 3, perm, pdx), reshaped_sa_permuted_dims(y, 3, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 4, perm, pdx), reshaped_sa_permuted_dims(y, 4, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 5, perm, pdx), reshaped_sa_permuted_dims(y, 5, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 6, perm, pdx), reshaped_sa_permuted_dims(y, 6, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 7, perm, pdx), reshaped_sa_permuted_dims(y, 7, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 8, perm, pdx), reshaped_sa_permuted_dims(y, 8, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 9, perm, pdx), reshaped_sa_permuted_dims(y, 9, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 10, perm, pdx), reshaped_sa_permuted_dims(y, 10, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 11, perm, pdx), reshaped_sa_permuted_dims(y, 11, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 12, perm, pdx), reshaped_sa_permuted_dims(y, 12, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 13, perm, pdx), reshaped_sa_permuted_dims(y, 13, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 14, perm, pdx), reshaped_sa_permuted_dims(y, 14, perm, pdy))
    Base.copyto!(reshaped_sa_permuted_dims(x, 15, perm, pdx), reshaped_sa_permuted_dims(y, 15, perm, pdy))
    return nothing
end;
reshaped(a) = reshape(parent(a), length(parent(a)))
function copy_arr_parent_fused!(x, y) # Fused linear
    Base.copyto!(reshaped(x), reshaped(y))
    return nothing
end;

bc_arr_unfused!(x, y)
bc_arr_fused!(x, y)
copy_arr_parent_unfused_reshaped_sa!(x, y)
copy_arr_parent_unfused_sa!(x, y)
copy_arr_parent_unfused_sa_permuted_dims!(x, y, perm, pdx, pdy)
copy_arr_parent_unfused_reshaped_sa_permuted_dims!(x, y, perm, pdx, pdy)
copy_arr_parent_fused!(x, y)

# println("\n******************* bc_arr_unfused")
# trial = BenchmarkTools.@benchmark bc_arr_unfused!($x, $y);
# show(stdout, MIME("text/plain"), trial);
# println("\n******************* bc_arr_fused")
# trial = BenchmarkTools.@benchmark bc_arr_fused!($x, $y);
# show(stdout, MIME("text/plain"), trial);
println("\n******************* copy_arr_parent_unfused_reshaped_sa")
trial = BenchmarkTools.@benchmark copy_arr_parent_unfused_reshaped_sa!($x, $y);
show(stdout, MIME("text/plain"), trial);
println("\n******************* copy_arr_parent_unfused_sa")
trial = BenchmarkTools.@benchmark copy_arr_parent_unfused_sa!($x, $y);
show(stdout, MIME("text/plain"), trial);
println("\n******************* copy_arr_parent_unfused_sa_permuted_dims")
trial = BenchmarkTools.@benchmark copy_arr_parent_unfused_sa_permuted_dims!($x, $y, $perm, $pdx, $pdy);
show(stdout, MIME("text/plain"), trial);
println("\n******************* copy_arr_parent_unfused_reshaped_sa_permuted_dims")
trial = BenchmarkTools.@benchmark copy_arr_parent_unfused_reshaped_sa_permuted_dims!($x, $y, $perm, $pdx, $pdy);
show(stdout, MIME("text/plain"), trial);
println("\n******************* copy_arr_parent_fused")
trial = BenchmarkTools.@benchmark copy_arr_parent_fused!($x, $y);
show(stdout, MIME("text/plain"), trial);
# nothing
#=
@code_typed bc_arr!(x, y)
copy_arr_parent!(x, y)

@code_typed DataLayouts._serial_copyto!(Fields.field_values(x), Base.Broadcast.broadcasted(identity, Fields.field_values(y)))
@code_typed Base.copyto!(parent(x), parent(y))
=#
charleskawczynski commented 1 year ago

For printing permutations:

using Revise
import ClimaCore;
import ClimaCore.Fields as Fields
import ClimaCore.DataLayouts as DataLayouts
using BenchmarkTools;
@isdefined(TU) || include(joinpath(pkgdir(ClimaCore), "test", "TestUtilities", "TestUtilities.jl"));
import .TestUtilities as TU;
FT = Float64;
space = TU.CenterExtrudedFiniteDifferenceSpace(FT;zelem=25, helem=10);
Base.@kwdef struct BigStruct{T}
    x1::T = 0
    x2::T = 0
end;
x = fill((;s=BigStruct{FT}()), space);
y = fill((;s=BigStruct{FT}()), space);

function oranges(a, i)
    pa = parent(a)
    return ( # Specific to VIJFH
        Base.Slice(Base.OneTo(size(pa, 1))),
        Base.Slice(Base.OneTo(size(pa, 2))),
        Base.Slice(Base.OneTo(size(pa, 3))),
        i,
        Base.Slice(Base.OneTo(size(pa, 5))),
    );
end

px = parent(x);
py = parent(y);

has_fast_indexing(::SubArray{T,N,P,I,L}) where {T,N,P,I,L} = L

pda(a, perm) = permutedims(parent(a), perm);
vpda(a, perm, i) = view(pda(a, perm), ntuple(j->oranges(a, i)[perm[j]], Val(5))...);

##########################################################
##########################################################
##########################################################
# Move dimensions around to see which order supports fast indexing
# perm = (
#     1, # Nv
#     2, # Nij
#     3, # Nij
#     5, # Nh
#     4, # fields
# )
perm = (1,2,3,4,5); println(has_fast_indexing(vpda(x, perm, 10 #=field index 10=#))); # false (what we use)
perm = (1,2,3,5,4); println(has_fast_indexing(vpda(x, perm, 10 #=field index 10=#))); # true (swap field and Nh dims)
perm = (4,1,2,3,5); println(has_fast_indexing(vpda(x, perm, 10 #=field index 10=#))); # true (put field first)
# All other permutations are false
##########################################################
##########################################################
##########################################################

This (has_fast_indexing = true) doesn't necessarily mean that a particular order is fast (benchmarks are needed).