eGPU runs slower than without only when using multiprocessing

ChrisRackauckas commented 4 years ago

using DiffEqGPU, CuArrays, OrdinaryDiffEq, Test
CuArrays.device!(0)

using Distributed
addprocs(2)
@everywhere using DiffEqGPU, CuArrays, OrdinaryDiffEq, Test, Random

@everywhere begin
    function lorenz_distributed(du,u,p,t)
     @inbounds begin
         du[1] = p[1]*(u[2]-u[1])
         du[2] = u[1]*(p[2]-u[3]) - u[2]
         du[3] = u[1]*u[2] - p[3]*u[3]
     end
     nothing
    end
    CuArrays.allowscalar(false)
    u0 = Float32[1.0;0.0;0.0]
    tspan = (0.0f0,100.0f0)
    p = (10.0f0,28.0f0,8/3f0)
    Random.seed!(1)
    pre_p_distributed = [rand(Float32,3) for i in 1:100_000]
    function prob_func_distributed(prob,i,repeat)
        remake(prob,p=pre_p_distributed[i].*p)
    end
end

@sync begin
    @spawnat 2 CuArrays.device!(0)
    @spawnat 3 CuArrays.device!(1)
end

CuArrays.allowscalar(false)
prob = ODEProblem(lorenz_distributed,u0,tspan,p)
monteprob = EnsembleProblem(prob, prob_func = prob_func_distributed)

@time sol = solve(monteprob,Tsit5(),EnsembleGPUArray(),trajectories=100_000,batch_size=50_000,saveat=1.0f0)
@time solve(monteprob,Tsit5(),EnsembleThreads(), trajectories=100_000,saveat=1.0f0)

@vchuravy

ChrisRackauckas commented 4 years ago

using OrdinaryDiffEq, DiffEqGPU, CuArrays, LinearAlgebra
CuArrays.device!(0)
function lorenz(du,u,p,t)
 @inbounds begin
     du[1] = p[1]*(u[2]-u[1])
     du[2] = u[1]*(p[2]-u[3]) - u[2]
     du[3] = u[1]*u[2] - p[3]*u[3]
 end
 nothing
end

u0 = Float32[1.0;0.0;0.0]
tspan = (0.0f0,100.0f0)
p = (10.0f0,28.0f0,8/3f0)
prob = ODEProblem(lorenz,u0,tspan,p)
prob_func = (prob,i,repeat) -> remake(prob,p=rand(Float32,3).*p)
monteprob = EnsembleProblem(prob, prob_func = prob_func)
@time sol = solve(monteprob,Tsit5(),EnsembleGPUArray(),trajectories=100_000,batch_size=50_000,saveat=1.0f0)

is 2.5 seconds,

using OrdinaryDiffEq, DiffEqGPU, CuArrays, LinearAlgebra
CuArrays.device!(1)
function lorenz(du,u,p,t)
 @inbounds begin
     du[1] = p[1]*(u[2]-u[1])
     du[2] = u[1]*(p[2]-u[3]) - u[2]
     du[3] = u[1]*u[2] - p[3]*u[3]
 end
 nothing
end

u0 = Float32[1.0;0.0;0.0]
tspan = (0.0f0,100.0f0)
p = (10.0f0,28.0f0,8/3f0)
prob = ODEProblem(lorenz,u0,tspan,p)
prob_func = (prob,i,repeat) -> remake(prob,p=rand(Float32,3).*p)
monteprob = EnsembleProblem(prob, prob_func = prob_func)
@time sol = solve(monteprob,Tsit5(),EnsembleGPUArray(),trajectories=100_000,batch_size=50_000,saveat=1.0f0)

is 3 seconds, but multi-GPU is 94 seconds on master! Trying to figure it out, we added

https://github.com/JuliaDiffEq/DiffEqGPU.jl/commit/748b7d18a7c91cc8c7bdfbbc8c39f75684262cea

which was sufficient to send the figure down to 1.5 seconds, making multi-GPU the fastest. However, this is clearly not the ideal strategy.

ChrisRackauckas commented 4 years ago

Fixed with yield() https://github.com/JuliaDiffEq/DiffEqGPU.jl/commit/90d10d0adbc3dcb174b600495f2d760cb67cd2c2

ChrisRackauckas commented 4 years ago

Turns out that only happened when the return was broken...

SciML / DiffEqGPU.jl

eGPU runs slower than without only when using multiprocessing #48