Open ChrisRackauckas opened 2 years ago
Full attempted tutorial code for future reference:
using OrdinaryDiffEq, LinearAlgebra, SparseArrays, BenchmarkTools, LinearSolve
const N = 32
const xyd_brusselator = range(0,stop=1,length=N)
brusselator_f(x, y, t) = (((x-0.3)^2 + (y-0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.
limit(a, N) = a == N+1 ? 1 : a == 0 ? N : a
kernel_u! = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
@inline function (du, u, A, B, α, II, I, t)
i, j = Tuple(I)
x = xyd[I[1]]
y = xyd[I[2]]
ip1 = limit(i+1, N); im1 = limit(i-1, N)
jp1 = limit(j+1, N); jm1 = limit(j-1, N)
du[II[i,j,1]] = α*(u[II[im1,j,1]] + u[II[ip1,j,1]] + u[II[i,jp1,1]] + u[II[i,jm1,1]] - 4u[II[i,j,1]]) +
B + u[II[i,j,1]]^2*u[II[i,j,2]] - (A + 1)*u[II[i,j,1]] + brusselator_f(x, y, t)
end
end
kernel_v! = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
@inline function (du, u, A, B, α, II, I, t)
i, j = Tuple(I)
ip1 = limit(i+1, N)
im1 = limit(i-1, N)
jp1 = limit(j+1, N)
jm1 = limit(j-1, N)
du[II[i,j,2]] = α*(u[II[im1,j,2]] + u[II[ip1,j,2]] + u[II[i,jp1,2]] + u[II[i,jm1,2]] - 4u[II[i,j,2]]) +
A*u[II[i,j,1]] - u[II[i,j,1]]^2*u[II[i,j,2]]
end
end
brusselator_2d = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
function (du, u, p, t)
@inbounds begin
ii1 = N^2
ii2 = ii1+N^2
ii3 = ii2+2(N^2)
A = p[1]
B = p[2]
α = p[3]/dx^2
II = LinearIndices((N, N, 2))
kernel_u!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
kernel_v!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
return nothing
end
end
end
p = (3.4, 1., 10., step(xyd_brusselator))
function init_brusselator_2d(xyd)
N = length(xyd)
u = zeros(N, N, 2)
for I in CartesianIndices((N, N))
x = xyd[I[1]]
y = xyd[I[2]]
u[I,1] = 22*(y*(1-y))^(3/2)
u[I,2] = 27*(x*(1-x))^(3/2)
end
u
end
u0 = init_brusselator_2d(xyd_brusselator)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d,u0,(0.,11.5),p)
du = similar(u0)
brusselator_2d(du, u0, p, 0.0)
du[34] # 802.9807693762164
du[1058] # 985.3120721709204
du[2000] # -403.5817880634729
du[end] # 1431.1460373522068
du[521] # -323.1677459142322
du2 = similar(u0)
brusselator_2d(du2, u0, p, 1.3)
du2[34] # 802.9807693762164
du2[1058] # 985.3120721709204
du2[2000] # -403.5817880634729
du2[end] # 1431.1460373522068
du2[521] # -318.1677459142322
using Symbolics, CUDA, SparseDiffTools
CUDA.allowscalar(false) # Makes slow behavior throw an error
du0 = copy(u0)
jac_sparsity = float.(Symbolics.jacobian_sparsity((du,u)->brusselator_2d(du,u,p,0.0),du0,u0))
jac_cusparse = cu(jac_sparsity)
colorvec = matrix_colors(jac_sparsity)
f = ODEFunction(brusselator_2d;jac_prototype=jac_sparsity,colorvec = colorvec)
cuf = ODEFunction(brusselator_2d;jac_prototype=jac_cusparse,colorvec = colorvec)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d,u0,(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_sparse = ODEProblem(f,u0,(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_cuda = ODEProblem(brusselator_2d,cu(u0),(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_cusparse = ODEProblem(cuf,cu(u0),(0f0,11.5f0),p,tstops=[1.1f0])
@time solve(prob_ode_brusselator_2d,Rosenbrock23(),save_everystep=false);
@time solve(prob_ode_brusselator_2d_cuda,Rosenbrock23(),save_everystep=false);
# 11.015065 seconds (6.93 M allocations: 386.634 MiB, 1.08% gc time)
using AlgebraicMultigrid
function algebraicmultigrid(W,du,u,p,t,newW,Plprev,Prprev,solverdata)
if newW === nothing || newW
Pl = aspreconditioner(ruge_stuben(convert(AbstractMatrix,W)))
else
Pl = Plprev
end
Pl,nothing
end
# Required due to a bug in Krylov.jl: https://github.com/JuliaSmoothOptimizers/Krylov.jl/pull/477
Base.eltype(::AlgebraicMultigrid.Preconditioner) = Float64
@time solve(prob_ode_brusselator_2d_sparse,Rosenbrock23(linsolve=KrylovJL_GMRES()),save_everystep=false);
# 1.732695 seconds (2.71 M allocations: 931.278 MiB, 28.04% gc time)
@time solve(prob_ode_brusselator_2d_sparse,Rosenbrock23(linsolve=KrylovJL_GMRES(),precs=algebraicmultigrid,concrete_jac=true),save_everystep=false);
# 0.864588 seconds (366.19 k allocations: 911.324 MiB, 9.27% gc time)
@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(),save_everystep=false);
@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(linsolve=KrylovJL_GMRES(),precs=algebraicmultigrid,concrete_jac=true),save_everystep=false);
using DifferentialEquations
@btime solve(prob_ode_brusselator_2d_cuda,alg_hints=[:stiff],save_everystep=false,abstol=1e-8,reltol=1e-8);
@btime solve(prob_ode_brusselator_2d_cusparse,alg_hints=[:stiff],save_everystep=false,abstol=1e-8,reltol=1e-8);
If fill!(cuA,true)
is implemented, then https://github.com/JuliaGPU/CUDA.jl/issues/101 is the last piece.
Needs https://github.com/JuliaArrays/ArrayInterface.jl/pull/242 for the sparse matrix tools to be generic enough.
Okay, current state is this. Pure GMRES works:
using OrdinaryDiffEq, LinearAlgebra, SparseArrays, BenchmarkTools, LinearSolve
const N = 32
const xyd_brusselator = range(0,stop=1,length=N)
brusselator_f(x, y, t) = (((x-0.3)^2 + (y-0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.
limit(a, N) = a == N+1 ? 1 : a == 0 ? N : a
kernel_u! = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
@inline function (du, u, A, B, α, II, I, t)
i, j = Tuple(I)
x = xyd[I[1]]
y = xyd[I[2]]
ip1 = limit(i+1, N); im1 = limit(i-1, N)
jp1 = limit(j+1, N); jm1 = limit(j-1, N)
du[II[i,j,1]] = α*(u[II[im1,j,1]] + u[II[ip1,j,1]] + u[II[i,jp1,1]] + u[II[i,jm1,1]] - 4u[II[i,j,1]]) +
B + u[II[i,j,1]]^2*u[II[i,j,2]] - (A + 1)*u[II[i,j,1]] + brusselator_f(x, y, t)
end
end
kernel_v! = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
@inline function (du, u, A, B, α, II, I, t)
i, j = Tuple(I)
ip1 = limit(i+1, N)
im1 = limit(i-1, N)
jp1 = limit(j+1, N)
jm1 = limit(j-1, N)
du[II[i,j,2]] = α*(u[II[im1,j,2]] + u[II[ip1,j,2]] + u[II[i,jp1,2]] + u[II[i,jm1,2]] - 4u[II[i,j,2]]) +
A*u[II[i,j,1]] - u[II[i,j,1]]^2*u[II[i,j,2]]
end
end
brusselator_2d = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
function (du, u, p, t)
@inbounds begin
ii1 = N^2
ii2 = ii1+N^2
ii3 = ii2+2(N^2)
A = p[1]
B = p[2]
α = p[3]/dx^2
II = LinearIndices((N, N, 2))
kernel_u!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
kernel_v!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
return nothing
end
end
end
p = (3.4, 1., 10., step(xyd_brusselator))
function init_brusselator_2d(xyd)
N = length(xyd)
u = zeros(N, N, 2)
for I in CartesianIndices((N, N))
x = xyd[I[1]]
y = xyd[I[2]]
u[I,1] = 22*(y*(1-y))^(3/2)
u[I,2] = 27*(x*(1-x))^(3/2)
end
u
end
u0 = init_brusselator_2d(xyd_brusselator)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d,u0,(0.,11.5),p)
du = similar(u0)
brusselator_2d(du, u0, p, 0.0)
du[34] # 802.9807693762164
du[1058] # 985.3120721709204
du[2000] # -403.5817880634729
du[end] # 1431.1460373522068
du[521] # -323.1677459142322
du2 = similar(u0)
brusselator_2d(du2, u0, p, 1.3)
du2[34] # 802.9807693762164
du2[1058] # 985.3120721709204
du2[2000] # -403.5817880634729
du2[end] # 1431.1460373522068
du2[521] # -318.1677459142322
using Symbolics, CUDA, SparseDiffTools
CUDA.allowscalar(false) # Makes slow behavior throw an error
du0 = copy(u0)
jac_sparsity = float.(Symbolics.jacobian_sparsity((du,u)->brusselator_2d(du,u,p,0.0),du0,u0))
jac_cusparse = CUDA.CUSPARSE.CuSparseMatrixCSR(jac_sparsity)
colorvec = matrix_colors(jac_sparsity)
f = ODEFunction(brusselator_2d;jac_prototype=jac_sparsity,colorvec = colorvec)
cuf = ODEFunction(brusselator_2d;jac_prototype=jac_cusparse,colorvec = colorvec)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d,u0,(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_sparse = ODEProblem(f,u0,(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_cuda = ODEProblem(brusselator_2d,cu(u0),(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_cusparse = ODEProblem(cuf,cu(u0),(0f0,11.5f0),p,tstops=[1.1f0])
@time solve(prob_ode_brusselator_2d,Rosenbrock23(),save_everystep=false);
@time solve(prob_ode_brusselator_2d_cuda,Rosenbrock23(),save_everystep=false);
@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(
linsolve=KrylovJL_GMRES()),save_everystep=false);
Using sparse factorizations is almost fixed via https://github.com/SciML/OrdinaryDiffEq.jl/pull/1599, but the factorization overloads themselves are missing (documented in https://github.com/JuliaGPU/CUDA.jl/issues/1396).
@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(),save_everystep=false);
ArgumentError: cannot take the CPU address of a CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}
unsafe_convert(#unused#::Type{Ptr{Float32}}, x::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) at array.jl:315
gemqrt!(side::Char, trans::Char, V::Matrix{Float32}, T::Matrix{Float32}, C::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) at lapack.jl:2949
lmul! at qr.jl:670 [inlined]
ldiv!(A::LinearAlgebra.QRCompactWY{Float32, Matrix{Float32}}, b::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) at qr.jl:855
ldiv!(Y::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, A::LinearAlgebra.QRCompactWY{Float32, Matrix{Float32}}, B::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) at factorization.jl:130
_ldiv!(x::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, A::LinearAlgebra.QRCompactWY{Float32, Matrix{Float32}}, b::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) at factorization.jl:1
#solve#6 at factorization.jl:14 [inlined]
solve at factorization.jl:10 [inlined]
#solve#5 at common.jl:137 [inlined]
solve at common.jl:137 [inlined]
#dolinsolve#3 at misc_utils.jl:105 [inlined]
dolinsolve at misc_utils.jl:86 [inlined]
perform_step!(integrator::OrdinaryDiffEq.ODEIntegrator{Rosenbrock23{12, true, QRFactorization{NoPivot}, typeof(OrdinaryDiffEq.DEFAULT_PRECS), Val{:forward}, true, nothing}, true, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Nothing, Float32, NTuple{4, Float64}, Float32, Float32, Float32, Float32, Vector{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}, ODESolution{Float32, 4, Vector{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}, Nothing, Nothing, Vector{Float32}, Vector{Vector{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, ODEProblem{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}, true, NTuple{4, Float64}, ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Base.Pairs{Symbol, Vector{Float32}, Tuple{Symbol}, NamedTuple{(:tstops,), Tuple{Vector{Float32}}}}, SciMLBase.StandardODEProblem}, Rosenbrock23{12, true, QRFactorization{NoPivot}, typeof(OrdinaryDiffEq.DEFAULT_PRECS), Val{:forward}, true, nothing}, OrdinaryDiffEq.InterpolationData{ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Vector{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}, Vector{Float32}, Vector{Vector{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, OrdinaryDiffEq.Rosenbrock23Cache{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, OrdinaryDiffEq.Rosenbrock23Tableau{Float32}, SciMLBase.TimeGradientWrapper{ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, NTuple{4, Float64}}, SciMLBase.UJacobianWrapper{ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Float32, NTuple{4, Float64}}, LinearSolve.LinearCache{CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, SciMLBase.NullParameters, QRFactorization{NoPivot}, LinearAlgebra.QRCompactWY{Float32, Matrix{Float32}}, LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32}, ForwardColorJacCache{CuArray{ForwardDiff.Dual{ForwardDiff.Tag{OrdinaryDiffEq.OrdinaryDiffEqTag, Float32}, Float32, 12}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{OrdinaryDiffEq.OrdinaryDiffEqTag, Float32}, Float32, 12}, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Vector{CuArray{NTuple{12, Float32}, 1, CUDA.Mem.DeviceBuffer}}, Vector{Int64}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{OrdinaryDiffEq.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, Float32, Rosenbrock23{12, true, QRFactorization{NoPivot}, typeof(OrdinaryDiffEq.DEFAULT_PRECS), Val{:forward}, true, nothing}}}, DiffEqBase.DEStats}, ODEFunction{true, var"#11#12"...
Using preconditioners with Krylov almost works.
using AlgebraicMultigrid
function algebraicmultigrid(W,du,u,p,t,newW,Plprev,Prprev,solverdata)
if newW === nothing || newW
Pl = aspreconditioner(ruge_stuben(convert(AbstractMatrix,W)))
else
Pl = Plprev
end
Pl,nothing
end
@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(
linsolve=KrylovJL_GMRES(),precs=algebraicmultigrid,
concrete_jac=true),save_everystep=false);
MethodError: no method matching ruge_stuben(::CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32})
Closest candidates are:
ruge_stuben(!Matched::Union{Hermitian{Ti, TA}, Symmetric{Ti, TA}, TA}) where {Ti, Tv, TA<:SparseMatrixCSC{Ti, Tv}} at C:\Users\accou\.julia\packages\AlgebraicMultigrid\ASpK7\src\classical.jl:10
ruge_stuben(!Matched::Union{Hermitian{Ti, TA}, Symmetric{Ti, TA}, TA}, !Matched::Type{Val{bs}}; strength, CF, presmoother, postsmoother, max_levels, max_coarse, coarse_solver, kwargs...) where {Ti, Tv, bs, TA<:SparseMatrixCSC{Ti, Tv}} at C:\Users\accou\.julia\packages\AlgebraicMultigrid\ASpK7\src\classical.jl:10
algebraicmultigrid(W::OrdinaryDiffEq.WOperator{true, Float32, UniformScaling{Bool}, Float32, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, JacVec{SciMLBase.UJacobianWrapper{ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Float32, NTuple{4, Float64}}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{SparseDiffTools.DeivVecTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{SparseDiffTools.DeivVecTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, du::Nothing, u::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, p::NTuple{4, Float64}, t::Float32, newW::Nothing, Plprev::Nothing, Prprev::Nothing, solverdata::Nothing) at test.jl:105
alg_cache(alg::Rosenbrock23{12, true, KrylovJL{typeof(Krylov.gmres!), Int64, Tuple{}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, typeof(algebraicmultigrid), Val{:forward}, true, true}, u::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, rate_prototype::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, #unused#::Type{Float32}, #unused#::Type{Float32}, #unused#::Type{Float32}, uprev::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, uprev2::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, f::ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, t::Float32, dt::Float32, reltol::Float32, p::NTuple{4, Float64}, calck::Bool, #unused#::Val{true}) at rosenbrock_caches.jl:84
__init(prob::ODEProblem{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}, true, NTuple{4, Float64}, ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Base.Pairs{Symbol, Vector{Float32}, Tuple{Symbol}, NamedTuple{(:tstops,), Tuple{Vector{Float32}}}}, SciMLBase.StandardODEProblem}, alg::Rosenbrock23{12, true, KrylovJL{typeof(Krylov.gmres!), Int64, Tuple{}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, typeof(algebraicmultigrid), Val{:forward}, true, true}, timeseries_init::Tuple{}, ts_init::Tuple{}, ks_init::Tuple{}, recompile::Type{Val{true}}; saveat::Tuple{}, tstops::Vector{Float32}, d_discontinuities::Tuple{}, save_idxs::Nothing, save_everystep::Bool, save_on::Bool, save_start::Bool, save_end::Nothing, callback::Nothing, dense::Bool, calck::Bool, dt::Float32, dtmin::Nothing, dtmax::Float32, force_dtmin::Bool, adaptive::Bool, gamma::Rational{Int64}, abstol::Nothing, reltol::Nothing, qmin::Rational{Int64}, qmax::Int64, qsteady_min::Int64, qsteady_max::Rational{Int64}, beta1::Nothing, beta2::Nothing, qoldinit::Rational{Int64}, controller::Nothing, fullnormalize::Bool, failfactor::Int64, maxiters::Int64, internalnorm::typeof(DiffEqBase.ODE_DEFAULT_NORM), internalopnorm::typeof(opnorm), isoutofdomain::typeof(DiffEqBase.ODE_DEFAULT_ISOUTOFDOMAIN), unstable_check::typeof(DiffEqBase.ODE_DEFAULT_UNSTABLE_CHECK), verbose::Bool, timeseries_errors::Bool, dense_errors::Bool, advance_to_tstop::Bool, stop_at_next_tstop::Bool, initialize_save::Bool, progress::Bool, progress_steps::Int64, progress_name::String, progress_message::typeof(DiffEqBase.ODE_DEFAULT_PROG_MESSAGE), userdata::Nothing, allow_extrapolation::Bool, initialize_integrator::Bool, alias_u0::Bool, alias_du0::Bool, initializealg::OrdinaryDiffEq.DefaultInit, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) at solve.jl:295
(::SciMLBase.var"#__init##kw")(::NamedTuple{(:tstops, :save_everystep), Tuple{Vector{Float32}, Bool}}, ::typeof(SciMLBase.__init), prob::ODEProblem{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}, true, NTuple{4, Float64}, ODEFun...
@ranjanan is there a reason that ruge_stuben
couldn't be changed to handle sparse GPU matrices? I thought GPU was handled? https://github.com/JuliaLinearAlgebra/AlgebraicMultigrid.jl/issues/93
We can do an ILU!
using OrdinaryDiffEq, LinearAlgebra, SparseArrays, BenchmarkTools, LinearSolve
const N = 32
const xyd_brusselator = range(0, stop = 1, length = N)
brusselator_f(x, y, t) = (((x - 0.3)^2 + (y - 0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.0
limit(a, N) = a == N + 1 ? 1 : a == 0 ? N : a
kernel_u! = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator)
@inline function (du, u, A, B, α, II, I, t)
i, j = Tuple(I)
x = xyd[I[1]]
y = xyd[I[2]]
ip1 = limit(i + 1, N)
im1 = limit(i - 1, N)
jp1 = limit(j + 1, N)
jm1 = limit(j - 1, N)
du[II[i, j, 1]] = α * (u[II[im1, j, 1]] + u[II[ip1, j, 1]] + u[II[i, jp1, 1]] + u[II[i, jm1, 1]] - 4u[II[i, j, 1]]) +
B + u[II[i, j, 1]]^2 * u[II[i, j, 2]] - (A + 1) * u[II[i, j, 1]] + brusselator_f(x, y, t)
end
end
kernel_v! = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator)
@inline function (du, u, A, B, α, II, I, t)
i, j = Tuple(I)
ip1 = limit(i + 1, N)
im1 = limit(i - 1, N)
jp1 = limit(j + 1, N)
jm1 = limit(j - 1, N)
du[II[i, j, 2]] = α * (u[II[im1, j, 2]] + u[II[ip1, j, 2]] + u[II[i, jp1, 2]] + u[II[i, jm1, 2]] - 4u[II[i, j, 2]]) +
A * u[II[i, j, 1]] - u[II[i, j, 1]]^2 * u[II[i, j, 2]]
end
end
brusselator_2d = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator)
function (du, u, p, t)
@inbounds begin
ii1 = N^2
ii2 = ii1 + N^2
ii3 = ii2 + 2(N^2)
A = p[1]
B = p[2]
α = p[3] / dx^2
II = LinearIndices((N, N, 2))
kernel_u!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
kernel_v!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
return nothing
end
end
end
p = (3.4, 1.0, 10.0, step(xyd_brusselator))
function init_brusselator_2d(xyd)
N = length(xyd)
u = zeros(N, N, 2)
for I in CartesianIndices((N, N))
x = xyd[I[1]]
y = xyd[I[2]]
u[I, 1] = 22 * (y * (1 - y))^(3 / 2)
u[I, 2] = 27 * (x * (1 - x))^(3 / 2)
end
u
end
u0 = init_brusselator_2d(xyd_brusselator)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d, u0, (0.0, 11.5), p)
du = similar(u0)
brusselator_2d(du, u0, p, 0.0)
du[34] # 802.9807693762164
du[1058] # 985.3120721709204
du[2000] # -403.5817880634729
du[end] # 1431.1460373522068
du[521] # -323.1677459142322
du2 = similar(u0)
brusselator_2d(du2, u0, p, 1.3)
du2[34] # 802.9807693762164
du2[1058] # 985.3120721709204
du2[2000] # -403.5817880634729
du2[end] # 1431.1460373522068
du2[521] # -318.1677459142322
using Symbolics, CUDA, SparseDiffTools
CUDA.allowscalar(false) # Makes slow behavior throw an error
du0 = copy(u0)
jac_sparsity = Float32.(Symbolics.jacobian_sparsity((du, u) -> brusselator_2d(du, u, p, 0.0), du0, u0))
jac_cusparse = CUDA.CUSPARSE.CuSparseMatrixCSR(jac_sparsity)
colorvec = matrix_colors(jac_sparsity)
f = ODEFunction(brusselator_2d; jac_prototype = jac_sparsity, colorvec = colorvec)
cuf = ODEFunction(brusselator_2d; jac_prototype = jac_cusparse, colorvec = colorvec)
prob_ode_brusselator_2d_cusparse = ODEProblem(cuf, cu(u0), (0.0f0, 11.5f0), p, tstops = [1.1f0])
@time solve(prob_ode_brusselator_2d_cusparse, Rosenbrock23(
linsolve = KrylovJL_GMRES()), save_everystep = false);
function ilu(W, du, u, p, t, newW, Plprev, Prprev, solverdata)
if newW === nothing || newW
Pl = convert(AbstractMatrix, W)
CUDA.CUSPARSE.ilu02!(Pl, 'O')
else
Pl = Plprev
end
Pl, nothing
end
@time solve(prob_ode_brusselator_2d_cusparse, Rosenbrock23(
linsolve = KrylovJL_GMRES(), precs = ilu,
concrete_jac = true), save_everystep = false)
But can't use it?
ERROR: LoadError: MethodError: no method matching ldiv!(::CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
Closest candidates are:
ldiv!(::Any, ::ChainRulesCore.AbstractThunk) at C:\Users\accou\.julia\packages\ChainRulesCore\IzITE\src\tangent_types\thunks.jl:88
ldiv!(::Any, ::LinearSolve.InvPreconditioner, ::Any) at C:\Users\accou\.julia\dev\LinearSolve\src\preconditioners.jl:30
ldiv!(::Any, ::LinearSolve.ComposePreconditioner, ::Any) at C:\Users\accou\.julia\dev\LinearSolve\src\preconditioners.jl:17
...
Stacktrace:
[1] ldiv!(y::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, A::LinearSolve.ComposePreconditioner{LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}}, x::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
@ LinearSolve C:\Users\accou\.julia\dev\LinearSolve\src\preconditioners.jl:21
[2] mul!(y::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, A::LinearSolve.InvPreconditioner{LinearSolve.ComposePreconditioner{LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}}}, x::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
@ LinearSolve C:\Users\accou\.julia\dev\LinearSolve\src\preconditioners.jl:31
[3] gmres!(solver::Krylov.GmresSolver{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, A::OrdinaryDiffEq.WOperator{true, Float32, UniformScaling{Bool}, Float32, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, JacVec{SciMLBase.UJacobianWrapper{ODEFunction{true, var"#9#10"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Float32, NTuple{4, Float64}}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{SparseDiffTools.DeivVecTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{SparseDiffTools.DeivVecTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, b::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}; M::LinearSolve.InvPreconditioner{LinearSolve.ComposePreconditioner{LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}}}, N::LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, atol::Float32, rtol::Float32, reorthogonalization::Bool, itmax::Int64, restart::Bool, verbose::Int64, history::Bool)
@ Krylov C:\Users\accou\.julia\packages\Krylov\73zDt\src\gmres.jl:88
[4] solve!(::Krylov.GmresSolver{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, ::OrdinaryDiffEq.WOperator{true, Float32, UniformScaling{Bool}, Float32, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, JacVec{SciMLBase.UJacobianWrapper{ODEFunction{true, var"#9#10"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Float32, NTuple{4, Float64}}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{SparseDiffTools.DeivVecTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{SparseDiffTools.DeivVecTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, ::Vararg{Any}; kwargs::Base.Pairs{Symbol, Any, NTuple{6, Symbol}, NamedTuple{(:M, :N, :atol, :rtol, :itmax, :verbose), Tuple{LinearSolve.InvPreconditioner{LinearSolve.ComposePreconditioner{LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}}}, LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Float32, Float32, Int64, Int64}}})
@ Krylov C:\Users\accou\.julia\packages\Krylov\73zDt\src\krylov_solvers.jl:1632
@ChrisRackauckas It seems there might have been a change to SparseDiffTools that broke the solve(prob_ode_brusselator_2d_cusparse, Rosenbrock23(linsolve = KrylovJL_GMRES()), save_everystep = false);
call. Running this I get the following error:
ERROR: MethodError: no method matching get_tag(::CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer})
Closest candidates are:
get_tag(::Array{ForwardDiff.Dual{T, V, N}}) where {T, V, N}
@ SparseDiffTools C:\Users\Sam\.julia\packages\SparseDiffTools\6bm6M\src\differentiation\jaches_products.jl:3
get_tag(::ForwardDiff.Dual{T, V, N}) where {T, V, N}
@ SparseDiffTools C:\Users\Sam\.julia\packages\SparseDiffTools\6bm6M\src\differentiation\jaches_products.jl:4
Stacktrace:
[1] auto_jacvec!(dy::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, f::SciMLBase.UJacobianWrapper{ODEFunction{true, SciMLBase.FullSpecialize, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}, Nothing}, Float32, NTuple{4, Float64}}, x::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, v::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, cache1::CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, cache2::CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer})
@ SparseDiffTools C:\Users\Sam\.julia\packages\SparseDiffTools\6bm6M\src\differentiation\jaches_products.jl:12
[2] (::SparseDiffTools.FwdModeAutoDiffVecProd{SciMLBase.UJacobianWrapper{ODEFunction{true, SciMLBase.FullSpecialize, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}, Nothing}, Float32, NTuple{4, Float64}}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Tuple{CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}}, typeof(auto_jacvec), typeof(auto_jacvec!)})(dv::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, v::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, p::NTuple{4, Float64}, t::Float32)
@ SparseDiffTools C:\Users\Sam\.julia\packages\SparseDiffTools\6bm6M\src\differentiation\jaches_products.jl:214
[3] mul!(v::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, L::FunctionOperator{true, true, false, Float32, SparseDiffTools.FwdModeAutoDiffVecProd{SciMLBase.UJacobianWrapper{ODEFunction{true, SciMLBase.FullSpecialize, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}, Nothing}, Float32, NTuple{4, Float64}}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Tuple{CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}}, typeof(auto_jacvec), typeof(auto_jacvec!)}, Nothing, Nothing, Nothing, NamedTuple{(:islinear, :isconvertible, :isconstant, :opnorm, :issymmetric, :ishermitian, :isposdef, :isinplace, :outofplace, :has_mul5, :ifcache, :T, :batch, :size, :sizes, :eltypes, :accepted_kwargs, :kwargs), Tuple{Bool, Bool, Bool, Nothing, Bool, Bool, Bool, Bool, Bool, Bool, Bool, DataType, Bool, Tuple{Int64, Int64}, Tuple{Tuple{Int64, Int64, Int64}, Tuple{Int64, Int64, Int64}}, Tuple{DataType, DataType}, Tuple{}, Dict{Symbol, Any}}}, NTuple{4, Float64}, Float32, Tuple{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, u::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
Update: I'm going to put up a separate issue over on SparseDiffTools, so this can hopefully be resolved.
Thanks for the report, that issue got solved. So we're at least back to the starting point here.
I was trying to write a tutorial on using CUDA and CuSparse for PDEs but found that CuSparse was a bit too incomplete. The setup was fine:
using dense was fine:
but CUDA doesn't help dense. But this case is all about sparse. And that's where I ran into issues. The first was https://github.com/JuliaGPU/CUDA.jl/issues/1316 which was turning the sparse GPU Jacobian into a dense CPU Jacobian. That was easy enough to workaround https://github.com/SciML/DiffEqBase.jl/pull/727 . But even with that,
hit lack of broadcast support https://github.com/JuliaGPU/CUDA.jl/issues/1317, while
hit a different case of https://github.com/JuliaGPU/CUDA.jl/issues/1317. So I think that's a no-go until broadcast support is completed.