SciML / OrdinaryDiffEq.jl

High performance ordinary differential equation (ODE) and differential-algebraic equation (DAE) solvers, including neural ordinary differential equations (neural ODEs) and scientific machine learning (SciML)
https://diffeq.sciml.ai/latest/
Other
529 stars 200 forks source link

CuSparse Completion for ODEs #1566

Open ChrisRackauckas opened 2 years ago

ChrisRackauckas commented 2 years ago

I was trying to write a tutorial on using CUDA and CuSparse for PDEs but found that CuSparse was a bit too incomplete. The setup was fine:

using OrdinaryDiffEq, LinearAlgebra, SparseArrays, BenchmarkTools, LinearSolve

const N = 32
const xyd_brusselator = range(0,stop=1,length=N)
brusselator_f(x, y, t) = (((x-0.3)^2 + (y-0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.
limit(a, N) = a == N+1 ? 1 : a == 0 ? N : a
kernel_u! = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
  @inline function (du, u, A, B, α, II, I, t)
    i, j = Tuple(I)
    x = xyd[I[1]]
    y = xyd[I[2]]
    ip1 = limit(i+1, N); im1 = limit(i-1, N)
    jp1 = limit(j+1, N); jm1 = limit(j-1, N)
    du[II[i,j,1]] = α*(u[II[im1,j,1]] + u[II[ip1,j,1]] + u[II[i,jp1,1]] + u[II[i,jm1,1]] - 4u[II[i,j,1]]) +
    B + u[II[i,j,1]]^2*u[II[i,j,2]] - (A + 1)*u[II[i,j,1]] + brusselator_f(x, y, t)
  end
end
kernel_v! = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
  @inline function (du, u, A, B, α, II, I, t)
    i, j = Tuple(I)
    ip1 = limit(i+1, N)
    im1 = limit(i-1, N)
    jp1 = limit(j+1, N)
    jm1 = limit(j-1, N)
    du[II[i,j,2]] = α*(u[II[im1,j,2]] + u[II[ip1,j,2]] + u[II[i,jp1,2]] + u[II[i,jm1,2]] - 4u[II[i,j,2]]) +
    A*u[II[i,j,1]] - u[II[i,j,1]]^2*u[II[i,j,2]]
  end
end
brusselator_2d = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
  function (du, u, p, t)
    @inbounds begin
      ii1 = N^2
      ii2 = ii1+N^2
      ii3 = ii2+2(N^2)
      A = p[1]
      B = p[2]
      α = p[3]/dx^2
      II = LinearIndices((N, N, 2))
      kernel_u!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
      kernel_v!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
      return nothing
    end
  end
end
p = (3.4, 1., 10., step(xyd_brusselator))

function init_brusselator_2d(xyd)
  N = length(xyd)
  u = zeros(N, N, 2)
  for I in CartesianIndices((N, N))
    x = xyd[I[1]]
    y = xyd[I[2]]
    u[I,1] = 22*(y*(1-y))^(3/2)
    u[I,2] = 27*(x*(1-x))^(3/2)
  end
  u
end
u0 = init_brusselator_2d(xyd_brusselator)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d,u0,(0.,11.5),p)

du = similar(u0)
brusselator_2d(du, u0, p, 0.0)
du[34] # 802.9807693762164
du[1058] # 985.3120721709204
du[2000] # -403.5817880634729
du[end] # 1431.1460373522068
du[521] # -323.1677459142322

du2 = similar(u0)
brusselator_2d(du2, u0, p, 1.3)
du2[34] # 802.9807693762164
du2[1058] # 985.3120721709204
du2[2000] # -403.5817880634729
du2[end] # 1431.1460373522068
du2[521] # -318.1677459142322

using Symbolics, CUDA, SparseDiffTools
CUDA.allowscalar(false) # Makes slow behavior throw an error
du0 = copy(u0)
jac_sparsity = float.(Symbolics.jacobian_sparsity((du,u)->brusselator_2d(du,u,p,0.0),du0,u0))
jac_cusparse = cu(jac_sparsity)
colorvec = matrix_colors(jac_sparsity)
f = ODEFunction(brusselator_2d;jac_prototype=jac_sparsity,colorvec = colorvec)
cuf = ODEFunction(brusselator_2d;jac_prototype=jac_cusparse,colorvec = colorvec)

prob_ode_brusselator_2d = ODEProblem(brusselator_2d,u0,(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_sparse = ODEProblem(f,u0,(0f0,11.5f0),p,tstops=[1.1f0])

prob_ode_brusselator_2d_cuda = ODEProblem(brusselator_2d,cu(u0),(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_cusparse = ODEProblem(cuf,cu(u0),(0f0,11.5f0),p,tstops=[1.1f0])

using dense was fine:

@time solve(prob_ode_brusselator_2d,Rosenbrock23(),save_everystep=false);
# 9.630198 seconds (2.52 M allocations: 4.514 GiB, 10.03% gc time)
@time solve(prob_ode_brusselator_2d_cuda,Rosenbrock23(),save_everystep=false);
# 11.015065 seconds (6.93 M allocations: 386.634 MiB, 1.08% gc time)

but CUDA doesn't help dense. But this case is all about sparse. And that's where I ran into issues. The first was https://github.com/JuliaGPU/CUDA.jl/issues/1316 which was turning the sparse GPU Jacobian into a dense CPU Jacobian. That was easy enough to workaround https://github.com/SciML/DiffEqBase.jl/pull/727 . But even with that,

@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(),save_everystep=false);

hit lack of broadcast support https://github.com/JuliaGPU/CUDA.jl/issues/1317, while

@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(linsolve=KrylovJL_GMRES(),precs=algebraicmultigrid,concrete_jac=true),save_everystep=false);

hit a different case of https://github.com/JuliaGPU/CUDA.jl/issues/1317. So I think that's a no-go until broadcast support is completed.

ChrisRackauckas commented 2 years ago

Full attempted tutorial code for future reference:

using OrdinaryDiffEq, LinearAlgebra, SparseArrays, BenchmarkTools, LinearSolve

const N = 32
const xyd_brusselator = range(0,stop=1,length=N)
brusselator_f(x, y, t) = (((x-0.3)^2 + (y-0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.
limit(a, N) = a == N+1 ? 1 : a == 0 ? N : a
kernel_u! = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
  @inline function (du, u, A, B, α, II, I, t)
    i, j = Tuple(I)
    x = xyd[I[1]]
    y = xyd[I[2]]
    ip1 = limit(i+1, N); im1 = limit(i-1, N)
    jp1 = limit(j+1, N); jm1 = limit(j-1, N)
    du[II[i,j,1]] = α*(u[II[im1,j,1]] + u[II[ip1,j,1]] + u[II[i,jp1,1]] + u[II[i,jm1,1]] - 4u[II[i,j,1]]) +
    B + u[II[i,j,1]]^2*u[II[i,j,2]] - (A + 1)*u[II[i,j,1]] + brusselator_f(x, y, t)
  end
end
kernel_v! = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
  @inline function (du, u, A, B, α, II, I, t)
    i, j = Tuple(I)
    ip1 = limit(i+1, N)
    im1 = limit(i-1, N)
    jp1 = limit(j+1, N)
    jm1 = limit(j-1, N)
    du[II[i,j,2]] = α*(u[II[im1,j,2]] + u[II[ip1,j,2]] + u[II[i,jp1,2]] + u[II[i,jm1,2]] - 4u[II[i,j,2]]) +
    A*u[II[i,j,1]] - u[II[i,j,1]]^2*u[II[i,j,2]]
  end
end
brusselator_2d = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
  function (du, u, p, t)
    @inbounds begin
      ii1 = N^2
      ii2 = ii1+N^2
      ii3 = ii2+2(N^2)
      A = p[1]
      B = p[2]
      α = p[3]/dx^2
      II = LinearIndices((N, N, 2))
      kernel_u!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
      kernel_v!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
      return nothing
    end
  end
end
p = (3.4, 1., 10., step(xyd_brusselator))

function init_brusselator_2d(xyd)
  N = length(xyd)
  u = zeros(N, N, 2)
  for I in CartesianIndices((N, N))
    x = xyd[I[1]]
    y = xyd[I[2]]
    u[I,1] = 22*(y*(1-y))^(3/2)
    u[I,2] = 27*(x*(1-x))^(3/2)
  end
  u
end
u0 = init_brusselator_2d(xyd_brusselator)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d,u0,(0.,11.5),p)

du = similar(u0)
brusselator_2d(du, u0, p, 0.0)
du[34] # 802.9807693762164
du[1058] # 985.3120721709204
du[2000] # -403.5817880634729
du[end] # 1431.1460373522068
du[521] # -323.1677459142322

du2 = similar(u0)
brusselator_2d(du2, u0, p, 1.3)
du2[34] # 802.9807693762164
du2[1058] # 985.3120721709204
du2[2000] # -403.5817880634729
du2[end] # 1431.1460373522068
du2[521] # -318.1677459142322

using Symbolics, CUDA, SparseDiffTools
CUDA.allowscalar(false) # Makes slow behavior throw an error
du0 = copy(u0)
jac_sparsity = float.(Symbolics.jacobian_sparsity((du,u)->brusselator_2d(du,u,p,0.0),du0,u0))
jac_cusparse = cu(jac_sparsity)
colorvec = matrix_colors(jac_sparsity)
f = ODEFunction(brusselator_2d;jac_prototype=jac_sparsity,colorvec = colorvec)
cuf = ODEFunction(brusselator_2d;jac_prototype=jac_cusparse,colorvec = colorvec)

prob_ode_brusselator_2d = ODEProblem(brusselator_2d,u0,(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_sparse = ODEProblem(f,u0,(0f0,11.5f0),p,tstops=[1.1f0])

prob_ode_brusselator_2d_cuda = ODEProblem(brusselator_2d,cu(u0),(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_cusparse = ODEProblem(cuf,cu(u0),(0f0,11.5f0),p,tstops=[1.1f0])

@time solve(prob_ode_brusselator_2d,Rosenbrock23(),save_everystep=false);
@time solve(prob_ode_brusselator_2d_cuda,Rosenbrock23(),save_everystep=false);
# 11.015065 seconds (6.93 M allocations: 386.634 MiB, 1.08% gc time)

using AlgebraicMultigrid
function algebraicmultigrid(W,du,u,p,t,newW,Plprev,Prprev,solverdata)
  if newW === nothing || newW
    Pl = aspreconditioner(ruge_stuben(convert(AbstractMatrix,W)))
  else
    Pl = Plprev
  end
  Pl,nothing
end

# Required due to a bug in Krylov.jl: https://github.com/JuliaSmoothOptimizers/Krylov.jl/pull/477
Base.eltype(::AlgebraicMultigrid.Preconditioner) = Float64

@time solve(prob_ode_brusselator_2d_sparse,Rosenbrock23(linsolve=KrylovJL_GMRES()),save_everystep=false);
# 1.732695 seconds (2.71 M allocations: 931.278 MiB, 28.04% gc time)

@time solve(prob_ode_brusselator_2d_sparse,Rosenbrock23(linsolve=KrylovJL_GMRES(),precs=algebraicmultigrid,concrete_jac=true),save_everystep=false);
# 0.864588 seconds (366.19 k allocations: 911.324 MiB, 9.27% gc time)

@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(),save_everystep=false);

@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(linsolve=KrylovJL_GMRES(),precs=algebraicmultigrid,concrete_jac=true),save_everystep=false);

using DifferentialEquations
@btime solve(prob_ode_brusselator_2d_cuda,alg_hints=[:stiff],save_everystep=false,abstol=1e-8,reltol=1e-8);
@btime solve(prob_ode_brusselator_2d_cusparse,alg_hints=[:stiff],save_everystep=false,abstol=1e-8,reltol=1e-8);
ChrisRackauckas commented 2 years ago

If fill!(cuA,true) is implemented, then https://github.com/JuliaGPU/CUDA.jl/issues/101 is the last piece.

ChrisRackauckas commented 2 years ago

Needs https://github.com/JuliaArrays/ArrayInterface.jl/pull/242 for the sparse matrix tools to be generic enough.

ChrisRackauckas commented 2 years ago

Okay, current state is this. Pure GMRES works:

using OrdinaryDiffEq, LinearAlgebra, SparseArrays, BenchmarkTools, LinearSolve

const N = 32
const xyd_brusselator = range(0,stop=1,length=N)
brusselator_f(x, y, t) = (((x-0.3)^2 + (y-0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.
limit(a, N) = a == N+1 ? 1 : a == 0 ? N : a
kernel_u! = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
  @inline function (du, u, A, B, α, II, I, t)
    i, j = Tuple(I)
    x = xyd[I[1]]
    y = xyd[I[2]]
    ip1 = limit(i+1, N); im1 = limit(i-1, N)
    jp1 = limit(j+1, N); jm1 = limit(j-1, N)
    du[II[i,j,1]] = α*(u[II[im1,j,1]] + u[II[ip1,j,1]] + u[II[i,jp1,1]] + u[II[i,jm1,1]] - 4u[II[i,j,1]]) +
    B + u[II[i,j,1]]^2*u[II[i,j,2]] - (A + 1)*u[II[i,j,1]] + brusselator_f(x, y, t)
  end
end
kernel_v! = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
  @inline function (du, u, A, B, α, II, I, t)
    i, j = Tuple(I)
    ip1 = limit(i+1, N)
    im1 = limit(i-1, N)
    jp1 = limit(j+1, N)
    jm1 = limit(j-1, N)
    du[II[i,j,2]] = α*(u[II[im1,j,2]] + u[II[ip1,j,2]] + u[II[i,jp1,2]] + u[II[i,jm1,2]] - 4u[II[i,j,2]]) +
    A*u[II[i,j,1]] - u[II[i,j,1]]^2*u[II[i,j,2]]
  end
end
brusselator_2d = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
  function (du, u, p, t)
    @inbounds begin
      ii1 = N^2
      ii2 = ii1+N^2
      ii3 = ii2+2(N^2)
      A = p[1]
      B = p[2]
      α = p[3]/dx^2
      II = LinearIndices((N, N, 2))
      kernel_u!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
      kernel_v!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
      return nothing
    end
  end
end
p = (3.4, 1., 10., step(xyd_brusselator))

function init_brusselator_2d(xyd)
  N = length(xyd)
  u = zeros(N, N, 2)
  for I in CartesianIndices((N, N))
    x = xyd[I[1]]
    y = xyd[I[2]]
    u[I,1] = 22*(y*(1-y))^(3/2)
    u[I,2] = 27*(x*(1-x))^(3/2)
  end
  u
end
u0 = init_brusselator_2d(xyd_brusselator)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d,u0,(0.,11.5),p)

du = similar(u0)
brusselator_2d(du, u0, p, 0.0)
du[34] # 802.9807693762164
du[1058] # 985.3120721709204
du[2000] # -403.5817880634729
du[end] # 1431.1460373522068
du[521] # -323.1677459142322

du2 = similar(u0)
brusselator_2d(du2, u0, p, 1.3)
du2[34] # 802.9807693762164
du2[1058] # 985.3120721709204
du2[2000] # -403.5817880634729
du2[end] # 1431.1460373522068
du2[521] # -318.1677459142322

using Symbolics, CUDA, SparseDiffTools
CUDA.allowscalar(false) # Makes slow behavior throw an error
du0 = copy(u0)
jac_sparsity = float.(Symbolics.jacobian_sparsity((du,u)->brusselator_2d(du,u,p,0.0),du0,u0))
jac_cusparse = CUDA.CUSPARSE.CuSparseMatrixCSR(jac_sparsity)
colorvec = matrix_colors(jac_sparsity)
f = ODEFunction(brusselator_2d;jac_prototype=jac_sparsity,colorvec = colorvec)
cuf = ODEFunction(brusselator_2d;jac_prototype=jac_cusparse,colorvec = colorvec)

prob_ode_brusselator_2d = ODEProblem(brusselator_2d,u0,(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_sparse = ODEProblem(f,u0,(0f0,11.5f0),p,tstops=[1.1f0])

prob_ode_brusselator_2d_cuda = ODEProblem(brusselator_2d,cu(u0),(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_cusparse = ODEProblem(cuf,cu(u0),(0f0,11.5f0),p,tstops=[1.1f0])

@time solve(prob_ode_brusselator_2d,Rosenbrock23(),save_everystep=false);
@time solve(prob_ode_brusselator_2d_cuda,Rosenbrock23(),save_everystep=false);

@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(
            linsolve=KrylovJL_GMRES()),save_everystep=false);

Using sparse factorizations is almost fixed via https://github.com/SciML/OrdinaryDiffEq.jl/pull/1599, but the factorization overloads themselves are missing (documented in https://github.com/JuliaGPU/CUDA.jl/issues/1396).

@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(),save_everystep=false);

ArgumentError: cannot take the CPU address of a CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}
unsafe_convert(#unused#::Type{Ptr{Float32}}, x::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) at array.jl:315
gemqrt!(side::Char, trans::Char, V::Matrix{Float32}, T::Matrix{Float32}, C::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) at lapack.jl:2949
lmul! at qr.jl:670 [inlined]
ldiv!(A::LinearAlgebra.QRCompactWY{Float32, Matrix{Float32}}, b::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) at qr.jl:855
ldiv!(Y::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, A::LinearAlgebra.QRCompactWY{Float32, Matrix{Float32}}, B::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) at factorization.jl:130
_ldiv!(x::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, A::LinearAlgebra.QRCompactWY{Float32, Matrix{Float32}}, b::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) at factorization.jl:1
#solve#6 at factorization.jl:14 [inlined]
solve at factorization.jl:10 [inlined]
#solve#5 at common.jl:137 [inlined]
solve at common.jl:137 [inlined]
#dolinsolve#3 at misc_utils.jl:105 [inlined]
dolinsolve at misc_utils.jl:86 [inlined]
perform_step!(integrator::OrdinaryDiffEq.ODEIntegrator{Rosenbrock23{12, true, QRFactorization{NoPivot}, typeof(OrdinaryDiffEq.DEFAULT_PRECS), Val{:forward}, true, nothing}, true, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Nothing, Float32, NTuple{4, Float64}, Float32, Float32, Float32, Float32, Vector{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}, ODESolution{Float32, 4, Vector{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}, Nothing, Nothing, Vector{Float32}, Vector{Vector{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, ODEProblem{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}, true, NTuple{4, Float64}, ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Base.Pairs{Symbol, Vector{Float32}, Tuple{Symbol}, NamedTuple{(:tstops,), Tuple{Vector{Float32}}}}, SciMLBase.StandardODEProblem}, Rosenbrock23{12, true, QRFactorization{NoPivot}, typeof(OrdinaryDiffEq.DEFAULT_PRECS), Val{:forward}, true, nothing}, OrdinaryDiffEq.InterpolationData{ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Vector{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}, Vector{Float32}, Vector{Vector{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, OrdinaryDiffEq.Rosenbrock23Cache{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, OrdinaryDiffEq.Rosenbrock23Tableau{Float32}, SciMLBase.TimeGradientWrapper{ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, NTuple{4, Float64}}, SciMLBase.UJacobianWrapper{ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Float32, NTuple{4, Float64}}, LinearSolve.LinearCache{CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, SciMLBase.NullParameters, QRFactorization{NoPivot}, LinearAlgebra.QRCompactWY{Float32, Matrix{Float32}}, LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32}, ForwardColorJacCache{CuArray{ForwardDiff.Dual{ForwardDiff.Tag{OrdinaryDiffEq.OrdinaryDiffEqTag, Float32}, Float32, 12}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{OrdinaryDiffEq.OrdinaryDiffEqTag, Float32}, Float32, 12}, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Vector{CuArray{NTuple{12, Float32}, 1, CUDA.Mem.DeviceBuffer}}, Vector{Int64}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{OrdinaryDiffEq.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, Float32, Rosenbrock23{12, true, QRFactorization{NoPivot}, typeof(OrdinaryDiffEq.DEFAULT_PRECS), Val{:forward}, true, nothing}}}, DiffEqBase.DEStats}, ODEFunction{true, var"#11#12"...

Using preconditioners with Krylov almost works.

using AlgebraicMultigrid
function algebraicmultigrid(W,du,u,p,t,newW,Plprev,Prprev,solverdata)
  if newW === nothing || newW
    Pl = aspreconditioner(ruge_stuben(convert(AbstractMatrix,W)))
  else
    Pl = Plprev
  end
  Pl,nothing
end
@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(
            linsolve=KrylovJL_GMRES(),precs=algebraicmultigrid,
            concrete_jac=true),save_everystep=false);
MethodError: no method matching ruge_stuben(::CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32})
Closest candidates are:
  ruge_stuben(!Matched::Union{Hermitian{Ti, TA}, Symmetric{Ti, TA}, TA}) where {Ti, Tv, TA<:SparseMatrixCSC{Ti, Tv}} at C:\Users\accou\.julia\packages\AlgebraicMultigrid\ASpK7\src\classical.jl:10
  ruge_stuben(!Matched::Union{Hermitian{Ti, TA}, Symmetric{Ti, TA}, TA}, !Matched::Type{Val{bs}}; strength, CF, presmoother, postsmoother, max_levels, max_coarse, coarse_solver, kwargs...) where {Ti, Tv, bs, TA<:SparseMatrixCSC{Ti, Tv}} at C:\Users\accou\.julia\packages\AlgebraicMultigrid\ASpK7\src\classical.jl:10
algebraicmultigrid(W::OrdinaryDiffEq.WOperator{true, Float32, UniformScaling{Bool}, Float32, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, JacVec{SciMLBase.UJacobianWrapper{ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Float32, NTuple{4, Float64}}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{SparseDiffTools.DeivVecTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{SparseDiffTools.DeivVecTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, du::Nothing, u::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, p::NTuple{4, Float64}, t::Float32, newW::Nothing, Plprev::Nothing, Prprev::Nothing, solverdata::Nothing) at test.jl:105
alg_cache(alg::Rosenbrock23{12, true, KrylovJL{typeof(Krylov.gmres!), Int64, Tuple{}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, typeof(algebraicmultigrid), Val{:forward}, true, true}, u::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, rate_prototype::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, #unused#::Type{Float32}, #unused#::Type{Float32}, #unused#::Type{Float32}, uprev::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, uprev2::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, f::ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, t::Float32, dt::Float32, reltol::Float32, p::NTuple{4, Float64}, calck::Bool, #unused#::Val{true}) at rosenbrock_caches.jl:84
__init(prob::ODEProblem{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}, true, NTuple{4, Float64}, ODEFunction{true, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Base.Pairs{Symbol, Vector{Float32}, Tuple{Symbol}, NamedTuple{(:tstops,), Tuple{Vector{Float32}}}}, SciMLBase.StandardODEProblem}, alg::Rosenbrock23{12, true, KrylovJL{typeof(Krylov.gmres!), Int64, Tuple{}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, typeof(algebraicmultigrid), Val{:forward}, true, true}, timeseries_init::Tuple{}, ts_init::Tuple{}, ks_init::Tuple{}, recompile::Type{Val{true}}; saveat::Tuple{}, tstops::Vector{Float32}, d_discontinuities::Tuple{}, save_idxs::Nothing, save_everystep::Bool, save_on::Bool, save_start::Bool, save_end::Nothing, callback::Nothing, dense::Bool, calck::Bool, dt::Float32, dtmin::Nothing, dtmax::Float32, force_dtmin::Bool, adaptive::Bool, gamma::Rational{Int64}, abstol::Nothing, reltol::Nothing, qmin::Rational{Int64}, qmax::Int64, qsteady_min::Int64, qsteady_max::Rational{Int64}, beta1::Nothing, beta2::Nothing, qoldinit::Rational{Int64}, controller::Nothing, fullnormalize::Bool, failfactor::Int64, maxiters::Int64, internalnorm::typeof(DiffEqBase.ODE_DEFAULT_NORM), internalopnorm::typeof(opnorm), isoutofdomain::typeof(DiffEqBase.ODE_DEFAULT_ISOUTOFDOMAIN), unstable_check::typeof(DiffEqBase.ODE_DEFAULT_UNSTABLE_CHECK), verbose::Bool, timeseries_errors::Bool, dense_errors::Bool, advance_to_tstop::Bool, stop_at_next_tstop::Bool, initialize_save::Bool, progress::Bool, progress_steps::Int64, progress_name::String, progress_message::typeof(DiffEqBase.ODE_DEFAULT_PROG_MESSAGE), userdata::Nothing, allow_extrapolation::Bool, initialize_integrator::Bool, alias_u0::Bool, alias_du0::Bool, initializealg::OrdinaryDiffEq.DefaultInit, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) at solve.jl:295
(::SciMLBase.var"#__init##kw")(::NamedTuple{(:tstops, :save_everystep), Tuple{Vector{Float32}, Bool}}, ::typeof(SciMLBase.__init), prob::ODEProblem{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}, true, NTuple{4, Float64}, ODEFun...

@ranjanan is there a reason that ruge_stuben couldn't be changed to handle sparse GPU matrices? I thought GPU was handled? https://github.com/JuliaLinearAlgebra/AlgebraicMultigrid.jl/issues/93

ChrisRackauckas commented 2 years ago

We can do an ILU!

using OrdinaryDiffEq, LinearAlgebra, SparseArrays, BenchmarkTools, LinearSolve

const N = 32
const xyd_brusselator = range(0, stop = 1, length = N)
brusselator_f(x, y, t) = (((x - 0.3)^2 + (y - 0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.0
limit(a, N) = a == N + 1 ? 1 : a == 0 ? N : a
kernel_u! = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator)
    @inline function (du, u, A, B, α, II, I, t)
        i, j = Tuple(I)
        x = xyd[I[1]]
        y = xyd[I[2]]
        ip1 = limit(i + 1, N)
        im1 = limit(i - 1, N)
        jp1 = limit(j + 1, N)
        jm1 = limit(j - 1, N)
        du[II[i, j, 1]] = α * (u[II[im1, j, 1]] + u[II[ip1, j, 1]] + u[II[i, jp1, 1]] + u[II[i, jm1, 1]] - 4u[II[i, j, 1]]) +
                          B + u[II[i, j, 1]]^2 * u[II[i, j, 2]] - (A + 1) * u[II[i, j, 1]] + brusselator_f(x, y, t)
    end
end
kernel_v! = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator)
    @inline function (du, u, A, B, α, II, I, t)
        i, j = Tuple(I)
        ip1 = limit(i + 1, N)
        im1 = limit(i - 1, N)
        jp1 = limit(j + 1, N)
        jm1 = limit(j - 1, N)
        du[II[i, j, 2]] = α * (u[II[im1, j, 2]] + u[II[ip1, j, 2]] + u[II[i, jp1, 2]] + u[II[i, jm1, 2]] - 4u[II[i, j, 2]]) +
                          A * u[II[i, j, 1]] - u[II[i, j, 1]]^2 * u[II[i, j, 2]]
    end
end
brusselator_2d = let N = N, xyd = xyd_brusselator, dx = step(xyd_brusselator)
    function (du, u, p, t)
        @inbounds begin
            ii1 = N^2
            ii2 = ii1 + N^2
            ii3 = ii2 + 2(N^2)
            A = p[1]
            B = p[2]
            α = p[3] / dx^2
            II = LinearIndices((N, N, 2))
            kernel_u!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
            kernel_v!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
            return nothing
        end
    end
end
p = (3.4, 1.0, 10.0, step(xyd_brusselator))

function init_brusselator_2d(xyd)
    N = length(xyd)
    u = zeros(N, N, 2)
    for I in CartesianIndices((N, N))
        x = xyd[I[1]]
        y = xyd[I[2]]
        u[I, 1] = 22 * (y * (1 - y))^(3 / 2)
        u[I, 2] = 27 * (x * (1 - x))^(3 / 2)
    end
    u
end
u0 = init_brusselator_2d(xyd_brusselator)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d, u0, (0.0, 11.5), p)

du = similar(u0)
brusselator_2d(du, u0, p, 0.0)
du[34] # 802.9807693762164
du[1058] # 985.3120721709204
du[2000] # -403.5817880634729
du[end] # 1431.1460373522068
du[521] # -323.1677459142322

du2 = similar(u0)
brusselator_2d(du2, u0, p, 1.3)
du2[34] # 802.9807693762164
du2[1058] # 985.3120721709204
du2[2000] # -403.5817880634729
du2[end] # 1431.1460373522068
du2[521] # -318.1677459142322

using Symbolics, CUDA, SparseDiffTools
CUDA.allowscalar(false) # Makes slow behavior throw an error
du0 = copy(u0)
jac_sparsity = Float32.(Symbolics.jacobian_sparsity((du, u) -> brusselator_2d(du, u, p, 0.0), du0, u0))
jac_cusparse = CUDA.CUSPARSE.CuSparseMatrixCSR(jac_sparsity)
colorvec = matrix_colors(jac_sparsity)
f = ODEFunction(brusselator_2d; jac_prototype = jac_sparsity, colorvec = colorvec)
cuf = ODEFunction(brusselator_2d; jac_prototype = jac_cusparse, colorvec = colorvec)
prob_ode_brusselator_2d_cusparse = ODEProblem(cuf, cu(u0), (0.0f0, 11.5f0), p, tstops = [1.1f0])

@time solve(prob_ode_brusselator_2d_cusparse, Rosenbrock23(
        linsolve = KrylovJL_GMRES()), save_everystep = false);

function ilu(W, du, u, p, t, newW, Plprev, Prprev, solverdata)
    if newW === nothing || newW
        Pl = convert(AbstractMatrix, W)
        CUDA.CUSPARSE.ilu02!(Pl, 'O')
    else
        Pl = Plprev
    end
    Pl, nothing
end

@time solve(prob_ode_brusselator_2d_cusparse, Rosenbrock23(
        linsolve = KrylovJL_GMRES(), precs = ilu,
        concrete_jac = true), save_everystep = false)

But can't use it?

ERROR: LoadError: MethodError: no method matching ldiv!(::CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
Closest candidates are:
  ldiv!(::Any, ::ChainRulesCore.AbstractThunk) at C:\Users\accou\.julia\packages\ChainRulesCore\IzITE\src\tangent_types\thunks.jl:88
  ldiv!(::Any, ::LinearSolve.InvPreconditioner, ::Any) at C:\Users\accou\.julia\dev\LinearSolve\src\preconditioners.jl:30
  ldiv!(::Any, ::LinearSolve.ComposePreconditioner, ::Any) at C:\Users\accou\.julia\dev\LinearSolve\src\preconditioners.jl:17
  ...
Stacktrace:
  [1] ldiv!(y::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, A::LinearSolve.ComposePreconditioner{LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}}, x::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ LinearSolve C:\Users\accou\.julia\dev\LinearSolve\src\preconditioners.jl:21
  [2] mul!(y::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, A::LinearSolve.InvPreconditioner{LinearSolve.ComposePreconditioner{LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}}}, x::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ LinearSolve C:\Users\accou\.julia\dev\LinearSolve\src\preconditioners.jl:31
  [3] gmres!(solver::Krylov.GmresSolver{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, A::OrdinaryDiffEq.WOperator{true, Float32, UniformScaling{Bool}, Float32, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, JacVec{SciMLBase.UJacobianWrapper{ODEFunction{true, var"#9#10"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Float32, NTuple{4, Float64}}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{SparseDiffTools.DeivVecTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{SparseDiffTools.DeivVecTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, b::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}; M::LinearSolve.InvPreconditioner{LinearSolve.ComposePreconditioner{LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}}}, N::LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, atol::Float32, rtol::Float32, reorthogonalization::Bool, itmax::Int64, restart::Bool, verbose::Int64, history::Bool)
    @ Krylov C:\Users\accou\.julia\packages\Krylov\73zDt\src\gmres.jl:88
  [4] solve!(::Krylov.GmresSolver{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, ::OrdinaryDiffEq.WOperator{true, Float32, UniformScaling{Bool}, Float32, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, JacVec{SciMLBase.UJacobianWrapper{ODEFunction{true, var"#9#10"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Vector{Int64}}, Float32, NTuple{4, Float64}}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{SparseDiffTools.DeivVecTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{SparseDiffTools.DeivVecTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, ::Vararg{Any}; kwargs::Base.Pairs{Symbol, Any, NTuple{6, Symbol}, NamedTuple{(:M, :N, :atol, :rtol, :itmax, :verbose), Tuple{LinearSolve.InvPreconditioner{LinearSolve.ComposePreconditioner{LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}}}, LinearSolve.InvPreconditioner{Diagonal{Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Float32, Float32, Int64, Int64}}})
    @ Krylov C:\Users\accou\.julia\packages\Krylov\73zDt\src\krylov_solvers.jl:1632
stmorgenstern commented 11 months ago

@ChrisRackauckas It seems there might have been a change to SparseDiffTools that broke the solve(prob_ode_brusselator_2d_cusparse, Rosenbrock23(linsolve = KrylovJL_GMRES()), save_everystep = false); call. Running this I get the following error:

ERROR: MethodError: no method matching get_tag(::CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer})
Closest candidates are:
  get_tag(::Array{ForwardDiff.Dual{T, V, N}}) where {T, V, N}
   @ SparseDiffTools C:\Users\Sam\.julia\packages\SparseDiffTools\6bm6M\src\differentiation\jaches_products.jl:3
  get_tag(::ForwardDiff.Dual{T, V, N}) where {T, V, N}
   @ SparseDiffTools C:\Users\Sam\.julia\packages\SparseDiffTools\6bm6M\src\differentiation\jaches_products.jl:4

Stacktrace:
  [1] auto_jacvec!(dy::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, f::SciMLBase.UJacobianWrapper{ODEFunction{true, SciMLBase.FullSpecialize, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}, Nothing}, Float32, NTuple{4, Float64}}, x::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, v::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, cache1::CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, cache2::CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer})
    @ SparseDiffTools C:\Users\Sam\.julia\packages\SparseDiffTools\6bm6M\src\differentiation\jaches_products.jl:12
  [2] (::SparseDiffTools.FwdModeAutoDiffVecProd{SciMLBase.UJacobianWrapper{ODEFunction{true, SciMLBase.FullSpecialize, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}, Nothing}, Float32, NTuple{4, Float64}}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Tuple{CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}}, typeof(auto_jacvec), typeof(auto_jacvec!)})(dv::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, v::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, p::NTuple{4, Float64}, t::Float32)
    @ SparseDiffTools C:\Users\Sam\.julia\packages\SparseDiffTools\6bm6M\src\differentiation\jaches_products.jl:214
  [3] mul!(v::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, L::FunctionOperator{true, true, false, Float32, SparseDiffTools.FwdModeAutoDiffVecProd{SciMLBase.UJacobianWrapper{ODEFunction{true, SciMLBase.FullSpecialize, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}, Nothing}, Float32, NTuple{4, Float64}}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Tuple{CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}}, typeof(auto_jacvec), typeof(auto_jacvec!)}, Nothing, Nothing, Nothing, NamedTuple{(:islinear, :isconvertible, :isconstant, :opnorm, :issymmetric, :ishermitian, :isposdef, :isinplace, :outofplace, :has_mul5, :ifcache, :T, :batch, :size, :sizes, :eltypes, :accepted_kwargs, :kwargs), Tuple{Bool, Bool, Bool, Nothing, Bool, Bool, Bool, Bool, Bool, Bool, Bool, DataType, Bool, Tuple{Int64, Int64}, Tuple{Tuple{Int64, Int64, Int64}, Tuple{Int64, Int64, Int64}}, Tuple{DataType, DataType}, Tuple{}, Dict{Symbol, Any}}}, NTuple{4, Float64}, Float32, Tuple{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, u::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})

Update: I'm going to put up a separate issue over on SparseDiffTools, so this can hopefully be resolved.

ChrisRackauckas commented 10 months ago

Thanks for the report, that issue got solved. So we're at least back to the starting point here.