JuliaDiff / SparseDiffTools.jl

Fast jacobian computation through sparsity exploitation and matrix coloring
MIT License
242 stars 42 forks source link

Failed get_tag breaking gpu ode solve with Krylov_GMRES() linear solver #262

Closed stmorgenstern closed 1 year ago

stmorgenstern commented 1 year ago

Running some of the previously successful code @ChrisRackauckas has set up over on OrdinaryDiffEq.jl/issues/1566, it seems that gpu ode solves using the Rosenbrock23(linsolve = KrylovJL_GMRES()) solver seems to raise an error with get_tag(). Here's a full MWE:

using OrdinaryDiffEq, LinearAlgebra, SparseArrays, BenchmarkTools, LinearSolve,SparseDiffTools,CUDA

const N = 32
const xyd_brusselator = range(0,stop=1,length=N)
brusselator_f(x, y, t) = (((x-0.3)^2 + (y-0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.
limit(a, N) = a == N+1 ? 1 : a == 0 ? N : a
kernel_u! = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
  @inline function (du, u, A, B, α, II, I, t)
    i, j = Tuple(I)
    x = xyd[I[1]]
    y = xyd[I[2]]
    ip1 = limit(i+1, N); im1 = limit(i-1, N)
    jp1 = limit(j+1, N); jm1 = limit(j-1, N)
    du[II[i,j,1]] = α*(u[II[im1,j,1]] + u[II[ip1,j,1]] + u[II[i,jp1,1]] + u[II[i,jm1,1]] - 4u[II[i,j,1]]) +
    B + u[II[i,j,1]]^2*u[II[i,j,2]] - (A + 1)*u[II[i,j,1]] + brusselator_f(x, y, t)
  end
end
kernel_v! = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
  @inline function (du, u, A, B, α, II, I, t)
    i, j = Tuple(I)
    ip1 = limit(i+1, N)
    im1 = limit(i-1, N)
    jp1 = limit(j+1, N)
    jm1 = limit(j-1, N)
    du[II[i,j,2]] = α*(u[II[im1,j,2]] + u[II[ip1,j,2]] + u[II[i,jp1,2]] + u[II[i,jm1,2]] - 4u[II[i,j,2]]) +
    A*u[II[i,j,1]] - u[II[i,j,1]]^2*u[II[i,j,2]]
  end
end
brusselator_2d = let N=N, xyd=xyd_brusselator, dx=step(xyd_brusselator)
  function (du, u, p, t)
    @inbounds begin
      ii1 = N^2
      ii2 = ii1+N^2
      ii3 = ii2+2(N^2)
      A = p[1]
      B = p[2]
      α = p[3]/dx^2
      II = LinearIndices((N, N, 2))
      kernel_u!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
      kernel_v!.(Ref(du), Ref(u), A, B, α, Ref(II), CartesianIndices((N, N)), t)
      return nothing
    end
  end
end
p = (3.4, 1., 10., step(xyd_brusselator))

function init_brusselator_2d(xyd)
  N = length(xyd)
  u = zeros(N, N, 2)
  for I in CartesianIndices((N, N))
    x = xyd[I[1]]
    y = xyd[I[2]]
    u[I,1] = 22*(y*(1-y))^(3/2)
    u[I,2] = 27*(x*(1-x))^(3/2)
  end
  u
end
u0 = init_brusselator_2d(xyd_brusselator)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d,u0,(0.,11.5),p)

du = similar(u0)
brusselator_2d(du, u0, p, 0.0)
du[34] # 802.9807693762164
du[1058] # 985.3120721709204
du[2000] # -403.5817880634729
du[end] # 1431.1460373522068
du[521] # -323.1677459142322

du2 = similar(u0)
brusselator_2d(du2, u0, p, 1.3)
du2[34] # 802.9807693762164
du2[1058] # 985.3120721709204
du2[2000] # -403.5817880634729
du2[end] # 1431.1460373522068
du2[521] # -318.1677459142322

CUDA.@time brusselator_2d(cu(du),cu(u0),p,0.0)

using Symbolics, CUDA, SparseDiffTools
CUDA.allowscalar(false) # Makes slow behavior throw an error
du0 = copy(u0)
jac_sparsity = float.(Symbolics.jacobian_sparsity((du,u)->brusselator_2d(du,u,p,0.0),du0,u0))
jac_cusparse = CUDA.CUSPARSE.CuSparseMatrixCSR(jac_sparsity)
colorvec = matrix_colors(jac_sparsity)
f = ODEFunction(brusselator_2d;jac_prototype=jac_sparsity,colorvec = colorvec)
cuf = ODEFunction(brusselator_2d;jac_prototype=jac_cusparse,colorvec = cu(colorvec))

prob_ode_brusselator_2d = ODEProblem(brusselator_2d,u0,(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_sparse = ODEProblem(f,u0,(0f0,11.5f0),p,tstops=[1.1f0])

prob_ode_brusselator_2d_cuda = ODEProblem(brusselator_2d,cu(u0),(0f0,11.5f0),p,tstops=[1.1f0])
prob_ode_brusselator_2d_cusparse = ODEProblem(cuf,cu(u0),(0f0,11.5f0),p,tstops=[1.1f0])

@time solve(prob_ode_brusselator_2d,Rosenbrock23(),save_everystep=false);
#
CUDA.@time solve(prob_ode_brusselator_2d_cuda,Rosenbrock23(),save_everystep=false);
#
CUDA.@time solve(prob_ode_brusselator_2d_cusparse,Rosenbrock23(linsolve=KrylovJL_GMRES()),save_everystep=false);

This produces the following error:

ERROR: MethodError: no method matching get_tag(::CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer})
Closest candidates are:
  get_tag(::Array{ForwardDiff.Dual{T, V, N}}) where {T, V, N}
   @ SparseDiffTools C:\Users\Sam\.julia\packages\SparseDiffTools\6bm6M\src\differentiation\jaches_products.jl:3
  get_tag(::ForwardDiff.Dual{T, V, N}) where {T, V, N}
   @ SparseDiffTools C:\Users\Sam\.julia\packages\SparseDiffTools\6bm6M\src\differentiation\jaches_products.jl:4

Stacktrace:
  [1] auto_jacvec!(dy::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, f::SciMLBase.UJacobianWrapper{ODEFunction{true, SciMLBase.FullSpecialize, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}, Nothing}, Float32, NTuple{4, Float64}}, x::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, v::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, cache1::CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, cache2::CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer})
    @ SparseDiffTools C:\Users\Sam\.julia\packages\SparseDiffTools\6bm6M\src\differentiation\jaches_products.jl:12
  [2] (::SparseDiffTools.FwdModeAutoDiffVecProd{SciMLBase.UJacobianWrapper{ODEFunction{true, SciMLBase.FullSpecialize, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}, Nothing}, Float32, NTuple{4, Float64}}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Tuple{CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}}, typeof(auto_jacvec), typeof(auto_jacvec!)})(dv::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, v::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, p::NTuple{4, Float64}, t::Float32)
    @ SparseDiffTools C:\Users\Sam\.julia\packages\SparseDiffTools\6bm6M\src\differentiation\jaches_products.jl:214
  [3] mul!(v::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, L::FunctionOperator{true, true, false, Float32, SparseDiffTools.FwdModeAutoDiffVecProd{SciMLBase.UJacobianWrapper{ODEFunction{true, SciMLBase.FullSpecialize, var"#11#12"{Int64, Float64}, UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int32}, CUDA.CUSPARSE.CuSparseMatrixCSR{Float64, Int32}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}, Nothing}, Float32, NTuple{4, Float64}}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, Tuple{CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}, CuArray{ForwardDiff.Dual{ForwardDiff.Tag{DiffEqBase.OrdinaryDiffEqTag, Float32}, Float32, 1}, 3, CUDA.Mem.DeviceBuffer}}, typeof(auto_jacvec), typeof(auto_jacvec!)}, Nothing, Nothing, Nothing, NamedTuple{(:islinear, :isconvertible, :isconstant, :opnorm, :issymmetric, :ishermitian, :isposdef, :isinplace, :outofplace, :has_mul5, :ifcache, :T, :batch, :size, :sizes, :eltypes, :accepted_kwargs, :kwargs), Tuple{Bool, Bool, Bool, Nothing, Bool, Bool, Bool, Bool, Bool, Bool, Bool, DataType, Bool, Tuple{Int64, Int64}, Tuple{Tuple{Int64, Int64, Int64}, Tuple{Int64, Int64, Int64}}, Tuple{DataType, DataType}, Tuple{}, Dict{Symbol, Any}}}, NTuple{4, Float64}, Float32, Tuple{CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}}}, u::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})