out of place Jacobian decomposition mutates

clintonTE commented 4 years ago

Performance results are highly variable when using cached out of place methods. I've gotten segfaults, although I cannot reliably reproduce that portion of the issue. Using CuArrays seems to amplify the issue.

using Revise
using Flux, BenchmarkTools, CuArrays, CUDAnative, ForwardDiff, LinearAlgebra, Random

function mwe(N, ::Type{T}=Float32) where T<:Real
  A::Matrix{T} = rand(T, N,N)
  cuA = A |> gpu

  function f!(out, A)
    out .= A .+ A .* A .+ 1f0
  end

  krn(x) = x + x*x + 1f0
  function f!(out, A::CuMatrix{Float32})
    out .= krn.(A)
  end

  function f(A)
    return A .+ A .* A .+ 1f0
  end

  function f(A::CuMatrix{Float32})
    return krn.(A)
  end

  J = rand(T, N^2, N^2)
  @info "test cpu (inplace)"
  cache = SparseDiffTools.ForwardColorJacCache(f!,A, dx = similar(A))
  SparseDiffTools.forwarddiff_color_jacobian!(J, f!, A, cache)
  (N<5) && @info "test ∇f cpu inplace: $(J)"
  (N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian!($J, $f!, $A, $cache)

  @info "test cpu (out of place)"
  cacheoos = SparseDiffTools.ForwardColorJacCache(f,A, dx = similar(A))
  J = SparseDiffTools.forwarddiff_color_jacobian(f, A, cacheoos)
  (N<5) && @info "test ∇f cpu oop: $(J)"
  (N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian($f, $A, $cacheoos)

  @info "test gpu (inplace)"
  cuJ = J |> gpu
  cucache = SparseDiffTools.ForwardColorJacCache(f!,cuA, dx = similar(cuA))
  SparseDiffTools.forwarddiff_color_jacobian!(cuJ, f!, cuA, cucache)
  (N<5) && @info "test ∇f gpu inplace: $(cuJ)"
  (N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian!($cuJ, $f!, $cuA, $cucache)

  @info "test gpu (outofplace)"
  cucacheoop = SparseDiffTools.ForwardColorJacCache(f,cuA, dx = similar(cuA))
  cuJ = SparseDiffTools.forwarddiff_color_jacobian(f, cuA, cucacheoop)
  (N<5) && @info "test ∇f  gpu oop: $(cuJ)"
  (N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian($f, $cuA, $cucacheoop)

end

mwe(12)

Output:

[ Info: test cpu (inplace)
  46.500 μs (8 allocations: 320 bytes)
[ Info: test cpu (out of place)
  181.946 ms (80271 allocations: 853.10 MiB)
[ Info: test gpu (inplace)
  12.860 ms (10965 allocations: 402.14 KiB)
[ Info: test gpu (outofplace)
  3.110 s (3516919 allocations: 122.65 MiB)

ChrisRackauckas commented 4 years ago

Confirmed to not be due to scalar indexing:

using Flux, SparseDiffTools, BenchmarkTools, CuArrays, ForwardDiff, LinearAlgebra, Random
CuArrays.allowscalar(false)
N = 10
T = Float32
A = rand(T, N,N)

cuA = A |> gpu

function f!(out, A)

  out .= A .+ A .* A .+ 1f0

end

krn(x) = x + x*x + 1f0

function f!(out, A::CuMatrix{Float32})

  out .= krn.(A)

end

function f(A)

  return A .+ A .* A .+ 1f0

end

function f(A::CuMatrix{Float32})

  return krn.(A)

end

J = rand(T, N^2, N^2)

@info "test cpu (inplace)"
cache = SparseDiffTools.ForwardColorJacCache(f!,A, dx = similar(A))
SparseDiffTools.forwarddiff_color_jacobian!(J, f!, A, cache)
(N<5) && @info "test ∇f cpu inplace: $(J)"
(N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian!($J, $f!, $A, $cache)

@info "test cpu (out of place)"
cacheoos = SparseDiffTools.ForwardColorJacCache(f,A, dx = similar(A))
J = SparseDiffTools.forwarddiff_color_jacobian(f, A, cacheoos)
(N<5) && @info "test ∇f cpu oop: $(J)"
(N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian($f, $A, $cacheoos)

@info "test gpu (inplace)"
cuJ = J |> gpu
cucache = SparseDiffTools.ForwardColorJacCache(f!,cuA, dx = similar(cuA))
SparseDiffTools.forwarddiff_color_jacobian!(cuJ, f!, cuA, cucache)
(N<5) && @info "test ∇f gpu inplace: $(cuJ)"
(N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian!($cuJ, $f!, $cuA, $cucache)

@info "test gpu (outofplace)"
cucacheoop = SparseDiffTools.ForwardColorJacCache(f,cuA, dx = similar(cuA))
cuJ = SparseDiffTools.forwarddiff_color_jacobian(f, cuA, cucacheoop)
(N<5) && @info "test ∇f  gpu oop: $(cuJ)"
(N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian($f, $cuA, $cucacheoop)

The problem is likely the fact that https://github.com/JuliaDiff/SparseDiffTools.jl/blob/v1.8.0/src/differentiation/compute_jacobian_ad.jl#L123-L126 produces too many kernels. It's somewhat tied up with https://github.com/JuliaDiff/SparseDiffTools.jl/pull/106 and the upstream issue https://github.com/JuliaGPU/CuArrays.jl/issues/571

ChrisRackauckas commented 4 years ago

https://github.com/JuliaDiff/SparseDiffTools.jl/pull/115 makes this much faster. I'll leave this open because it's not perfect (it now mutates), but the speed boost will essentially make this issue go away for most people.

JuliaDiff / SparseDiffTools.jl

out of place Jacobian decomposition mutates #108