batched_jacobian fails on GPU for arrays above a certain size

Hi, I appreciate this package is experimental but I came across the following strange behaviour on a V100 GPU. Above a certain size in the first dimension, the batched jacobian no longer works on the GPU, while continuing to function on the CPU. The batch dimension doesn't appear to make a difference.

MWE

using BatchedRoutines, ForwardDiff, CUDA, Random

f(u) = u .^ 2

u = rand(50, 128)
BatchedRoutines.batched_jacobian(AutoForwardDiff(), f, u);  # CPU, works

u = CuArray(u)
BatchedRoutines.batched_jacobian(AutoForwardDiff(), f, u);  # GPU, works

u = rand(51, 128)
BatchedRoutines.batched_jacobian(AutoForwardDiff(), f, u);  # CPU, works

u = CuArray(u)
BatchedRoutines.batched_jacobian(AutoForwardDiff(), f, u);  # GPU, fails

Stack trace

InvalidIRError: compiling MethodInstance for (::GPUArrays.var"#34#36")(::CUDA.CuKernelContext, ::CuDeviceVector{ForwardDiff.Partials{11, Float64}, 1}, ::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1, CUDA.Mem.DeviceBuffer}, Tuple{Base.OneTo{Int64}}, BatchedRoutinesForwardDiffExt.var"#1#8"{ForwardDiff.Partials{11, Float64}, Float64, 11}, Tuple{Base.Broadcast.Extruded{CuDeviceVector{Int64, 1}, Tuple{Bool}, Tuple{Int64}}}}, ::Int64) resulted in invalid LLVM IR
Reason: unsupported call to an unknown function (call to julia.new_gc_frame)
Stacktrace:
 [1] ntuple
   @ ./ntuple.jl:19
 [2] #1
   @ ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:23
 [3] _broadcast_getindex_evalf
   @ ./broadcast.jl:709
 [4] _broadcast_getindex
   @ ./broadcast.jl:682
 [5] getindex
   @ ./broadcast.jl:636
 [6] #34
   @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:59
Reason: unsupported call to an unknown function (call to julia.push_gc_frame)
Stacktrace:
 [1] ntuple
   @ ./ntuple.jl:19
 [2] #1
   @ ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:23
 [3] _broadcast_getindex_evalf
   @ ./broadcast.jl:709
 [4] _broadcast_getindex
   @ ./broadcast.jl:682
 [5] getindex
   @ ./broadcast.jl:636
 [6] #34
   @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:59
Reason: unsupported call through a literal pointer (call to ijl_alloc_array_1d)
Stacktrace:
  [1] Array
    @ ./boot.jl:477
  [2] Array
    @ ./boot.jl:486
  [3] similar
    @ ./abstractarray.jl:877
  [4] similar
    @ ./abstractarray.jl:876
  [5] _array_for
    @ ./array.jl:723
  [6] collect
    @ ./array.jl:836
  [7] _ntuple
    @ ./ntuple.jl:37
  [8] ntuple
    @ ./ntuple.jl:19
  [9] #1
    @ ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:23
 [10] _broadcast_getindex_evalf
    @ ./broadcast.jl:709
 [11] _broadcast_getindex
    @ ./broadcast.jl:682
 [12] getindex
    @ ./broadcast.jl:636
 [13] #34
    @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:59
Reason: unsupported call through a literal pointer (call to ijl_alloc_array_1d)
Stacktrace:
  [1] Array
    @ ./boot.jl:477
  [2] Array
    @ ./boot.jl:486
  [3] similar
    @ ./abstractarray.jl:877
  [4] similar
    @ ./abstractarray.jl:876
  [5] _array_for
    @ ./array.jl:723
  [6] collect
    @ ./array.jl:839
  [7] _ntuple
    @ ./ntuple.jl:37
  [8] ntuple
    @ ./ntuple.jl:19
  [9] #1
    @ ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:23
 [10] _broadcast_getindex_evalf
    @ ./broadcast.jl:709
 [11] _broadcast_getindex
    @ ./broadcast.jl:682
 [12] getindex
    @ ./broadcast.jl:636
 [13] #34
    @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:59
Reason: unsupported call to an unknown function (call to julia.get_gc_frame_slot)
Stacktrace:
 [1] ntuple
   @ ./ntuple.jl:19
 [2] #1
   @ ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:23
 [3] _broadcast_getindex_evalf
   @ ./broadcast.jl:709
 [4] _broadcast_getindex
   @ ./broadcast.jl:682
 [5] getindex
   @ ./broadcast.jl:636
 [6] #34
   @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:59
Reason: unsupported call to an unknown function (call to jl_f__apply_iterate)
Stacktrace:
 [1] _ntuple
   @ ./ntuple.jl:37
 [2] ntuple
   @ ./ntuple.jl:19
 [3] #1
   @ ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:23
 [4] _broadcast_getindex_evalf
   @ ./broadcast.jl:709
 [5] _broadcast_getindex
   @ ./broadcast.jl:682
 [6] getindex
   @ ./broadcast.jl:636
 [7] #34
   @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:59
Reason: unsupported call to an unknown function (call to julia.pop_gc_frame)
Stacktrace:
 [1] ntuple
   @ ./ntuple.jl:19
 [2] #1
   @ ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:23
 [3] _broadcast_getindex_evalf
   @ ./broadcast.jl:709
 [4] _broadcast_getindex
   @ ./broadcast.jl:682
 [5] getindex
   @ ./broadcast.jl:636
 [6] #34
   @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:59
Reason: unsupported call to an unknown function (call to julia.new_gc_frame)
Stacktrace:
 [1] _broadcast_getindex_evalf
   @ ./broadcast.jl:709
 [2] _broadcast_getindex
   @ ./broadcast.jl:682
 [3] getindex
   @ ./broadcast.jl:636
 [4] #34
   @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:59
Reason: unsupported call to an unknown function (call to julia.push_gc_frame)
Stacktrace:
 [1] _broadcast_getindex_evalf
   @ ./broadcast.jl:709
 [2] _broadcast_getindex
   @ ./broadcast.jl:682
 [3] getindex
   @ ./broadcast.jl:636
 [4] #34
   @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:59
Reason: unsupported call to an unknown function (call to julia.get_gc_frame_slot)
Stacktrace:
 [1] _broadcast_getindex_evalf
   @ ./broadcast.jl:709
 [2] _broadcast_getindex
   @ ./broadcast.jl:682
 [3] getindex
   @ ./broadcast.jl:636
 [4] #34
   @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:59
Reason: unsupported dynamic function invocation (call to ForwardDiff.Partials{11, Float64})
Stacktrace:
 [1] #1
   @ ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:23
 [2] _broadcast_getindex_evalf
   @ ./broadcast.jl:709
 [3] _broadcast_getindex
   @ ./broadcast.jl:682
 [4] getindex
   @ ./broadcast.jl:636
 [5] #34
   @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:59
Reason: unsupported call to an unknown function (call to julia.pop_gc_frame)
Stacktrace:
 [1] _broadcast_getindex_evalf
   @ ./broadcast.jl:709
 [2] _broadcast_getindex
   @ ./broadcast.jl:682
 [3] getindex
   @ ./broadcast.jl:636
 [4] #34
   @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:59
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl

Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/validation.jl:147
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:440 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/RsWnF/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:439 [inlined]
  [5] emit_llvm(job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/utils.jl:92
  [6] emit_llvm
    @ ~/.julia/packages/GPUCompiler/U36Ed/src/utils.jl:86 [inlined]
  [7] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:129
  [8] codegen
    @ ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:110 [inlined]
  [9] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:106
 [10] compile
    @ ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:98 [inlined]
 [11] #1116
    @ ~/.julia/packages/CUDA/jdJ7Z/src/compiler/compilation.jl:247 [inlined]
 [12] JuliaContext(f::CUDA.var"#1116#1119"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/driver.jl:47
 [13] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/jdJ7Z/src/compiler/compilation.jl:246
 [14] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/execution.jl:125
 [15] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/U36Ed/src/execution.jl:103
 [16] macro expansion
    @ ~/.julia/packages/CUDA/jdJ7Z/src/compiler/execution.jl:367 [inlined]
 [17] macro expansion
    @ ./lock.jl:267 [inlined]
 [18] cufunction(f::GPUArrays.var"#34#36", tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceVector{ForwardDiff.Partials{11, Float64}, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1, CUDA.Mem.DeviceBuffer}, Tuple{Base.OneTo{Int64}}, BatchedRoutinesForwardDiffExt.var"#1#8"{ForwardDiff.Partials{11, Float64}, Float64, 11}, Tuple{Base.Broadcast.Extruded{CuDeviceVector{Int64, 1}, Tuple{Bool}, Tuple{Int64}}}}, Int64}}; kwargs::@Kwargs{})
    @ CUDA ~/.julia/packages/CUDA/jdJ7Z/src/compiler/execution.jl:362
 [19] cufunction
    @ ~/.julia/packages/CUDA/jdJ7Z/src/compiler/execution.jl:359 [inlined]
 [20] macro expansion
    @ ~/.julia/packages/CUDA/jdJ7Z/src/compiler/execution.jl:112 [inlined]
 [21] #launch_heuristic#1173
    @ ~/.julia/packages/CUDA/jdJ7Z/src/gpuarrays.jl:17 [inlined]
 [22] launch_heuristic
    @ ~/.julia/packages/CUDA/jdJ7Z/src/gpuarrays.jl:15 [inlined]
 [23] _copyto!
    @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:78 [inlined]
 [24] copyto!
    @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:44 [inlined]
 [25] copy
    @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:29 [inlined]
 [26] materialize(bc::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1, CUDA.Mem.DeviceBuffer}, Nothing, BatchedRoutinesForwardDiffExt.var"#1#8"{ForwardDiff.Partials{11, Float64}, Float64, 11}, Tuple{CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}}})
    @ Base.Broadcast ./broadcast.jl:903
 [27] map(::Function, ::CuArray{Int64, 1, CUDA.Mem.DeviceBuffer})
    @ GPUArrays ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:102
 [28] __batched_forwarddiff_value_and_jacobian_chunk(idx::Int64, ::Val{11}, ::Type{ForwardDiff.Tag{typeof(f), Float64}}, ::Type{ForwardDiff.Dual{ForwardDiff.Tag{typeof(f), Float64}, Float64, 11}}, ::Type{ForwardDiff.Partials{11, Float64}}, f::typeof(f), u::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer})
    @ BatchedRoutinesForwardDiffExt ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:22
 [29] __batched_value_and_jacobian(ad::AutoForwardDiff{nothing, Nothing}, f::typeof(f), u::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}, ck::Val{11})
    @ BatchedRoutinesForwardDiffExt ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:64
 [30] macro expansion
    @ ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:0 [inlined]
 [31] __batched_value_and_jacobian(ad::AutoForwardDiff{nothing, Nothing}, f::typeof(f), u::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer})
    @ BatchedRoutinesForwardDiffExt ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:91
 [32] _batched_jacobian
    @ ~/.julia/packages/BatchedRoutines/kEhIl/ext/BatchedRoutinesForwardDiffExt/jacobian.jl:126 [inlined]
 [33] batched_jacobian(ad::AutoForwardDiff{nothing, Nothing}, f::typeof(f), u::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer})
    @ BatchedRoutines ~/.julia/packages/BatchedRoutines/kEhIl/src/api.jl:15
 [34] top-level scope
    @ In[52]:14

LuxDL / BatchedRoutines.jl

batched_jacobian fails on GPU for arrays above a certain size #14