OneHotArray scalar indexing problem in a gradient context

Came up when trying to do semisupervised learning with GNNs

julia> using Flux, CUDA

julia> CUDA.allowscalar(false)

julia> gradient(y -> sum(y[:,1:2]), y)
(nothing,)

julia> gradient(y -> sum(y[:,1:2]), y |> gpu)
ERROR: Scalar indexing is disallowed.
Invocation of getindex resulted in scalar indexing of a GPU array.
This is typically caused by calling an iterating implementation of a method.
Such implementations *do not* execute on the GPU, but very slowly on the CPU,
and therefore are only permitted from the REPL for prototyping purposes.
If you did intend to index this array, annotate the caller with @allowscalar.
Stacktrace:
  [1] error(s::String)
    @ Base ./error.jl:33
  [2] assertscalar(op::String)
    @ GPUArrays ~/.julia/packages/GPUArrays/Tebtl/src/host/indexing.jl:53
  [3] getindex
    @ ~/.julia/packages/GPUArrays/Tebtl/src/host/indexing.jl:86 [inlined]
  [4] getindex
    @ ~/.julia/packages/Flux/Zz9RI/src/onehot.jl:45 [inlined]
  [5] getindex
    @ ~/.julia/packages/Flux/Zz9RI/src/onehot.jl:48 [inlined]
  [6] iterate
    @ ./abstractarray.jl:1096 [inlined]
  [7] iterate
    @ ./abstractarray.jl:1094 [inlined]
  [8] _foldl_impl(op::Base.BottomRF{typeof(Base.add_sum)}, init::Base._InitialValue, itr::Flux.OneHotArray{UInt32, 7, 1, 2, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}})
    @ Base ./reduce.jl:56
  [9] foldl_impl
    @ ./reduce.jl:48 [inlined]
 [10] mapfoldl_impl
    @ ./reduce.jl:44 [inlined]
 [11] #mapfoldl#214
    @ ./reduce.jl:160 [inlined]
 [12] mapfoldl
    @ ./reduce.jl:160 [inlined]
 [13] _mapreduce
    @ ./reduce.jl:421 [inlined]
 [14] _mapreduce_dim
    @ ./reducedim.jl:318 [inlined]
 [15] #mapreduce#672
    @ ./reducedim.jl:310 [inlined]
 [16] #reduce#674
    @ ./reducedim.jl:359 [inlined]
 [17] #sum#223
    @ ./reduce.jl:529 [inlined]
 [18] #adjoint#624
    @ ~/.julia/packages/Zygote/l3aNG/src/lib/array.jl:289 [inlined]
 [19] adjoint
    @ ./none:0 [inlined]
 [20] _pullback
    @ ~/.julia/packages/ZygoteRules/OjfTt/src/adjoint.jl:57 [inlined]
 [21] _pullback
    @ ./REPL[12]:1 [inlined]
 [22] _pullback(ctx::Zygote.Context, f::var"#7#8", args::Flux.OneHotArray{UInt32, 7, 1, 2, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}})
    @ Zygote ~/.julia/packages/Zygote/l3aNG/src/compiler/interface2.jl:0
 [23] _pullback(f::Function, args::Flux.OneHotArray{UInt32, 7, 1, 2, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}})
    @ Zygote ~/.julia/packages/Zygote/l3aNG/src/compiler/interface.jl:34
 [24] pullback(f::Function, args::Flux.OneHotArray{UInt32, 7, 1, 2, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}})
    @ Zygote ~/.julia/packages/Zygote/l3aNG/src/compiler/interface.jl:40
 [25] gradient(f::Function, args::Flux.OneHotArray{UInt32, 7, 1, 2, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}})
    @ Zygote ~/.julia/packages/Zygote/l3aNG/src/compiler/interface.jl:75
 [26] top-level scope
    @ REPL[12]:1
 [27] top-level scope
    @ ~/.julia/dev/CUDA/src/initialization.jl:65

Where presumably you define something like:

julia> y = Flux.onehotbatch([1,2,3,1,2], 1:3)
3×5 OneHotMatrix(::Vector{UInt32}) with eltype Bool:
 1  ⋅  ⋅  1  ⋅
 ⋅  1  ⋅  ⋅  1
 ⋅  ⋅  1  ⋅  ⋅

julia> gpu(y)
3×5 OneHotMatrix(::CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}) with eltype Bool:
 1  ⋅  ⋅  1  ⋅
 ⋅  1  ⋅  ⋅  1
 ⋅  ⋅  1  ⋅  ⋅

The gradient of sum seems not to notice that the eltype of this is Bool?

Sometimes CuArray bypasses that rule, because of this (intended to avoid Fill I think): https://github.com/FluxML/Zygote.jl/blob/master/src/lib/broadcast.jl#L269

julia> gradient(sum, randn(3).>0)
(nothing,)

julia> gradient(sum, cu(randn(3).>0))
(Bool[1, 1, 1],)

julia> y isa CUDA.AbstractGPUArray  # hence this rule shouldn't apply to y
false

julia> gradient(sum, gpu(y))  # still fails
ERROR: Scalar indexing is disallowed.

If we pick a function whose gradient isn't zero on the CPU (because not every operation projects) then I presume that scalar indexing is unavoidable:

ulia> gradient(y -> sum(y[:,1:3] .+ y[:, 1:3]'), y)
([2 2 … 0 0; 2 2 … 0 0; 2 2 … 0 0],)

julia> gradient(y -> sum(y[:,1:3] .+ y[:, 1:3]'), gpu(y))
ERROR: Scalar indexing is disallowed.
Invocation of getindex resulted in scalar indexing of a GPU array.
This is typically caused by calling an iterating implementation of a method.
Such implementations *do not* execute on the GPU, but very slowly on the CPU,
and therefore are only permitted from the REPL for prototyping purposes.
If you did intend to index this array, annotate the caller with @allowscalar.
Stacktrace:
  [1] error(s::String)
    @ Base ./error.jl:33
  [2] assertscalar(op::String)
    @ GPUArrays ~/.julia/packages/GPUArrays/UBzTm/src/host/indexing.jl:53
  [3] getindex(::CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}, ::Int64, ::Int64)
    @ GPUArrays ~/.julia/packages/GPUArrays/UBzTm/src/host/indexing.jl:86
  [4] getindex
    @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.7/LinearAlgebra/src/adjtrans.jl:179 [inlined]
  [5] _getindex
    @ ./abstractarray.jl:1265 [inlined]
  [6] getindex
    @ ./abstractarray.jl:1221 [inlined]
  [7] _broadcast_getindex
    @ ./broadcast.jl:636 [inlined]
  [8] _getindex
    @ ./broadcast.jl:667 [inlined]
  [9] _getindex
    @ ./broadcast.jl:666 [inlined]
 [10] _broadcast_getindex
    @ ./broadcast.jl:642 [inlined]
 [11] getindex
    @ ./broadcast.jl:597 [inlined]
 [12] macro expansion
    @ ./broadcast.jl:1005 [inlined]
 [13] macro expansion
    @ ./simdloop.jl:77 [inlined]
 [14] copyto!
    @ ./broadcast.jl:1004 [inlined]
 [15] copyto!
    @ ./broadcast.jl:957 [inlined]
 [16] materialize!
    @ ./broadcast.jl:915 [inlined]
 [17] materialize!
    @ ./broadcast.jl:912 [inlined]
 [18] (::Zygote.var"#430#432"{2, Bool, Flux.OneHotArray{UInt32, 3, 1, 2, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Colon, UnitRange{Int64}}})(dy::LinearAlgebra.Adjoint{Int64, CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}})
    @ Zygote ~/.julia/packages/Zygote/nsu1Y/src/lib/array.jl:39
 [19] #2309#back
    @ ~/.julia/packages/ZygoteRules/OjfTt/src/adjoint.jl:59 [inlined]
 [20] Pullback
    @ ./REPL[28]:1 [inlined]
 [21] (::Zygote.var"#52#53"{typeof(∂(#23))})(Δ::Int64)
    @ Zygote ~/.julia/packages/Zygote/nsu1Y/src/compiler/interface.jl:41
 [22] gradient(f::Function, args::Flux.OneHotArray{UInt32, 3, 1, 2, CuArray{UInt32, 1, CUDA.Mem.DeviceBuffer}})
    @ Zygote ~/.julia/packages/Zygote/nsu1Y/src/compiler/interface.jl:76

FluxML / Flux.jl

OneHotArray scalar indexing problem in a gradient context #1703