JuliaGPU / oneAPI.jl

Julia support for the oneAPI programming toolkit.
https://juliagpu.org/oneapi/
Other
179 stars 22 forks source link

Support for logical indexing #461

Open rkierulf opened 4 weeks ago

rkierulf commented 4 weeks ago

Logical indexing of a matrix using .== doesn't work for oneAPI:

using oneAPI
A = oneArray(rand(Float32, (1000, 1000)))
A[A .== 0] .= 1.0f0
ERROR: LoadError: GPU compilation of MethodInstance for (::GPUArrays.var"#34#36")(::oneAPI.oneKernelContext, ::SubArray{Float32, 1, oneAPI.oneDeviceMatrix{Float32, 1}, Tuple{Vector{CartesianIndex{2}}}, false}, ::Base.Broadcast.Broadcasted{oneAPI.oneArrayStyle{1, oneAPI.oneL0.DeviceBuffer}, Tuple{Base.OneTo{Int64}}, typeof(identity), Tuple{Float32}}, ::Int64) failed
KernelError: passing and using non-bitstype argument
Argument 3 to your kernel function is of type SubArray{Float32, 1, oneAPI.oneDeviceMatrix{Float32, 1}, Tuple{Vector{CartesianIndex{2}}}, false}, which is not isbits:
  .indices is of type Tuple{Vector{CartesianIndex{2}}} which is not isbits.
    .1 is of type Vector{CartesianIndex{2}} which is not isbits.
Stacktrace:
  [1] check_invocation(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/GPUCompiler/Y4hSX/src/validation.jl:92
  [2] macro expansion
    @ ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/GPUCompiler/Y4hSX/src/driver.jl:128 [inlined]
  [3] macro expansion
    @ ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
  [4] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
    @ GPUCompiler ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/GPUCompiler/Y4hSX/src/driver.jl:126
  [5] codegen
    @ ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/GPUCompiler/Y4hSX/src/driver.jl:115 [inlined]
  [6] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool)
    @ GPUCompiler ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/GPUCompiler/Y4hSX/src/driver.jl:111
  [7] compile
    @ ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/GPUCompiler/Y4hSX/src/driver.jl:103 [inlined]
  [8] #58
    @ ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/oneAPI/z4Axk/src/compiler/compilation.jl:81 [inlined]
  [9] JuliaContext(f::oneAPI.var"#58#59"{GPUCompiler.CompilerJob{GPUCompiler.SPIRVCompilerTarget, oneAPI.oneAPICompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/GPUCompiler/Y4hSX/src/driver.jl:52
 [10] JuliaContext(f::Function)
    @ GPUCompiler ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/GPUCompiler/Y4hSX/src/driver.jl:42
 [11] compile(job::GPUCompiler.CompilerJob)
    @ oneAPI ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/oneAPI/z4Axk/src/compiler/compilation.jl:80
 [12] actual_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.SPIRVCompilerTarget, oneAPI.oneAPICompilerParams}, compiler::typeof(oneAPI.compile), linker::typeof(oneAPI.link))
    @ GPUCompiler ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/GPUCompiler/Y4hSX/src/execution.jl:237
 [13] cached_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.SPIRVCompilerTarget, oneAPI.oneAPICompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/GPUCompiler/Y4hSX/src/execution.jl:151
 [14] macro expansion
    @ ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/oneAPI/z4Axk/src/compiler/execution.jl:203 [inlined]
 [15] macro expansion
    @ ./lock.jl:267 [inlined]
 [16] zefunction(f::GPUArrays.var"#34#36", tt::Type{Tuple{oneAPI.oneKernelContext, SubArray{Float32, 1, oneAPI.oneDeviceMatrix{Float32, 1}, Tuple{Vector{CartesianIndex{2}}}, false}, Base.Broadcast.Broadcasted{oneAPI.oneArrayStyle{1, oneAPI.oneL0.DeviceBuffer}, Tuple{Base.OneTo{Int64}}, typeof(identity), Tuple{Float32}}, Int64}}; kwargs::@Kwargs{})
    @ oneAPI ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/oneAPI/z4Axk/src/compiler/execution.jl:198
 [17] zefunction
    @ ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/oneAPI/z4Axk/src/compiler/execution.jl:195 [inlined]
 [18] macro expansion
    @ ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/oneAPI/z4Axk/src/compiler/execution.jl:66 [inlined]
 [19] #launch_heuristic#93
    @ ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/oneAPI/z4Axk/src/gpuarrays.jl:17 [inlined]
 [20] launch_heuristic
    @ ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/oneAPI/z4Axk/src/gpuarrays.jl:15 [inlined]
 [21] _copyto!
    @ ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/GPUArrays/bbZD0/src/host/broadcast.jl:78 [inlined]
 [22] materialize!
    @ ~/.cache/julia-buildkite-plugin/depots/018dadfb-f4d5-466e-8c3f-55b5f9a75025/packages/GPUArrays/bbZD0/src/host/broadcast.jl:38 [inlined]
 [23] materialize!(dest::SubArray{Float32, 1, oneAPI.oneArray{Float32, 2, oneAPI.oneL0.DeviceBuffer}, Tuple{Vector{CartesianIndex{2}}}, false}, bc::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{0}, Nothing, typeof(identity), Tuple{Float32}})
    @ Base.Broadcast ./broadcast.jl:911

It does work for CUDA, AMDGPU, and Metal based on the results of this build: https://buildkite.com/julialang/komamri-dot-jl/builds/913

maleadt commented 4 days ago

The core of the issue:

julia> typeof(view(CuArray{Float32}(undef, 2, 2), CuArray{Bool}(undef, 2, 2)))
SubArray{Float32, 1, CuArray{Float32, 2, CUDA.DeviceMemory}, Tuple{CuArray{CartesianIndex{2}, 1, CUDA.DeviceMemory}}, false}

julia> typeof(view(oneArray{Float32}(undef, 2, 2), oneArray{Bool}(undef, 2, 2)))
SubArray{Float32, 1, oneArray{Float32, 2, oneAPI.oneL0.DeviceBuffer}, Tuple{Vector{CartesianIndex{2}}}, false}
maleadt commented 4 days ago

Okay, so the issue is that we keep Base.LogicalIndex around, which materializes to a Vector. We should port the following code from CUDA.jl: https://github.com/JuliaGPU/CUDA.jl/blob/master/src/indexing.jl

However, if this isn't a blocker, I'd like to postpone that until we migrate GPUArrays over to KernelAbstractions.jl, so that we can "simply" have a single definition of that functionality over there instead of duplication it across GPU back-ends.