JuliaGPU / KernelAbstractions.jl

Heterogeneous programming in Julia
MIT License
350 stars 61 forks source link

issue with using enzyme on kernel abstractions on CUDA backend #480

Open jakubMitura14 opened 1 month ago

jakubMitura14 commented 1 month ago

I want to desing rrule from chainrules for my kernel. Below is simple reproducible example. System info: Julia 1.10 CUDA v5.4.0 ChainRulesCore v1.23.0 ChainRulesTestUtils v1.13.0 Enzyme v0.12.9 https://github.com/EnzymeAD/Enzyme.jl.git#main EnzymeTestUtils v0.1.7 KernelAbstractions v0.9.20 https://github.com/JuliaGPU/KernelAbstractions.jl#main

GPU: Nvidia RTX 3090

code

using KernelAbstractions
using ChainRulesCore, Zygote, CUDA, Enzyme, Test

@kernel function example_kenr(@Const(A),A_out)

    index = @index(Global)
    shared_arr = @localmem Float32 (@groupsize()[1], 1)
    shared_arr[@index(Local, Linear)] = A[index]
    A_out[index] = shared_arr[@index(Local, Linear), 1]
    index = @index(Global)
end

function call_example(A,A_out)
    dev = get_backend(A)
    example_kenr(dev, 256)(A,A_out, ndrange=(size(A)[1]))
    KernelAbstractions.synchronize(dev)
    return nothing
end

A=CUDA.ones(10).*2
A_out=CUDA.ones(10)
call_example(A,A_out)
@test A_out == CUDA.ones(10).*2

function ChainRulesCore.rrule(::typeof(call_example), A,A_out)

    #modify A_out by mutation
    call_example(A,A_out)

    function call_test_kernel1_pullback(d_A_out)
        d_A_out = CuArray(collect(d_A_out))
        d_A = CUDA.zeros(size(A)...)

        Enzyme.autodiff_deferred(Enzyme.Reverse, call_example, Const, Duplicated(A,d_A), Duplicated(A_out, d_A_out))

        #NoTangent for the function itself
        return NoTangent(), d_A,d_A_out
    end

    return A_out, call_test_kernel1_pullback

end

out,pull_back=rrule(call_example,A,A_out)
pull_back(CUDA.ones(10))

error

ERROR: Enzyme.Compiler.EnzymeRuntimeException(Cstring(0x00007d0b6b39b32c))
Stacktrace:
  [1] throwerr(cstr::Cstring)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:1338
  [2] getindex
    @ ./essentials.jl:13 [inlined]
  [3] get
    @ ./dict.jl:525 [inlined]
  [4] compiler_cache
    @ ~/.julia/packages/CUDA/DS19C/src/compiler/compilation.jl:166 [inlined]
  [5] macro expansion
    @ ~/.julia/packages/CUDA/DS19C/src/compiler/execution.jl:366 [inlined]
  [6] macro expansion
    @ ./lock.jl:267 [inlined]
  [7] #cufunction#1169
    @ ~/.julia/packages/CUDA/DS19C/src/compiler/execution.jl:364
  [8] cufunction
    @ ~/.julia/packages/CUDA/DS19C/src/compiler/execution.jl:361 [inlined]
  [9] macro expansion
    @ ~/.julia/packages/CUDA/DS19C/src/compiler/execution.jl:112 [inlined]
 [10] #_#4
    @ ~/.julia/packages/CUDA/DS19C/src/CUDAKernels.jl:103 [inlined]
 [11] augmented_julia____4_10560_inner_1wrap
    @ ~/.julia/packages/CUDA/DS19C/src/CUDAKernels.jl:0
 [12] macro expansion
    @ ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:5916 [inlined]
 [13] enzyme_call
    @ ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:5566 [inlined]
 [14] AugmentedForwardThunk
    @ ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:5454 [inlined]
 [15] runtime_generic_augfwd(activity::Type{…}, width::Val{…}, ModifiedBetween::Val{…}, RT::Val{…}, f::CUDA.CUDAKernels.var"##_#4", df::Nothing, primal_1::Int64, shadow_1_1::Nothing, primal_2::Nothing, shadow_2_1::Nothing, primal_3::KernelAbstractions.Kernel{…}, shadow_3_1::Nothing, primal_4::CuArray{…}, shadow_4_1::CuArray{…}, primal_5::CuArray{…}, shadow_5_1::CuArray{…})
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/sDjFs/src/rules/jitrules.jl:179
 [16] Kernel
    @ ~/.julia/packages/CUDA/DS19C/src/CUDAKernels.jl:89 [inlined]
 [17] call_example
    @ ~/projects_new/superVoxelJuliaCode/superVoxelJuliaCode/src/old/cuda_enzyme_kern_ans_test.jl:15 [inlined]
 [18] diffejulia_call_example_4822wrap
    @ ~/projects_new/superVoxelJuliaCode/superVoxelJuliaCode/src/old/cuda_enzyme_kern_ans_test.jl:0
 [19] macro expansion
    @ ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:5916 [inlined]
 [20] enzyme_call
    @ ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:5566 [inlined]
 [21] CombinedAdjointThunk
    @ ~/.julia/packages/Enzyme/sDjFs/src/compiler.jl:5443 [inlined]
 [22] autodiff_deferred
    @ ~/.julia/packages/Enzyme/sDjFs/src/Enzyme.jl:440 [inlined]
 [23] autodiff_deferred
    @ ~/.julia/packages/Enzyme/sDjFs/src/Enzyme.jl:510 [inlined]
 [24] (::var"#call_test_kernel1_pullback#5"{CuArray{…}, CuArray{…}})(d_A_out::CuArray{Float32, 1, CUDA.DeviceMemory})
    @ Main ~/projects_new/superVoxelJuliaCode/superVoxelJuliaCode/src/old/cuda_enzyme_kern_ans_test.jl:36
 [25] top-level scope
    @ ~/projects_new/superVoxelJuliaCode/superVoxelJuliaCode/src/old/cuda_enzyme_kern_ans_test.jl:48
Some type information was truncated. Use `show(err)` to see complete types.
vchuravy commented 1 month ago

X-ref #454

jakubMitura14 commented 1 month ago

Ok thanks for reference, so if I understand correctly issue is coming close to the solution, fantastic!