No augmented forward pass found for `cuOccupancyMaxPotentialBlockSize`

pxl-th commented 1 year ago

Hi! I'm trying to use fused kernel compute_α_fused to compute alpha-composing weights and use Enzyme to generate gradient kernel in Reverse mode instead of compute_α.

But the compilation fails. Is this the issue with CUDA.jl?

Error:

No augmented forward pass found for cuOccupancyMaxPotentialBlockSize
declare i32 @cuOccupancyMaxPotentialBlockSize(i64, i64, i64, i64, i64, i32) local_unnamed_addr

Stacktrace:
  [1] julia_error(cstr::Cstring, val::Ptr{…}, errtype::Enzyme.API.ErrorType, data::Ptr{…}, data2::Ptr{…}, B::Ptr{…})
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:5768
  [2] EnzymeCreateAugmentedPrimal(logic::Enzyme.Logic, todiff::LLVM.Function, retType::Enzyme.API.CDIFFE_TYPE, constant_args::Vector{…}, TA::Enzyme.TypeAnalysis, returnUsed::Bool, shadowReturnUsed::Bool, typeInfo::Enzyme.FnTypeInfo, uncacheable_args::Vector{…}, forceAnonymousTape::Bool, width::Int64, atomicAdd::Bool)
    @ Enzyme.API ~/.julia/packages/Enzyme/0SYwj/src/api.jl:164
  [3] enzyme!(job::GPUCompiler.CompilerJob{…}, mod::LLVM.Module, primalf::LLVM.Function, TT::Type, mode::Enzyme.API.CDerivativeMode, width::Int64, parallel::Bool, actualRetType::Type, wrap::Bool, modifiedBetween::NTuple{…}, returnPrimal::Bool, jlrules::Vector{…}, expectedTapeType::Type)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:7541
  [4] codegen(output::Symbol, job::GPUCompiler.CompilerJob{…}; libraries::Bool, deferred_codegen::Bool, optimize::Bool, toplevel::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9119
  [5] codegen
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:8723 [inlined]
  [6] _thunk(job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams}, postopt::Bool) (repeats 2 times)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9671
  [7] cached_compilation
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9705 [inlined]
  [8] (::Enzyme.Compiler.var"#475#476"{…})(ctx::LLVM.Context)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9768
  [9] JuliaContext(f::Enzyme.Compiler.var"#475#476"{…})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:47
 [10] #s292#474
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9723 [inlined]
 [11] 
    @ Enzyme.Compiler ./none:0
 [12] (::Core.GeneratedFunctionStub)(::UInt64, ::LineNumberNode, ::Any, ::Vararg{Any})
    @ Core ./boot.jl:600
 [13] runtime_generic_augfwd(activity::Type{…}, width::Val{…}, ModifiedBetween::Val{…}, RT::Val{…}, f::CUDA.CUDAKernels.var"##_#6", df::Nothing, primal_1::Int64, shadow_1_1::Nothing, primal_2::Nothing, shadow_2_1::Nothing, primal_3::KernelAbstractions.Kernel{…}, shadow_3_1::Nothing, primal_4::CuArray{…}, shadow_4_1::CuArray{…}, primal_5::CuArray{…}, shadow_5_1::CuArray{…}, primal_6::CuArray{…}, shadow_6_1::Nothing, primal_7::CuArray{…}, shadow_7_1::Nothing)
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:1361
 [14] Kernel
    @ ~/.julia/packages/CUDA/35NC6/src/CUDAKernels.jl:103 [inlined]
 [15] _compute_α_fused!
    @ ~/code/ZipNerf.jl/src/t.jl:56 [inlined]
 [16] _compute_α_fused!
    @ ~/code/ZipNerf.jl/src/t.jl:0 [inlined]
 [17] diffejulia__compute___fused__2929_inner_1wrap
    @ ~/code/ZipNerf.jl/src/t.jl:0
 [18] macro expansion
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9619 [inlined]
 [19] enzyme_call
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9297 [inlined]
 [20] CombinedAdjointThunk
    @ Enzyme.Compiler ~/.julia/packages/Enzyme/0SYwj/src/compiler.jl:9260 [inlined]
 [21] autodiff
    @ Enzyme ~/.julia/packages/Enzyme/0SYwj/src/Enzyme.jl:213 [inlined]
 [22] autodiff
    @ Enzyme ~/.julia/packages/Enzyme/0SYwj/src/Enzyme.jl:236 [inlined]
 [23] autodiff
    @ Enzyme ~/.julia/packages/Enzyme/0SYwj/src/Enzyme.jl:222 [inlined]
 [24] _pullback
    @ Main ~/code/ZipNerf.jl/src/t.jl:88 [inlined]
 [25] ZBack
    @ Zygote ~/.julia/packages/Zygote/4SSHS/src/compiler/chainrules.jl:211 [inlined]
 [26] kw_zpullback
    @ Zygote ~/.julia/packages/Zygote/4SSHS/src/compiler/chainrules.jl:237 [inlined]
 [27] #2
    @ Main ~/code/ZipNerf.jl/src/t.jl:25 [inlined]
 [28] (::Zygote.var"#75#76"{Zygote.Pullback{Tuple{…}, Tuple{…}}})(Δ::Float32)
    @ Zygote ~/.julia/packages/Zygote/4SSHS/src/compiler/interface.jl:45
 [29] gradient(f::Function, args::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/4SSHS/src/compiler/interface.jl:97
 [30] main()
    @ Main ~/code/ZipNerf.jl/src/t.jl:24
 [31] top-level scope
    @ REPL[2]:1
 [32] top-level scope
    @ ~/.julia/packages/CUDA/35NC6/src/initialization.jl:190
Some type information was truncated. Use `show(err)` to see complete types.

Code:

using Adapt
using ChainRulesCore
using CUDA
using KernelAbstractions
using Zygote
using Enzyme

import KernelAbstractions as KA

function main()
    kab = CUDABackend()

    tdist = adapt(kab, reshape(collect(range(0f0, 1f0, 65)), :, 1))
    directions = adapt(kab, reshape([0f0, 0f0, 1f0], 3, 1))

    σ = adapt(kab, ones(Float32, 64, 1))
    ω = compute_α(σ; tdist, directions)
    ω2 = compute_α_fused(σ; tdist, directions)
    @assert all(ω .≈ ω2)

    g1 = Zygote.gradient(σ) do σ
        sum(compute_α(σ; tdist, directions))
    end
    g2 = Zygote.gradient(σ) do σ
        sum(compute_α_fused(σ; tdist, directions))
    end
    @assert all(g1[1] .≈ g2[1])
    return
end

function compute_α(σ; tdist, directions)
    kab = get_backend(σ)
    N = size(σ, 2)

    tδ = tdist[2:end, :] .- tdist[1:end - 1, :]
    δ = tδ .* sqrt.(sum(directions.^2; dims=1))
    σδ = σ .* δ

    α = 1f0 .- exp.(-σδ)
    T = vcat(
        @ignore_derivatives(KA.ones(kab, Float32, 1, N)),
        exp.(-cumsum(σδ[1:end - 1, :]; dims=1)))
    ω = α .* T
    return ω
end

function compute_α_fused(
    σ::AbstractMatrix{Float32}; tdist::AbstractMatrix{Float32},
    directions::AbstractMatrix{Float32},
)
    ω = KA.allocate(get_backend(σ), Float32, size(σ))
    _compute_α_fused!(ω, σ, tdist, directions)
    return ω
end

_compute_α_fused!(ω, σ, tdist, directions) =
    _compute_α!(get_backend(ω))(ω, σ, tdist, directions; ndrange=size(ω, 2))

@kernel function _compute_α!(
    # Output.
    ω::AbstractMatrix{Float32},
    # Input.
    Σ::AbstractMatrix{Float32},
    tdist::AbstractMatrix{Float32},
    directions::AbstractMatrix{Float32},
)
    @uniform K = size(ω, 1)

    i = @index(Global)
    δ_scale = sqrt(directions[1, i]^2 + directions[2, i]^2 + directions[3, i]^2)

    T::Float32 = 1f0
    for k in 1:K
        σ = Σ[k, i]
        tδ = tdist[k + 1, i] - tdist[k, i]
        δ = tδ * δ_scale

        α = 1f0 - exp(-σ * δ)
        ω[k, i] = α * T
        T *= 1f0 - α
    end
end

function ChainRulesCore.rrule(::typeof(compute_α_fused), σ; tdist, directions)
    ω = compute_α_fused(σ; tdist, directions)
    function _pullback(Δ)
        ∂σ = KA.allocate(get_backend(ω), eltype(ω), size(ω))
        Enzyme.autodiff(Reverse, _compute_α_fused!,
            Duplicated(ω, Δ), Duplicated(σ, ∂σ),
            Const(tdist), Const(directions))
        return NoTangent(), ∂σ
    end
    ω, _pullback
end

vchuravy commented 1 year ago

I think this is due to the EnzymeRules for KernelAbstractions not supporting reverse mode yet

pxl-th commented 1 year ago

Oh, I see. I saw tests in KernelAbstractions for reverse mode and though that it works.

wsmoses commented 1 year ago

the KA custom rule is implemented for any backend in forward mode, and the CPU backend in reverse

vchuravy commented 1 year ago

I don't actually remember what was needed for reverse GPU support

wsmoses commented 1 year ago

We needed to prexompite the GPU relevant/interpreted tape size from outside the kernel.

So we need a variant of thunk tape computation that allows for a different device

On Mon, Sep 18, 2023 at 12:04 PM Valentin Churavy @.***> wrote:

I don't actually remember what was needed for reverse GPU support

— Reply to this email directly, view it on GitHub https://github.com/EnzymeAD/Enzyme.jl/issues/1061#issuecomment-1724002697, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAJTUXD7CG46R7XEC5NBCF3X3B5IRANCNFSM6AAAAAA44UTTAU . You are receiving this because you commented.Message ID: <EnzymeAD/Enzyme. @.***>

pxl-th commented 1 year ago

I think this is due to the EnzymeRules for KernelAbstractions not supporting reverse mode yet

Actually, is this also the case if I want to differentiate just the kernel (no host code involved)?

wsmoses commented 1 year ago

nope that would be fine

pxl-th commented 1 year ago

I see there are tests for reverse for CUDA.jl: https://github.com/EnzymeAD/Enzyme.jl/blob/7d99eec57328329eba693f04aefcdd45f9420e3e/test/cuda.jl#L14

But when I try the same with KA, it errors:

ERROR: return type is Union{}, giving up.
Stacktrace:
 [1] error(s::String)
   @ Base ./error.jl:35
 [2] autodiff_deferred
   @ Main ~/.julia/packages/Enzyme/0SYwj/src/Enzyme.jl:456 [inlined]
 [3] autodiff_deferred
   @ Main ~/.julia/packages/Enzyme/0SYwj/src/Enzyme.jl:442 [inlined]
 [4] main2()
   @ Main ~/code/t.jl:110
 [5] top-level scope
   @ REPL[3]:1
 [6] top-level scope
   @ ~/.julia/packages/CUDA/35NC6/src/initialization.jl:190

using CUDA
using KernelAbstractions
using Enzyme

@kernel function ker(x)
    i = @index(Global)
    x[i] *= x[i]
end

function main()
    kab = CUDABackend()
    x = KA.ones(kab, Float32, 16)
    dx = KA.ones(kab, Float32, 16)
    Enzyme.autodiff_deferred(Reverse, ker(kab), Duplicated(x, dx))
    return
end
main()

I'm probably doing things incorrectly, but I haven't found the example with KA with just a single kernel... :/

pxl-th commented 1 year ago

Actually, test for CUDA.jl also gives this error:

function mul_kernel(A)
    i = threadIdx().x
    if i <= length(A)
        A[i] *= A[i]
    end
    return nothing
end

function main()
    A = CUDA.ones(64,)
    dA = CUDA.ones(64,)
    autodiff_deferred(Reverse, mul_kernel, Const, Duplicated(A, dA))
    return
end

I'm using CUDA 4.4.1, Enzyme 0.11.7 and Julia 1.10-beta2

pxl-th commented 1 year ago

So I got confused, but with CUDA.jl if you wrap in

function mul_kernel(A)
    i = threadIdx().x
    A[i] *= A[i]
    return nothing
end

function grad(A, dA)
    autodiff_deferred(Reverse, mul_kernel, Duplicated(A, dA))
    return nothing
end

And call @cuda threads=length(A) grad(A, dA), then it works (which is still confusing a bit).

But with KernelAbstractions I cannot figure out how to do this. The only example involves host code: https://github.com/JuliaGPU/KernelAbstractions.jl/blob/3165d35b9b707e73d19e7f8fc9f442bafaf415ac/test/extensions/enzyme.jl#L10

Is there a way to AD just the kernel?

pxl-th commented 1 year ago

@wsmoses, sorry for spamming, but are there any examples with KA not involving host code (just the kernel)?

wsmoses commented 1 year ago

You should be able to use autodiff_deferred inside the kernel itself (like your grad case). The KA example you showed is for the custom rules nicer support, but that's only enabled for forward mode in KA.jl rn.

For reverse mode, you'll have to set it up manually like your mul_kernel above where the autodiff call is inside the device code entirely

pxl-th commented 12 months ago

autodiff call is inside the device code entirely

Oh, I see! Now it works! A note somewhere in the docs might be useful (unless I missed one). Thanks for the help!

pxl-th commented 12 months ago

It works for the mul_kernel, however fails when using with more complex kernels. For example, with sin function.

Error:

ERROR: InvalidIRError: compiling MethodInstance for gpu_gker(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, ::AMDGPU.Device.ROCDeviceVector{Float32, 1}, ::AMDGPU.Device.ROCDeviceVector{Float32, 1}) resulted in invalid LLVM IR
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
  [1] #sin
    @ ~/.julia/dev/AMDGPU/src/device/gcn/math.jl:32
  [2] ker
    @ ~/code/ZipNerf.jl/t.jl:7
  [3] ker
    @ ~/code/ZipNerf.jl/t.jl:0
  [4] diffejulia_ker_5228_inner_1wrap
    @ ~/code/ZipNerf.jl/t.jl:0
  [5] macro expansion
    @ ~/.julia/packages/Enzyme/VS5jo/src/compiler.jl:9774
  [6] enzyme_call
    @ ~/.julia/packages/Enzyme/VS5jo/src/compiler.jl:9452
  [7] CombinedAdjointThunk
    @ ~/.julia/packages/Enzyme/VS5jo/src/compiler.jl:9415
  [8] autodiff_deferred
    @ ~/.julia/packages/Enzyme/VS5jo/src/Enzyme.jl:372
  [9] autodiff_deferred
    @ ~/.julia/packages/Enzyme/VS5jo/src/Enzyme.jl:459
 [10] autodiff_deferred
    @ ~/.julia/packages/Enzyme/VS5jo/src/Enzyme.jl:442
 [11] macro expansion
    @ ~/code/ZipNerf.jl/t.jl:18
 [12] gpu_gker
    @ ~/.julia/packages/KernelAbstractions/cWlFz/src/macros.jl:90
 [13] gpu_gker
    @ ./none:0
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
 [1] #sin
   @ ~/.julia/dev/AMDGPU/src/device/gcn/math.jl:32
 [2] ker
   @ ~/code/ZipNerf.jl/t.jl:7
 [3] ker
   @ ~/code/ZipNerf.jl/t.jl:0
 [4] diffejulia_ker_5228_inner_1wrap
   @ ~/code/ZipNerf.jl/t.jl:0
...

Code:

using AMDGPU
using KernelAbstractions
using Enzyme
import KernelAbstractions as KA

@inline function ker(x, i)
    x[i] *= sin(x[i])
    return
end

@kernel function fker(x)
    i = @index(Global)
    ker(x, i)
end

@kernel function gker(x, dx)
    i = @index(Global)
    Enzyme.autodiff_deferred(Reverse, ker, Duplicated(x, dx), i)
end

function main()
    kab = ROCBackend()
    x = KA.ones(kab, Float32, 16)
    dx = KA.ones(kab, Float32, 16)

    fker(kab)(x; ndrange=length(x))
    @show x
    gker(kab)(x, dx; ndrange=length(x))
    @show dx
    return
end

wsmoses commented 12 months ago

Yeah that's the same as https://github.com/EnzymeAD/Enzyme.jl/issues/683

On Mon, Sep 25, 2023 at 9:13 AM Anton Smirnov @.***> wrote:

It works for the mul_kernel, however fails when using with more complex kernels. For example, with sin function.

Error:

ERROR: InvalidIRError: compiling MethodInstance for gpu_gker(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, ::AMDGPU.Device.ROCDeviceVector{Float32, 1}, ::AMDGPU.Device.ROCDeviceVector{Float32, 1}) resulted in invalid LLVM IR Reason: unsupported call through a literal pointer (call to ) Stacktrace: [1] #sin @ ~/.julia/dev/AMDGPU/src/device/gcn/math.jl:32 [2] ker @ ~/code/ZipNerf.jl/t.jl:7 [3] ker @ ~/code/ZipNerf.jl/t.jl:0 [4] diffejulia_ker_5228_inner_1wrap @ ~/code/ZipNerf.jl/t.jl:0 [5] macro expansion @ ~/.julia/packages/Enzyme/VS5jo/src/compiler.jl:9774 [6] enzyme_call @ ~/.julia/packages/Enzyme/VS5jo/src/compiler.jl:9452 [7] CombinedAdjointThunk @ ~/.julia/packages/Enzyme/VS5jo/src/compiler.jl:9415 [8] autodiff_deferred @ ~/.julia/packages/Enzyme/VS5jo/src/Enzyme.jl:372 [9] autodiff_deferred @ ~/.julia/packages/Enzyme/VS5jo/src/Enzyme.jl:459 [10] autodiff_deferred @ ~/.julia/packages/Enzyme/VS5jo/src/Enzyme.jl:442 [11] macro expansion @ ~/code/ZipNerf.jl/t.jl:18 [12] gpu_gker @ ~/.julia/packages/KernelAbstractions/cWlFz/src/macros.jl:90 [13] gpu_gker @ ./none:0 Reason: unsupported call through a literal pointer (call to ) Stacktrace: [1] #sin @ ~/.julia/dev/AMDGPU/src/device/gcn/math.jl:32 [2] ker @ ~/code/ZipNerf.jl/t.jl:7 [3] ker @ ~/code/ZipNerf.jl/t.jl:0 [4] diffejulia_ker_5228_inner_1wrap @ ~/code/ZipNerf.jl/t.jl:0...

Code:

using AMDGPUusing KernelAbstractionsusing Enzymeimport KernelAbstractions as KA @inline function ker(x, i) x[i] *= sin(x[i]) returnend @kernel function fker(x) i = @index(Global) ker(x, i)end @kernel function gker(x, dx) i = @index(Global) Enzyme.autodiff_deferred(Reverse, ker, Duplicated(x, dx), i)end function main() kab = ROCBackend() x = KA.ones(kab, Float32, 16) dx = KA.ones(kab, Float32, 16)
fker(kab)(x; ndrange=length(x))
@show x
gker(kab)(x, dx; ndrange=length(x))
@show dx
returnend
— Reply to this email directly, view it on GitHub https://github.com/EnzymeAD/Enzyme.jl/issues/1061#issuecomment-1733796640, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAJTUXB5HYQPM4ZYTI45AS3X4GGSJANCNFSM6AAAAAA44UTTAU . You are receiving this because you were mentioned.Message ID: @.***>

pxl-th commented 12 months ago

Yeah that's the same as #683

Just curious if the fix is coming relatively soon or is it more involved?

wsmoses commented 11 months ago

It's unfortunately more involved.

@aviatesk do you have cycles to help us with the nested abstract interpreter issues?

cc @ChrisRackauckas

wsmoses commented 3 months ago

@pxl-th the AMDGPU issues are resolved by https://github.com/EnzymeAD/Enzyme.jl/pull/1537

EnzymeAD / Enzyme.jl

No augmented forward pass found for `cuOccupancyMaxPotentialBlockSize` #1061