JuliaGPU / CUDA.jl

CUDA programming in Julia.
https://juliagpu.org/cuda/
Other
1.17k stars 207 forks source link

Support for Julia 1.11 #2241

Open maleadt opened 5 months ago

maleadt commented 5 months ago

Blocked on: https://github.com/JuliaLang/julia/issues/52938

MWEs for that issue (in descending order of abstraction):

using CUDA

inner() = nothing

function outer()
    s = CuDeviceStream()
    @cuda dynamic=true stream=s shmem=1 inner()
    return
end

using InteractiveUtils
function main()
    InteractiveUtils.code_llvm(outer, Tuple{})
    CUDA.code_llvm(outer, Tuple{})
end
isinteractive() || main()
using CUDA

inner(a, b, c, d, e, f, h) = nothing

function outer()
    @cuda dynamic=true inner(1, 1, 1, 1, 1, 1, 1)
    return
end

using InteractiveUtils
function main()
    InteractiveUtils.code_llvm(outer, Tuple{})
    CUDA.code_llvm(outer, Tuple{})
end
isinteractive() || main()
using CUDA

cudacall(f, types::Type, args...; kwargs...) = nothing

function outer(f)
    @inline cudacall(f, Tuple{}; stream=Ref(42), shmem=1)
    return
end

using InteractiveUtils
function main()
    InteractiveUtils.code_llvm(outer, Tuple{Nothing})
    CUDA.code_llvm(outer, Tuple{Nothing})
end
isinteractive() || main()
maleadt commented 1 month ago

Testing on backports-release-1.11@https://github.com/JuliaLang/julia/commit/b69fc5786331a60c5357bafb901be077e5e3a90d, which includes https://github.com/JuliaLang/julia/pull/54323, this does not seem to fix the third MWE here:

julia> InteractiveUtils.code_llvm(outer, Tuple{Nothing})
; Function Signature: outer(Nothing)
;  @ REPL[16]:1 within `outer`
define void @julia_outer_15341() #0 {
top:
  ret void
}

julia> CUDA.code_llvm(outer, Tuple{Nothing})
warning: linking module flags 'Dwarf Version': IDs have conflicting values ('i32 4' from globals with 'i32 2' from start)
;  @ REPL[16]:1 within `outer`
define void @julia_outer_15455() local_unnamed_addr {
top:
  %jlcallframe1 = alloca [4 x {}*], align 8
  %jlcallframe1.sub = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe1, i64 0, i64 0
;  @ REPL[16]:2 within `outer`
; ┌ @ REPL[15]:1 within `cudacall`
; │┌ @ iterators.jl:279 within `pairs`
; ││┌ @ essentials.jl:459 within `Pairs`
; │││┌ @ namedtuple.jl:234 within `eltype`
; ││││┌ @ namedtuple.jl:236 within `nteltype`
; │││││┌ @ tuple.jl:271 within `eltype`
; ││││││┌ @ tuple.jl:291 within `_compute_eltype`
; │││││││┌ @ promotion.jl:175 within `promote_typejoin`
          %0 = load {}*, {}** bitcast (i8* getelementptr (i8, i8* @jl_small_typeof, i64 256) to {}**), align 8
          %1 = call fastcc nonnull {}* @julia_typejoin_15462({}* readonly %0, {}* readonly inttoptr (i64 126395047196048 to {}*))
; ││││││││ @ promotion.jl:176 within `promote_typejoin`
          %2 = load {}*, {}** bitcast (i8* getelementptr (i8, i8* @jl_small_typeof, i64 64) to {}**), align 8
          store {}* %2, {}** %jlcallframe1.sub, align 8
          %3 = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe1, i64 0, i64 1
          store {}* %0, {}** %3, align 8
          %4 = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe1, i64 0, i64 2
          store {}* inttoptr (i64 126395047196048 to {}*), {}** %4, align 8
          %5 = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe1, i64 0, i64 3
          store {}* %1, {}** %5, align 8
          %6 = call nonnull {}* @jl_f_apply_type({}* null, {}** nonnull %jlcallframe1.sub, i32 4)
; └└└└└└└└
;  @ REPL[16]:3 within `outer`
  ret void
}

@aviatesk I thought you mentioned otherwise in https://github.com/JuliaLang/julia/pull/54322#issuecomment-212689504?

aviatesk commented 1 month ago

I have confirmed that the original issue (https://github.com/JuliaLang/julia/issues/52938) has been fixed in backports-release-1.11 with the following code:

const CC = Core.Compiler
using Core: MethodInstance, CodeInstance, CodeInfo, MethodTable

## interpreter

if isdefined(CC, :CachedMethodTable)
    const ExternalMethodTableView = CC.CachedMethodTable{CC.OverlayMethodTable}
    get_method_table_view(world::UInt, mt::MethodTable) =
        CC.CachedMethodTable(CC.OverlayMethodTable(world, mt))
else
    const ExternalMethodTableView = CC.OverlayMethodTable
    get_method_table_view(world::UInt, mt::MethodTable) = CC.OverlayMethodTable(world, mt)
end

struct ExternalInterpreter <: CC.AbstractInterpreter
    world::UInt
    method_table::ExternalMethodTableView

    # code_cache
    inf_cache::Vector{CC.InferenceResult}
end

function ExternalInterpreter(world::UInt=Base.get_world_counter(); method_table)
    @assert world <= Base.get_world_counter()
    method_table = get_method_table_view(world, method_table)
    inf_cache = Vector{CC.InferenceResult}()

    return ExternalInterpreter(world, method_table, inf_cache)
end

CC.InferenceParams(interp::ExternalInterpreter) = CC.InferenceParams()
CC.OptimizationParams(interp::ExternalInterpreter) = CC.OptimizationParams()
CC.get_inference_world(interp::ExternalInterpreter) = interp.world
CC.get_inference_cache(interp::ExternalInterpreter) = interp.inf_cache
CC.cache_owner(interp::ExternalInterpreter) = Symbol("JuliaLang/julia#52938")

# No need to do any locking since we're not putting our results into the runtime cache
CC.lock_mi_inference(interp::ExternalInterpreter, mi::MethodInstance) = nothing
CC.unlock_mi_inference(interp::ExternalInterpreter, mi::MethodInstance) = nothing

function CC.add_remark!(interp::ExternalInterpreter, sv::CC.InferenceState, msg)
    @debug "Inference remark during External compilation of $(sv.linfo): $msg"
end

CC.may_optimize(interp::ExternalInterpreter) = true
CC.may_compress(interp::ExternalInterpreter) = true
CC.may_discard_trees(interp::ExternalInterpreter) = true
CC.verbose_stmt_info(interp::ExternalInterpreter) = false
CC.method_table(interp::ExternalInterpreter) = interp.method_table

# main

Base.Experimental.@MethodTable(GLOBAL_METHOD_TABLE)

inner(f, types::Type, args...; kwargs...) = nothing
outer(f) = @inline inner(f, Tuple{}; foo=Ref(42), bar=1)

interp = ExternalInterpreter(; method_table=GLOBAL_METHOD_TABLE)
only(Base.code_ircode(outer, Tuple{Nothing}; interp))
1 ─     return nothing                                                             │
  => Nothing

So, it seems likely that there is some interaction with the implementation of CUDA's external abstract interpreter?

aviatesk commented 1 month ago

@maleadt Can you share with me an example to run external abstract interpreter used CUDA.jl, in a way that doesn't require any CUDA driver installed?

maleadt commented 4 weeks ago

Looks like one of the quirks is a contributing factor here. Adding the following overlay mimics that:

Base.Experimental.@overlay GLOBAL_METHOD_TABLE @inline Base.throw_boundserror(A, I) = error()
aviatesk commented 4 weeks ago

I see, in that case, it seems necessary to use https://github.com/JuliaLang/julia/pull/54322 and mark throw_boundserror as @consistent_overlay. Since throw_boundserror always just throws, it should be @consistent_overlay. I'm checking it locally.

aviatesk commented 4 weeks ago

Yeah, I confirmed it fixes the issue if we overlay it as @consistent_overlay CUDA_2241_MT @inline Base.throw_boundserror(A, I) = error().

maleadt commented 4 weeks ago

Since throw_boundserror always just throws, it should be @consistent_overlay.

FWIW, t actual implementation does some more work: https://github.com/JuliaGPU/CUDA.jl/blob/e1e5be2b6bf17f03a367cebeb18c4645e593f80d/src/device/quirks.jl#L10-L17

But it still ends in an unconditional throw, so I guess it's still consistent.

aviatesk commented 4 weeks ago

Ah I see, so our situation is trickier than I thought... In this case, both the original and @overlayed throw_boundserror will unconditionally throw, so it is indeed @consistent_overlay, but the @overlayed version modifies the global state of GPUCompiler.jl, so it is not effect_free and is actually not eligible for concrete evaluation.

In this case, the original version is :effect_free and eligible for concrete evaluation, so technically it's possible to concrete-evaluate it (, which had been happening due to the bug we aimed to fix for v1.11), but allowing that doesn't seem like the correct approach. This is because it would require us to do something like

macro gputhrow(subtype, reason) 
    quote Base.@assume_effects :effect_free begin
        info = kernel_state().exception_info 
        info.subtype = @strptr $subtype 
        info.reason = @strptr $reason 
        throw(nothing) 
    end end
end 

, which is totally wrong usage of Base.@assume_effects.

In this case I think it might be a good solution to incorporate an idea like LazyString into the implementation of @gputhrow. Is it possible to implement @gputhrow as follows?

struct GPUError
    subtype
    reason
end
macro gputhrow(subtype, reason)
    :(GPUError($subtype, $reason))
end
function Base.show_error(io::IO, x::GPUError) # delay the effect of `@gputhrow`
    info = kernel_state().exception_info
    info.subtype = @strptr x.subtype
    info.reason = @strptr x.reason
    [...] # show the proper exception using the updated kernel state here?
end

The goal of this implementation is to delay the computational effects of @gputhrow as much as possible and defer them until the error is actually shown.

maleadt commented 4 weeks ago

In this case I think it might be a good solution to incorporate an idea like LazyString into the implementation of @gputhrow. Is it possible to implement @gputhrow as follows?

Sadly not, we don't support try/catch in GPU code (stack unwinding, and setjmp/longjmp are not supported), so we can't actually throw an error object for evaluation outside of the code generated by @gputhrow. In fact, the throw(nothing) that's generated there is lowered to what's basically @llvm.trap(), aborting execution of the GPU kernel after the exception has been reported. In addition, this pattern would require dynamic allocation of a GPUError object, which CUDA.jl happens to support, but other back-ends don't.

aviatesk commented 3 weeks ago

@maleadt After further investigation, it seems that simply using @consistent_overlay should suffice (there's no need to modify the implementation of the overlayed throw_boundserror and refine its effects). CUDA's throw_boundserror does have effects, but those effects are ignored for this call graph at the @assume_effects :foldable annotation on typejoin. However, if @consistent_overlay is not used, the :nonoverlayed-bit of throw_boundserror gets tainted, which prevents typejoin from being concretely evaluated by GPUInterpreter. So just using @consistent_overlay would be sufficient and justified.

maleadt commented 3 weeks ago

CUDA's throw_boundserror does have effects, but those effects are ignored for this call graph at the @assume_effects :foldable annotation on typejoin.

I see. I hope effect mismatches from our other overrides (i.e. outside of the typejoin context, such as our math intrinsics) don't pose problems. For example, with a simple llvmcall many effects are different, which I assume is fine:

julia> Base.infer_effects(Base.cos, (Float32,))
(+c,+e,!n,+t,+s,+m,+i)

julia> cuda_cos(x::Float32) = ccall("extern __nv_cosf", llvmcall, Cfloat, (Cfloat,), x)
cuda_cos (generic function with 1 method)

julia> Base.infer_effects(cuda_cos, (Float32,))
(!c,!e,!n,!t,!s,!m,+i)

Or our ^ implementation, which differs in :nothrow:

julia> Base.infer_effects(Base.:(^), (Float64, Int64))
(+c,+e,+n,+t,+s,+m,+i)

julia> function cuda_pow(x::Float64, y::Int64)
           y == -1 && return inv(x)
           y == 0 && return one(x)
           y == 1 && return x
           y == 2 && return x*x
           y == 3 && return x*x*x
           x ^ Float64(y)
       end
cuda_pow (generic function with 1 method)

julia> Base.infer_effects(cuda_pow, (Float64, Int64))
(+c,+e,!n,+t,+s,+m,+i)

Again, me having doubts here is purely because of not fully understanding the effects analysis. Which is why I expressed some reservations in https://github.com/JuliaLang/julia/pull/54322#issuecomment-2126437421.

aviatesk commented 3 weeks ago

Even if the effects mismatch, there is no problem using @consistent_overlay. To summarize the usage conditions of @consistent_overlay explained in JuliaLang/julia#54322 concisely:

As long as these conditions are met, there is no problem using @consistent_overlay f, and we don't need to care about the finer details of effect analysis.

maleadt commented 3 weeks ago

Thanks, that really helps!