Closed leios closed 1 year ago
Putting this here because I don't know where else to put it for now, but I need to use a workaround for using
for i = 1:length(fxs)
fxs[i](args...; kwargs[i]
end
due to a known compiler bug. I don't know if this has been mentioned upstream as of yet...
@kernel function check_kernel(a, t, t_args, t_kwargs)
tid = @index(Global, Linear)
for i = 1:length(t)
a[tid] = t[i](t_args[i]...; t_kwargs[i]...)
end
end
...
a = Cu/ROCArray(ones(10))
# works
t = (f1,)
t_args = ((1,2),)
t_kwargs = ((c = 3,),)
check(a, t, t_args, t_kwargs)
# Doesn't work
t = (f1,f1)
t_args = ((1,2),(2,2))
t_kwargs = ((c = 3,),(d = 2,))
check(a, t, t_args, t_kwargs)
The solution is just to unroll the loop via a generated function, but that might not work for more complicated expressions, like those planned in #2
CUDA Errors:
ERROR: InvalidIRError: compiling kernel #gpu_check_kernel(KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, CuDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}) resulted in invalid LLVM IR
Reason: unsupported call to an unknown function (call to jl_f_getfield)
Stacktrace:
[1] getindex
@ ./tuple.jl:29
[2] macro expansion
@ ~/projects/sketches/tuple_test.jl:29
[3] gpu_check_kernel
@ ~/.julia/packages/KernelAbstractions/C8flJ/src/macros.jl:81
[4] gpu_check_kernel
@ ./none:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
[1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, CuDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}, args::LLVM.Module)
@ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/validation.jl:141
[2] macro expansion
@ ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:418 [inlined]
[3] macro expansion
@ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
[4] macro expansion
@ ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:417 [inlined]
[5] emit_llvm(job::GPUCompiler.CompilerJob, method_instance::Any; libraries::Bool, deferred_codegen::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool, ctx::LLVM.ThreadSafeContext)
@ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/utils.jl:83
[6] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.ThreadSafeContext)
@ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:360
[7] #221
@ ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:354 [inlined]
[8] LLVM.ThreadSafeContext(f::CUDA.var"#221#222"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, CuDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}})
@ LLVM ~/.julia/packages/LLVM/HykgZ/src/executionengine/ts_module.jl:14
[9] JuliaContext(f::CUDA.var"#221#222"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, CuDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}})
@ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:74
[10] cufunction_compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:353
[11] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/cache.jl:90
[12] cufunction(f::typeof(gpu_check_kernel), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, CuDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}; name::Nothing, always_inline::Bool, kwargs::Base.Pairs{Symbol, Int64, Tuple{Symbol}, NamedTuple{(:maxthreads,), Tuple{Int64}}})
@ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:306
[13] macro expansion
@ ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:102 [inlined]
[14] (::KernelAbstractions.Kernel{CUDADevice{false, false}, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_check_kernel)})(::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, ::Vararg{Any}; ndrange::Int64, dependencies::CUDAKernels.CudaEvent, workgroupsize::Nothing, progress::Function)
@ CUDAKernels ~/.julia/packages/CUDAKernels/3IKLV/src/CUDAKernels.jl:283
[15] check(a::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, t::Tuple{typeof(f1), typeof(f1)}, t_args::Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, t_kwargs::Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}})
@ Main ~/projects/sketches/tuple_test.jl:21
[16] top-level scope
@ REPL[9]:1
[17] top-level scope
@ ~/.julia/packages/CUDA/ZdCxS/src/initialization.jl:155
AMD Error:
julia> check(a, t, t_args, t_kwargs)
ERROR: InvalidIRError: compiling kernel gpu_check_kernel(KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, AMDGPU.Device.ROCDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}) resulted in invalid LLVM IR
Reason: unsupported call to an unknown function (call to ijl_get_nth_field_checked)
Stacktrace:
[1] getindex
@ ./tuple.jl:29
[2] macro expansion
@ ~/projects/Fae.jl/sketches/KA_test.jl:28
[3] gpu_check_kernel
@ ~/.julia/packages/KernelAbstractions/C8flJ/src/macros.jl:81
[4] gpu_check_kernel
@ ./none:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
[1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.ROCCompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, AMDGPU.Device.ROCDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}, args::LLVM.Module)
@ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/validation.jl:141
[2] macro expansion
@ ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:418 [inlined]
[3] macro expansion
@ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
[4] macro expansion
@ ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:417 [inlined]
[5] emit_llvm(job::GPUCompiler.CompilerJob, method_instance::Any; libraries::Bool, deferred_codegen::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool, ctx::LLVM.ThreadSafeContext)
@ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/utils.jl:83
[6] emit_llvm
@ ~/.julia/packages/GPUCompiler/S3TWf/src/utils.jl:77 [inlined]
[7] (::AMDGPU.Compiler.var"#59#62"{GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.ROCCompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, AMDGPU.Device.ROCDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}, Core.MethodInstance})(ctx::LLVM.ThreadSafeContext)
@ AMDGPU.Compiler ~/.julia/packages/AMDGPU/bzHD4/src/compiler/codegen.jl:183
[8] LLVM.ThreadSafeContext(f::AMDGPU.Compiler.var"#59#62"{GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.ROCCompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, AMDGPU.Device.ROCDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}, Core.MethodInstance})
@ LLVM ~/.julia/packages/LLVM/HykgZ/src/executionengine/ts_module.jl:14
[9] JuliaContext(f::AMDGPU.Compiler.var"#59#62"{GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.ROCCompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, AMDGPU.Device.ROCDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}, Core.MethodInstance})
@ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:74
[10] rocfunction_compile(job::GPUCompiler.CompilerJob)
@ AMDGPU.Compiler ~/.julia/packages/AMDGPU/bzHD4/src/compiler/codegen.jl:182
[11] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(AMDGPU.Compiler.rocfunction_compile), linker::typeof(AMDGPU.Compiler.rocfunction_link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/cache.jl:90
[12] rocfunction(f::typeof(gpu_check_kernel), tt::Type; name::String, device::ROCDevice, global_hooks::NamedTuple{(), Tuple{}})
@ AMDGPU.Compiler ~/.julia/packages/AMDGPU/bzHD4/src/compiler/codegen.jl:165
[13] rocfunction
@ ~/.julia/packages/AMDGPU/bzHD4/src/compiler/codegen.jl:154 [inlined]
[14] macro expansion
@ ~/.julia/packages/AMDGPU/bzHD4/src/highlevel.jl:430 [inlined]
[15] (::KernelAbstractions.Kernel{ROCDevice, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_check_kernel)})(::ROCVector{Float64}, ::Vararg{Any}; ndrange::Int64, dependencies::Nothing, workgroupsize::Nothing, progress::Nothing)
@ ROCKernels ~/.julia/packages/ROCKernels/TyQpD/src/ROCKernels.jl:197
[16] check(a::ROCVector{Float64}, t::Tuple{typeof(f1), typeof(f1)}, t_args::Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, t_kwargs::Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}})
@ Main ~/projects/Fae.jl/sketches/KA_test.jl:21
[17] top-level scope
@ REPL[9]:1
Fae currently works by creating custom
fx_strings
defined by the user; however, this leads to the error in #61 on Julia 1.9 and newer. It also creates an unnecessary dynamic function call when we know the functions, kwargs, etc well before the kernels are called. Here is a rough list of things to do:f_dynamic = f(a=userfx(1, 10, frame))
where1
and10
could be start and end framesHutchinson(kwargs...)
, note this should error if there are multiple fums with the same kwargs. Otherwise, you might set all 4 points of the square toP = (0,0)
or something.Hutchinson(fum_name(kwargs))
, which will search for the right fum first?((fum_1, fum2), fum_3, ...)
, where the tuples define the tree of possible actions at each step (so choosefum_1
orfum_2
, then usefum_3
)@invokelatest
calls currently in at kernel launches