Hutchinson and FractalUserMethod overhaul

Putting this here because I don't know where else to put it for now, but I need to use a workaround for using

for i = 1:length(fxs)
    fxs[i](args...; kwargs[i]
end

due to a known compiler bug. I don't know if this has been mentioned upstream as of yet...

@kernel function check_kernel(a, t, t_args, t_kwargs)
    tid = @index(Global, Linear)

    for i = 1:length(t)
        a[tid] = t[i](t_args[i]...; t_kwargs[i]...)
    end
end
...
a = Cu/ROCArray(ones(10))

# works
t = (f1,)
t_args = ((1,2),)
t_kwargs = ((c = 3,),)
check(a, t, t_args, t_kwargs)

# Doesn't work
t = (f1,f1)
t_args = ((1,2),(2,2))
t_kwargs = ((c = 3,),(d = 2,))
check(a, t, t_args, t_kwargs)

The solution is just to unroll the loop via a generated function, but that might not work for more complicated expressions, like those planned in #2

CUDA Errors:

ERROR: InvalidIRError: compiling kernel #gpu_check_kernel(KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, CuDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}) resulted in invalid LLVM IR
Reason: unsupported call to an unknown function (call to jl_f_getfield)
Stacktrace:
 [1] getindex
   @ ./tuple.jl:29
 [2] macro expansion
   @ ~/projects/sketches/tuple_test.jl:29
 [3] gpu_check_kernel
   @ ~/.julia/packages/KernelAbstractions/C8flJ/src/macros.jl:81
 [4] gpu_check_kernel
   @ ./none:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, CuDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/validation.jl:141
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:418 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:417 [inlined]
  [5] emit_llvm(job::GPUCompiler.CompilerJob, method_instance::Any; libraries::Bool, deferred_codegen::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool, ctx::LLVM.ThreadSafeContext)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/utils.jl:83
  [6] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.ThreadSafeContext)
    @ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:360
  [7] #221
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:354 [inlined]
  [8] LLVM.ThreadSafeContext(f::CUDA.var"#221#222"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, CuDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}})
    @ LLVM ~/.julia/packages/LLVM/HykgZ/src/executionengine/ts_module.jl:14
  [9] JuliaContext(f::CUDA.var"#221#222"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, CuDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:74
 [10] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:353
 [11] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/cache.jl:90
 [12] cufunction(f::typeof(gpu_check_kernel), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, CuDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}; name::Nothing, always_inline::Bool, kwargs::Base.Pairs{Symbol, Int64, Tuple{Symbol}, NamedTuple{(:maxthreads,), Tuple{Int64}}})
    @ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:306
 [13] macro expansion
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:102 [inlined]
 [14] (::KernelAbstractions.Kernel{CUDADevice{false, false}, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_check_kernel)})(::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, ::Vararg{Any}; ndrange::Int64, dependencies::CUDAKernels.CudaEvent, workgroupsize::Nothing, progress::Function)
    @ CUDAKernels ~/.julia/packages/CUDAKernels/3IKLV/src/CUDAKernels.jl:283
 [15] check(a::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, t::Tuple{typeof(f1), typeof(f1)}, t_args::Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, t_kwargs::Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}})
    @ Main ~/projects/sketches/tuple_test.jl:21
 [16] top-level scope
    @ REPL[9]:1
 [17] top-level scope
    @ ~/.julia/packages/CUDA/ZdCxS/src/initialization.jl:155

AMD Error:

julia> check(a, t, t_args, t_kwargs)
ERROR: InvalidIRError: compiling kernel gpu_check_kernel(KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, AMDGPU.Device.ROCDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}) resulted in invalid LLVM IR
Reason: unsupported call to an unknown function (call to ijl_get_nth_field_checked)
Stacktrace:
 [1] getindex
   @ ./tuple.jl:29
 [2] macro expansion
   @ ~/projects/Fae.jl/sketches/KA_test.jl:28
 [3] gpu_check_kernel
   @ ~/.julia/packages/KernelAbstractions/C8flJ/src/macros.jl:81
 [4] gpu_check_kernel
   @ ./none:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.ROCCompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, AMDGPU.Device.ROCDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/validation.jl:141
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:418 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:417 [inlined]
  [5] emit_llvm(job::GPUCompiler.CompilerJob, method_instance::Any; libraries::Bool, deferred_codegen::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool, ctx::LLVM.ThreadSafeContext)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/utils.jl:83
  [6] emit_llvm
    @ ~/.julia/packages/GPUCompiler/S3TWf/src/utils.jl:77 [inlined]
  [7] (::AMDGPU.Compiler.var"#59#62"{GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.ROCCompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, AMDGPU.Device.ROCDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}, Core.MethodInstance})(ctx::LLVM.ThreadSafeContext)
    @ AMDGPU.Compiler ~/.julia/packages/AMDGPU/bzHD4/src/compiler/codegen.jl:183
  [8] LLVM.ThreadSafeContext(f::AMDGPU.Compiler.var"#59#62"{GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.ROCCompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, AMDGPU.Device.ROCDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}, Core.MethodInstance})
    @ LLVM ~/.julia/packages/LLVM/HykgZ/src/executionengine/ts_module.jl:14
  [9] JuliaContext(f::AMDGPU.Compiler.var"#59#62"{GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.ROCCompilerParams, GPUCompiler.FunctionSpec{typeof(gpu_check_kernel), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(256,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, AMDGPU.Device.ROCDeviceVector{Float64, 1}, Tuple{typeof(f1), typeof(f1)}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}}}}}, Core.MethodInstance})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:74
 [10] rocfunction_compile(job::GPUCompiler.CompilerJob)
    @ AMDGPU.Compiler ~/.julia/packages/AMDGPU/bzHD4/src/compiler/codegen.jl:182
 [11] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(AMDGPU.Compiler.rocfunction_compile), linker::typeof(AMDGPU.Compiler.rocfunction_link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/cache.jl:90
 [12] rocfunction(f::typeof(gpu_check_kernel), tt::Type; name::String, device::ROCDevice, global_hooks::NamedTuple{(), Tuple{}})
    @ AMDGPU.Compiler ~/.julia/packages/AMDGPU/bzHD4/src/compiler/codegen.jl:165
 [13] rocfunction
    @ ~/.julia/packages/AMDGPU/bzHD4/src/compiler/codegen.jl:154 [inlined]
 [14] macro expansion
    @ ~/.julia/packages/AMDGPU/bzHD4/src/highlevel.jl:430 [inlined]
 [15] (::KernelAbstractions.Kernel{ROCDevice, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_check_kernel)})(::ROCVector{Float64}, ::Vararg{Any}; ndrange::Int64, dependencies::Nothing, workgroupsize::Nothing, progress::Nothing)
    @ ROCKernels ~/.julia/packages/ROCKernels/TyQpD/src/ROCKernels.jl:197
 [16] check(a::ROCVector{Float64}, t::Tuple{typeof(f1), typeof(f1)}, t_args::Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, t_kwargs::Tuple{NamedTuple{(:c,), Tuple{Int64}}, NamedTuple{(:d,), Tuple{Int64}}})
    @ Main ~/projects/Fae.jl/sketches/KA_test.jl:21
 [17] top-level scope
    @ REPL[9]:1

leios / Fable.jl

Hutchinson and FractalUserMethod overhaul #64