Open DhruvDh opened 6 months ago
I tried to run this on AMDGPU and Metal, on both backends I get something very similar to the following -
RROR: LoadError: InvalidIRError: compiling MethodInstance for gpu_self_dot_kernel!(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, ::AMDGPU.Device.ROCDeviceVector{Float16, 1}, ::AMDGPU.Device.ROCDeviceVector{Float16, 1}) resulted in invalid LLVM IR
Reason: unsupported call through a literal pointer (call to ijl_alloc_array_1d)
Stacktrace:
[1] Array
@ ./boot.jl:477
[2] vect
@ ./array.jl:163
[3] macro expansion
@ /media/dhruvdh/External/Dropbox/documents/repositories/Llama2.jl/test.jl:19
[4] gpu_self_dot_kernel!
@ ~/.julia/packages/KernelAbstractions/zPAn3/src/macros.jl:95
[5] gpu_self_dot_kernel!
@ ./none:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
[1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.HIPCompilerParams}, args::LLVM.Module)
@ GPUCompiler ~/.julia/packages/GPUCompiler/C55Eh/src/validation.jl:147
[2] macro expansion
@ ~/.julia/packages/GPUCompiler/C55Eh/src/driver.jl:445 [inlined]
[3] macro expansion
@ ~/.julia/packages/TimerOutputs/RsWnF/src/TimerOutput.jl:253 [inlined]
[4] macro expansion
@ ~/.julia/packages/GPUCompiler/C55Eh/src/driver.jl:444 [inlined]
[5] emit_llvm(job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool)
@ GPUCompiler ~/.julia/packages/GPUCompiler/C55Eh/src/utils.jl:92
[6] emit_llvm
@ ~/.julia/packages/GPUCompiler/C55Eh/src/utils.jl:86 [inlined]
[7] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
@ GPUCompiler ~/.julia/packages/GPUCompiler/C55Eh/src/driver.jl:134
[8] codegen
@ ~/.julia/packages/GPUCompiler/C55Eh/src/driver.jl:115 [inlined]
[9] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool)
@ GPUCompiler ~/.julia/packages/GPUCompiler/C55Eh/src/driver.jl:111
[10] compile
@ ~/.julia/packages/GPUCompiler/C55Eh/src/driver.jl:103 [inlined]
[11] #40
@ ~/.julia/packages/AMDGPU/gtxsf/src/compiler/codegen.jl:172 [inlined]
[12] JuliaContext(f::AMDGPU.Compiler.var"#40#41"{GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.HIPCompilerParams}}; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/C55Eh/src/driver.jl:52
[13] JuliaContext(f::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/C55Eh/src/driver.jl:42
[14] hipcompile(job::GPUCompiler.CompilerJob)
@ AMDGPU.Compiler ~/.julia/packages/AMDGPU/gtxsf/src/compiler/codegen.jl:171
[15] actual_compilation(cache::Dict{Any, AMDGPU.HIP.HIPFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.HIPCompilerParams}, compiler::typeof(AMDGPU.Compiler.hipcompile), linker::typeof(AMDGPU.Compiler.hiplink))
@ GPUCompiler ~/.julia/packages/GPUCompiler/C55Eh/src/execution.jl:125
[16] cached_compilation(cache::Dict{Any, AMDGPU.HIP.HIPFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.HIPCompilerParams}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/C55Eh/src/execution.jl:103
[17] macro expansion
@ ~/.julia/packages/AMDGPU/gtxsf/src/compiler/codegen.jl:139 [inlined]
[18] macro expansion
@ ./lock.jl:267 [inlined]
[19] hipfunction(f::typeof(gpu_self_dot_kernel!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, AMDGPU.Device.ROCDeviceVector{Float16, 1}, AMDGPU.Device.ROCDeviceVector{Float16, 1}}}; kwargs::@Kwargs{})
@ AMDGPU.Compiler ~/.julia/packages/AMDGPU/gtxsf/src/compiler/codegen.jl:133
[20] hipfunction(f::typeof(gpu_self_dot_kernel!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(32,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}}, AMDGPU.Device.ROCDeviceVector{Float16, 1}, AMDGPU.Device.ROCDeviceVector{Float16, 1}}})
@ AMDGPU.Compiler ~/.julia/packages/AMDGPU/gtxsf/src/compiler/codegen.jl:132
[21] macro expansion
@ ~/.julia/packages/AMDGPU/gtxsf/src/highlevel.jl:172 [inlined]
[22] (::KernelAbstractions.Kernel{ROCBackend, KernelAbstractions.NDIteration.StaticSize{(32,)}, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_self_dot_kernel!)})(::ROCArray{Float16, 1, AMDGPU.Runtime.Mem.HIPBuffer}, ::Vararg{ROCArray{Float16, 1, AMDGPU.Runtime.Mem.HIPBuffer}}; ndrange::Int64, workgroupsize::Nothing)
@ AMDGPU.ROCKernels ~/.julia/packages/AMDGPU/gtxsf/src/ROCKernels.jl:86
[23] top-level scope
@ /media/dhruvdh/External/Dropbox/documents/repositories/Llama2.jl/test.jl:44
in expression starting at /media/dhruvdh/External/Dropbox/documents/repositories/Llama2.jl/test.jl:38
The only change I made was using AMDGPU
and changed backend = ROCBackend()
or MetalBackend
for i in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
if li + i <= BLOCK_SIZE
intermediateresults[li] += intermediateresults[li+i]
end
@synchronize()
end
This is #330 I think. On the CPU this currently won't work.
(call to ijl_alloc_array_1d)
for i in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
Use a tuple ()
instead
Thanks! That works but I am still getting inconsistent results on GPU. I'll close this issue in favor of #330 .
Hm on the GPU you shouldn't get inconsistent results, on the CPU yes.
So, on AMDGPU, using the following snippet, here is what I get -
using Pkg;
Pkg.add("AMDGPU");
Pkg.add("KernelAbstractions");
Pkg.add("Atomix");
using AMDGPU
using KernelAbstractions
using Atomix: @atomic
@kernel function self_dot_kernel!(result, @Const(vec))
BLOCK_SIZE = @groupsize()[1]
I = @index(Global, Linear)
li = @index(Local, Linear)
intermediateresults = @localmem eltype(vec) BLOCK_SIZE
if I <= length(vec)
intermediateresults[li] = vec[I] * vec[I]
else
intermediateresults[li] = 0
end
@synchronize()
s = 1
while (li + s) <= BLOCK_SIZE
intermediateresults[li] += intermediateresults[li+s]
@synchronize()
s *= 2
end
@synchronize()
if li == 1
@atomic result[1] += intermediateresults[1]
end
end
backend = ROCBackend()
for T in [Float32, Float64, Int32, Int64]
println("===== Testing $T =====")
for dims in [100, 24354, 12323, 13454, 32512, 314, 235, 51234, 5312]
x = KernelAbstractions.ones(backend, T, dims)
dotresult = KernelAbstractions.zeros(backend, T, 1)
for i in 1:6
kernel! = self_dot_kernel!(backend, 32)
kernel!(dotresult, x; ndrange=length(x))
KernelAbstractions.synchronize(backend)
println(dotresult)
dotresult .= 0
end
end
end
output -
===== Testing Float32 =====
Float32[100.0]
Float32[98.20209]
Float32[100.00625]
Float32[98.097916]
Float32[99.902084]
Float32[100.0]
Float32[24354.0]
Float32[24452.0]
Float32[24452.0]
Float32[24452.0]
Float32[24452.0]
Float32[24452.0]
Float32[12420.0]
Float32[12420.0]
Float32[12420.0]
Float32[12420.0]
Float32[12420.0]
Float32[12420.0]
Float32[13488.0]
Float32[13488.0]
Float32[13488.0]
Float32[13488.0]
Float32[13488.0]
Float32[13488.0]
Float32[32512.0]
Float32[32512.0]
Float32[32512.0]
Float32[32512.0]
Float32[32512.0]
Float32[32512.0]
Float32[324.0]
Float32[324.0]
Float32[324.0]
Float32[324.0]
Float32[324.0]
Float32[324.0]
Float32[292.0]
Float32[292.0]
Float32[292.0]
Float32[292.0]
Float32[292.0]
Float32[292.0]
Float32[51232.902]
Float32[51236.902]
Float32[51332.0]
Float32[51332.0]
Float32[51332.0]
Float32[51332.0]
Float32[5312.0]
Float32[5312.0]
Float32[5312.0]
Float32[5312.0]
Float32[5312.0]
Float32[5312.0]
===== Testing Float64 =====
[4.1943056984375e8]
[100.0]
[99.9991455078125]
[7.909956653047252e22]
[100.0]
[100.00803324091248]
[24353.999145507812]
[24452.0]
[24452.0]
[24452.0]
[24452.0]
[24452.0]
[12420.0]
[12420.0]
[12420.0]
[12420.0]
[12420.0]
[12420.0]
[13488.0]
[13488.0]
[13488.0]
[13488.0]
[13488.0]
[13488.0]
[32512.0]
[32512.0]
[32512.0]
[32512.0]
[32512.0]
[32512.0]
[324.0]
[324.0]
[324.0]
[324.0]
[324.0]
[324.0]
[292.0]
[292.0]
[292.0]
[292.0]
[292.0]
[292.0]
[51234.0]
[51234.0]
[51332.0]
[51332.0]
[51332.0]
[51332.0]
[5312.0]
[5312.0]
[5312.0]
[5312.0]
[5312.0]
[5312.0]
===== Testing Int32 =====
Int32[100]
Int32[100]
Int32[100]
Int32[100]
Int32[100]
Int32[-18419886]
Int32[24354]
Int32[24452]
Int32[24452]
Int32[24452]
Int32[24452]
Int32[24452]
Int32[12420]
Int32[12420]
Int32[12420]
Int32[12420]
Int32[12420]
Int32[12420]
Int32[13488]
Int32[13488]
Int32[13488]
Int32[13488]
Int32[13488]
Int32[13488]
Int32[32512]
Int32[32512]
Int32[32512]
Int32[32512]
Int32[32512]
Int32[32512]
Int32[324]
Int32[324]
Int32[324]
Int32[324]
Int32[324]
Int32[324]
Int32[292]
Int32[292]
Int32[292]
Int32[292]
Int32[292]
Int32[292]
Int32[51234]
Int32[2128985074]
Int32[51332]
Int32[51332]
Int32[51332]
Int32[51332]
Int32[5312]
Int32[5312]
Int32[5312]
Int32[5312]
Int32[5312]
Int32[5312]
===== Testing Int64 =====
[163208757388]
[4625196980518256780]
[4949667255626498188]
[4948752461952188556]
[163208757388]
[4625196980518256780]
[4811252051793174374]
[24452]
[24452]
[24452]
[24452]
[24452]
[12420]
[12420]
[12420]
[12420]
[12420]
[12420]
[13488]
[13488]
[13488]
[13488]
[13488]
[13488]
[32512]
[32512]
[32512]
[32512]
[32512]
[32512]
[324]
[324]
[324]
[324]
[324]
[324]
[292]
[292]
[292]
[292]
[292]
[292]
[279172925542]
[4625197096482424934]
[51332]
[51332]
[51332]
[51332]
[5312]
[5312]
[5312]
[5312]
[5312]
[5312]
On Metal, I get -
===== Testing Float32 =====
Float32[5.4712238f17]
Float32[2.7071126f17]
Float32[100.0]
Float32[100.0]
Float32[100.0]
Float32[100.0]
Float32[24354.0]
Float32[24354.0]
Float32[24354.0]
Float32[24428.0]
Float32[24444.0]
Float32[24420.0]
Float32[12323.0]
Float32[12323.0]
Float32[12323.0]
Float32[12323.0]
Float32[12398.0]
Float32[12400.0]
Float32[-Inf]
Float32[13454.0]
Float32[13454.0]
Float32[-Inf]
Float32[-Inf]
Float32[13488.0]
Float32[32512.0]
Float32[32512.0]
Float32[32512.0]
Float32[32512.0]
Float32[32512.0]
Float32[32512.0]
Float32[314.0]
Float32[-Inf]
Float32[-Inf]
Float32[-Inf]
Float32[-Inf]
Float32[314.0]
Float32[235.0]
Float32[235.0]
Float32[235.0]
Float32[235.0]
Float32[235.0]
Float32[-Inf]
Float32[51234.0]
Float32[51234.0]
Float32[-Inf]
Float32[51234.016]
Float32[-Inf]
Float32[-Inf]
Float32[5312.0]
Float32[5312.0]
Float32[5312.0]
Float32[5312.0]
Float32[5312.0]
Float32[5312.0]
===== Testing Int32 =====
Int32[-1677848871]
Int32[-1979950382]
Int32[1387148965]
Int32[-1337915697]
Int32[100]
Int32[100]
Int32[24354]
Int32[24428]
Int32[24424]
Int32[24424]
Int32[24432]
Int32[24424]
Int32[12400]
Int32[12418]
Int32[12400]
Int32[12323]
Int32[12323]
Int32[-1324293837]
Int32[13488]
Int32[-29856570]
Int32[-29856570]
Int32[13488]
Int32[13488]
Int32[13488]
Int32[32512]
Int32[32512]
Int32[32512]
Int32[32512]
Int32[32512]
Int32[32512]
Int32[1342722370]
Int32[-29869710]
Int32[-29869710]
Int32[-29869710]
Int32[-29869710]
Int32[-29869710]
Int32[-44804801]
Int32[235]
Int32[235]
Int32[235]
Int32[235]
Int32[-44804801]
Int32[-59688814]
Int32[51312]
Int32[51332]
Int32[-59688814]
Int32[51324]
Int32[51332]
Int32[5312]
Int32[5312]
Int32[5312]
Int32[5312]
Int32[5312]
Int32[5312]
So the only thing that seems fishy to me is.
while (li + s) <= BLOCK_SIZE
intermediateresults[li] += intermediateresults[li+s]
@synchronize()
s *= 2
end
Do you not have a read-write race here?
E.g. li=n+1
may or may not have updated it's value when li=n
accesses the value?
You are trying to do something similar to https://github.com/JuliaGPU/CUDA.jl/blob/cb14a637e0b7b7be9ae01005ea9bdcf79b320189/src/mapreduce.jl#L62 IIUC?
My long term goal is to provide primitives for these kinds of block reduces #234
So the difference I see that in your first iteration all Workitems in the block are active and are accessing their neighborhood data (and being racy about it).
In the CUDA code half the Workitems are active in the first iteration
yes, thanks! There is overlap in how I was indexing which does not occur with the CUDA way. I can confirm it works -
d = 1
while d < BLOCK_SIZE
@synchronize()
if 2 * d * (li - 1) + li <= BLOCK_SIZE
intermediateresults[li] += intermediateresults[2*d*(li-1)+li]
end
d *= 2
end
Some things still don't make sense to me -
This works -
d = 1
while d < BLOCK_SIZE
@synchronize()
if 2 * d * (li - 1) + li <= BLOCK_SIZE
intermediateresults[li] += intermediateresults[2*d*(li-1)+li]
end
d *= 2
end
This outputs only zeros -
d = 1
while d < BLOCK_SIZE
@synchronize()
lhsIndex = 2 * d * (li - 1) # or @private lhsIndex = 2 * d * (li - 1)
if lhsIndex + li <= BLOCK_SIZE
intermediateresults[lhsIndex] += intermediateresults[lhsIndex+li]
end
d *= 2
end
Assigning variables outside the loop is fine but inside the loop is troublesome.
Are you back to testing on the CPU? I should add a pass that errors when synchronize is used inside control flow, KA current design simply can't support that.
Sorry, I forgot to mention. My previous message was based on the Metal backend. I tried it again on the AMD backend, and the full snippet follows, where it crashes.
# using Pkg;
# Pkg.add("AMDGPU");
# Pkg.add("KernelAbstractions");
# Pkg.add("Atomix");
using AMDGPU
using KernelAbstractions
using Atomix: @atomic
@kernel function self_dot_kernel!(result, @Const(vec))
BLOCK_SIZE = @groupsize()[1]
I = @index(Global, Linear)
li = @index(Local, Linear)
intermediateresults = @localmem eltype(vec) BLOCK_SIZE
if I <= length(vec)
intermediateresults[li] = vec[I] * vec[I]
else
intermediateresults[li] = 0
end
d = 1
while d < BLOCK_SIZE
@synchronize()
lhsIndex = 2 * d * (li - 1) # or @private lhsIndex = 2 * d * (li - 1)
if lhsIndex + li <= BLOCK_SIZE
intermediateresults[lhsIndex] += intermediateresults[lhsIndex+li]
end
d *= 2
end
@synchronize()
if li == 1
@atomic result[1] += intermediateresults[1]
end
end
backend = ROCBackend()
for T in [Float32, Float64, Int32, Int64]
println("===== Testing $T =====")
for dims in [100, 24354, 12323, 13454, 32512, 314, 235, 51234, 5312]
x = KernelAbstractions.ones(backend, T, dims)
dotresult = KernelAbstractions.zeros(backend, T, 1)
for i in 1:6
kernel! = self_dot_kernel!(backend, 32)
kernel!(dotresult, x; ndrange=length(x))
KernelAbstractions.synchronize(backend)
println(dotresult)
dotresult .= 0
end
end
end
===== Testing Float32 =====
ERROR: LoadError: GPU Kernel Exception
Stacktrace:
[1] error(s::String)
@ Base ./error.jl:35
[2] throw_if_exception(dev::HIPDevice)
@ AMDGPU ~/.julia/packages/AMDGPU/gtxsf/src/exception_handler.jl:123
[3] synchronize(stm::HIPStream; blocking::Bool, stop_hostcalls::Bool)
@ AMDGPU ~/.julia/packages/AMDGPU/gtxsf/src/highlevel.jl:53
[4] synchronize (repeats 2 times)
@ ~/.julia/packages/AMDGPU/gtxsf/src/highlevel.jl:49 [inlined]
[5] synchronize
@ ~/.julia/packages/AMDGPU/gtxsf/src/ROCKernels.jl:29 [inlined]
[6] top-level scope
@ /media/dhruvdh/External/Dropbox/documents/repositories/Llama2.jl/amdgpu-test.jl:52
in expression starting at /media/dhruvdh/External/Dropbox/documents/repositories/Llama2.jl/amdgpu-test.jl:42
On Metal, the same kernel outputs only 0.0s
Are you not missing a +1
in lhsIndex = 2 * d * (li - 1)
For li=1
lhsindex
will be 0 which is OOB
No, actually, it was supposed to be intermediateresults[li] += intermediateresults[lhsIndex+li]
. Sorry! and thanks for all your responses!
A pass that errors on when synchronize is used inside control flow would be great!
I am trying to write a simple reduction kernel, which seems to only work for certain datatypes. The datatypes it works for is inconsistent across platforms.
Here is the kernel, which computes the sum of squares for a vector of 6000 ones -
And here are some outputs, ran using
julia 1.10.2
-Float32, and Float64 always work on apple-aarch64, making me think that there's nothing wrong with the algorithm. And on windows-x64 -
On windows-x64, only
Float64
consistently works. What is going on?