Open ww1g11 opened 1 week ago
Can you also print the output?
Can you also print the output?
The output is
cpu: [18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0;;;]
cuda: [18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0;;;]
amd: [6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0;;;]
julia> versioninfo()
Julia Version 1.10.0
Commit 3120989f39 (2023-12-25 18:01 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Windows (x86_64-w64-mingw32)
CPU: 12 × Intel(R) Core(TM) i7-9850H CPU @ 2.60GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-15.0.7 (ORCJIT, skylake)
Threads: 1 on 12 virtual cores
[ Info: AMDGPU versioninfo
┌───────────┬──────────────────┬───────────┬───────────────────────────────────────────────────────────────────────────────────────────────┐
│ Available │ Name │ Version │ Path │
├───────────┼──────────────────┼───────────┼───────────────────────────────────────────────────────────────────────────────────────────────┤
│ + │ LLD │ - │ C:\\Program Files\\AMD\\ROCm\\5.7\\bin\\ld.lld.exe │
│ + │ Device Libraries │ - │ D:\\Softwares\\julia_pkg\\artifacts\\5ad5ecb46e3c334821f54c1feecc6c152b7b6a45\\amdgcn/bitcode │
│ + │ HIP │ 5.7.32000 │ C:\\WINDOWS\\SYSTEM32\\amdhip64.DLL │
│ + │ rocBLAS │ 3.1.0 │ C:\\Program Files\\AMD\\ROCm\\5.7\\bin\\rocblas.dll │
│ + │ rocSOLVER │ 3.23.0 │ C:\\Program Files\\AMD\\ROCm\\5.7\\bin\\rocsolver.dll │
│ + │ rocALUTION │ - │ C:\\Program Files\\AMD\\ROCm\\5.7\\bin\\rocalution.dll │
│ + │ rocSPARSE │ - │ C:\\Program Files\\AMD\\ROCm\\5.7\\bin\\rocsparse.dll │
│ + │ rocRAND │ 2.10.5 │ C:\\Program Files\\AMD\\ROCm\\5.7\\bin\\rocrand.dll │
│ + │ rocFFT │ 1.0.27 │ C:\\Program Files\\AMD\\ROCm\\5.7\\bin\\rocfft.dll │
│ - │ MIOpen │ - │ - │
└───────────┴──────────────────┴───────────┴───────────────────────────────────────────────────────────────────────────────────────────────┘
[ Info: AMDGPU devices
┌────┬────────────────────────┬──────────┬───────────┬───────────┐
│ Id │ Name │ GCN arch │ Wavefront │ Memory │
├────┼────────────────────────┼──────────┼───────────┼───────────┤
│ 1 │ AMD Radeon RX 7600M XT │ gfx1102 │ 32 │ 7.984 GiB │
└────┴────────────────────────┴──────────┴───────────┴───────────┘
Yeah that's concerning... @pxl-th any ideas?
AMDGPU on Windows right? Could one have sum differently to discern if p or q are not executed?
An error occurs if the kernel function changes to
@kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64)
i, j, k = @index(Global, NTuple)
sum = zero(eltype(tensor))
for p in (-Nx):Nx
for q in (-Ny):Ny
sum += 2.0
end
end
@inbounds tensor[i, j, k] = sum
end
the error msg:
julia> include("amd.jl")
cpu:[18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0;;;]
cuda:[18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0;;;]
ERROR: LoadError: Not implemented
Stacktrace:
[1] error(s::String)
@ Base .\error.jl:35
[2] runtime_module(job::GPUCompiler.CompilerJob)
@ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\interface.jl:176
[3] build_runtime(job::GPUCompiler.CompilerJob)
@ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\rtlib.jl:102
[4] (::GPUCompiler.var"#140#142"{GPUCompiler.CompilerJob{…}})()
@ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\rtlib.jl:143
[5] lock(f::GPUCompiler.var"#140#142"{GPUCompiler.CompilerJob{…}}, l::ReentrantLock)
@ Base .\lock.jl:229
[6] macro expansion
@ D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\rtlib.jl:121 [inlined]
[7] load_runtime(job::GPUCompiler.CompilerJob)
@ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\utils.jl:103
[8] macro expansion
@ D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:304 [inlined]
[9]
@ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\utils.jl:103
[10] emit_llvm
@ D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\utils.jl:97 [inlined]
[11]
@ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:136
[12] codegen
@ D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:115 [inlined]
[13]
@ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:111
[14] compile
@ D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:103 [inlined]
[15] #40
@ D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\compiler\codegen.jl:170 [inlined]
[16] JuliaContext(f::AMDGPU.Compiler.var"#40#41"{GPUCompiler.CompilerJob{…}}; kwargs::@Kwargs{})
@ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:52
[17] JuliaContext(f::Function)
@ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:42
[18] hipcompile(job::GPUCompiler.CompilerJob)
@ AMDGPU.Compiler D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\compiler\codegen.jl:169
[19] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(AMDGPU.Compiler.hipcompile), linker::typeof(AMDGPU.Compiler.hiplink))
@ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\execution.jl:128
[20] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
@ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\execution.jl:103
[21] macro expansion
@ D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\compiler\codegen.jl:137 [inlined]
[22] macro expansion
@ .\lock.jl:267 [inlined]
[23] hipfunction(f::GPUArrays.var"#6#7", tt::Type{Tuple{…}}; kwargs::@Kwargs{name::Nothing})
@ AMDGPU.Compiler D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\compiler\codegen.jl:131
[24] hipfunction
@ D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\compiler\codegen.jl:130 [inlined]
[25] macro expansion
@ D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\highlevel.jl:172 [inlined]
[26] #gpu_call#55
@ D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\gpuarrays.jl:8 [inlined]
[27] gpu_call
@ D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\gpuarrays.jl:5 [inlined]
[28] gpu_call(::GPUArrays.var"#6#7", ::ROCArray{…}, ::Float64; target::ROCArray{…}, elements::Nothing, threads::Nothing, blocks::Nothing, name::Nothing)
@ GPUArrays D:\Softwares\julia_pkg\packages\GPUArrays\qt4ax\src\device\execution.jl:65
[29] gpu_call
@ GPUArrays D:\Softwares\julia_pkg\packages\GPUArrays\qt4ax\src\device\execution.jl:34 [inlined]
[30] fill!(A::ROCArray{Float64, 3, AMDGPU.Runtime.Mem.HIPBuffer}, x::Float64)
@ GPUArrays D:\Softwares\julia_pkg\packages\GPUArrays\qt4ax\src\host\construction.jl:14
[31] zeros(::Type, ::Int64, ::Vararg{Int64})
@ AMDGPU D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\array.jl:244
Testing on RX 7800 XT Ubuntu 22.04 I can reproduce the issue.
The following does not errors though for me:
@kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64)
i, j, k = @index(Global, NTuple)
sum = zero(eltype(tensor))
for p in (-Nx):Nx
for q in (-Ny):Ny
sum += 2.0
end
end
@inbounds tensor[i, j, k] = sum
end
and produces the same wrong results as the original version.
From testing, it seems that -Ny
brakes the thing as following produces correct result:
@kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64)
i, j, k = @index(Global, NTuple)
sum = zero(eltype(tensor))
for p in (-Nx):Nx
for q in (-1):Ny
sum += 2.0
end
end
@inbounds tensor[i, j, k] = sum
end
EDIT: (-Ny):Ny
seems to be the deal-breaker in both loop-variants.
Also the "plain" AMDGPU version works fine:
using AMDGPU
function compute_amdgpu(tensor, kernel_fun, Nx, Ny, Nz)
groupsize = (16, 4, 2) # nthreads
gridsize = cld.(size(tensor), groupsize) # nblocks
@roc groupsize=groupsize gridsize=gridsize kernel_fun(tensor, Nx, Ny, Nz)
AMDGPU.synchronize()
return nothing
end
function kernel_xx2!(tensor, Nx, Ny, Nz)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
k = (workgroupIdx().z - 1) * workgroupDim().z + workitemIdx().z
sum = zero(eltype(tensor))
for p in (-Nx):Nx, q in (-Ny):Ny
sum += 2.0
end
if i ∈ axes(tensor, 1) && j ∈ axes(tensor, 2) && k ∈ axes(tensor, 3)
@inbounds tensor[i, j, k] = sum
end
return nothing
end
nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1
tensor2 = AMDGPU.zeros(Float64, nx, ny, nz)
compute_amdgpu(tensor2, kernel_xx2!, Nx, Ny, Nz)
println("amdgpu:", tensor2)
This seems to appear only when using NTuple
index type.
Changing to linear or cartesian works fine:
@kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64)
i = @index(Global)
s = zero(eltype(tensor))
for p in (-Nx):Nx, q in (-Ny):Ny
s += 2.0
end
@inbounds tensor[i] = s
end
Or more generally, when passing size(x)
to ndrage
instead of length(x)
.
Here's optimized LLVM IR for:
@kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64)
idx = @index(Global)
res = zero(eltype(tensor))
for p in (-Nx):Nx
for q in (-Ny):Ny
res += 2.0
end
end
@inbounds tensor[idx] = res
end
@vchuravy in size(x)
case, there's lot's of comparisons >= 0
and terminations in case of false
. I suspect that's why it fails:
%29 = icmp sgt i64 %.fca.1.1.0.1.0.extract, 0
br i1 %29, label %pass6, label %fail5
ndrange=size(x)
:
; ModuleID = 'start'
source_filename = "start"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:10:11:12:13"
target triple = "amdgcn-amd-amdhsa"
; Function Attrs: nounwind readnone speculatable willreturn declare i32 @llvm.amdgcn.workgroup.id.x() #0
; Function Attrs: nounwind readnone speculatable willreturn declare i32 @llvm.amdgcn.workitem.id.x() #0
; Function Attrs: cold noreturn nounwind declare void @llvm.amdgcn.endpgm() #1
; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn declare i64 @llvm.smax.i64(i64, i64) #2
define amdgpu_kernel void @_Z14gpu_kernel_xx_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILl3E5TupleI5OneToI5Int64ES4_IS5_ES4_IS5_EEE7NDRangeILl3ES0_S0_S2_ILl3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEES2_ILl3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEEEE14ROCDeviceArrayI7Float64Ll3ELl1EES5_S5S5({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, { [3 x i64], i8 addrspace(1), i64 } %1, i64 signext %2, i64 signext %3, i64 signext %4) local_unnamed_addr #3 { conversion: %.fca.0.0.0.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 0, 0, 0, 0 %.fca.0.0.1.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 0, 0, 1, 0 %.fca.0.0.2.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 0, 0, 2, 0 %.fca.1.0.0.0.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 1, 0, 0, 0, 0 %.fca.1.0.0.1.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 1, 0, 0, 1, 0 %.fca.1.1.0.0.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 1, 1, 0, 0, 0 %.fca.1.1.0.1.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 1, 1, 0, 1, 0 %.fca.1.1.0.2.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 1, 1, 0, 2, 0 %.fca.1.extract = extractvalue { [3 x i64], i8 addrspace(1), i64 } %1, 1 %5 = call i32 @llvm.amdgcn.workitem.id.x(), !range !7 %6 = call i64 @llvm.smax.i64(i64 %.fca.1.0.0.1.0.extract, i64 0) %7 = icmp sgt i64 %.fca.1.0.0.0.0.extract, 0 br i1 %7, label %pass, label %fail
L674: ; preds = %L674.preheader, %L706 %value_phi17 = phi i64 [ %10, %L706 ], [ %65, %L674.preheader ] %value_phi18 = phi double [ %value_phi26, %L706 ], [ 0.000000e+00, %L674.preheader ] br i1 %.not127.not, label %L706, label %L693
L693: ; preds = %L693, %L674 %value_phi22 = phi double [ %8, %L693 ], [ %value_phi18, %L674 ] %value_phi23 = phi i64 [ %9, %L693 ], [ %67, %L674 ] %8 = fadd double %value_phi22, 2.000000e+00 %.not128 = icmp eq i64 %value_phi23, %value_phi19 %9 = add i64 %value_phi23, 1 br i1 %.not128, label %L706, label %L693
L706: ; preds = %L693, %L674 %value_phi26 = phi double [ %value_phi18, %L674 ], [ %8, %L693 ] %.not129 = icmp eq i64 %value_phi17, %value_phi %10 = add i64 %value_phi17, 1 br i1 %.not129, label %L732, label %L674
L732: ; preds = %pass10, %L706 %value_phi29 = phi double [ 0.000000e+00, %pass10 ], [ %value_phi26, %L706 ] %11 = add i64 %64, %32 %reass.add137 = add i64 %11, %reass.mul135 %reass.mul138 = mul i64 %reass.add137, %60 %12 = add i64 %reass.mul138, %31 %13 = add i64 %12, %reass.mul %14 = bitcast i8 addrspace(1) %.fca.1.extract to double addrspace(1) %15 = getelementptr inbounds double, double addrspace(1) %14, i64 %13 store double %value_phi29, double addrspace(1) %15, align 8, !tbaa !8 br label %L738
L738: ; preds = %pass6, %L732 ret void
fail: ; preds = %conversion %state.i.fca.0.extract.i = extractvalue { i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, 0 %16 = inttoptr i64 %state.i.fca.0.extract.i to i32 store i32 1, i32 %16, align 1 call void @llvm.amdgcn.endpgm() unreachable
pass: ; preds = %conversion %17 = call i32 @llvm.amdgcn.workgroup.id.x(), !range !11 %18 = zext i32 %17 to i64 %19 = udiv i64 %18, %.fca.1.0.0.0.0.extract %20 = mul i64 %19, %.fca.1.0.0.0.0.extract %21 = icmp sgt i64 %.fca.1.0.0.1.0.extract, 0 br i1 %21, label %pass2, label %fail1
fail1: ; preds = %pass %state.i.fca.0.extract.i28 = extractvalue { i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, 0 %22 = inttoptr i64 %state.i.fca.0.extract.i28 to i32 store i32 1, i32 %22, align 1 call void @llvm.amdgcn.endpgm() unreachable
pass2: ; preds = %pass %23 = udiv i64 %19, %6 %24 = mul i64 %23, %6 %25 = sub i64 %19, %24 %26 = call i64 @llvm.smax.i64(i64 %.fca.1.1.0.1.0.extract, i64 0) %27 = icmp sgt i64 %.fca.1.1.0.0.0.extract, 0 br i1 %27, label %pass4, label %fail3
fail3: ; preds = %pass2 %state.i.fca.0.extract.i42 = extractvalue { i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, 0 %28 = inttoptr i64 %state.i.fca.0.extract.i42 to i32 store i32 1, i32 %28, align 1 call void @llvm.amdgcn.endpgm() unreachable
pass4: ; preds = %pass2 %29 = icmp sgt i64 %.fca.1.1.0.1.0.extract, 0 br i1 %29, label %pass6, label %fail5
fail5: ; preds = %pass4 %state.i.fca.0.extract.i56 = extractvalue { i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, 0 %30 = inttoptr i64 %state.i.fca.0.extract.i56 to i32 store i32 1, i32 %30, align 1 call void @llvm.amdgcn.endpgm() unreachable
pass6: ; preds = %pass4 %31 = zext i32 %5 to i64 %32 = udiv i64 %31, %.fca.1.1.0.0.0.extract %33 = udiv i64 %32, %26 %34 = mul i64 %33, %26 %35 = add i64 %20, %32 %reass.add = sub i64 %18, %35 %reass.mul = mul i64 %reass.add, %.fca.1.1.0.0.0.extract %36 = add nuw nsw i64 %31, 1 %37 = add i64 %36, %reass.mul %38 = mul i64 %25, %.fca.1.1.0.1.0.extract %39 = add i64 %38, 1 %40 = add i64 %39, %32 %41 = sub i64 %40, %34 %42 = mul i64 %23, %.fca.1.1.0.2.0.extract %43 = add i64 %42, 1 %44 = add i64 %43, %33 %45 = icmp sgt i64 %37, 0 %46 = icmp sle i64 %37, %.fca.0.0.0.0.extract %47 = and i1 %45, %46 %48 = icmp sgt i64 %41, 0 %49 = icmp sle i64 %41, %.fca.0.0.1.0.extract %50 = and i1 %48, %49 %51 = icmp sgt i64 %44, 0 %52 = icmp sle i64 %44, %.fca.0.0.2.0.extract %53 = and i1 %51, %52 %54 = and i1 %47, %50 %55 = and i1 %53, %54 br i1 %55, label %pass10, label %L738
pass10: ; preds = %pass6 %56 = udiv i64 %19, %.fca.1.0.0.1.0.extract %57 = mul i64 %56, %.fca.1.0.0.1.0.extract %58 = udiv i64 %32, %.fca.1.1.0.1.0.extract %59 = mul i64 %56, %.fca.1.1.0.2.0.extract %60 = call i64 @llvm.smax.i64(i64 %.fca.0.0.0.0.extract, i64 0) %61 = call i64 @llvm.smax.i64(i64 %.fca.0.0.1.0.extract, i64 0) %62 = add i64 %57, %58 %reass.add134 = sub i64 %19, %62 %reass.mul135 = mul i64 %reass.add134, %.fca.1.1.0.1.0.extract %63 = add i64 %58, %59 %64 = mul i64 %63, %61 %65 = sub i64 0, %2 %.not = icmp sgt i64 %65, %2 %66 = sext i1 %.not to i64 %value_phi = xor i64 %66, %2 %.not125.not = icmp slt i64 %value_phi, %65 br i1 %.not125.not, label %L732, label %L674.preheader
L674.preheader: ; preds = %pass10 %67 = sub i64 0, %3 %.not126 = icmp sgt i64 %67, %3 %68 = sext i1 %.not126 to i64 %value_phi19 = xor i64 %68, %3 %.not127.not = icmp slt i64 %value_phi19, %67 br label %L674 }
attributes #0 = { nounwind readnone speculatable willreturn } attributes #1 = { cold noreturn nounwind } attributes #2 = { nocallback nofree nosync nounwind readnone speculatable willreturn } attributes #3 = { "amdgpu-unsafe-fp-atomics"="true" "target-cpu"="gfx1100" "target-features"="+wavefrontsize32,-wavefrontsize64" }
!llvm.module.flags = !{!0, !1, !2, !3} !opencl.ocl.version = !{!4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4} !llvm.ident = !{!5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5} !julia.kernel = !{!6}
!0 = !{i32 2, !"Dwarf Version", i32 4} !1 = !{i32 2, !"Debug Info Version", i32 3} !2 = !{i32 1, !"wchar_size", i32 4} !3 = !{i32 7, !"PIC Level", i32 1} !4 = !{i32 2, i32 0} !5 = !{!"clang version 15.0.0 (/cache/yggdrasil/downloads/clones/llvm-project.git-974efd367bc513231526d317489c66cb27727ef3caa41108e3819c131a8acf57 f3d695fc2985a8dfdd5f4219d351fdeac3038867)"} !6 = !{void ({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 }, { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] }, { [3 x i64], i8 addrspace(1), i64 }, i64, i64, i64) @_Z14gpu_kernel_xx_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILl3E5TupleI5OneToI5Int64ES4_IS5_ES4_IS5_EEE7NDRangeILl3ES0_S0_S2_ILl3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEES2_ILl3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEEEE14ROCDeviceArrayI7Float64Ll3ELl1EES5_S5S5} !7 = !{i32 0, i32 1023} !8 = !{!9, !9, i64 0, i64 0} !9 = !{!"custom_tbaa_addrspace(1)", !10, i64 0} !10 = !{!"custom_tbaa"} !11 = !{i32 0, i32 -2}
- `ndrange=length(x)`:
```llvm
; ModuleID = 'start'
source_filename = "start"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:10:11:12:13"
target triple = "amdgcn-amd-amdhsa"
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workgroup.id.x() #0
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workitem.id.x() #0
define amdgpu_kernel void @_Z14gpu_kernel_xx_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILl1E5TupleI5OneToI5Int64EEE7NDRangeILl1ES0_S0_S2_ILl1ES3_IS4_IS5_EEES2_ILl1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float64Ll3ELl1EES5_S5_S5_({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %0, { [3 x i64], i8 addrspace(1)*, i64 } %1, i64 signext %2, i64 signext %3, i64 signext %4) local_unnamed_addr #1 {
conversion:
%.fca.0.0.0.0.extract = extractvalue { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %0, 0, 0, 0, 0
%.fca.1.1.0.0.0.extract = extractvalue { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %0, 1, 1, 0, 0, 0
%.fca.1.extract = extractvalue { [3 x i64], i8 addrspace(1)*, i64 } %1, 1
%5 = call i32 @llvm.amdgcn.workgroup.id.x(), !range !7
%6 = call i32 @llvm.amdgcn.workitem.id.x(), !range !8
%7 = add nuw nsw i32 %6, 1
%8 = zext i32 %7 to i64
%9 = zext i32 %5 to i64
%10 = mul i64 %.fca.1.1.0.0.0.extract, %9
%11 = add i64 %10, %8
%12 = icmp slt i64 %11, 1
%13 = icmp sgt i64 %11, %.fca.0.0.0.0.extract
%14 = or i1 %12, %13
br i1 %14, label %L299, label %L103
L103: ; preds = %conversion
%15 = sub i64 0, %2
%.not = icmp sgt i64 %15, %2
%16 = sext i1 %.not to i64
%value_phi = xor i64 %16, %2
%.not6.not = icmp slt i64 %value_phi, %15
br i1 %.not6.not, label %L293, label %L235.preheader
L235.preheader: ; preds = %L103
%17 = sub i64 0, %3
%.not7 = icmp sgt i64 %17, %3
%18 = sext i1 %.not7 to i64
%value_phi5 = xor i64 %18, %3
%.not8.not = icmp slt i64 %value_phi5, %17
br label %L235
L235: ; preds = %L267, %L235.preheader
%value_phi3 = phi i64 [ %21, %L267 ], [ %15, %L235.preheader ]
%value_phi4 = phi double [ %value_phi12, %L267 ], [ 0.000000e+00, %L235.preheader ]
br i1 %.not8.not, label %L267, label %L254
L254: ; preds = %L254, %L235
%value_phi8 = phi double [ %19, %L254 ], [ %value_phi4, %L235 ]
%value_phi9 = phi i64 [ %20, %L254 ], [ %17, %L235 ]
%19 = fadd double %value_phi8, 2.000000e+00
%.not9 = icmp eq i64 %value_phi9, %value_phi5
%20 = add i64 %value_phi9, 1
br i1 %.not9, label %L267, label %L254
L267: ; preds = %L254, %L235
%value_phi12 = phi double [ %value_phi4, %L235 ], [ %19, %L254 ]
%.not10 = icmp eq i64 %value_phi3, %value_phi
%21 = add i64 %value_phi3, 1
br i1 %.not10, label %L293, label %L235
L293: ; preds = %L267, %L103
%value_phi15 = phi double [ 0.000000e+00, %L103 ], [ %value_phi12, %L267 ]
%22 = add nsw i64 %8, -1
%23 = add i64 %22, %10
%24 = bitcast i8 addrspace(1)* %.fca.1.extract to double addrspace(1)*
%25 = getelementptr inbounds double, double addrspace(1)* %24, i64 %23
store double %value_phi15, double addrspace(1)* %25, align 8, !tbaa !9
br label %L299
L299: ; preds = %L293, %conversion
ret void
}
attributes #0 = { nounwind readnone speculatable willreturn }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "target-cpu"="gfx1100" "target-features"="+wavefrontsize32,-wavefrontsize64" }
!llvm.module.flags = !{!0, !1, !2, !3}
!opencl.ocl.version = !{!4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4}
!llvm.ident = !{!5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5}
!julia.kernel = !{!6}
!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 1, !"wchar_size", i32 4}
!3 = !{i32 7, !"PIC Level", i32 1}
!4 = !{i32 2, i32 0}
!5 = !{!"clang version 15.0.0 (/cache/yggdrasil/downloads/clones/llvm-project.git-974efd367bc513231526d317489c66cb27727ef3caa41108e3819c131a8acf57 f3d695fc2985a8dfdd5f4219d351fdeac3038867)"}
!6 = !{void ({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 }, { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] }, { [3 x i64], i8 addrspace(1)*, i64 }, i64, i64, i64)* @_Z14gpu_kernel_xx_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILl1E5TupleI5OneToI5Int64EEE7NDRangeILl1ES0_S0_S2_ILl1ES3_IS4_IS5_EEES2_ILl1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float64Ll3ELl1EES5_S5_S5_}
!7 = !{i32 0, i32 -2}
!8 = !{i32 0, i32 1023}
!9 = !{!10, !10, i64 0, i64 0}
!10 = !{!"custom_tbaa_addrspace(1)", !11, i64 0}
!11 = !{!"custom_tbaa"}
In my computer, the linear or cartesian also give the wrong results:
cpu:[18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0;;;]
amd:[6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0;;;]
This seems to appear only when using
NTuple
index type. Changing to linear or cartesian works fine:@kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64) i = @index(Global) s = zero(eltype(tensor)) for p in (-Nx):Nx, q in (-Ny):Ny s += 2.0 end @inbounds tensor[i] = s end
This time this kernel works well (the results are still wrong).
An error occurs if the kernel function changes to
@kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64) i, j, k = @index(Global, NTuple) sum = zero(eltype(tensor)) for p in (-Nx):Nx for q in (-Ny):Ny sum += 2.0 end end @inbounds tensor[i, j, k] = sum end
the error msg:
julia> include("amd.jl") cpu:[18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0;;;] cuda:[18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0;;;] ERROR: LoadError: Not implemented Stacktrace: [1] error(s::String) @ Base .\error.jl:35 [2] runtime_module(job::GPUCompiler.CompilerJob) @ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\interface.jl:176 [3] build_runtime(job::GPUCompiler.CompilerJob) @ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\rtlib.jl:102 [4] (::GPUCompiler.var"#140#142"{GPUCompiler.CompilerJob{…}})() @ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\rtlib.jl:143 [5] lock(f::GPUCompiler.var"#140#142"{GPUCompiler.CompilerJob{…}}, l::ReentrantLock) @ Base .\lock.jl:229 [6] macro expansion @ D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\rtlib.jl:121 [inlined] [7] load_runtime(job::GPUCompiler.CompilerJob) @ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\utils.jl:103 [8] macro expansion @ D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:304 [inlined] [9] @ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\utils.jl:103 [10] emit_llvm @ D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\utils.jl:97 [inlined] [11] @ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:136 [12] codegen @ D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:115 [inlined] [13] @ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:111 [14] compile @ D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:103 [inlined] [15] #40 @ D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\compiler\codegen.jl:170 [inlined] [16] JuliaContext(f::AMDGPU.Compiler.var"#40#41"{GPUCompiler.CompilerJob{…}}; kwargs::@Kwargs{}) @ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:52 [17] JuliaContext(f::Function) @ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\driver.jl:42 [18] hipcompile(job::GPUCompiler.CompilerJob) @ AMDGPU.Compiler D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\compiler\codegen.jl:169 [19] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(AMDGPU.Compiler.hipcompile), linker::typeof(AMDGPU.Compiler.hiplink)) @ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\execution.jl:128 [20] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function) @ GPUCompiler D:\Softwares\julia_pkg\packages\GPUCompiler\nWT2N\src\execution.jl:103 [21] macro expansion @ D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\compiler\codegen.jl:137 [inlined] [22] macro expansion @ .\lock.jl:267 [inlined] [23] hipfunction(f::GPUArrays.var"#6#7", tt::Type{Tuple{…}}; kwargs::@Kwargs{name::Nothing}) @ AMDGPU.Compiler D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\compiler\codegen.jl:131 [24] hipfunction @ D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\compiler\codegen.jl:130 [inlined] [25] macro expansion @ D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\highlevel.jl:172 [inlined] [26] #gpu_call#55 @ D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\gpuarrays.jl:8 [inlined] [27] gpu_call @ D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\gpuarrays.jl:5 [inlined] [28] gpu_call(::GPUArrays.var"#6#7", ::ROCArray{…}, ::Float64; target::ROCArray{…}, elements::Nothing, threads::Nothing, blocks::Nothing, name::Nothing) @ GPUArrays D:\Softwares\julia_pkg\packages\GPUArrays\qt4ax\src\device\execution.jl:65 [29] gpu_call @ GPUArrays D:\Softwares\julia_pkg\packages\GPUArrays\qt4ax\src\device\execution.jl:34 [inlined] [30] fill!(A::ROCArray{Float64, 3, AMDGPU.Runtime.Mem.HIPBuffer}, x::Float64) @ GPUArrays D:\Softwares\julia_pkg\packages\GPUArrays\qt4ax\src\host\construction.jl:14 [31] zeros(::Type, ::Int64, ::Vararg{Int64}) @ AMDGPU D:\Softwares\julia_pkg\packages\AMDGPU\WqMSe\src\array.jl:244
In my computer, the linear or cartesian also give the wrong results:
Did you change how you launch the code?
You need to specity ndrange=length(x)
instead of ndrange=size(x)
.
Because that's the main issue, passing a tuple to ndrange
.
In my computer, the linear or cartesian also give the wrong results:
Did you change how you launch the code? You need to specity
ndrange=length(x)
instead ofndrange=size(x)
. Because that's the main issue, passing a tuple tondrange
.
I just tried ndrange=length(x), and it give the correct results.
I am using ndrange=size(x) because I want to know its Cartesian Indices, it seems I can cast the linear index to CartesianIndices manually:
idx = @index(Global)
i, j = Tuple(CartesianIndices(tensor)[idx])
so using ndrange=length(x) solves the problem.
You can also access size of the array within the kernel and compute i, j
indices from there.
And if you do only element-wise operations, you can just index with idx
since it will iterate all elements of x
anyway:
x[2, 2] == x[4]
You can also access size of the array within the kernel and compute
i, j
indices from there. And if you do only element-wise operations, you can just index withidx
since it will iterate all elements ofx
anyway:x[2, 2] == x[4]
thanks~
So this is smelling more and more like a compiler bug...
The below code is an attempt of mine to remove the KA syntax sugar. Next step is the inline expand
and then we should have a KA free reproducer.
import KernelAbstractions as KA
function gpu_kernel_xx!(ndrange, iterspace, tensor, Nx::Int64, Ny::Int64; )
I = @inbounds KA.expand(iterspace, AMDGPU.blockIdx().x, AMDGPU.threadIdx().x)
if I in ndrange
(i, j, k) = @inbounds KA.expand(iterspace, AMDGPU.blockIdx().x, AMDGPU.threadIdx().x).I
sum = zero(eltype(tensor))
for p = 1:Nx + 2
for q = -Ny:Ny
sum += 1.0
end
end
@inbounds tensor[i, j, k] = sum
end
return nothing
end
obj = kernel_xx!(ROCBackend())
ndrange, wgsize, iterspace, dynamic = KernelAbstractions.launch_config(obj, (10, 1, 1), nothing)
ctx = KernelAbstractions.mkcontext(obj, ndrange, iterspace)
tensor = AMDGPU.zeros(Float64, nx, ny, nz)
@roc groupsize=512 gridsize=size(tensor) gpu_kernel_xx!(CartesianIndices((10, 1, 1)), iterspace, tensor, Nx, Ny)
println("ka_direct:", tensor)
This is now KA free:
"""
assume(cond::Bool)
Assume that the condition `cond` is true. This is a hint to the compiler, possibly enabling
it to optimize more aggressively.
"""
@inline assume(cond::Bool) = Base.llvmcall(("""
declare void @llvm.assume(i1)
define void @entry(i8) #0 {
%cond = icmp eq i8 %0, 1
call void @llvm.assume(i1 %cond)
ret void
}
attributes #0 = { alwaysinline }""", "entry"),
Nothing, Tuple{Bool}, cond)
@inline function assume_nonzero(CI::CartesianIndices)
ntuple(Val(ndims(CI))) do I
@inline
indices = CI.indices[I]
assume(indices.stop > 0)
end
end
Base.@propagate_inbounds function expand(blocks, workitems, groupidx::Integer, idx::Integer)
# this causes a exception branch and a div
assume_nonzero(blocks)
assume_nonzero(workitems)
expand(blocks, workitems, blocks[groupidx], workitems[idx])
end
@inline function expand(blocks, workitems, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N}
nI = ntuple(Val(N)) do I
Base.@_inline_meta
stride = size(workitems, I)
gidx = groupidx.I[I]
(gidx - 1) * stride + idx.I[I]
end
CartesianIndex(nI)
end
function gpu_kernel_xx!(ndrange, blocks, workitems, tensor, Nx::Int64, Ny::Int64; )
I = @inbounds expand(blocks, workitems, AMDGPU.blockIdx().x, AMDGPU.threadIdx().x)
if I in ndrange
(i, j, k) = @inbounds expand(blocks, workitems, AMDGPU.blockIdx().x, AMDGPU.threadIdx().x).I
sum = zero(eltype(tensor))
for p = 1:Nx + 2
for q = -Ny:Ny
sum += 1.0
end
end
@inbounds tensor[i, j, k] = sum
end
return nothing
end
tensor = AMDGPU.zeros(Float64, nx, ny, nz)
ndrange = CartesianIndices((10,1,1))
blocks = CartesianIndices((1, 1, 1))
workitems = CartesianIndices((10, 1, 1))
@roc groupsize=512 gridsize=size(tensor) gpu_kernel_xx!(ndrange, blocks, workitems, tensor, Nx, Ny)
println("ka_direct:", tensor)
Hi, I noticed that the following script produces different results depending on the backend. On my machine, the output is:
Is there a mistake in the kernel function?