JuliaGPU / KernelAbstractions.jl

Heterogeneous programming in Julia
MIT License
369 stars 65 forks source link

LLVM error when using Atomix.@atomic with FP16 #486

Open ymtoo opened 3 months ago

ymtoo commented 3 months ago

With an FP16 input, the example

using CUDA, KernelAbstractions, Atomix

function index_fun_fixed(arr; backend=get_backend(arr))
    out = similar(arr)
    fill!(out, 0)
    kernel! = my_kernel_fixed!(backend)
    kernel!(out, arr, ndrange=(size(arr, 1), size(arr, 2)))
    return out
end

@kernel function my_kernel_fixed!(out, arr)
    i, j = @index(Global, NTuple)
    for k in 1:size(out, 1)
        Atomix.@atomic out[k, i] += arr[i, j]
    end
end

img_f16 = zeros(Float16, (50, 50))
index_fun_fixed(CuArray(img_f16))

throws an error.

ERROR: LLVM error: Cannot select: 0x434d1250: f16,ch = AtomicLoadFAdd<(load store seq_cst (s16) on %ir.15, addrspace 1)> 0x43223230:1, 0x434d0fe0, 0x43223230, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:259 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:259 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/atomics.jl:363 @[ /home/arl/.julia/packages/UnsafeAtomicsLLVM/R7s6h/src/internal.jl:20 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:33 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ]
  0x434d0fe0: i64 = add 0x43222f58, Constant:i64<-2>, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
    0x43222f58: i64 = add 0x434cd998, 0x43222598, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
      0x434cd998: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %5, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:114 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:147 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ]
        0x41eb8eb8: i64 = Register %5
      0x43222598: i64 = shl 0x43222600, Constant:i32<1>, int.jl:88 @[ abstractarray.jl:1244 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:52 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:103 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ]
        0x43222600: i64 = add 0x43222bb0, 0x434d0a98, int.jl:87 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
          0x43222bb0: i64 = mul 0x434cde78, 0x41eb9538, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
            0x434cde78: i64 = AssertZext 0x41eb97a8, ValueType:ch:i63, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
              0x41eb97a8: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %23, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
                0x434d08f8: i64 = Register %23
            0x41eb9538: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %24, int.jl:88 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
              0x43222c18: i64 = Register %24
          0x434d0a98: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %12, int.jl:87 @[ abstractarray.jl:2989 @[ abstractarray.jl:2989 @[ abstractarray.jl:2973 @[ abstractarray.jl:2957 @[ abstractarray.jl:1330 @[ abstractarray.jl:1324 @[ abstractarray.jl:1291 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/references.jl:102 @[ /home/arl/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ] ] ]
            0x434cde10: i64 = Register %12
        0x41eb9190: i32 = Constant<1>
    0x41eb8aa8: i64 = Constant<-2>
  0x43223230: f16,ch = load<(load (s16) from %ir.64, !tbaa !511, addrspace 1)> 0x45ac35c8, 0x434d1180, undef:i64, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ none:0 @[ none:0 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:91 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:164 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:175 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ]
    0x434d1180: i64,ch = CopyFromReg 0x45ac35c8, Register:i64 %27, /home/arl/.julia/packages/LLVM/6cDbl/src/interop/base.jl:38 @[ none:0 @[ none:0 @[ /home/arl/.julia/packages/LLVM/6cDbl/src/interop/pointer.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:91 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:85 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:164 @[ /home/arl/.julia/packages/CUDA/75aiI/src/device/array.jl:175 @[ REPL[101]:4 @[ /home/arl/.julia/packages/KernelAbstractions/vMaNm/src/macros.jl:95 @[ none:0 ] ] ] ] ] ] ] ] ] ]
      0x43222a78: i64 = Register %27
    0x43222ce8: i64 = undef
In function: _Z20gpu_my_kernel_fixed_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi2E5TupleI5OneToI5Int64ES4_IS5_EEE7NDRangeILi2ES0_S0_S2_ILi2ES3_IS4_IS5_ES4_IS5_EEES2_ILi2ES3_IS4_IS5_ES4_IS5_EEEEE13CuDeviceArrayI7Float16Li2ELi1EES7_IS8_Li2ELi1EE
Stacktrace:
  [1] handle_error(reason::Cstring)
    @ LLVM ~/.julia/packages/LLVM/6cDbl/src/core/context.jl:168
  [2] LLVMTargetMachineEmitToMemoryBuffer(T::LLVM.TargetMachine, M::LLVM.Module, codegen::LLVM.API.LLVMCodeGenFileType, ErrorMessage::Base.RefValue{…}, OutMemBuf::Base.RefValue{…})
    @ LLVM.API ~/.julia/packages/LLVM/6cDbl/lib/15/libLLVM.jl:5318
  [3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
    @ LLVM ~/.julia/packages/LLVM/6cDbl/src/targetmachine.jl:45
  [4] mcgen
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/mcgen.jl:84 [inlined]
  [5] mcgen(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:131
  [6] macro expansion
    @ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
  [7] macro expansion
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:481 [inlined]
  [8] macro expansion
    @ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
  [9] macro expansion
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:478 [inlined]
 [10] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/utils.jl:103
 [11] emit_asm
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/utils.jl:97 [inlined]
 [12] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:156
 [13] codegen
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:115 [inlined]
 [14] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:111
 [15] compile
    @ ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:103 [inlined]
 [16] #1145
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:254 [inlined]
 [17] JuliaContext(f::CUDA.var"#1145#1148"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:52
 [18] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/driver.jl:42
 [19] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/compilation.jl:253
 [20] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/execution.jl:128
 [21] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/nWT2N/src/execution.jl:103
 [22] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:369 [inlined]
 [23] macro expansion
    @ ./lock.jl:267 [inlined]
 [24] cufunction(f::typeof(gpu_my_kernel_fixed!), tt::Type{Tuple{…}}; kwargs::@Kwargs{always_inline::Bool, maxthreads::Nothing})
    @ CUDA ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:364
 [25] macro expansion
    @ ~/.julia/packages/CUDA/75aiI/src/compiler/execution.jl:112 [inlined]
 [26] (::KernelAbstractions.Kernel{…})(::CuArray{…}, ::Vararg{…}; ndrange::Tuple{…}, workgroupsize::Nothing)
    @ CUDA.CUDAKernels ~/.julia/packages/CUDA/75aiI/src/CUDAKernels.jl:103
 [27] Kernel
    @ ~/.julia/packages/CUDA/75aiI/src/CUDAKernels.jl:89 [inlined]
 [28] #index_fun_fixed#1
    @ ./REPL[96]:5 [inlined]
 [29] index_fun_fixed(arr::CuArray{Float16, 2, CUDA.DeviceMemory})
    @ Main ./REPL[96]:1
 [30] top-level scope
    @ REPL[110]:1
Some type information was truncated. Use `show(err)` to see complete types.

It works fine on FP32 inputs.

Julia and package version:

julia> versioninfo()
Julia Version 1.10.4
Commit 48d4fd48430 (2024-06-04 10:41 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 24 × AMD Ryzen 9 5900X 12-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, znver3)
Threads: 1 default, 0 interactive, 1 GC (on 24 virtual cores)

(TestKA) pkg> st
Project TestKA v0.1.10
Status `~/Projects/TestKA.jl/Project.toml`
  [a9b6321e] Atomix v0.1.0
  [052768ef] CUDA v5.4.2
  [63c18a36] KernelAbstractions v0.9.21
vchuravy commented 3 months ago

This is tricky to fix.

IIRC one needs to perform a cas loop for < 4 byte atomics.

Atomix should use https://github.com/JuliaConcurrent/Atomix.jl/blob/e60c518e3ffd2c9d4e96104f16f2a970a69e4289/lib/AtomixCUDA/src/AtomixCUDA.jl#L38

Which does claim to support Float16: https://github.com/JuliaGPU/CUDA.jl/blob/14de0097ff7c26932cc4a175840961cc7d3f396e/src/device/intrinsics/atomics.jl#L195

What is ]status -m

x-ref: https://github.com/JuliaGPU/CUDA.jl/pull/1790

vchuravy commented 3 months ago

It might be that we end up in https://github.com/JuliaConcurrent/UnsafeAtomicsLLVM.jl instead of UnsafeAtomicsCUDA.jl

ymtoo commented 3 months ago

This is tricky to fix.

IIRC one needs to perform a cas loop for < 4 byte atomics.

Atomix should use https://github.com/JuliaConcurrent/Atomix.jl/blob/e60c518e3ffd2c9d4e96104f16f2a970a69e4289/lib/AtomixCUDA/src/AtomixCUDA.jl#L38

Which does claim to support Float16: https://github.com/JuliaGPU/CUDA.jl/blob/14de0097ff7c26932cc4a175840961cc7d3f396e/src/device/intrinsics/atomics.jl#L195

What is ]status -m

x-ref: JuliaGPU/CUDA.jl#1790

(jl_hHeJiL) pkg> status -m
Status `/tmp/jl_hHeJiL/Manifest.toml`
[621f4979] AbstractFFTs v1.5.0
[79e6a3ab] Adapt v4.0.4
[a9b6321e] Atomix v0.1.0
[ab4f0b2a] BFloat16s v0.5.0
[fa961155] CEnum v0.5.0
[052768ef] CUDA v5.4.2
[1af6417a] CUDA_Runtime_Discovery v0.3.4
[3da002f7] ColorTypes v0.11.5
[5ae59095] Colors v0.12.11
[34da2185] Compat v4.15.0
[a8cc5b0e] Crayons v4.1.1
[9a962f9c] DataAPI v1.16.0
[a93c6f00] DataFrames v1.6.1
[864edb3b] DataStructures v0.18.20
[e2d170a0] DataValueInterfaces v1.0.0
[e2ba6199] ExprTools v0.1.10
[53c48c17] FixedPointNumbers v0.8.5
[0c68f7d7] GPUArrays v10.2.1
[46192b85] GPUArraysCore v0.1.6
⌃ [61eb1bfa] GPUCompiler v0.26.5
[842dd82b] InlineStrings v1.4.1
[41ab1584] InvertedIndices v1.3.0
[82899510] IteratorInterfaceExtensions v1.0.0
[692b3bcd] JLLWrappers v1.5.0
[63c18a36] KernelAbstractions v0.9.21
⌅ [929cbde3] LLVM v7.2.1
[8b046642] LLVMLoopInfo v1.0.0
[b964fa9f] LaTeXStrings v1.3.1
[1914dd2f] MacroTools v0.5.13
[e1d29d7a] Missings v1.2.0
[5da4648a] NVTX v0.3.4
[bac558e1] OrderedCollections v1.6.3
[69de0a69] Parsers v2.8.1
[2dfb63ee] PooledArrays v1.4.3
[aea7be01] PrecompileTools v1.2.1
[21216c6a] Preferences v1.4.3
[08abe8d2] PrettyTables v2.3.2
[74087812] Random123 v1.7.0
[e6cf234a] RandomNumbers v1.5.3
[189a3867] Reexport v1.2.2
[ae029012] Requires v1.3.0
[6c6a2e73] Scratch v1.2.1
[91c51154] SentinelArrays v1.4.3
[a2af1166] SortingAlgorithms v1.2.1
[90137ffa] StaticArrays v1.9.6
[1e83bf80] StaticArraysCore v1.4.3
[892a3eda] StringManipulation v0.3.4
[3783bdb8] TableTraits v1.0.1
[bd369af6] Tables v1.11.1
[a759f4b9] TimerOutputs v0.5.24
[013be700] UnsafeAtomics v0.2.1
[d80eeb9a] UnsafeAtomicsLLVM v0.1.5
[4ee394cb] CUDA_Driver_jll v0.9.0+0
[76a88914] CUDA_Runtime_jll v0.14.0+1
[9c1d0b0a] JuliaNVTXCallbacks_jll v0.2.1+0
⌅ [dad2f222] LLVMExtra_jll v0.0.29+0
[e98f9f5b] NVTX_jll v3.1.0+2
[0dad84c5] ArgTools v1.1.1
[56f22d72] Artifacts
[2a0f44e3] Base64
[ade2ca70] Dates
[f43a241f] Downloads v1.6.0
[7b1f6079] FileWatching
[9fa8497b] Future
[b77e0a4c] InteractiveUtils
[4af54fe1] LazyArtifacts
[b27032c2] LibCURL v0.6.4
[76f85450] LibGit2
[8f399da3] Libdl
[37e2e46d] LinearAlgebra
[56ddb016] Logging
[d6f4376e] Markdown
[ca575930] NetworkOptions v1.2.0
[44cfe95a] Pkg v1.10.0
[de0858da] Printf
[3fa0cd96] REPL
[9a3f8284] Random
[ea8e919c] SHA v0.7.0
[9e88b42a] Serialization
[6462fe0b] Sockets
[2f01184e] SparseArrays v1.10.0
[10745b16] Statistics v1.10.0
[fa267f1f] TOML v1.0.3
[a4e569a6] Tar v1.10.0
[8dfed614] Test
[cf7118a7] UUIDs
[4ec0a83e] Unicode
[e66e0078] CompilerSupportLibraries_jll v1.1.1+0
[deac9b47] LibCURL_jll v8.4.0+0
[e37daf67] LibGit2_jll v1.6.4+0
[29816b5a] LibSSH2_jll v1.11.0+1
[c8ffd9c3] MbedTLS_jll v2.28.2+1
[14a3606d] MozillaCACerts_jll v2023.1.10
[4536629a] OpenBLAS_jll v0.3.23+4
[bea87d4a] SuiteSparse_jll v7.2.1+1
[83775a58] Zlib_jll v1.2.13+1
[8e850b90] libblastrampoline_jll v5.8.0+1
[8e850ede] nghttp2_jll v1.52.0+1
[3f19e933] p7zip_jll v17.4.0+2
Info Packages marked with ⌃ and ⌅ have new versions available. Those with ⌃ may be upgradable, but those with ⌅ are restricted by compatibility constraints from upgrading. To see why use `status --outdated -m`
vchuravy commented 3 months ago

What happens when you load AtomixCUDA?

ymtoo commented 3 months ago

The error still occurs after adding and loading AtomixCUDA.

(jl_OV6Zim) pkg> st
Status `/tmp/jl_OV6Zim/Project.toml`
  [a9b6321e] Atomix v0.1.0
  [6171a885] AtomixCUDA v0.1.0-DEV `https://github.com/JuliaConcurrent/Atomix.jl#main:lib/AtomixCUDA`
  [052768ef] CUDA v5.4.2
  [63c18a36] KernelAbstractions v0.9.22

julia> using AtomixCUDA
vchuravy commented 3 months ago

I won't be able to look at this in detail until August.

For now I would recommend just writing a CUDA.jl kernel and using CUDA.@atomic