EnzymeAD / Enzyme.jl

Julia bindings for the Enzyme automatic differentiator
https://enzyme.mit.edu
MIT License
440 stars 62 forks source link

CUDA.@atomic error in GPU kernel #511

Open jgreener64 opened 1 year ago

jgreener64 commented 1 year ago

I am on Julia 1.8.1 and Enzyme e452f8932fc602989df23d96e5039a3268e5e965. The following works:

using CUDA, Enzyme, StaticArrays, LinearAlgebra

CUDA.limit!(CUDA.CU_LIMIT_MALLOC_HEAP_SIZE, 1*1024^3)

struct Atom
    σ::Float32
    ϵ::Float32
end

function find_neighbors(coords)
    n_atoms = length(coords)
    neighbors = Tuple{Int, Int}[]
    for i in 1:n_atoms
        for j in (i + 1):n_atoms
            if norm(coords[i] - coords[j]) <= 1.0
                push!(neighbors, (i, j))
            end
        end
    end
    return neighbors
end

n_atoms = 1024
coords = rand(SVector{3, Float32}, n_atoms) .* 2.7f0
atoms = [Atom(0.02f0, 0.02f0) for _ in 1:n_atoms]
cu_coords = CuArray(coords)
cu_atoms = CuArray(atoms)
neighbors = find_neighbors(coords)
cu_neighbors = CuArray(neighbors)

function force(c1, c2, a1, a2)
    dr = c2 - c1
    invr2 = inv(sum(abs2, dr))
    σ = (a1.σ + a2.σ) / 2
    ϵ = sqrt(a1.ϵ * a2.ϵ)
    six_term = (σ^2 * invr2) ^ 3
    f = (24 * ϵ * invr2) * (2 * six_term ^ 2 - six_term)
    return f * dr
end

function kernel!(forces::CuDeviceMatrix{T}, coords_var, atoms_var, neighbors_var,
                 ::Val{M}, shared_fs) where {T, M}
    coords = CUDA.Const(coords_var)
    atoms = CUDA.Const(atoms_var)
    neighbors = CUDA.Const(neighbors_var)

    tidx = threadIdx().x
    inter_ig = (blockIdx().x - 1) * blockDim().x + tidx
    stride = gridDim().x * blockDim().x
    shared_is = CuStaticSharedArray(Int32, M)
    shared_js = CuStaticSharedArray(Int32, M)

    if tidx == 1
        for si in 1:M
            shared_is[si] = zero(Int32)
        end
    end
    sync_threads()

    for (thread_i, inter_i) in enumerate(inter_ig:stride:length(neighbors))
        si = (thread_i - 1) * blockDim().x + tidx
        i, j = neighbors[inter_i]
        f = force(coords[i], coords[j], atoms[i], atoms[j])
        shared_fs[1, si] = f[1]
        shared_fs[2, si] = f[2]
        shared_fs[3, si] = f[3]
        shared_is[si] = i
        shared_js[si] = j
    end
    sync_threads()

    if tidx == 1
        for si in 1:M
            i = shared_is[si]
            if iszero(i)
                break
            end
            j = shared_js[si]
            dx, dy, dz = shared_fs[1, si], shared_fs[2, si], shared_fs[3, si]
            forces[1, i] -= dx
            forces[2, i] -= dy
            forces[3, i] -= dz
            forces[1, j] += dx
            forces[2, j] += dy
            forces[3, j] += dz
            #=CUDA.@atomic forces[1, i] -= dx
            CUDA.@atomic forces[2, i] -= dy
            CUDA.@atomic forces[3, i] -= dz
            CUDA.@atomic forces[1, j] += dx
            CUDA.@atomic forces[2, j] += dy
            CUDA.@atomic forces[3, j] += dz=#
        end
    end
    return
end

function grad_kernel!(forces::CuDeviceMatrix{T}, d_forces, coords, d_coords, atoms, d_atoms,
                      neighbors, shared_mem_size::Val{M}) where {T, M}
    shared_fs = CuStaticSharedArray(T, (3, M))
    d_shared_fs = CuStaticSharedArray(T, (3, M))
    sync_threads()

    Enzyme.autodiff_deferred(
        kernel!,
        Duplicated(forces, d_forces),
        Duplicated(coords, d_coords),
        Duplicated(atoms, d_atoms),
        Const(neighbors),
        Const(shared_mem_size),
        Duplicated(shared_fs, d_shared_fs),
    )
    return
end

cu_forces_mat = CuArray(zeros(Float32, 3, n_atoms))
d_cu_forces_mat = CuArray(rand(Float32, 3, n_atoms))
d_cu_coords = zero(cu_coords)
d_cu_atoms = CuArray([Atom(0.0f0, 0.0f0) for _ in 1:n_atoms])
n_threads = 256
n_blocks = 800
shared_mem_size = 512

CUDA.@sync @cuda threads=n_threads blocks=n_blocks grad_kernel!(cu_forces_mat, d_cu_forces_mat,
        cu_coords, d_cu_coords, cu_atoms, d_cu_atoms, cu_neighbors, Val(shared_mem_size))

However it errors when I make the forces[1, i] -= dx lines use CUDA.@atomic (as commented out). The truncated error message is:

Instruction does not dominate all uses!
  %"'ipc88" = bitcast i8 addrspace(1)* %"'ipg87" to float addrspace(1)*, !dbg !1622
  %346 = load atomic float, float addrspace(1)* %"'ipc88" acquire, align 4, !dbg !1622
Instruction does not dominate all uses!
  %"'ipc91" = bitcast i8 addrspace(1)* %"'ipg90" to float addrspace(1)*, !dbg !1610
  %349 = load atomic float, float addrspace(1)* %"'ipc91" acquire, align 4, !dbg !1610
Instruction does not dominate all uses!
  %"'ipc93" = bitcast i8 addrspace(1)* %"'ipg92" to float addrspace(1)*, !dbg !1598
  %352 = load atomic float, float addrspace(1)* %"'ipc93" acquire, align 4, !dbg !1598
Instruction does not dominate all uses!
  %"'ipc95" = bitcast i8 addrspace(1)* %"'ipg94" to float addrspace(1)*, !dbg !1576
  %355 = load atomic float, float addrspace(1)* %"'ipc95" acquire, align 4, !dbg !1576
Instruction does not dominate all uses!
  %"'ipc98" = bitcast i8 addrspace(1)* %"'ipg97" to float addrspace(1)*, !dbg !1562
  %362 = load atomic float, float addrspace(1)* %"'ipc98" acquire, align 4, !dbg !1562
Instruction does not dominate all uses!
  %"'ipc101" = bitcast i8 addrspace(1)* %"'ipg100" to float addrspace(1)*, !dbg !1548
  %369 = load atomic float, float addrspace(1)* %"'ipc101" acquire, align 4, !dbg !1548
; Function Attrs: mustprogress willreturn
define void @preprocess_julia_kernel__4472_inner20({ i8 addrspace(1)*, i64, [2 x i64], i64 } %0, { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, { i8 addrspace(3)*, i64, [2 x i64], i64 } %4) local_unnamed_addr #139 !dbg !842 {
entry:
  %5 = alloca [1 x [3 x float]], align 4
  %6 = alloca [1 x [3 x float]], align 4
  %7 = alloca [2 x float], align 4
  %8 = alloca [2 x float], align 4
  %9 = alloca [1 x [3 x float]], align 4
  %.fca.0.extract26 = extractvalue { i8 addrspace(1)*, i64, [2 x i64], i64 } %0, 0, !dbg !843
  %.fca.2.0.extract30 = extractvalue { i8 addrspace(1)*, i64, [2 x i64], i64 } %0, 2, 0, !dbg !843
  %.fca.0.extract18 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, 0, !dbg !843
  %.fca.2.0.extract22 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, 2, 0, !dbg !843
  %.fca.0.extract10 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 0, !dbg !843
  %.fca.2.0.extract14 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 2, 0, !dbg !843
  %.fca.0.extract2 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, 0, !dbg !843
  %.fca.2.0.extract6 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, 2, 0, !dbg !843
  %.fca.0.extract = extractvalue { i8 addrspace(3)*, i64, [2 x i64], i64 } %4, 0, !dbg !843
  %.fca.2.0.extract = extractvalue { i8 addrspace(3)*, i64, [2 x i64], i64 } %4, 2, 0, !dbg !843
  %.fca.3.extract = extractvalue { i8 addrspace(3)*, i64, [2 x i64], i64 } %4, 3, !dbg !843
  %10 = bitcast [1 x [3 x float]]* %5 to i8*
  call void @llvm.lifetime.start.p0i8(i64 noundef 12, i8* noundef nonnull align 4 dereferenceable(12) %10) #143
  %11 = bitcast [1 x [3 x float]]* %6 to i8*
  call void @llvm.lifetime.start.p0i8(i64 noundef 12, i8* noundef nonnull align 4 dereferenceable(12) %11) #143
  %12 = bitcast [2 x float]* %7 to i8*
  call void @llvm.lifetime.start.p0i8(i64 noundef 8, i8* noundef nonnull align 4 dereferenceable(8) %12) #143
  %13 = bitcast [2 x float]* %8 to i8*
  call void @llvm.lifetime.start.p0i8(i64 noundef 8, i8* noundef nonnull align 4 dereferenceable(8) %13) #143
  %14 = bitcast [1 x [3 x float]]* %9 to i8*
  call void @llvm.lifetime.start.p0i8(i64 noundef 12, i8* noundef nonnull align 4 dereferenceable(12) %14) #143
  %15 = call {}*** @julia.get_pgcstack() #143
  %16 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #143, !dbg !844, !range !362
  %17 = add nuw nsw i32 %16, 1, !dbg !851
  %18 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #143, !dbg !852, !range !373
  %19 = zext i32 %18 to i64, !dbg !858
  %20 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #143, !dbg !860, !range !384
  %21 = zext i32 %20 to i64, !dbg !865
  %22 = mul nuw nsw i64 %21, %19, !dbg !867
  %23 = zext i32 %17 to i64, !dbg !869
  %24 = add nuw nsw i64 %22, %23, !dbg !871
  %25 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #143, !dbg !873, !range !403
  %26 = mul i32 %20, %25, !dbg !879
  %.not = icmp eq i32 %16, 0, !dbg !880
  %27 = bitcast i8 addrspace(1)* %.fca.0.extract2 to [2 x i64] addrspace(1)*, !dbg !883
  %28 = bitcast i8 addrspace(3)* %.fca.0.extract to float addrspace(3)*, !dbg !883
  br i1 %.not, label %L67.i.preheader, label %L102.i, !dbg !883

L67.i.preheader:                                  ; preds = %entry
  br label %L67.i, !dbg !884

L67.i:                                            ; preds = %L67.i.preheader, %L67.i
  %iv = phi i64 [ %iv.next, %L67.i ], [ 0, %L67.i.preheader ]
  %29 = shl i64 %iv, 2, !dbg !885
  %iv.next = add nuw nsw i64 %iv, 1, !dbg !885
  %30 = add i64 %29, 1, !dbg !885
  %value_phi.i.off = add nsw i64 %30, -1, !dbg !885
  %31 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([2048 x i8] addrspace(3)* @shmem to i32 addrspace(3)*), i64 %value_phi.i.off, !dbg !892
  store i32 0, i32 addrspace(3)* %31, align 4, !dbg !892, !tbaa !444
  %32 = add nuw nsw i64 %30, 1, !dbg !898
  %33 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([2048 x i8] addrspace(3)* @shmem to i32 addrspace(3)*), i64 %30, !dbg !892
  store i32 0, i32 addrspace(3)* %33, align 4, !dbg !892, !tbaa !444
  %34 = add nuw nsw i64 %30, 2, !dbg !898
  %35 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([2048 x i8] addrspace(3)* @shmem to i32 addrspace(3)*), i64 %32, !dbg !892
  store i32 0, i32 addrspace(3)* %35, align 4, !dbg !892, !tbaa !444
  %36 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([2048 x i8] addrspace(3)* @shmem to i32 addrspace(3)*), i64 %34, !dbg !892
  store i32 0, i32 addrspace(3)* %36, align 4, !dbg !892, !tbaa !444
  %.not95.3 = icmp eq i64 %30, 509, !dbg !900
  %37 = add nuw nsw i64 %30, 4, !dbg !898
  br i1 %.not95.3, label %L102.i.loopexit, label %L67.i, !dbg !884

L102.i.loopexit:                                  ; preds = %L67.i
  br label %L102.i, !dbg !902

L102.i:                                           ; preds = %L102.i.loopexit, %entry
  call void @llvm.nvvm.barrier0() #143, !dbg !902
  %38 = call fastcc i64 @julia_steprange_last_4556(i64 signext %24, i32 signext %26, i64 signext %.fca.2.0.extract6) #139, !dbg !904
  %39 = icmp ne i64 %24, %38, !dbg !909
  %40 = icmp sgt i32 %26, 0, !dbg !915
  %41 = icmp slt i64 %24, %38, !dbg !915
  %42 = xor i1 %40, %41, !dbg !917
  %43 = and i1 %39, %42, !dbg !918
  br i1 %43, label %L465.i, label %L132.i.preheader, !dbg !919

L132.i.preheader:                                 ; preds = %L102.i
  %44 = icmp sgt i64 %.fca.2.0.extract6, 0
  %45 = select i1 %44, i64 %.fca.2.0.extract6, i64 0
  %46 = icmp sgt i64 %.fca.2.0.extract22, 0
  %47 = select i1 %46, i64 %.fca.2.0.extract22, i64 0
  %48 = bitcast i8 addrspace(1)* %.fca.0.extract18 to [1 x [3 x float]] addrspace(1)*
  %.fca.0.0.gep61 = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]]* %5, i64 0, i64 0, i64 0
  %.fca.0.1.gep63 = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]]* %5, i64 0, i64 0, i64 1
  %.fca.0.2.gep65 = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]]* %5, i64 0, i64 0, i64 2
  %.fca.0.0.gep = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]]* %6, i64 0, i64 0, i64 0
  %.fca.0.1.gep = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]]* %6, i64 0, i64 0, i64 1
  %.fca.0.2.gep = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]]* %6, i64 0, i64 0, i64 2
  %49 = icmp sgt i64 %.fca.2.0.extract14, 0
  %50 = select i1 %49, i64 %.fca.2.0.extract14, i64 0
  %51 = bitcast i8 addrspace(1)* %.fca.0.extract10 to [2 x float] addrspace(1)*
  %.fca.0.gep57 = getelementptr inbounds [2 x float], [2 x float]* %7, i64 0, i64 0
  %.fca.1.gep59 = getelementptr inbounds [2 x float], [2 x float]* %7, i64 0, i64 1
  %.fca.0.gep53 = getelementptr inbounds [2 x float], [2 x float]* %8, i64 0, i64 0
  %.fca.1.gep55 = getelementptr inbounds [2 x float], [2 x float]* %8, i64 0, i64 1
  %52 = addrspacecast [1 x [3 x float]]* %5 to [1 x [3 x float]] addrspace(11)*
  %53 = addrspacecast [1 x [3 x float]]* %6 to [1 x [3 x float]] addrspace(11)*
  %54 = addrspacecast [2 x float]* %7 to [2 x float] addrspace(11)*
  %55 = addrspacecast [2 x float]* %8 to [2 x float] addrspace(11)*
  %56 = icmp sgt i64 %.fca.2.0.extract, 0
  %57 = select i1 %56, i64 %.fca.2.0.extract, i64 0
  %58 = icmp sgt i64 %.fca.3.extract, 0
  %59 = select i1 %58, i64 %.fca.3.extract, i64 0
  %60 = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]]* %9, i64 0, i64 0, i64 0
  %61 = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]]* %9, i64 0, i64 0, i64 1
  %62 = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]]* %9, i64 0, i64 0, i64 2
  %63 = sext i32 %26 to i64
  %64 = mul i64 %21, %19, !dbg !920
  %65 = sub i64 %23, 1, !dbg !920
  %66 = add i64 %64, %65, !dbg !920
  %67 = add i64 %66, 1, !dbg !920
  br label %L132.i, !dbg !920

L132.i:                                           ; preds = %L428.i, %L132.i.preheader
  %iv27 = phi i64 [ %iv.next28, %L428.i ], [ 0, %L132.i.preheader ]
  %68 = add i64 %iv27, 2, !dbg !924
  %iv.next28 = add nuw nsw i64 %iv27, 1, !dbg !924
  %69 = mul i64 %63, %iv27, !dbg !924
  %70 = add i64 %67, %69, !dbg !924
  %71 = add i64 %iv.next28, -1, !dbg !924
  %72 = mul i64 %71, %21, !dbg !926
  %73 = add i64 %72, %23, !dbg !928
  %74 = icmp slt i64 %70, 1, !dbg !930
  %75 = icmp sgt i64 %70, %45, !dbg !930
  %76 = or i1 %74, %75, !dbg !920
  br i1 %76, label %L163.i, label %L161.i, !dbg !920

L161.i:                                           ; preds = %L132.i
  %77 = add nsw i64 %70, -1, !dbg !934
  %.elt = getelementptr inbounds [2 x i64], [2 x i64] addrspace(1)* %27, i64 %77, i64 0, !dbg !941
  %.unpack = load i64, i64 addrspace(1)* %.elt, align 8, !dbg !941, !tbaa !511
  %.elt98 = getelementptr inbounds [2 x i64], [2 x i64] addrspace(1)* %27, i64 %77, i64 1, !dbg !941
  %.unpack99 = load i64, i64 addrspace(1)* %.elt98, align 8, !dbg !941, !tbaa !511
  %78 = icmp slt i64 %.unpack, 1, !dbg !942
  %79 = icmp sgt i64 %.unpack, %47, !dbg !942
  %80 = or i1 %78, %79, !dbg !949
  br i1 %80, label %L188.i, label %L186.i, !dbg !949

L163.i:                                           ; preds = %L132.i
  %81 = call fastcc nonnull {} addrspace(10)* @julia__throw_boundserror_4539() #144, !dbg !920
  unreachable, !dbg !920

L186.i:                                           ; preds = %L161.i
  %82 = add nsw i64 %.unpack, -1, !dbg !950
  %.unpack100.elt = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]] addrspace(1)* %48, i64 %82, i64 0, i64 0, !dbg !957
  %.unpack100.unpack = load float, float addrspace(1)* %.unpack100.elt, align 4, !dbg !957, !tbaa !511
  %.unpack100.elt101 = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]] addrspace(1)* %48, i64 %82, i64 0, i64 1, !dbg !957
  %.unpack100.unpack102 = load float, float addrspace(1)* %.unpack100.elt101, align 4, !dbg !957, !tbaa !511
  %.unpack100.elt103 = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]] addrspace(1)* %48, i64 %82, i64 0, i64 2, !dbg !957
  %.unpack100.unpack104 = load float, float addrspace(1)* %.unpack100.elt103, align 4, !dbg !957, !tbaa !511
  store float %.unpack100.unpack, float* %.fca.0.0.gep61, align 4, !dbg !957
  store float %.unpack100.unpack102, float* %.fca.0.1.gep63, align 4, !dbg !957
  store float %.unpack100.unpack104, float* %.fca.0.2.gep65, align 4, !dbg !957
  %83 = icmp slt i64 %.unpack99, 1, !dbg !942
  %84 = icmp sgt i64 %.unpack99, %47, !dbg !942
  %85 = or i1 %83, %84, !dbg !949
  br i1 %85, label %L211.i, label %L209.i, !dbg !949

L188.i:                                           ; preds = %L161.i
  %86 = call fastcc nonnull {} addrspace(10)* @julia__throw_boundserror_4542() #144, !dbg !949
  unreachable, !dbg !949

L209.i:                                           ; preds = %L186.i
  %87 = add nsw i64 %.unpack99, -1, !dbg !950
  %.unpack106.elt = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]] addrspace(1)* %48, i64 %87, i64 0, i64 0, !dbg !957
  %.unpack106.unpack = load float, float addrspace(1)* %.unpack106.elt, align 4, !dbg !957, !tbaa !511
  %.unpack106.elt107 = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]] addrspace(1)* %48, i64 %87, i64 0, i64 1, !dbg !957
  %.unpack106.unpack108 = load float, float addrspace(1)* %.unpack106.elt107, align 4, !dbg !957, !tbaa !511
  %.unpack106.elt109 = getelementptr inbounds [1 x [3 x float]], [1 x [3 x float]] addrspace(1)* %48, i64 %87, i64 0, i64 2, !dbg !957
  %.unpack106.unpack110 = load float, float addrspace(1)* %.unpack106.elt109, align 4, !dbg !957, !tbaa !511
  store float %.unpack106.unpack, float* %.fca.0.0.gep, align 4, !dbg !957
  store float %.unpack106.unpack108, float* %.fca.0.1.gep, align 4, !dbg !957
  store float %.unpack106.unpack110, float* %.fca.0.2.gep, align 4, !dbg !957
  %88 = icmp sgt i64 %.unpack, %50, !dbg !942
  br i1 %88, label %L234.i, label %L232.i, !dbg !949

L211.i:                                           ; preds = %L186.i
  %89 = call fastcc nonnull {} addrspace(10)* @julia__throw_boundserror_4542() #144, !dbg !949
  unreachable, !dbg !949

L232.i:                                           ; preds = %L209.i
  %.elt112 = getelementptr inbounds [2 x float], [2 x float] addrspace(1)* %51, i64 %82, i64 0, !dbg !957
  %.unpack113 = load float, float addrspace(1)* %.elt112, align 4, !dbg !957, !tbaa !511
  %.elt114 = getelementptr inbounds [2 x float], [2 x float] addrspace(1)* %51, i64 %82, i64 1, !dbg !957
  %.unpack115 = load float, float addrspace(1)* %.elt114, align 4, !dbg !957, !tbaa !511
  store float %.unpack113, float* %.fca.0.gep57, align 4, !dbg !957
  store float %.unpack115, float* %.fca.1.gep59, align 4, !dbg !957
  %90 = icmp sgt i64 %.unpack99, %50, !dbg !942
  br i1 %90, label %L257.i, label %L255.i, !dbg !949

L234.i:                                           ; preds = %L209.i
  %91 = call fastcc nonnull {} addrspace(10)* @julia__throw_boundserror_4545() #144, !dbg !949
  unreachable, !dbg !949

L255.i:                                           ; preds = %L232.i
  %.elt116 = getelementptr inbounds [2 x float], [2 x float] addrspace(1)* %51, i64 %87, i64 0, !dbg !957
  %.unpack117 = load float, float addrspace(1)* %.elt116, align 4, !dbg !957, !tbaa !511
  %.elt118 = getelementptr inbounds [2 x float], [2 x float] addrspace(1)* %51, i64 %87, i64 1, !dbg !957
  %.unpack119 = load float, float addrspace(1)* %.elt118, align 4, !dbg !957, !tbaa !511
  store float %.unpack117, float* %.fca.0.gep53, align 4, !dbg !957
  store float %.unpack119, float* %.fca.1.gep55, align 4, !dbg !957
  call fastcc void @julia_force_4554([1 x [3 x float]]* noalias nocapture nofree noundef nonnull writeonly sret([1 x [3 x float]]) align 4 dereferenceable(12) %9, [1 x [3 x float]] addrspace(11)* nocapture nofree noundef nonnull readonly align 4 dereferenceable(12) %52, [1 x [3 x float]] addrspace(11)* nocapture nofree noundef nonnull readonly align 4 dereferenceable(12) %53, [2 x float] addrspace(11)* nocapture nofree noundef nonnull readonly align 4 dereferenceable(8) %54, [2 x float] addrspace(11)* nocapture nofree noundef nonnull readonly align 4 dereferenceable(8) %55) #139, !dbg !958
  %92 = add i64 %73, -1, !dbg !959
  %93 = mul i64 %57, %92, !dbg !968
  %94 = add i64 %93, 1, !dbg !969
  %95 = icmp ugt i64 %93, 9223372036854775806, !dbg !970
  %96 = icmp sgt i64 %94, %59, !dbg !970
  %97 = or i1 %95, %96, !dbg !976
  br i1 %97, label %L294.i, label %L292.i, !dbg !976

L257.i:                                           ; preds = %L232.i
  %98 = call fastcc nonnull {} addrspace(10)* @julia__throw_boundserror_4545() #144, !dbg !949
  unreachable, !dbg !949

L292.i:                                           ; preds = %L255.i
  %99 = load float, float* %60, align 4, !dbg !977, !tbaa !522
  %100 = getelementptr inbounds float, float addrspace(3)* %28, i64 %93, !dbg !977
  store float %99, float addrspace(3)* %100, align 4, !dbg !977, !tbaa !444
  %101 = add nuw i64 %93, 2, !dbg !983
  %102 = icmp slt i64 %101, 1, !dbg !991
  %103 = icmp sgt i64 %101, %59, !dbg !991
  %104 = or i1 %102, %103, !dbg !997
  br i1 %104, label %L331.i, label %L329.i, !dbg !997

L294.i:                                           ; preds = %L255.i
  %105 = call fastcc nonnull {} addrspace(10)* @julia__throw_boundserror_4548() #144, !dbg !976
  unreachable, !dbg !976

L329.i:                                           ; preds = %L292.i
  %106 = load float, float* %61, align 4, !dbg !998, !tbaa !522
  %107 = getelementptr inbounds float, float addrspace(3)* %28, i64 %94, !dbg !998
  store float %106, float addrspace(3)* %107, align 4, !dbg !998, !tbaa !444
  %108 = add nuw i64 %93, 3, !dbg !1004
  %109 = icmp slt i64 %108, 1, !dbg !1012
  %110 = icmp sgt i64 %108, %59, !dbg !1012
  %111 = or i1 %109, %110, !dbg !1018
  br i1 %111, label %L368.i, label %L366.i, !dbg !1018

L331.i:                                           ; preds = %L292.i
  %112 = call fastcc nonnull {} addrspace(10)* @julia__throw_boundserror_4548() #144, !dbg !997
  unreachable, !dbg !997

L366.i:                                           ; preds = %L329.i
  %113 = load float, float* %62, align 4, !dbg !1019, !tbaa !522
  %114 = getelementptr inbounds float, float addrspace(3)* %28, i64 %101, !dbg !1019
  store float %113, float addrspace(3)* %114, align 4, !dbg !1019, !tbaa !444
  %115 = trunc i64 %.unpack to i32, !dbg !1025
  %116 = icmp ugt i64 %.unpack, 2147483647, !dbg !1031
  br i1 %116, label %L384.i, label %L383.i, !dbg !1031

L368.i:                                           ; preds = %L329.i
  %117 = call fastcc nonnull {} addrspace(10)* @julia__throw_boundserror_4548() #144, !dbg !1018
  unreachable, !dbg !1018

L383.i:                                           ; preds = %L366.i
  %118 = icmp ugt i64 %92, 511, !dbg !1032
  br i1 %118, label %L400.i, label %L398.i, !dbg !1037

L384.i:                                           ; preds = %L366.i
  %119 = call fastcc nonnull {} addrspace(10)* @julia__throw_inexacterror_4551() #144, !dbg !1031
  unreachable, !dbg !1031

L398.i:                                           ; preds = %L383.i
  %120 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([2048 x i8] addrspace(3)* @shmem to i32 addrspace(3)*), i64 %92, !dbg !1038
  store i32 %115, i32 addrspace(3)* %120, align 4, !dbg !1038, !tbaa !444
  %121 = icmp ugt i64 %.unpack99, 2147483647, !dbg !1044
  br i1 %121, label %L414.i, label %L428.i, !dbg !1044

L400.i:                                           ; preds = %L383.i
  %122 = call fastcc nonnull {} addrspace(10)* @julia__throw_boundserror_4536() #144, !dbg !1037
  unreachable, !dbg !1037

L414.i:                                           ; preds = %L398.i
  %123 = call fastcc nonnull {} addrspace(10)* @julia__throw_inexacterror_4551() #144, !dbg !1044
  unreachable, !dbg !1044

L428.i:                                           ; preds = %L398.i
  %124 = trunc i64 %.unpack99 to i32, !dbg !1050
  %125 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([2048 x i8] addrspace(3)* @shmem.35 to i32 addrspace(3)*), i64 %92, !dbg !1051
  store i32 %124, i32 addrspace(3)* %125, align 4, !dbg !1051, !tbaa !444
  %.not121 = icmp eq i64 %70, %38, !dbg !1057
  %126 = add i64 %70, %63, !dbg !1061
  %127 = add i64 %68, 1, !dbg !1061
  br i1 %.not121, label %L465.i.loopexit, label %L132.i, !dbg !1062

L465.i.loopexit:                                  ; preds = %L428.i
  br label %L465.i, !dbg !1063

L465.i:                                           ; preds = %L465.i.loopexit, %L102.i
  call void @llvm.nvvm.barrier0() #143, !dbg !1063
  br i1 %.not, label %L470.i.preheader, label %julia_kernel__4472_inner.exit, !dbg !1065

L470.i.preheader:                                 ; preds = %L465.i
  %128 = icmp sgt i64 %.fca.2.0.extract30, 0
  %129 = select i1 %128, i64 %.fca.2.0.extract30, i64 0
  %130 = shl i64 %129, 2
  br label %L470.i, !dbg !1066

L470.i:                                           ; preds = %L607.i, %L470.i.preheader
  %iv29 = phi i64 [ %iv.next30, %L607.i ], [ 0, %L470.i.preheader ]
  %iv.next30 = add nuw nsw i64 %iv29, 1, !dbg !1070
  %value_phi27.i.off = add nsw i64 %iv.next30, -1, !dbg !1070
  %131 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([2048 x i8] addrspace(3)* @shmem to i32 addrspace(3)*), i64 %value_phi27.i.off, !dbg !1074
  %132 = load i32, i32 addrspace(3)* %131, align 4, !dbg !1074, !tbaa !444
  %.not123 = icmp eq i32 %132, 0, !dbg !1080
  br i1 %.not123, label %julia_kernel__4472_inner.exit.loopexit, label %L504.i, !dbg !1083

L504.i:                                           ; preds = %L470.i
  %133 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([2048 x i8] addrspace(3)* @shmem.35 to i32 addrspace(3)*), i64 %value_phi27.i.off, !dbg !1084
  %134 = load i32, i32 addrspace(3)* %133, align 4, !dbg !1084, !tbaa !444
  %135 = icmp sgt i64 %.fca.2.0.extract, 0, !dbg !1092
  %136 = select i1 %135, i64 %.fca.2.0.extract, i64 0, !dbg !1092
  %137 = mul i64 %136, %value_phi27.i.off, !dbg !1103
  %138 = add i64 %137, 1, !dbg !1107
  %139 = icmp sgt i64 %.fca.3.extract, 0, !dbg !1108
  %140 = select i1 %139, i64 %.fca.3.extract, i64 0, !dbg !1108
  %141 = icmp ugt i64 %137, 9223372036854775806, !dbg !1118
  %142 = icmp sgt i64 %138, %140, !dbg !1118
  %143 = or i1 %141, %142, !dbg !1120
  br i1 %143, label %L539.i, label %L537.i, !dbg !1120

L537.i:                                           ; preds = %L504.i
  %144 = bitcast i8 addrspace(3)* %.fca.0.extract to float addrspace(3)*, !dbg !1121
  %145 = getelementptr inbounds float, float addrspace(3)* %144, i64 %137, !dbg !1121
  %146 = load float, float addrspace(3)* %145, align 4, !dbg !1121, !tbaa !444
  %147 = add nuw i64 %137, 2, !dbg !1107
  %148 = icmp slt i64 %147, 1, !dbg !1118
  %149 = icmp sgt i64 %147, %140, !dbg !1118
  %150 = or i1 %148, %149, !dbg !1120
  br i1 %150, label %L574.i, label %L572.i, !dbg !1120

L539.i:                                           ; preds = %L504.i
  %151 = call fastcc nonnull {} addrspace(10)* @julia__throw_boundserror_4548() #144, !dbg !1120
  unreachable, !dbg !1120

L572.i:                                           ; preds = %L537.i
  %152 = getelementptr inbounds float, float addrspace(3)* %144, i64 %138, !dbg !1121
  %153 = load float, float addrspace(3)* %152, align 4, !dbg !1121, !tbaa !444
  %154 = add nuw i64 %137, 3, !dbg !1107
  %155 = icmp slt i64 %154, 1, !dbg !1118
  %156 = icmp sgt i64 %154, %140, !dbg !1118
  %157 = or i1 %155, %156, !dbg !1120
  br i1 %157, label %L609.i, label %L607.i, !dbg !1120

L574.i:                                           ; preds = %L537.i
  %158 = call fastcc nonnull {} addrspace(10)* @julia__throw_boundserror_4548() #144, !dbg !1120
  unreachable, !dbg !1120

L607.i:                                           ; preds = %L572.i
  %159 = getelementptr inbounds float, float addrspace(3)* %144, i64 %147, !dbg !1121
  %160 = load float, float addrspace(3)* %159, align 4, !dbg !1121, !tbaa !444
  %161 = sext i32 %132 to i64, !dbg !1127
  %162 = add nsw i64 %161, -1, !dbg !1138
  %163 = mul i64 %130, %162, !dbg !1140
  %164 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract26, i64 %163, !dbg !1144
  %165 = fneg float %146, !dbg !1145
  %166 = bitcast i8 addrspace(1)* %164 to float addrspace(1)*, !dbg !1147
  %167 = atomicrmw fadd float addrspace(1)* %166, float %165 acq_rel, align 4, !dbg !1147
  %168 = add i64 %163, 4, !dbg !1151
  %169 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract26, i64 %168, !dbg !1158
  %170 = fneg float %153, !dbg !1159
  %171 = bitcast i8 addrspace(1)* %169 to float addrspace(1)*, !dbg !1161
  %172 = atomicrmw fadd float addrspace(1)* %171, float %170 acq_rel, align 4, !dbg !1161
  %173 = add i64 %163, 8, !dbg !1165
  %174 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract26, i64 %173, !dbg !1172
  %175 = fneg float %160, !dbg !1173
  %176 = bitcast i8 addrspace(1)* %174 to float addrspace(1)*, !dbg !1175
  %177 = atomicrmw fadd float addrspace(1)* %176, float %175 acq_rel, align 4, !dbg !1175
  %178 = sext i32 %134 to i64, !dbg !1179
  %179 = add nsw i64 %178, -1, !dbg !1190
  %180 = mul i64 %130, %179, !dbg !1192
  %181 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract26, i64 %180, !dbg !1196
  %182 = bitcast i8 addrspace(1)* %181 to float addrspace(1)*, !dbg !1197
  %183 = atomicrmw fadd float addrspace(1)* %182, float %146 acq_rel, align 4, !dbg !1197
  %184 = add i64 %180, 4, !dbg !1201
  %185 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract26, i64 %184, !dbg !1208
  %186 = bitcast i8 addrspace(1)* %185 to float addrspace(1)*, !dbg !1209
  %187 = atomicrmw fadd float addrspace(1)* %186, float %153 acq_rel, align 4, !dbg !1209
  %188 = add i64 %180, 8, !dbg !1213
  %189 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract26, i64 %188, !dbg !1220
  %190 = bitcast i8 addrspace(1)* %189 to float addrspace(1)*, !dbg !1221
  %191 = atomicrmw fadd float addrspace(1)* %190, float %160 acq_rel, align 4, !dbg !1221
  %.not125 = icmp eq i64 %iv.next30, 512, !dbg !1225
  %192 = add nuw nsw i64 %iv.next30, 1, !dbg !1228
  br i1 %.not125, label %julia_kernel__4472_inner.exit.loopexit, label %L470.i, !dbg !1229

L609.i:                                           ; preds = %L572.i
  %193 = call fastcc nonnull {} addrspace(10)* @julia__throw_boundserror_4548() #144, !dbg !1120
  unreachable, !dbg !1120

julia_kernel__4472_inner.exit.loopexit:           ; preds = %L470.i, %L607.i
  br label %julia_kernel__4472_inner.exit, !dbg !1230

[...]

invertjulia_kernel__4472_inner.exit.loopexit:     ; preds = %invertjulia_kernel__4472_inner.exit
  %379 = load i64, i64* %loopLimit_cache4, align 8, !invariant.group !1632
  %380 = load i64, i64* %loopLimit_cache4, align 8, !invariant.group !1632
  %381 = load i32, i32* %"!manual_lcssa_cache", align 4, !invariant.group !1631
  %.not123_unwrap = icmp eq i32 %381, 0
  br i1 %.not123_unwrap, label %mergeinvertL470.i_julia_kernel__4472_inner.exit.loopexit111, label %mergeinvertL470.i_julia_kernel__4472_inner.exit.loopexit

mergeinvertL470.i_julia_kernel__4472_inner.exit.loopexit111: ; preds = %invertjulia_kernel__4472_inner.exit.loopexit
  store i64 %380, i64* %"iv29'ac", align 8
  br label %invertL470.i

mergeinvertL470.i_julia_kernel__4472_inner.exit.loopexit: ; preds = %invertjulia_kernel__4472_inner.exit.loopexit
  store i64 %379, i64* %"iv29'ac", align 8
  br label %invertL607.i

invertjulia_kernel__4472_inner.exit:              ; preds = %julia_kernel__4472_inner.exit
  br i1 %.not, label %invertjulia_kernel__4472_inner.exit.loopexit, label %invertL465.i
}

ERROR: LoadError: LLVM error: function failed verification (4)
Stacktrace:
  [1] handle_error(reason::Cstring)
    @ LLVM ~/.julia/packages/LLVM/WjSQG/src/core/context.jl:105
  [2] EnzymeCreatePrimalAndGradient(logic::Enzyme.Logic, todiff::LLVM.Function, retType::Enzyme.API.CDIFFE_TYPE, constant_args::Vector{Enzyme.API.CDIFFE_TYPE}, TA::Enzyme.TypeAnalysis, returnValue::Bool, dretUsed::Bool, mode::Enzyme.API.CDerivativeMode, width::Int64, additionalArg::Ptr{Nothing}, typeInfo::Enzyme.FnTypeInfo, uncacheable_args::Vector{Bool}, augmented::Ptr{Nothing}, atomicAdd::Bool)
    @ Enzyme.API ~/.julia/dev/Enzyme/src/api.jl:118
  [3] enzyme!(job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{typeof(kernel!), Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}, CuDeviceMatrix{Float32, 3}}}}, mod::LLVM.Module, primalf::LLVM.Function, adjoint::GPUCompiler.FunctionSpec{typeof(kernel!), Tuple{Duplicated{CuDeviceMatrix{Float32, 1}}, Duplicated{CuDeviceVector{SVector{3, Float32}, 1}}, Duplicated{CuDeviceVector{Atom, 1}}, Const{CuDeviceVector{Tuple{Int64, Int64}, 1}}, Const{Val{512}}, Duplicated{CuDeviceMatrix{Float32, 3}}}}, mode::Enzyme.API.CDerivativeMode, width::Int64, parallel::Bool, actualRetType::Type, dupClosure::Bool, wrap::Bool, modifiedBetween::Bool, returnPrimal::Bool, jlrules::Vector{String})
    @ Enzyme.Compiler ~/.julia/dev/Enzyme/src/compiler.jl:4617
  [4] codegen(output::Symbol, job::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{typeof(kernel!), Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}, CuDeviceMatrix{Float32, 3}}}}; libraries::Bool, deferred_codegen::Bool, optimize::Bool, ctx::LLVM.Context, strip::Bool, validate::Bool, only_entry::Bool, parent_job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(grad_kernel!), Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}}}})
    @ Enzyme.Compiler ~/.julia/dev/Enzyme/src/compiler.jl:5709
  [5] (::GPUCompiler.var"#114#117"{LLVM.Context, GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(grad_kernel!), Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}}}}, GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{typeof(kernel!), Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}, CuDeviceMatrix{Float32, 3}}}}})()
    @ GPUCompiler ~/.julia/packages/GPUCompiler/07qaN/src/driver.jl:296
  [6] get!(default::GPUCompiler.var"#114#117"{LLVM.Context, GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(grad_kernel!), Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}}}}, GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{typeof(kernel!), Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}, CuDeviceMatrix{Float32, 3}}}}}, h::Dict{GPUCompiler.CompilerJob, String}, key::GPUCompiler.CompilerJob{Enzyme.Compiler.EnzymeTarget, Enzyme.Compiler.EnzymeCompilerParams, GPUCompiler.FunctionSpec{typeof(kernel!), Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}, CuDeviceMatrix{Float32, 3}}}})
    @ Base ./dict.jl:481
  [7] macro expansion
    @ ~/.julia/packages/GPUCompiler/07qaN/src/driver.jl:295 [inlined]
  [8] emit_llvm(job::GPUCompiler.CompilerJob, method_instance::Any; libraries::Bool, deferred_codegen::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, ctx::LLVM.Context)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/07qaN/src/utils.jl:68
  [9] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
    @ CUDA ~/.julia/packages/CUDA/DfvRa/src/compiler/execution.jl:353
 [10] #224
    @ ~/.julia/packages/CUDA/DfvRa/src/compiler/execution.jl:347 [inlined]
 [11] JuliaContext(f::CUDA.var"#224#225"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(grad_kernel!), Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}}}}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/07qaN/src/driver.jl:76
 [12] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/DfvRa/src/compiler/execution.jl:346
 [13] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/07qaN/src/cache.jl:90
 [14] cufunction(f::typeof(grad_kernel!), tt::Type{Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA ~/.julia/packages/CUDA/DfvRa/src/compiler/execution.jl:299
 [15] cufunction(f::typeof(grad_kernel!), tt::Type{Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}}})
    @ CUDA ~/.julia/packages/CUDA/DfvRa/src/compiler/execution.jl:292
 [16] macro expansion
    @ ~/.julia/packages/CUDA/DfvRa/src/compiler/execution.jl:102 [inlined]
 [17] top-level scope
    @ ~/.julia/packages/CUDA/DfvRa/src/utilities.jl:25
in expression starting at /home/jgreener/dms/molly_dev/enzyme_err2d.jl:125
wsmoses commented 1 year ago

@jgreener64 can you post the whole log?

wsmoses commented 1 year ago

I also cannot reproduce this on my system (1.8.1, NVIDIA 3090), latest Enzyme.jl and Enzyme proper.

jgreener64 commented 1 year ago

I have attached the error.txt and error with Enzyme.API.printall!(true) since they are over the text box size limit.

My setup (Julia updated since the top post) is Enzyme e452f8932fc602989df23d96e5039a3268e5e965, Enzyme_jll 0.0.42, a NVIDIA RTX A6000 GPU and

Julia Version 1.8.2
Commit 36034abf260 (2022-09-29 15:21 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 36 × Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-13.0.1 (ORCJIT, cascadelake)
  Threads: 16 on 36 virtual cores
Environment:
  LD_LIBRARY_PATH = /usr/local/gromacs/lib
CUDA toolkit 11.7, artifact installation
NVIDIA driver 470.141.3, for CUDA 11.4
CUDA driver 11.7

Libraries: 
- CUBLAS: 11.10.1
- CURAND: 10.2.10
- CUFFT: 10.7.2
- CUSOLVER: 11.3.5
- CUSPARSE: 11.7.3
- CUPTI: 17.0.0
- NVML: 11.0.0+470.141.3
  Downloaded artifact: CUDNN
- CUDNN: 8.30.2 (for CUDA 11.5.0)
  Downloaded artifact: CUTENSOR
- CUTENSOR: 1.4.0 (for CUDA 11.5.0)

Toolchain:
- Julia: 1.8.2
- LLVM: 13.0.1
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0, 7.1, 7.2
- Device capability support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86

2 devices:
  0: NVIDIA RTX A6000 (sm_86, 47.531 GiB / 47.544 GiB available)
  1: NVIDIA RTX A6000 (sm_86, 19.788 GiB / 47.541 GiB available)
^ [21141c5a] AMDGPU v0.4.2
  [4c88cf16] Aqua v0.5.5
  [a963bdd2] AtomsBase v0.2.2 `~/.julia/dev/AtomsBase`
  [6e4b80f9] BenchmarkTools v1.3.1
  [99c8bb3a] Bio3DView v0.1.4 `~/.julia/dev/Bio3DView`
  [de9282ab] BioStructures v1.2.1 `~/.julia/dev/BioStructures`
  [052768ef] CUDA v3.12.0
  [69e1c6dd] CellListMap v0.8.4
⌃ [082447d4] ChainRules v1.42.0
  [d360d2e6] ChainRulesCore v1.15.6
  [46823bd8] Chemfiles v0.10.3
⌃ [31c24e10] Distributions v0.25.75
  [7da242da] Enzyme v0.10.11 `~/.julia/dev/Enzyme`
  [8f5d6c58] EzXML v1.1.0
⌃ [26cc04aa] FiniteDifferences v0.12.24
  [1fa38f19] Format v1.3.2
  [f6369f11] ForwardDiff v0.10.32
  [e9467ef8] GLMakie v0.6.13
  [7073ff75] IJulia v1.23.3
  [63c18a36] KernelAbstractions v0.8.4
  [259c3a9c] MMTF v1.0.0 `~/.julia/dev/MMTF`
⌅ [ee78f7c6] Makie v0.17.13
  [aa0f7f06] Molly v0.13.0 `~/.julia/dev/Molly`
  [5fb14364] OhMyREPL v0.5.12
  [32113eaa] PkgBenchmark v0.2.12
⌃ [91a5bcdd] Plots v1.34.3
  [c46f51b8] ProfileView v1.5.2
  [186d2b2d] ProteinEnsembles v0.3.1 `~/.julia/dev/ProteinEnsembles`
  [295af30f] Revise v3.4.0
  [90137ffa] StaticArrays v1.5.9
⌃ [f3b207a7] StatsPlots v0.15.3
  [1986cc42] Unitful v1.12.0
  [f31437dd] UnitfulChainRules v0.1.2
  [e88e6eb3] Zygote v0.6.44 `~/.julia/dev/Zygote`
  [7cc45869] Enzyme_jll v0.0.42+0
wsmoses commented 1 year ago

Can you retry latest main

jgreener64 commented 1 year ago

On d37ce7247b9cabd910c5aa73ed8bd6f5d73bb7d2 with Enzyme_jll 0.0.43 it still errors, but the error changes:

ERROR: LoadError: LLVM error: Cannot select: 0x9f6b278: f32,ch = AtomicLoad<(load acquire (s32) from %ir."'ipc123_unwrap.i.i", addrspace 1)> 0x76f0260:1, 0x97e5870, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:99 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:106 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:456 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:444 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:439 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:90 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ] ]
  0x97e5870: i64 = add 0x9be74b0, 0x9f6a8b8, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:99 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:106 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:456 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:444 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:439 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:92 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ] ]
    0x9be74b0: i64,ch = CopyFromReg 0xa0e1498, Register:i64 %12, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:99 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:456 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:444 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:439 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:95 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ]
      0x974c3b0: i64 = Register %12
    0x9f6a8b8: i64 = mul 0x72d09c8, 0x9f6a648, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:99 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:106 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:456 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:444 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:439 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:92 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ] ]
      0x72d09c8: i64 = add nsw 0x72d1118, Constant:i64<-1>, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:99 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:106 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:456 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:444 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:439 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:92 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ] ]
        0x72d1118: i64,ch = load<(load (s32) from %ir.375, !tbaa !305), sext from i32> 0x9f6b140:1, 0x9f6af38, undef:i64, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/pointer.jl:9 @[ /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/pointer.jl:9 @[ /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/pointer.jl:81 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/array.jl:119 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/array.jl:111 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/array.jl:192 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:78 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ]
          0x9f6af38: i64 = add 0x9be7928, 0x97e5530, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:99 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:106 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:456 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:444 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:439 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:92 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ] ]
            0x9be7928: i64,ch = CopyFromReg 0xa0e1498, Register:i64 %156, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:99 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:106 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:456 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:444 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:439 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:92 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ] ]
              0x974bf38: i64 = Register %156
            0x97e5530: i64 = shl 0x974c1a8, Constant:i32<2>, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:99 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:456 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:444 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:439 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:95 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ]
              0x974c1a8: i64,ch = CopyFromReg 0xa0e1498, Register:i64 %171, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:99 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:456 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:444 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:439 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:95 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ]
                0x93fab10: i64 = Register %171
              0x9be7ed8: i32 = Constant<2>
          0x9be77f0: i64 = undef
        0x76ef8a0: i64 = Constant<-1>
      0x9f6a648: i64 = shl 0x72d11e8, Constant:i32<2>, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:99 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:456 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:444 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:439 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:95 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ]
        0x72d11e8: i64 = smax 0x9be7990, Constant:i64<0>, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:99 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:456 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:444 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:439 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:95 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ]
          0x9be7990: i64,ch = CopyFromReg 0xa0e1498, Register:i64 %11, /home/jgreener/.julia/packages/LLVM/WjSQG/src/interop/base.jl:40 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:28 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:99 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:456 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:444 @[ /home/jgreener/.julia/packages/CUDA/DfvRa/src/device/intrinsics/atomics.jl:439 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:95 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:0 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6270 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:6001 @[ /home/jgreener/.julia/dev/Enzyme/src/compiler.jl:5978 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:384 @[ /home/jgreener/.julia/dev/Enzyme/src/Enzyme.jl:398 @[ /home/jgreener/dms/molly_dev/enzyme_err2d.jl:107 ] ] ] ] ] ] ] ] ] ] ] ] ] ]
            0x97e53f8: i64 = Register %11
          0x97e51f0: i64 = Constant<0>
        0x9be7ed8: i32 = Constant<2>
In function: _Z23julia_grad_kernel__418213CuDeviceArrayI7Float32Li2ELi1EES_IS0_Li2ELi1EES_I6SArrayI5TupleILi3EES0_Li1ELi3EELi1ELi1EES_IS1_IS2_ILi3EES0_Li1ELi3EELi1ELi1EES_I4AtomLi1ELi1EES_IS3_Li1ELi1EES_IS2_I5Int64S4_ELi1ELi1EE3ValILi512EE
Stacktrace:
  [1] handle_error(reason::Cstring)
    @ LLVM ~/.julia/packages/LLVM/WjSQG/src/core/context.jl:105
  [2] LLVMTargetMachineEmitToMemoryBuffer
    @ ~/.julia/packages/LLVM/WjSQG/lib/13/libLLVM_h.jl:947 [inlined]
  [3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
    @ LLVM ~/.julia/packages/LLVM/WjSQG/src/targetmachine.jl:45
  [4] mcgen(job::GPUCompiler.CompilerJob, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/07qaN/src/mcgen.jl:73
  [5] macro expansion
    @ ~/.julia/packages/TimerOutputs/4yHI4/src/TimerOutput.jl:253 [inlined]
  [6] macro expansion
    @ ~/.julia/packages/GPUCompiler/07qaN/src/driver.jl:430 [inlined]
  [7] macro expansion
    @ ~/.julia/packages/TimerOutputs/4yHI4/src/TimerOutput.jl:253 [inlined]
  [8] macro expansion
    @ ~/.julia/packages/GPUCompiler/07qaN/src/driver.jl:427 [inlined]
  [9] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/07qaN/src/utils.jl:68
 [10] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
    @ CUDA ~/.julia/packages/CUDA/DfvRa/src/compiler/execution.jl:354
 [11] #224
    @ ~/.julia/packages/CUDA/DfvRa/src/compiler/execution.jl:347 [inlined]
 [12] JuliaContext(f::CUDA.var"#224#225"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(grad_kernel!), Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}}}}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/07qaN/src/driver.jl:76
 [13] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/DfvRa/src/compiler/execution.jl:346
 [14] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/07qaN/src/cache.jl:90
 [15] cufunction(f::typeof(grad_kernel!), tt::Type{Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA ~/.julia/packages/CUDA/DfvRa/src/compiler/execution.jl:299
 [16] cufunction(f::typeof(grad_kernel!), tt::Type{Tuple{CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{SVector{3, Float32}, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Atom, 1}, CuDeviceVector{Tuple{Int64, Int64}, 1}, Val{512}}})
    @ CUDA ~/.julia/packages/CUDA/DfvRa/src/compiler/execution.jl:292
 [17] macro expansion
    @ ~/.julia/packages/CUDA/DfvRa/src/compiler/execution.jl:102 [inlined]
 [18] top-level scope
    @ ~/.julia/packages/CUDA/DfvRa/src/utilities.jl:25
in expression starting at /home/jgreener/dms/molly_dev/enzyme_err2d.jl:127

The printall error is attached.

vchuravy commented 1 year ago

Oh that's much more exciting!

vchuravy commented 1 year ago

I wonder if the comment in https://reviews.llvm.org/D50391 is still true.

Higher levels of atomicity (like acquire and release) need additional synchronization properties which were added with PTX ISA 6.0 / sm_70. So using these instructions still results in an error.

We are trying to emit an atomic load aquire.

tkf commented 1 year ago

Yeah, looks similar to what I had in https://github.com/JuliaConcurrent/Atomix.jl/issues/33

It's weird that LLVM is trying to select AtomicLoad though. I only see RMW in the Julia code. Maybe Enzyme inserts some loads given some RMW in the user code? If so, I wonder if you can use Atomix.@atomic :monotonic forces[1, i] -= dx etc. to avoid it (provided that Enzyme copies the ordering).

(Note: you'd need Atomix for now since CUDA.jl uses acq_rel https://github.com/JuliaGPU/CUDA.jl/blob/0cd30cbed3d084cede39db1a9959630ddae904a1/src/device/intrinsics/atomics.jl#L43-L46)

Somewhat relevant https://github.com/JuliaGPU/CUDA.jl/pull/1393

wsmoses commented 1 year ago

Yeah the derivative of an atomicadd can create an atomic load. Presently we preserve the same ordering -- hence the above

tkf commented 1 year ago

Not sure how much of workarounds you'd want to add in Enzyme, but maybe you can use fetch-and-add with 0 for load (and swap for store) when the ordering is stronger than monotonic?

I wonder if the comment in https://reviews.llvm.org/D50391 is still true.

Yeah, I still see the comment in the main branch https://github.com/llvm/llvm-project/blob/de6dfbbb300e552efa1cd86a023063a39d408b06/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp#L854-L859

I guess LLVM needs to do whatever NVCC does with libcu++ https://godbolt.org/z/aoM6477T4

vchuravy commented 1 year ago

Interesting to see the differences to sm_60 https://godbolt.org/z/Y7Pj5G7sK

jgreener64 commented 1 year ago

If this is the issue, is there a way to update other software to get around it? My device is sm_86 and I am on CUDA 11.7.

vchuravy commented 1 year ago

Could you try Takafumi's suggestion in https://github.com/EnzymeAD/Enzyme.jl/issues/511#issuecomment-1279187387?

jgreener64 commented 1 year ago

Replacing the forces[1, i] -= dx lines with Atomix.@atomic :monotonic forces[1, i] -= dx means it runs without throwing an error.

However d_cu_coords and d_cu_atoms remain zero, i.e. it doesn't seem like the gradients are recorded.

vchuravy commented 1 year ago

However d_cu_coords and d_cu_atoms remain zero, i.e. it doesn't seem like the gradients are recorded.

Could you open a new issue with that and. a complete reproducer as minimal as you can get it :)

jgreener64 commented 1 year ago

Looking into it but running into some segfaults that have appeared with recent commits: https://github.com/EnzymeAD/Enzyme.jl/issues/533.

jgreener64 commented 1 year ago

I am looking into a minimal example with Atomix but running into some non-Enzyme issues on the GPU so reported them at https://github.com/JuliaConcurrent/Atomix.jl/issues/33.

leios commented 1 year ago

@jgreener64 Is there some equivalent C / CUDA code (in GROMACS, for example) we could look at to see if we can reproduce this issue there? We are trying to see if this is a Julia issue or an Enzyme issue.

jgreener64 commented 1 year ago

The kernels in the fastest software are more complicated, using warp reductions and clever ordering of pairs to get high speed. See for example the CUDA kernel in OpenMM, which uses some atomics: https://github.com/openmm/openmm/blob/master/platforms/cuda/src/kernels/nonbonded.cu. There are likely some simpler implementations around but I don't know of any off the top of my head.

I think this issue may be solved though based on @vchuravy's comment in https://github.com/EnzymeAD/Enzyme.jl/issues/576? In particular when I run that code (which differs from the top code here by using UnsafeAtomicsLLVM and += for all forces) on Enzyme 0.10.15 with https://github.com/JuliaGPU/CUDA.jl/pull/1644 and the -g0 Julia flag, it seems to work. By work I mean that d_cu_coords is not zero like it was before, I can test for correctness later.

leios commented 1 year ago

Great! Thanks for the reference, and yes. @vchuravy and I were in a meeting discussing this and we think things "work" (tm) now, but also did not check for correctness.

jgreener64 commented 1 year ago

Brilliant, thanks for all the help on this. If it's helpful I can make a PR adding this as a regression test to Enzyme once https://github.com/JuliaGPU/CUDA.jl/pull/1644 is in a release and https://github.com/EnzymeAD/Enzyme.jl/issues/576 is fixed.