EnzymeAD / Enzyme.jl

Julia bindings for the Enzyme automatic differentiator
https://enzyme.mit.edu
MIT License
459 stars 66 forks source link

Better error for GPU codes #1102

Closed jgreener64 closed 6 months ago

jgreener64 commented 1 year ago

I am on Julia 1.9.2, Enzyme main (bf3415d5b032eacd4a924ba09ec725798129d007), CUDA 4.4.1, Atomix 0.1.0, StaticArrays 1.6.5. The following used to work but now errors:

using Enzyme, CUDA, Atomix, StaticArrays, LinearAlgebra

struct HarmonicBond
    k::Float64
    r0::Float64
end

Base.zero(::HarmonicBond) = HarmonicBond(0.0, 0.0)

Base.:+(b1::HarmonicBond, b2::HarmonicBond) = HarmonicBond(b1.k + b2.k, b1.r0 + b2.r0)

@inline @inbounds function f(b::HarmonicBond, coord_i, coord_j)
    dr = coord_i - coord_j
    r = norm(dr)
    return (b.k / 2) * (r - b.r0) ^ 2
end

function kernel!(energy, coords_var, is_var, js_var, inters_var)
    coords = CUDA.Const(coords_var)
    is = CUDA.Const(is_var)
    js = CUDA.Const(js_var)
    inters = CUDA.Const(inters_var)

    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x

    @inbounds if inter_i <= length(is)
        i, j = is[inter_i], js[inter_i]
        pe = f(inters[inter_i], coords[i], coords[j])
        Atomix.@atomic :monotonic energy[1] += pe
    end
    return nothing
end

function grad_kernel!(energy, d_energy, coords, d_coords, is, js, inters, d_inters)
    Enzyme.autodiff_deferred(
        Enzyme.Reverse,
        kernel!,
        Const,
        Duplicated(energy, d_energy),
        Duplicated(coords, d_coords),
        Const(is),
        Const(js),
        Duplicated(inters, d_inters),
    )
    return nothing
end

pe_vec = CuArray([0.0])
d_pe_vec = CuArray([1.0])
coords = CuArray([
    SVector(1.0, 1.0, 1.0),
    SVector(2.0, 2.0, 2.0),
    SVector(3.0, 3.0, 3.0),
])
d_coords = zero(coords)
is = CuArray([1, 2])
js = CuArray([2, 3])
inters = CuArray([
    HarmonicBond(100.0, 1.4),
    HarmonicBond(100.0, 1.4),
])
d_inters = CuArray([
    HarmonicBond(0.0, 0.0),
    HarmonicBond(0.0, 0.0),
])

CUDA.@sync @cuda threads=128 kernel!(pe_vec, coords, is, js, inters) # Works

CUDA.@sync @cuda threads=128 grad_kernel!(pe_vec, d_pe_vec, coords, d_coords, is, js, inters, d_inters)
ERROR: LoadError: InvalidIRError: compiling MethodInstance for grad_kernel!(::CuDeviceVector{Float64, 1}, ::CuDeviceVector{Float64, 1}, ::CuDeviceVector{SVector{3, Float64}, 1}, ::CuDeviceVector{SVector{3, Float64}, 1}, ::CuDeviceVector{Int64, 1}, ::CuDeviceVector{Int64, 1}, ::CuDeviceVector{HarmonicBond, 1}, ::CuDeviceVector{HarmonicBond, 1}) resulted in invalid LLVM IR
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
 [1] #isfinite
   @ ~/.julia/dev/CUDA/src/device/intrinsics/math.jl:190
 [2] macro expansion
   @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:278
 [3] _norm
   @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:266
 [4] norm
   @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:265
 [5] f
   @ ~/dms/molly_dev/enzyme_err28.jl:16
 [6] kernel!
   @ ~/dms/molly_dev/enzyme_err28.jl:30
 [7] kernel!
   @ ~/dms/molly_dev/enzyme_err28.jl:0
 [8] diffejulia_kernel__3358_inner5wrap
   @ ~/dms/molly_dev/enzyme_err28.jl:0
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
 [1] multiple call sites
   @ unknown:0
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
  [1] #isfinite
    @ ~/.julia/dev/CUDA/src/device/intrinsics/math.jl:190
  [2] macro expansion
    @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:278
  [3] _norm
    @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:266
  [4] norm
    @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:265
  [5] f
    @ ~/dms/molly_dev/enzyme_err28.jl:16
  [6] kernel!
    @ ~/dms/molly_dev/enzyme_err28.jl:30
  [7] kernel!
    @ ~/dms/molly_dev/enzyme_err28.jl:0
  [8] diffejulia_kernel__3358_inner5wrap
    @ ~/dms/molly_dev/enzyme_err28.jl:0
  [9] macro expansion
    @ ~/.julia/dev/Enzyme/src/compiler.jl:9872
 [10] enzyme_call
    @ ~/.julia/dev/Enzyme/src/compiler.jl:9550
 [11] CombinedAdjointThunk
    @ ~/.julia/dev/Enzyme/src/compiler.jl:9513
 [12] autodiff_deferred
    @ ~/.julia/dev/Enzyme/src/Enzyme.jl:372
 [13] autodiff_deferred
    @ ~/.julia/dev/Enzyme/src/Enzyme.jl:442
 [14] grad_kernel!
    @ ~/dms/molly_dev/enzyme_err28.jl:37
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/validation.jl:149
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:415 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/RsWnF/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:414 [inlined]
  [5] emit_llvm(job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/utils.jl:89
  [6] emit_llvm
    @ ~/.julia/packages/GPUCompiler/YO8Uj/src/utils.jl:83 [inlined]
  [7] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:129
  [8] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:106
  [9] compile
    @ ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:98 [inlined]
 [10] #1037
    @ ~/.julia/dev/CUDA/src/compiler/compilation.jl:104 [inlined]
 [11] JuliaContext(f::CUDA.var"#1037#1040"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:47
 [12] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/dev/CUDA/src/compiler/compilation.jl:103
 [13] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/execution.jl:125
 [14] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/execution.jl:103
 [15] macro expansion
    @ ~/.julia/dev/CUDA/src/compiler/execution.jl:318 [inlined]
 [16] macro expansion
    @ ./lock.jl:267 [inlined]
 [17] cufunction(f::typeof(grad_kernel!), tt::Type{Tuple{CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{SVector{3, Float64}, 1}, CuDeviceVector{SVector{3, Float64}, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{HarmonicBond, 1}, CuDeviceVector{HarmonicBond, 1}}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:313
 [18] cufunction(f::typeof(grad_kernel!), tt::Type{Tuple{CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{SVector{3, Float64}, 1}, CuDeviceVector{SVector{3, Float64}, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{HarmonicBond, 1}, CuDeviceVector{HarmonicBond, 1}}})
    @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:310
 [19] macro expansion
    @ ~/.julia/dev/CUDA/src/compiler/execution.jl:104 [inlined]
 [20] top-level scope
    @ ~/.julia/dev/CUDA/src/utilities.jl:25
wsmoses commented 1 year ago

Can you add a log with Enzyme.API.printall!(true) ?

jgreener64 commented 1 year ago

With Enzyme.API.printall!(true):

after simplification :
; Function Attrs: mustprogress willreturn
define void @preprocess_julia_kernel__3362_inner5({ i8 addrspace(1)*, i64, [1 x i64], i64 } %0, { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, { i8 addrspace(1)*, i64, [1 x i64], i64 } %4) local_unnamed_addr #5 !dbg !224 {
entry:
  %.fca.0.extract25 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, 0, !dbg !225
  %.fca.2.0.extract11 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 2, 0, !dbg !225
  %5 = call {}*** @julia.get_pgcstack() #6
  %6 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #6, !dbg !226, !range !27
  %7 = zext i32 %6 to i64, !dbg !233
  %8 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #6, !dbg !235, !range !39
  %9 = zext i32 %8 to i64, !dbg !240
  %10 = mul nuw nsw i64 %7, %9, !dbg !242
  %11 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #6, !dbg !244, !range !53
  %12 = add nuw nsw i32 %11, 1, !dbg !249
  %13 = zext i32 %12 to i64, !dbg !250
  %14 = add nuw nsw i64 %10, %13, !dbg !252
  %.not = icmp sgt i64 %14, %.fca.2.0.extract11, !dbg !254
  br i1 %.not, label %julia_kernel__3362_inner.exit, label %L31.i, !dbg !256

L31.i:                                            ; preds = %entry
  %.fca.0.extract = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %4, 0, !dbg !225
  %.fca.0.extract1 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, 0, !dbg !225
  %.fca.0.extract9 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 0, !dbg !225
  %.fca.0.extract16 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, 0, !dbg !225
  %15 = add nsw i64 %14, -1, !dbg !257
  %16 = shl nuw nsw i64 %15, 3, !dbg !263
  %17 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract9, i64 %16, !dbg !265
  %18 = bitcast i8 addrspace(1)* %17 to i64 addrspace(1)*, !dbg !266
  %19 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %18, i32 noundef 8) #7, !dbg !266
  %20 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract1, i64 %16, !dbg !265
  %21 = bitcast i8 addrspace(1)* %20 to i64 addrspace(1)*, !dbg !266
  %22 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %21, i32 noundef 8) #7, !dbg !266
  %23 = bitcast i8 addrspace(1)* %.fca.0.extract to [2 x double] addrspace(1)*, !dbg !271
  %.elt = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %23, i64 %15, i64 0, !dbg !271
  %.unpack = load double, double addrspace(1)* %.elt, align 8, !dbg !271, !tbaa !98
  %.elt53 = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %23, i64 %15, i64 1, !dbg !271
  %.unpack54 = load double, double addrspace(1)* %.elt53, align 8, !dbg !271, !tbaa !98
  %24 = add i64 %19, -1, !dbg !279
  %25 = bitcast i8 addrspace(1)* %.fca.0.extract16 to [1 x [3 x double]] addrspace(1)*, !dbg !271
  %.unpack55.elt = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %24, i64 0, i64 0, !dbg !271
  %.unpack55.unpack = load double, double addrspace(1)* %.unpack55.elt, align 8, !dbg !271, !tbaa !98
  %.unpack55.elt56 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %24, i64 0, i64 1, !dbg !271
  %.unpack55.unpack57 = load double, double addrspace(1)* %.unpack55.elt56, align 8, !dbg !271, !tbaa !98
  %.unpack55.elt58 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %24, i64 0, i64 2, !dbg !271
  %.unpack55.unpack59 = load double, double addrspace(1)* %.unpack55.elt58, align 8, !dbg !271, !tbaa !98
  %26 = add i64 %22, -1, !dbg !279
  %.unpack61.elt = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %26, i64 0, i64 0, !dbg !271
  %.unpack61.unpack = load double, double addrspace(1)* %.unpack61.elt, align 8, !dbg !271, !tbaa !98
  %.unpack61.elt62 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %26, i64 0, i64 1, !dbg !271
  %.unpack61.unpack63 = load double, double addrspace(1)* %.unpack61.elt62, align 8, !dbg !271, !tbaa !98
  %.unpack61.elt64 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %26, i64 0, i64 2, !dbg !271
  %.unpack61.unpack65 = load double, double addrspace(1)* %.unpack61.elt64, align 8, !dbg !271, !tbaa !98
  %27 = fsub double %.unpack55.unpack, %.unpack61.unpack, !dbg !281
  %28 = fsub double %.unpack55.unpack57, %.unpack61.unpack63, !dbg !281
  %29 = fsub double %.unpack55.unpack59, %.unpack61.unpack65, !dbg !281
  %30 = fmul double %27, %27, !dbg !287
  %31 = fmul double %28, %28, !dbg !287
  %32 = fadd double %30, %31, !dbg !294
  %33 = fmul double %29, %29, !dbg !287
  %34 = fadd double %32, %33, !dbg !294
  %35 = call double @__nv_sqrt(double %34) #6, !dbg !295
  %36 = fcmp ule double %35, 0.000000e+00, !dbg !296
  br i1 %36, label %L176.i, label %L169.i, !dbg !298

L169.i:                                           ; preds = %L31.i
  %37 = call i32 @__nv_isfinited(double %35) #6, !dbg !299
  %38 = icmp eq i32 %37, 0, !dbg !300
  br i1 %38, label %L176.i, label %L221.i, !dbg !298

L176.i:                                           ; preds = %L169.i, %L31.i
  %39 = call double @__nv_fabs(double %27) #6, !dbg !304
  %40 = call double @__nv_fabs(double %28) #6, !dbg !310
  %41 = call double @__nv_fmax(double %39, double %40) #6, !dbg !313
  %42 = call double @__nv_fabs(double %29) #6, !dbg !310
  %43 = call double @__nv_fmax(double %41, double %42) #6, !dbg !313
  %44 = call i32 @__nv_isfinited(double %43) #6, !dbg !314
  %.not68 = icmp eq i32 %44, 0, !dbg !316
  br i1 %.not68, label %L221.i, label %L204.i, !dbg !320

L204.i:                                           ; preds = %L176.i
  %45 = fcmp une double %43, 0.000000e+00, !dbg !321
  br i1 %45, label %L208.i, label %L206.i, !dbg !324

L206.i:                                           ; preds = %L204.i
  %46 = call double @__nv_fabs(double noundef 0.000000e+00) #6, !dbg !325
  br label %L221.i, !dbg !329

L208.i:                                           ; preds = %L204.i
  %47 = fdiv double %27, %43, !dbg !331
  %48 = fmul double %47, %47, !dbg !333
  %49 = fdiv double %28, %43, !dbg !331
  %50 = fmul double %49, %49, !dbg !333
  %51 = fadd double %48, %50, !dbg !336
  %52 = fdiv double %29, %43, !dbg !331
  %53 = fmul double %52, %52, !dbg !333
  %54 = fadd double %53, %51, !dbg !336
  %55 = call double @__nv_sqrt(double %54) #6, !dbg !337
  %56 = fmul double %43, %55, !dbg !338
  br label %L221.i, !dbg !329

L221.i:                                           ; preds = %L208.i, %L206.i, %L176.i, %L169.i
  %value_phi.i = phi double [ %35, %L169.i ], [ %46, %L206.i ], [ %56, %L208.i ], [ %43, %L176.i ]
  %57 = fmul double %.unpack, 5.000000e-01, !dbg !339
  %58 = fsub double %value_phi.i, %.unpack54, !dbg !342
  %59 = fmul double %58, %58, !dbg !343
  %60 = fmul double %57, %59, !dbg !345
  %61 = bitcast i8 addrspace(1)* %.fca.0.extract25 to double addrspace(1)*, !dbg !346
  %62 = atomicrmw fadd double addrspace(1)* %61, double %60 monotonic, align 8, !dbg !346
  br label %julia_kernel__3362_inner.exit, !dbg !353

julia_kernel__3362_inner.exit:                    ; preds = %L221.i, %entry
  ret void, !dbg !225
}

; Function Attrs: mustprogress willreturn
define internal void @diffejulia_kernel__3362_inner5({ i8 addrspace(1)*, i64, [1 x i64], i64 } %0, { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'", { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'1", { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, { i8 addrspace(1)*, i64, [1 x i64], i64 } %4, { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'2") local_unnamed_addr #5 !dbg !354 {
entry:
  %"'de" = alloca double, align 8
  %5 = getelementptr double, double* %"'de", i64 0
  store double 0.000000e+00, double* %5, align 8
  %"'de9" = alloca double, align 8
  %6 = getelementptr double, double* %"'de9", i64 0
  store double 0.000000e+00, double* %6, align 8
  %"'de10" = alloca double, align 8
  %7 = getelementptr double, double* %"'de10", i64 0
  store double 0.000000e+00, double* %7, align 8
  %"'de11" = alloca double, align 8
  %8 = getelementptr double, double* %"'de11", i64 0
  store double 0.000000e+00, double* %8, align 8
  %"'de12" = alloca double, align 8
  %9 = getelementptr double, double* %"'de12", i64 0
  store double 0.000000e+00, double* %9, align 8
  %"'de13" = alloca double, align 8
  %10 = getelementptr double, double* %"'de13", i64 0
  store double 0.000000e+00, double* %10, align 8
  %"'de14" = alloca double, align 8
  %11 = getelementptr double, double* %"'de14", i64 0
  store double 0.000000e+00, double* %11, align 8
  %"'de15" = alloca double, align 8
  %12 = getelementptr double, double* %"'de15", i64 0
  store double 0.000000e+00, double* %12, align 8
  %"'de16" = alloca double, align 8
  %13 = getelementptr double, double* %"'de16", i64 0
  store double 0.000000e+00, double* %13, align 8
  %".unpack55.unpack59'de" = alloca double, align 8
  %14 = getelementptr double, double* %".unpack55.unpack59'de", i64 0
  store double 0.000000e+00, double* %14, align 8
  %".unpack61.unpack65'de" = alloca double, align 8
  %15 = getelementptr double, double* %".unpack61.unpack65'de", i64 0
  store double 0.000000e+00, double* %15, align 8
  %".unpack55.unpack57'de" = alloca double, align 8
  %16 = getelementptr double, double* %".unpack55.unpack57'de", i64 0
  store double 0.000000e+00, double* %16, align 8
  %".unpack61.unpack63'de" = alloca double, align 8
  %17 = getelementptr double, double* %".unpack61.unpack63'de", i64 0
  store double 0.000000e+00, double* %17, align 8
  %".unpack55.unpack'de" = alloca double, align 8
  %18 = getelementptr double, double* %".unpack55.unpack'de", i64 0
  store double 0.000000e+00, double* %18, align 8
  %".unpack61.unpack'de" = alloca double, align 8
  %19 = getelementptr double, double* %".unpack61.unpack'de", i64 0
  store double 0.000000e+00, double* %19, align 8
  %".unpack54'de" = alloca double, align 8
  %20 = getelementptr double, double* %".unpack54'de", i64 0
  store double 0.000000e+00, double* %20, align 8
  %".unpack'de" = alloca double, align 8
  %21 = getelementptr double, double* %".unpack'de", i64 0
  store double 0.000000e+00, double* %21, align 8
  %"'de33" = alloca double, align 8
  %22 = getelementptr double, double* %"'de33", i64 0
  store double 0.000000e+00, double* %22, align 8
  %"'de34" = alloca double, align 8
  %23 = getelementptr double, double* %"'de34", i64 0
  store double 0.000000e+00, double* %23, align 8
  %"'de35" = alloca double, align 8
  %24 = getelementptr double, double* %"'de35", i64 0
  store double 0.000000e+00, double* %24, align 8
  %"'de36" = alloca double, align 8
  %25 = getelementptr double, double* %"'de36", i64 0
  store double 0.000000e+00, double* %25, align 8
  %"'de37" = alloca double, align 8
  %26 = getelementptr double, double* %"'de37", i64 0
  store double 0.000000e+00, double* %26, align 8
  %"'de44" = alloca double, align 8
  %27 = getelementptr double, double* %"'de44", i64 0
  store double 0.000000e+00, double* %27, align 8
  %"'de53" = alloca double, align 8
  %28 = getelementptr double, double* %"'de53", i64 0
  store double 0.000000e+00, double* %28, align 8
  %"'de54" = alloca double, align 8
  %29 = getelementptr double, double* %"'de54", i64 0
  store double 0.000000e+00, double* %29, align 8
  %"'de55" = alloca double, align 8
  %30 = getelementptr double, double* %"'de55", i64 0
  store double 0.000000e+00, double* %30, align 8
  %"'de56" = alloca double, align 8
  %31 = getelementptr double, double* %"'de56", i64 0
  store double 0.000000e+00, double* %31, align 8
  %"'de57" = alloca double, align 8
  %32 = getelementptr double, double* %"'de57", i64 0
  store double 0.000000e+00, double* %32, align 8
  %"'de58" = alloca double, align 8
  %33 = getelementptr double, double* %"'de58", i64 0
  store double 0.000000e+00, double* %33, align 8
  %"'de59" = alloca double, align 8
  %34 = getelementptr double, double* %"'de59", i64 0
  store double 0.000000e+00, double* %34, align 8
  %"'de60" = alloca double, align 8
  %35 = getelementptr double, double* %"'de60", i64 0
  store double 0.000000e+00, double* %35, align 8
  %"'de61" = alloca double, align 8
  %36 = getelementptr double, double* %"'de61", i64 0
  store double 0.000000e+00, double* %36, align 8
  %"'de63" = alloca double, align 8
  %37 = getelementptr double, double* %"'de63", i64 0
  store double 0.000000e+00, double* %37, align 8
  %"'de66" = alloca double, align 8
  %38 = getelementptr double, double* %"'de66", i64 0
  store double 0.000000e+00, double* %38, align 8
  %"'de68" = alloca double, align 8
  %39 = getelementptr double, double* %"'de68", i64 0
  store double 0.000000e+00, double* %39, align 8
  %"'de69" = alloca double, align 8
  %40 = getelementptr double, double* %"'de69", i64 0
  store double 0.000000e+00, double* %40, align 8
  %"value_phi.i'de" = alloca double, align 8
  %41 = getelementptr double, double* %"value_phi.i'de", i64 0
  store double 0.000000e+00, double* %41, align 8
  %.fca.0.extract25 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, 0, !dbg !355
  %".fca.0.extract25'ipev" = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'", 0, !dbg !355
  %.fca.2.0.extract11 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 2, 0, !dbg !355
  %42 = call {}*** @julia.get_pgcstack() #6
  %43 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #6, !dbg !356, !range !27
  %44 = zext i32 %43 to i64, !dbg !363
  %45 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #6, !dbg !365, !range !39
  %46 = zext i32 %45 to i64, !dbg !370
  %47 = mul nuw nsw i64 %44, %46, !dbg !372
  %48 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #6, !dbg !374, !range !53
  %49 = add nuw nsw i32 %48, 1, !dbg !379
  %50 = zext i32 %49 to i64, !dbg !380
  %51 = add nuw nsw i64 %47, %50, !dbg !382
  %.not = icmp sgt i64 %51, %.fca.2.0.extract11, !dbg !384
  br i1 %.not, label %julia_kernel__3362_inner.exit, label %L31.i, !dbg !386

L31.i:                                            ; preds = %entry
  %.fca.0.extract = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %4, 0, !dbg !355
  %".fca.0.extract'ipev" = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'2", 0, !dbg !355
  %.fca.0.extract1 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, 0, !dbg !355
  %.fca.0.extract9 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 0, !dbg !355
  %.fca.0.extract16 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, 0, !dbg !355
  %".fca.0.extract16'ipev" = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'1", 0, !dbg !387
  %52 = add nsw i64 %51, -1, !dbg !387
  %53 = shl nuw nsw i64 %52, 3, !dbg !393
  %54 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract9, i64 %53, !dbg !395
  %55 = bitcast i8 addrspace(1)* %54 to i64 addrspace(1)*, !dbg !396
  %56 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %55, i32 noundef 8) #7, !dbg !396, !alias.scope !401, !noalias !404
  %57 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract1, i64 %53, !dbg !395
  %58 = bitcast i8 addrspace(1)* %57 to i64 addrspace(1)*, !dbg !396
  %59 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %58, i32 noundef 8) #7, !dbg !396, !alias.scope !406, !noalias !409
  %"'ipc25" = bitcast i8 addrspace(1)* %".fca.0.extract'ipev" to [2 x double] addrspace(1)*, !dbg !411
  %60 = bitcast i8 addrspace(1)* %.fca.0.extract to [2 x double] addrspace(1)*, !dbg !411
  %".elt'ipg" = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %"'ipc25", i64 %52, i64 0, !dbg !411
  %.elt = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %60, i64 %52, i64 0, !dbg !411
  %.unpack = load double, double addrspace(1)* %.elt, align 8, !dbg !411, !tbaa !98, !alias.scope !419, !noalias !422
  %".elt53'ipg" = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %"'ipc25", i64 %52, i64 1, !dbg !411
  %.elt53 = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %60, i64 %52, i64 1, !dbg !411
  %.unpack54 = load double, double addrspace(1)* %.elt53, align 8, !dbg !411, !tbaa !98, !alias.scope !419, !noalias !422
  %61 = add i64 %56, -1, !dbg !424
  %"'ipc" = bitcast i8 addrspace(1)* %".fca.0.extract16'ipev" to [1 x [3 x double]] addrspace(1)*, !dbg !411
  %62 = bitcast i8 addrspace(1)* %.fca.0.extract16 to [1 x [3 x double]] addrspace(1)*, !dbg !411
  %".unpack55.elt'ipg" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc", i64 %61, i64 0, i64 0, !dbg !411
  %.unpack55.elt = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %62, i64 %61, i64 0, i64 0, !dbg !411
  %.unpack55.unpack = load double, double addrspace(1)* %.unpack55.elt, align 8, !dbg !411, !tbaa !98, !alias.scope !426, !noalias !429
  %".unpack55.elt56'ipg" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc", i64 %61, i64 0, i64 1, !dbg !411
  %.unpack55.elt56 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %62, i64 %61, i64 0, i64 1, !dbg !411
  %.unpack55.unpack57 = load double, double addrspace(1)* %.unpack55.elt56, align 8, !dbg !411, !tbaa !98, !alias.scope !426, !noalias !429
  %".unpack55.elt58'ipg" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc", i64 %61, i64 0, i64 2, !dbg !411
  %.unpack55.elt58 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %62, i64 %61, i64 0, i64 2, !dbg !411
  %.unpack55.unpack59 = load double, double addrspace(1)* %.unpack55.elt58, align 8, !dbg !411, !tbaa !98, !alias.scope !426, !noalias !429
  %63 = add i64 %59, -1, !dbg !424
  %".unpack61.elt'ipg" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc", i64 %63, i64 0, i64 0, !dbg !411
  %.unpack61.elt = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %62, i64 %63, i64 0, i64 0, !dbg !411
  %.unpack61.unpack = load double, double addrspace(1)* %.unpack61.elt, align 8, !dbg !411, !tbaa !98, !alias.scope !426, !noalias !429
  %".unpack61.elt62'ipg" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc", i64 %63, i64 0, i64 1, !dbg !411
  %.unpack61.elt62 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %62, i64 %63, i64 0, i64 1, !dbg !411
  %.unpack61.unpack63 = load double, double addrspace(1)* %.unpack61.elt62, align 8, !dbg !411, !tbaa !98, !alias.scope !426, !noalias !429
  %".unpack61.elt64'ipg" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc", i64 %63, i64 0, i64 2, !dbg !411
  %.unpack61.elt64 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %62, i64 %63, i64 0, i64 2, !dbg !411
  %.unpack61.unpack65 = load double, double addrspace(1)* %.unpack61.elt64, align 8, !dbg !411, !tbaa !98, !alias.scope !426, !noalias !429
  %64 = fsub double %.unpack55.unpack, %.unpack61.unpack, !dbg !431
  %65 = fsub double %.unpack55.unpack57, %.unpack61.unpack63, !dbg !431
  %66 = fsub double %.unpack55.unpack59, %.unpack61.unpack65, !dbg !431
  %67 = fmul double %64, %64, !dbg !437
  %68 = fmul double %65, %65, !dbg !437
  %69 = fadd double %67, %68, !dbg !444
  %70 = fmul double %66, %66, !dbg !437
  %71 = fadd double %69, %70, !dbg !444
  %72 = call double @__nv_sqrt(double %71) #6, !dbg !445
  %73 = fcmp ule double %72, 0.000000e+00, !dbg !446
  br i1 %73, label %L176.i, label %L169.i, !dbg !448

L169.i:                                           ; preds = %L31.i
  call void inttoptr (i64 139880611672816 to void (i8*)*)(i8* getelementptr inbounds ([7867 x i8], [7867 x i8]* @0, i32 0, i32 0)) #8, !dbg !449
  %_augmented = call { {} addrspace(10)*, i32 } @fakeaugmented___nv_isfinited(double %72), !dbg !449
  %subcache = extractvalue { {} addrspace(10)*, i32 } %_augmented, 0, !dbg !449
  %74 = extractvalue { {} addrspace(10)*, i32 } %_augmented, 1, !dbg !449
  %75 = icmp eq i32 %74, 0, !dbg !450
  br i1 %75, label %L176.i, label %L221.i, !dbg !448

L176.i:                                           ; preds = %L169.i, %L31.i
  %subcache_cache.0 = phi {} addrspace(10)* [ addrspacecast ({}* inttoptr (i64 139887560998920 to {}*) to {} addrspace(10)*), %L31.i ], [ %subcache, %L169.i ]
  %76 = call double @__nv_fabs(double %64) #6, !dbg !454
  %77 = call double @__nv_fabs(double %65) #6, !dbg !460
  %78 = call double @__nv_fmax(double %76, double %77) #6, !dbg !463
  %79 = call double @__nv_fabs(double %66) #6, !dbg !460
  %80 = call double @__nv_fmax(double %78, double %79) #6, !dbg !463
  %_augmented31 = call { {} addrspace(10)*, i32 } @fakeaugmented___nv_isfinited(double %80), !dbg !464
  %subcache32 = extractvalue { {} addrspace(10)*, i32 } %_augmented31, 0, !dbg !464
  %81 = extractvalue { {} addrspace(10)*, i32 } %_augmented31, 1, !dbg !464
  %.not68 = icmp eq i32 %81, 0, !dbg !466
  br i1 %.not68, label %L221.i, label %L204.i, !dbg !470

L204.i:                                           ; preds = %L176.i
  %82 = fcmp une double %80, 0.000000e+00, !dbg !471
  br i1 %82, label %L208.i, label %L206.i, !dbg !474

L206.i:                                           ; preds = %L204.i
  %83 = call double @__nv_fabs(double noundef 0.000000e+00) #6, !dbg !475
  br label %L221.i, !dbg !479

L208.i:                                           ; preds = %L204.i
  %84 = fdiv double %64, %80, !dbg !481
  %85 = fmul double %84, %84, !dbg !483
  %86 = fdiv double %65, %80, !dbg !481
  %87 = fmul double %86, %86, !dbg !483
  %88 = fadd double %85, %87, !dbg !486
  %89 = fdiv double %66, %80, !dbg !481
  %90 = fmul double %89, %89, !dbg !483
  %91 = fadd double %90, %88, !dbg !486
  %92 = call double @__nv_sqrt(double %91) #6, !dbg !487
  %93 = fmul double %80, %92, !dbg !488
  br label %L221.i, !dbg !479

L221.i:                                           ; preds = %L208.i, %L206.i, %L176.i, %L169.i
  %_cache73.0 = phi i8 [ 2, %L176.i ], [ 0, %L208.i ], [ 3, %L206.i ], [ 1, %L169.i ], !dbg !489
  %_cache72.0 = phi i8 [ 0, %L176.i ], [ 2, %L208.i ], [ 3, %L206.i ], [ 1, %L169.i ], !dbg !489
  %subcache32_cache.0 = phi {} addrspace(10)* [ %subcache32, %L176.i ], [ %subcache32, %L208.i ], [ %subcache32, %L206.i ], [ addrspacecast ({}* inttoptr (i64 139887560998920 to {}*) to {} addrspace(10)*), %L169.i ]
  %subcache_cache.1 = phi {} addrspace(10)* [ %subcache_cache.0, %L176.i ], [ %subcache_cache.0, %L208.i ], [ %subcache_cache.0, %L206.i ], [ %subcache, %L169.i ]
  %value_phi.i = phi double [ %72, %L169.i ], [ %83, %L206.i ], [ %93, %L208.i ], [ %80, %L176.i ]
  %94 = fmul double %.unpack, 5.000000e-01, !dbg !490
  %95 = fsub double %value_phi.i, %.unpack54, !dbg !493
  %96 = fmul double %95, %95, !dbg !494
  %97 = fmul double %94, %96, !dbg !496
  %"'ipc62" = bitcast i8 addrspace(1)* %".fca.0.extract25'ipev" to double addrspace(1)*, !dbg !497
  %98 = bitcast i8 addrspace(1)* %.fca.0.extract25 to double addrspace(1)*, !dbg !497
  %99 = atomicrmw fadd double addrspace(1)* %98, double %97 monotonic, align 8, !dbg !497
  br label %julia_kernel__3362_inner.exit, !dbg !504

julia_kernel__3362_inner.exit:                    ; preds = %L221.i, %entry
  %_cache73.1 = phi i8 [ undef, %entry ], [ %_cache73.0, %L221.i ]
  %_cache72.1 = phi i8 [ undef, %entry ], [ %_cache72.0, %L221.i ]
  %_cache67.0 = phi double [ undef, %entry ], [ %94, %L221.i ]
  %_cache64.0 = phi double [ undef, %entry ], [ %95, %L221.i ]
  %subcache32_cache.1 = phi {} addrspace(10)* [ addrspacecast ({}* inttoptr (i64 139887560998920 to {}*) to {} addrspace(10)*), %entry ], [ %subcache32_cache.0, %L221.i ]
  %subcache_cache.2 = phi {} addrspace(10)* [ addrspacecast ({}* inttoptr (i64 139887560998920 to {}*) to {} addrspace(10)*), %entry ], [ %subcache_cache.1, %L221.i ]
  %_cache6.0 = phi double [ undef, %entry ], [ %66, %L221.i ]
  %_cache3.0 = phi double [ undef, %entry ], [ %65, %L221.i ]
  %_cache.0 = phi double [ undef, %entry ], [ %64, %L221.i ]
  br label %invertjulia_kernel__3362_inner.exit, !dbg !355

invertentry:                                      ; preds = %invertjulia_kernel__3362_inner.exit, %invertL31.i
  ret void

invertL31.i:                                      ; preds = %invertL176.i, %invertL169.i
  %100 = load double, double* %"'de", align 8, !dbg !445
  store double 0.000000e+00, double* %"'de", align 8, !dbg !445
  %_unwrap = fmul double %_cache.0, %_cache.0, !dbg !445
  %_unwrap4 = fmul double %_cache3.0, %_cache3.0, !dbg !445
  %_unwrap5 = fadd double %_unwrap, %_unwrap4, !dbg !445
  %_unwrap7 = fmul double %_cache6.0, %_cache6.0, !dbg !445
  %_unwrap8 = fadd double %_unwrap5, %_unwrap7, !dbg !445
  %101 = fcmp fast ueq double %_unwrap8, 0.000000e+00, !dbg !445
  %102 = call fast double @__nv_sqrt(double %_unwrap8) #9, !dbg !445
  %103 = fmul fast double 2.000000e+00, %102, !dbg !445
  %104 = fdiv fast double %100, %103, !dbg !445
  %105 = select fast i1 %101, double 0.000000e+00, double %104, !dbg !445
  %106 = load double, double* %"'de9", align 8, !dbg !445
  %107 = fadd fast double %106, %104, !dbg !445
  %108 = select fast i1 %101, double %106, double %107, !dbg !445
  store double %108, double* %"'de9", align 8, !dbg !445
  %109 = load double, double* %"'de9", align 8, !dbg !444
  store double 0.000000e+00, double* %"'de9", align 8, !dbg !444
  %110 = load double, double* %"'de10", align 8, !dbg !444
  %111 = fadd fast double %110, %109, !dbg !444
  store double %111, double* %"'de10", align 8, !dbg !444
  %112 = load double, double* %"'de11", align 8, !dbg !444
  %113 = fadd fast double %112, %109, !dbg !444
  store double %113, double* %"'de11", align 8, !dbg !444
  %114 = load double, double* %"'de11", align 8, !dbg !437
  store double 0.000000e+00, double* %"'de11", align 8, !dbg !437
  %115 = fmul fast double %114, %_cache6.0, !dbg !437
  %116 = load double, double* %"'de12", align 8, !dbg !437
  %117 = fadd fast double %116, %115, !dbg !437
  store double %117, double* %"'de12", align 8, !dbg !437
  %118 = fmul fast double %114, %_cache6.0, !dbg !437
  %119 = load double, double* %"'de12", align 8, !dbg !437
  %120 = fadd fast double %119, %118, !dbg !437
  store double %120, double* %"'de12", align 8, !dbg !437
  %121 = load double, double* %"'de10", align 8, !dbg !444
  store double 0.000000e+00, double* %"'de10", align 8, !dbg !444
  %122 = load double, double* %"'de13", align 8, !dbg !444
  %123 = fadd fast double %122, %121, !dbg !444
  store double %123, double* %"'de13", align 8, !dbg !444
  %124 = load double, double* %"'de14", align 8, !dbg !444
  %125 = fadd fast double %124, %121, !dbg !444
  store double %125, double* %"'de14", align 8, !dbg !444
  %126 = load double, double* %"'de14", align 8, !dbg !437
  store double 0.000000e+00, double* %"'de14", align 8, !dbg !437
  %127 = fmul fast double %126, %_cache3.0, !dbg !437
  %128 = load double, double* %"'de15", align 8, !dbg !437
  %129 = fadd fast double %128, %127, !dbg !437
  store double %129, double* %"'de15", align 8, !dbg !437
  %130 = fmul fast double %126, %_cache3.0, !dbg !437
  %131 = load double, double* %"'de15", align 8, !dbg !437
  %132 = fadd fast double %131, %130, !dbg !437
  store double %132, double* %"'de15", align 8, !dbg !437
  %133 = load double, double* %"'de13", align 8, !dbg !437
  store double 0.000000e+00, double* %"'de13", align 8, !dbg !437
  %134 = fmul fast double %133, %_cache.0, !dbg !437
  %135 = load double, double* %"'de16", align 8, !dbg !437
  %136 = fadd fast double %135, %134, !dbg !437
  store double %136, double* %"'de16", align 8, !dbg !437
  %137 = fmul fast double %133, %_cache.0, !dbg !437
  %138 = load double, double* %"'de16", align 8, !dbg !437
  %139 = fadd fast double %138, %137, !dbg !437
  store double %139, double* %"'de16", align 8, !dbg !437
  %140 = load double, double* %"'de12", align 8, !dbg !431
  store double 0.000000e+00, double* %"'de12", align 8, !dbg !431
  %141 = load double, double* %".unpack55.unpack59'de", align 8, !dbg !431
  %142 = fadd fast double %141, %140, !dbg !431
  store double %142, double* %".unpack55.unpack59'de", align 8, !dbg !431
  %143 = fneg fast double %140, !dbg !431
  %144 = load double, double* %".unpack61.unpack65'de", align 8, !dbg !431
  %145 = fadd fast double %144, %143, !dbg !431
  store double %145, double* %".unpack61.unpack65'de", align 8, !dbg !431
  %146 = load double, double* %"'de15", align 8, !dbg !431
  store double 0.000000e+00, double* %"'de15", align 8, !dbg !431
  %147 = load double, double* %".unpack55.unpack57'de", align 8, !dbg !431
  %148 = fadd fast double %147, %146, !dbg !431
  store double %148, double* %".unpack55.unpack57'de", align 8, !dbg !431
  %149 = fneg fast double %146, !dbg !431
  %150 = load double, double* %".unpack61.unpack63'de", align 8, !dbg !431
  %151 = fadd fast double %150, %149, !dbg !431
  store double %151, double* %".unpack61.unpack63'de", align 8, !dbg !431
  %152 = load double, double* %"'de16", align 8, !dbg !431
  store double 0.000000e+00, double* %"'de16", align 8, !dbg !431
  %153 = load double, double* %".unpack55.unpack'de", align 8, !dbg !431
  %154 = fadd fast double %153, %152, !dbg !431
  store double %154, double* %".unpack55.unpack'de", align 8, !dbg !431
  %155 = fneg fast double %152, !dbg !431
  %156 = load double, double* %".unpack61.unpack'de", align 8, !dbg !431
  %157 = fadd fast double %156, %155, !dbg !431
  store double %157, double* %".unpack61.unpack'de", align 8, !dbg !431
  %158 = load double, double* %".unpack61.unpack65'de", align 8, !dbg !411
  store double 0.000000e+00, double* %".unpack61.unpack65'de", align 8, !dbg !411
  %".fca.0.extract16'ipev_unwrap" = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'1", 0, !dbg !411
  %"'ipc_unwrap" = bitcast i8 addrspace(1)* %".fca.0.extract16'ipev_unwrap" to [1 x [3 x double]] addrspace(1)*, !dbg !411
  %.fca.0.extract1_unwrap = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, 0, !dbg !411
  %_unwrap17 = add nsw i64 %51, -1, !dbg !411
  %_unwrap18 = shl nuw nsw i64 %_unwrap17, 3, !dbg !411
  %_unwrap19 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract1_unwrap, i64 %_unwrap18, !dbg !411
  %_unwrap20 = bitcast i8 addrspace(1)* %_unwrap19 to i64 addrspace(1)*, !dbg !411
  %159 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %_unwrap20, i32 noundef 8) #7, !dbg !396
  %_unwrap21 = add i64 %159, -1, !dbg !411
  %".unpack61.elt64'ipg_unwrap" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc_unwrap", i64 %_unwrap21, i64 0, i64 2, !dbg !411
  %160 = atomicrmw fadd double addrspace(1)* %".unpack61.elt64'ipg_unwrap", double %158 monotonic, align 8, !dbg !411
  %161 = load double, double* %".unpack61.unpack63'de", align 8, !dbg !411
  store double 0.000000e+00, double* %".unpack61.unpack63'de", align 8, !dbg !411
  %".unpack61.elt62'ipg_unwrap" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc_unwrap", i64 %_unwrap21, i64 0, i64 1, !dbg !411
  %162 = atomicrmw fadd double addrspace(1)* %".unpack61.elt62'ipg_unwrap", double %161 monotonic, align 8, !dbg !411
  %163 = load double, double* %".unpack61.unpack'de", align 8, !dbg !411
  store double 0.000000e+00, double* %".unpack61.unpack'de", align 8, !dbg !411
  %".unpack61.elt'ipg_unwrap" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc_unwrap", i64 %_unwrap21, i64 0, i64 0, !dbg !411
  %164 = atomicrmw fadd double addrspace(1)* %".unpack61.elt'ipg_unwrap", double %163 monotonic, align 8, !dbg !411
  %165 = load double, double* %".unpack55.unpack59'de", align 8, !dbg !411
  store double 0.000000e+00, double* %".unpack55.unpack59'de", align 8, !dbg !411
  %.fca.0.extract9_unwrap = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 0, !dbg !411
  %_unwrap22 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract9_unwrap, i64 %_unwrap18, !dbg !411
  %_unwrap23 = bitcast i8 addrspace(1)* %_unwrap22 to i64 addrspace(1)*, !dbg !411
  %166 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %_unwrap23, i32 noundef 8) #7, !dbg !396
  %_unwrap24 = add i64 %166, -1, !dbg !411
  %".unpack55.elt58'ipg_unwrap" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc_unwrap", i64 %_unwrap24, i64 0, i64 2, !dbg !411
  %167 = atomicrmw fadd double addrspace(1)* %".unpack55.elt58'ipg_unwrap", double %165 monotonic, align 8, !dbg !411
  %168 = load double, double* %".unpack55.unpack57'de", align 8, !dbg !411
  store double 0.000000e+00, double* %".unpack55.unpack57'de", align 8, !dbg !411
  %".unpack55.elt56'ipg_unwrap" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc_unwrap", i64 %_unwrap24, i64 0, i64 1, !dbg !411
  %169 = atomicrmw fadd double addrspace(1)* %".unpack55.elt56'ipg_unwrap", double %168 monotonic, align 8, !dbg !411
  %170 = load double, double* %".unpack55.unpack'de", align 8, !dbg !411
  store double 0.000000e+00, double* %".unpack55.unpack'de", align 8, !dbg !411
  %".unpack55.elt'ipg_unwrap" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc_unwrap", i64 %_unwrap24, i64 0, i64 0, !dbg !411
  %171 = atomicrmw fadd double addrspace(1)* %".unpack55.elt'ipg_unwrap", double %170 monotonic, align 8, !dbg !411
  %172 = load double, double* %".unpack54'de", align 8, !dbg !411
  store double 0.000000e+00, double* %".unpack54'de", align 8, !dbg !411
  %".fca.0.extract'ipev_unwrap" = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'2", 0, !dbg !411
  %"'ipc25_unwrap" = bitcast i8 addrspace(1)* %".fca.0.extract'ipev_unwrap" to [2 x double] addrspace(1)*, !dbg !411
  %".elt53'ipg_unwrap" = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %"'ipc25_unwrap", i64 %_unwrap17, i64 1, !dbg !411
  %173 = atomicrmw fadd double addrspace(1)* %".elt53'ipg_unwrap", double %172 monotonic, align 8, !dbg !411
  %174 = load double, double* %".unpack'de", align 8, !dbg !411
  store double 0.000000e+00, double* %".unpack'de", align 8, !dbg !411
  %".elt'ipg_unwrap" = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %"'ipc25_unwrap", i64 %_unwrap17, i64 0, !dbg !411
  %175 = atomicrmw fadd double addrspace(1)* %".elt'ipg_unwrap", double %174 monotonic, align 8, !dbg !411
  br label %invertentry

invertL169.i:                                     ; preds = %invertL221.i, %invertL176.i
  %_unwrap26 = fmul double %_cache.0, %_cache.0, !dbg !449
  %_unwrap27 = fmul double %_cache3.0, %_cache3.0, !dbg !449
  %_unwrap28 = fadd double %_unwrap26, %_unwrap27, !dbg !449
  %_unwrap29 = fmul double %_cache6.0, %_cache6.0, !dbg !449
  %_unwrap30 = fadd double %_unwrap28, %_unwrap29, !dbg !449
  %176 = call double @__nv_sqrt(double %_unwrap30) #6, !dbg !445
  call void inttoptr (i64 139880611672816 to void (i8*)*)(i8* getelementptr inbounds ([7857 x i8], [7857 x i8]* @1, i32 0, i32 0)) #8, !dbg !449
  %177 = call { double } @diffe__nv_isfinited(double %176, {} addrspace(10)* %subcache_cache.2), !dbg !449
  %178 = extractvalue { double } %177, 0, !dbg !449
  %179 = load double, double* %"'de", align 8, !dbg !449
  %180 = fadd fast double %179, %178, !dbg !449
  store double %180, double* %"'de", align 8, !dbg !449
  br label %invertL31.i

invertL176.i:                                     ; preds = %invertL221.i, %invertL204.i
  %181 = call double @__nv_fabs(double %_cache.0) #6, !dbg !454
  %182 = call double @__nv_fabs(double %_cache3.0) #6, !dbg !460
  %183 = call double @__nv_fmax(double %181, double %182) #6, !dbg !463
  %184 = call double @__nv_fabs(double %_cache6.0) #6, !dbg !460
  %185 = call double @__nv_fmax(double %183, double %184) #6, !dbg !463
  %186 = call { double } @diffe__nv_isfinited(double %185, {} addrspace(10)* %subcache32_cache.1), !dbg !464
  %187 = extractvalue { double } %186, 0, !dbg !464
  %188 = load double, double* %"'de33", align 8, !dbg !464
  %189 = fadd fast double %188, %187, !dbg !464
  store double %189, double* %"'de33", align 8, !dbg !464
  %190 = load double, double* %"'de33", align 8, !dbg !463
  store double 0.000000e+00, double* %"'de33", align 8, !dbg !463
  %191 = fcmp fast olt double %183, %184, !dbg !463
  %192 = select fast i1 %191, double 0.000000e+00, double %190, !dbg !463
  %193 = load double, double* %"'de34", align 8, !dbg !463
  %194 = fadd fast double %193, %190, !dbg !463
  %195 = select fast i1 %191, double %193, double %194, !dbg !463
  store double %195, double* %"'de34", align 8, !dbg !463
  %196 = fcmp fast olt double %183, %184, !dbg !463
  %197 = select fast i1 %196, double %190, double 0.000000e+00, !dbg !463
  %198 = load double, double* %"'de35", align 8, !dbg !463
  %199 = fadd fast double %198, %190, !dbg !463
  %200 = select fast i1 %196, double %199, double %198, !dbg !463
  store double %200, double* %"'de35", align 8, !dbg !463
  %201 = load double, double* %"'de35", align 8, !dbg !460
  store double 0.000000e+00, double* %"'de35", align 8, !dbg !460
  %202 = fcmp fast olt double %_cache6.0, 0.000000e+00, !dbg !460
  %203 = select fast i1 %202, double -1.000000e+00, double 1.000000e+00, !dbg !460
  %204 = fmul fast double %201, %203, !dbg !460
  %205 = load double, double* %"'de12", align 8, !dbg !460
  %206 = fadd fast double %205, %204, !dbg !460
  store double %206, double* %"'de12", align 8, !dbg !460
  %207 = load double, double* %"'de34", align 8, !dbg !463
  store double 0.000000e+00, double* %"'de34", align 8, !dbg !463
  %208 = fcmp fast olt double %181, %182, !dbg !463
  %209 = select fast i1 %208, double 0.000000e+00, double %207, !dbg !463
  %210 = load double, double* %"'de36", align 8, !dbg !463
  %211 = fadd fast double %210, %207, !dbg !463
  %212 = select fast i1 %208, double %210, double %211, !dbg !463
  store double %212, double* %"'de36", align 8, !dbg !463
  %213 = fcmp fast olt double %181, %182, !dbg !463
  %214 = select fast i1 %213, double %207, double 0.000000e+00, !dbg !463
  %215 = load double, double* %"'de37", align 8, !dbg !463
  %216 = fadd fast double %215, %207, !dbg !463
  %217 = select fast i1 %213, double %216, double %215, !dbg !463
  store double %217, double* %"'de37", align 8, !dbg !463
  %218 = load double, double* %"'de37", align 8, !dbg !460
  store double 0.000000e+00, double* %"'de37", align 8, !dbg !460
  %219 = fcmp fast olt double %_cache3.0, 0.000000e+00, !dbg !460
  %220 = select fast i1 %219, double -1.000000e+00, double 1.000000e+00, !dbg !460
  %221 = fmul fast double %218, %220, !dbg !460
  %222 = load double, double* %"'de15", align 8, !dbg !460
  %223 = fadd fast double %222, %221, !dbg !460
  store double %223, double* %"'de15", align 8, !dbg !460
  %224 = load double, double* %"'de36", align 8, !dbg !454
  store double 0.000000e+00, double* %"'de36", align 8, !dbg !454
  %225 = fcmp fast olt double %_cache.0, 0.000000e+00, !dbg !454
  %226 = select fast i1 %225, double -1.000000e+00, double 1.000000e+00, !dbg !454
  %227 = fmul fast double %224, %226, !dbg !454
  %228 = load double, double* %"'de16", align 8, !dbg !454
  %229 = fadd fast double %228, %227, !dbg !454
  store double %229, double* %"'de16", align 8, !dbg !454
  %_unwrap38 = fmul double %_cache.0, %_cache.0
  %_unwrap39 = fmul double %_cache3.0, %_cache3.0
  %_unwrap40 = fadd double %_unwrap38, %_unwrap39
  %_unwrap41 = fmul double %_cache6.0, %_cache6.0
  %_unwrap42 = fadd double %_unwrap40, %_unwrap41
  %230 = call double @__nv_sqrt(double %_unwrap42) #6, !dbg !445
  %_unwrap43 = fcmp ule double %230, 0.000000e+00
  br i1 %_unwrap43, label %invertL31.i, label %invertL169.i

invertL204.i:                                     ; preds = %invertL208.i, %invertL206.i
  br label %invertL176.i

invertL206.i:                                     ; preds = %invertL221.i
  br label %invertL204.i

invertL208.i:                                     ; preds = %invertL221.i
  %231 = load double, double* %"'de44", align 8, !dbg !488
  store double 0.000000e+00, double* %"'de44", align 8, !dbg !488
  %232 = call double @__nv_fabs(double %_cache.0) #6, !dbg !454
  %233 = call double @__nv_fabs(double %_cache3.0) #6, !dbg !460
  %234 = call double @__nv_fmax(double %232, double %233) #6, !dbg !463
  %235 = call double @__nv_fabs(double %_cache6.0) #6, !dbg !460
  %236 = call double @__nv_fmax(double %234, double %235) #6, !dbg !463
  %_unwrap45 = fdiv double %_cache6.0, %236, !dbg !488
  %_unwrap46 = fmul double %_unwrap45, %_unwrap45, !dbg !488
  %_unwrap47 = fdiv double %_cache.0, %236, !dbg !488
  %_unwrap48 = fmul double %_unwrap47, %_unwrap47, !dbg !488
  %_unwrap49 = fdiv double %_cache3.0, %236, !dbg !488
  %_unwrap50 = fmul double %_unwrap49, %_unwrap49, !dbg !488
  %_unwrap51 = fadd double %_unwrap48, %_unwrap50, !dbg !488
  %_unwrap52 = fadd double %_unwrap46, %_unwrap51, !dbg !488
  %237 = call double @__nv_sqrt(double %_unwrap52) #6, !dbg !487
  %238 = fmul fast double %231, %237, !dbg !488
  %239 = load double, double* %"'de33", align 8, !dbg !488
  %240 = fadd fast double %239, %238, !dbg !488
  store double %240, double* %"'de33", align 8, !dbg !488
  %241 = fmul fast double %231, %236, !dbg !488
  %242 = load double, double* %"'de53", align 8, !dbg !488
  %243 = fadd fast double %242, %241, !dbg !488
  store double %243, double* %"'de53", align 8, !dbg !488
  %244 = load double, double* %"'de53", align 8, !dbg !487
  store double 0.000000e+00, double* %"'de53", align 8, !dbg !487
  %245 = fcmp fast ueq double %_unwrap52, 0.000000e+00, !dbg !487
  %246 = call fast double @__nv_sqrt(double %_unwrap52) #9, !dbg !487
  %247 = fmul fast double 2.000000e+00, %246, !dbg !487
  %248 = fdiv fast double %244, %247, !dbg !487
  %249 = select fast i1 %245, double 0.000000e+00, double %248, !dbg !487
  %250 = load double, double* %"'de54", align 8, !dbg !487
  %251 = fadd fast double %250, %248, !dbg !487
  %252 = select fast i1 %245, double %250, double %251, !dbg !487
  store double %252, double* %"'de54", align 8, !dbg !487
  %253 = load double, double* %"'de54", align 8, !dbg !486
  store double 0.000000e+00, double* %"'de54", align 8, !dbg !486
  %254 = load double, double* %"'de55", align 8, !dbg !486
  %255 = fadd fast double %254, %253, !dbg !486
  store double %255, double* %"'de55", align 8, !dbg !486
  %256 = load double, double* %"'de56", align 8, !dbg !486
  %257 = fadd fast double %256, %253, !dbg !486
  store double %257, double* %"'de56", align 8, !dbg !486
  %258 = load double, double* %"'de55", align 8, !dbg !483
  store double 0.000000e+00, double* %"'de55", align 8, !dbg !483
  %259 = fmul fast double %258, %_unwrap45, !dbg !483
  %260 = load double, double* %"'de57", align 8, !dbg !483
  %261 = fadd fast double %260, %259, !dbg !483
  store double %261, double* %"'de57", align 8, !dbg !483
  %262 = fmul fast double %258, %_unwrap45, !dbg !483
  %263 = load double, double* %"'de57", align 8, !dbg !483
  %264 = fadd fast double %263, %262, !dbg !483
  store double %264, double* %"'de57", align 8, !dbg !483
  %265 = load double, double* %"'de57", align 8, !dbg !481
  store double 0.000000e+00, double* %"'de57", align 8, !dbg !481
  %266 = fdiv fast double %265, %236, !dbg !481
  %267 = load double, double* %"'de12", align 8, !dbg !481
  %268 = fadd fast double %267, %266, !dbg !481
  store double %268, double* %"'de12", align 8, !dbg !481
  %269 = fdiv fast double %265, %236, !dbg !481
  %270 = fdiv fast double %_cache6.0, %236, !dbg !481
  %271 = fmul fast double %269, %270, !dbg !481
  %272 = fneg fast double %271, !dbg !481
  %273 = load double, double* %"'de33", align 8, !dbg !481
  %274 = fadd fast double %273, %272, !dbg !481
  store double %274, double* %"'de33", align 8, !dbg !481
  %275 = load double, double* %"'de56", align 8, !dbg !486
  store double 0.000000e+00, double* %"'de56", align 8, !dbg !486
  %276 = load double, double* %"'de58", align 8, !dbg !486
  %277 = fadd fast double %276, %275, !dbg !486
  store double %277, double* %"'de58", align 8, !dbg !486
  %278 = load double, double* %"'de59", align 8, !dbg !486
  %279 = fadd fast double %278, %275, !dbg !486
  store double %279, double* %"'de59", align 8, !dbg !486
  %280 = load double, double* %"'de59", align 8, !dbg !483
  store double 0.000000e+00, double* %"'de59", align 8, !dbg !483
  %281 = fmul fast double %280, %_unwrap49, !dbg !483
  %282 = load double, double* %"'de60", align 8, !dbg !483
  %283 = fadd fast double %282, %281, !dbg !483
  store double %283, double* %"'de60", align 8, !dbg !483
  %284 = fmul fast double %280, %_unwrap49, !dbg !483
  %285 = load double, double* %"'de60", align 8, !dbg !483
  %286 = fadd fast double %285, %284, !dbg !483
  store double %286, double* %"'de60", align 8, !dbg !483
  %287 = load double, double* %"'de60", align 8, !dbg !481
  store double 0.000000e+00, double* %"'de60", align 8, !dbg !481
  %288 = fdiv fast double %287, %236, !dbg !481
  %289 = load double, double* %"'de15", align 8, !dbg !481
  %290 = fadd fast double %289, %288, !dbg !481
  store double %290, double* %"'de15", align 8, !dbg !481
  %291 = fdiv fast double %287, %236, !dbg !481
  %292 = fdiv fast double %_cache3.0, %236, !dbg !481
  %293 = fmul fast double %291, %292, !dbg !481
  %294 = fneg fast double %293, !dbg !481
  %295 = load double, double* %"'de33", align 8, !dbg !481
  %296 = fadd fast double %295, %294, !dbg !481
  store double %296, double* %"'de33", align 8, !dbg !481
  %297 = load double, double* %"'de58", align 8, !dbg !483
  store double 0.000000e+00, double* %"'de58", align 8, !dbg !483
  %298 = fmul fast double %297, %_unwrap47, !dbg !483
  %299 = load double, double* %"'de61", align 8, !dbg !483
  %300 = fadd fast double %299, %298, !dbg !483
  store double %300, double* %"'de61", align 8, !dbg !483
  %301 = fmul fast double %297, %_unwrap47, !dbg !483
  %302 = load double, double* %"'de61", align 8, !dbg !483
  %303 = fadd fast double %302, %301, !dbg !483
  store double %303, double* %"'de61", align 8, !dbg !483
  %304 = load double, double* %"'de61", align 8, !dbg !481
  store double 0.000000e+00, double* %"'de61", align 8, !dbg !481
  %305 = fdiv fast double %304, %236, !dbg !481
  %306 = load double, double* %"'de16", align 8, !dbg !481
  %307 = fadd fast double %306, %305, !dbg !481
  store double %307, double* %"'de16", align 8, !dbg !481
  %308 = fdiv fast double %304, %236, !dbg !481
  %309 = fdiv fast double %_cache.0, %236, !dbg !481
  %310 = fmul fast double %308, %309, !dbg !481
  %311 = fneg fast double %310, !dbg !481
  %312 = load double, double* %"'de33", align 8, !dbg !481
  %313 = fadd fast double %312, %311, !dbg !481
  store double %313, double* %"'de33", align 8, !dbg !481
  br label %invertL204.i

invertL221.i:                                     ; preds = %invertjulia_kernel__3362_inner.exit
  %"'ipc62_unwrap" = bitcast i8 addrspace(1)* %".fca.0.extract25'ipev" to double addrspace(1)*, !dbg !497
  %314 = load atomic double, double addrspace(1)* %"'ipc62_unwrap" monotonic, align 8, !dbg !497
  %315 = load double, double* %"'de63", align 8, !dbg !497
  %316 = fadd fast double %315, %314, !dbg !497
  store double %316, double* %"'de63", align 8, !dbg !497
  %317 = load double, double* %"'de63", align 8, !dbg !496
  store double 0.000000e+00, double* %"'de63", align 8, !dbg !496
  %_unwrap65 = fmul double %_cache64.0, %_cache64.0, !dbg !496
  %318 = fmul fast double %317, %_unwrap65, !dbg !496
  %319 = load double, double* %"'de66", align 8, !dbg !496
  %320 = fadd fast double %319, %318, !dbg !496
  store double %320, double* %"'de66", align 8, !dbg !496
  %321 = fmul fast double %317, %_cache67.0, !dbg !496
  %322 = load double, double* %"'de68", align 8, !dbg !496
  %323 = fadd fast double %322, %321, !dbg !496
  store double %323, double* %"'de68", align 8, !dbg !496
  %324 = load double, double* %"'de68", align 8, !dbg !494
  store double 0.000000e+00, double* %"'de68", align 8, !dbg !494
  %325 = fmul fast double %324, %_cache64.0, !dbg !494
  %326 = load double, double* %"'de69", align 8, !dbg !494
  %327 = fadd fast double %326, %325, !dbg !494
  store double %327, double* %"'de69", align 8, !dbg !494
  %328 = fmul fast double %324, %_cache64.0, !dbg !494
  %329 = load double, double* %"'de69", align 8, !dbg !494
  %330 = fadd fast double %329, %328, !dbg !494
  store double %330, double* %"'de69", align 8, !dbg !494
  %331 = load double, double* %"'de69", align 8, !dbg !493
  store double 0.000000e+00, double* %"'de69", align 8, !dbg !493
  %332 = load double, double* %"value_phi.i'de", align 8, !dbg !493
  %333 = fadd fast double %332, %331, !dbg !493
  store double %333, double* %"value_phi.i'de", align 8, !dbg !493
  %334 = fneg fast double %331, !dbg !493
  %335 = load double, double* %".unpack54'de", align 8, !dbg !493
  %336 = fadd fast double %335, %334, !dbg !493
  store double %336, double* %".unpack54'de", align 8, !dbg !493
  %337 = load double, double* %"'de66", align 8, !dbg !490
  store double 0.000000e+00, double* %"'de66", align 8, !dbg !490
  %338 = fmul fast double %337, 5.000000e-01, !dbg !490
  %339 = load double, double* %".unpack'de", align 8, !dbg !490
  %340 = fadd fast double %339, %338, !dbg !490
  store double %340, double* %".unpack'de", align 8, !dbg !490
  %341 = load double, double* %"value_phi.i'de", align 8
  store double 0.000000e+00, double* %"value_phi.i'de", align 8
  %342 = icmp eq i8 0, %_cache72.1
  %343 = icmp eq i8 1, %_cache72.1
  %344 = icmp eq i8 2, %_cache72.1
  %345 = select fast i1 %344, double %341, double 0.000000e+00
  %346 = load double, double* %"'de44", align 8
  %347 = fadd fast double %346, %341
  %348 = select fast i1 %344, double %347, double %346
  store double %348, double* %"'de44", align 8
  %349 = select fast i1 %342, double %341, double 0.000000e+00
  %350 = load double, double* %"'de33", align 8
  %351 = fadd fast double %350, %341
  %352 = select fast i1 %342, double %351, double %350
  store double %352, double* %"'de33", align 8
  %353 = select fast i1 %343, double %341, double 0.000000e+00
  %354 = load double, double* %"'de", align 8
  %355 = fadd fast double %354, %341
  %356 = select fast i1 %343, double %355, double %354
  store double %356, double* %"'de", align 8
  switch i8 %_cache73.1, label %invertL206.i [
    i8 0, label %invertL208.i
    i8 1, label %invertL169.i
    i8 2, label %invertL176.i
  ]

invertjulia_kernel__3362_inner.exit:              ; preds = %julia_kernel__3362_inner.exit
  br i1 %.not, label %invertentry, label %invertL221.i
}

ERROR: LoadError: InvalidIRError: compiling MethodInstance for grad_kernel!(::CuDeviceVector{Float64, 1}, ::CuDeviceVector{Float64, 1}, ::CuDeviceVector{SVector{3, Float64}, 1}, ::CuDeviceVector{SVector{3, Float64}, 1}, ::CuDeviceVector{Int64, 1}, ::CuDeviceVector{Int64, 1}, ::CuDeviceVector{HarmonicBond, 1}, ::CuDeviceVector{HarmonicBond, 1}) resulted in invalid LLVM IR
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
 [1] #isfinite
   @ ~/.julia/dev/CUDA/src/device/intrinsics/math.jl:190
 [2] macro expansion
   @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:278
 [3] _norm
   @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:266
 [4] norm
   @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:265
 [5] f
   @ ~/dms/molly_dev/enzyme_err28.jl:19
 [6] kernel!
   @ ~/dms/molly_dev/enzyme_err28.jl:33
 [7] kernel!
   @ ~/dms/molly_dev/enzyme_err28.jl:0
 [8] diffejulia_kernel__3362_inner5wrap
   @ ~/dms/molly_dev/enzyme_err28.jl:0
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
 [1] multiple call sites
   @ unknown:0
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
  [1] #isfinite
    @ ~/.julia/dev/CUDA/src/device/intrinsics/math.jl:190
  [2] macro expansion
    @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:278
  [3] _norm
    @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:266
  [4] norm
    @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:265
  [5] f
    @ ~/dms/molly_dev/enzyme_err28.jl:19
  [6] kernel!
    @ ~/dms/molly_dev/enzyme_err28.jl:33
  [7] kernel!
    @ ~/dms/molly_dev/enzyme_err28.jl:0
  [8] diffejulia_kernel__3362_inner5wrap
    @ ~/dms/molly_dev/enzyme_err28.jl:0
  [9] macro expansion
    @ ~/.julia/dev/Enzyme/src/compiler.jl:9872
 [10] enzyme_call
    @ ~/.julia/dev/Enzyme/src/compiler.jl:9550
 [11] CombinedAdjointThunk
    @ ~/.julia/dev/Enzyme/src/compiler.jl:9513
 [12] autodiff_deferred
    @ ~/.julia/dev/Enzyme/src/Enzyme.jl:372
 [13] autodiff_deferred
    @ ~/.julia/dev/Enzyme/src/Enzyme.jl:442
 [14] grad_kernel!
    @ ~/dms/molly_dev/enzyme_err28.jl:40
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/validation.jl:149
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:415 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/RsWnF/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:414 [inlined]
  [5] emit_llvm(job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/utils.jl:89
  [6] emit_llvm
    @ ~/.julia/packages/GPUCompiler/YO8Uj/src/utils.jl:83 [inlined]
  [7] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:129
  [8] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:106
  [9] compile
    @ ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:98 [inlined]
 [10] #1037
    @ ~/.julia/dev/CUDA/src/compiler/compilation.jl:104 [inlined]
 [11] JuliaContext(f::CUDA.var"#1037#1040"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:47
 [12] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/dev/CUDA/src/compiler/compilation.jl:103
 [13] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/execution.jl:125
 [14] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/execution.jl:103
 [15] macro expansion
    @ ~/.julia/dev/CUDA/src/compiler/execution.jl:318 [inlined]
 [16] macro expansion
    @ ./lock.jl:267 [inlined]
 [17] cufunction(f::typeof(grad_kernel!), tt::Type{Tuple{CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{SVector{3, Float64}, 1}, CuDeviceVector{SVector{3, Float64}, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{HarmonicBond, 1}, CuDeviceVector{HarmonicBond, 1}}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:313
 [18] cufunction(f::typeof(grad_kernel!), tt::Type{Tuple{CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{SVector{3, Float64}, 1}, CuDeviceVector{SVector{3, Float64}, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{HarmonicBond, 1}, CuDeviceVector{HarmonicBond, 1}}})
    @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:310
 [19] macro expansion
    @ ~/.julia/dev/CUDA/src/compiler/execution.jl:104 [inlined]
 [20] top-level scope
    @ ~/.julia/dev/CUDA/src/utilities.jl:25
wsmoses commented 1 year ago

@vchuravy any ideas here?

vchuravy commented 1 year ago

We will need to look at @device_code_dump, but I am wondering if your libm function replacement is inserting calls to pointers.

Given that we are looking at isfinite

jgreener64 commented 1 year ago

The output of

@device_code dir="kernel_dump" @cuda threads=128 kernel!(pe_vec, coords, is, js, inters)

and

@device_code dir="grad_kernel_dump" @cuda threads=128 grad_kernel!(pe_vec, d_pe_vec, coords, d_coords, is, js, inters, d_inters)

are in device_code.zip.

vchuravy commented 1 year ago

So I immediatly see:

@0 = private unnamed_addr constant [7808 x i8] c"Enzyme compilation failed.\0ACurrent scope: \0A; Function Attrs: mustprogress willreturn\0Adefine void @preprocess_julia_kernel__5774_inner5({ i8 addrspace(1)*, i64, [1 x i64], i64 } %0, { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, { i8 addrspace(1)*, i64, [1 x i64], i64 } %4) local_unnamed_addr #5 !dbg !227 {\0Aentry:\0A  %.fca.0.extract25 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, 0, !dbg !228\0A  %.fca.2.0.extract11 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 2, 0, !dbg !228\0A  %5 = call {}*** @julia.get_pgcstack() #6\0A  %6 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #6, !dbg !229, !range !27\0A  %7 = zext i32 %6 to i64, !dbg !236\0A  %8 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #6, !dbg !238, !range !39\0A  %9 = zext i32 %8 to i64, !dbg !243\0A  %10 = mul nuw nsw i64 %7, %9, !dbg !245\0A  %11 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #6, !dbg !247, !range !53\0A  %12 = add nuw nsw i32 %11, 1, !dbg !252\0A  %13 = zext i32 %12 to i64, !dbg !253\0A  %14 = add nuw nsw i64 %10, %13, !dbg !255\0A  %.not = icmp sgt i64 %14, %.fca.2.0.extract11, !dbg !257\0A  br i1 %.not, label %julia_kernel__5774_inner.exit, label %L31.i, !dbg !259\0A\0AL31.i:                                            ; preds = %entry\0A  %.fca.0.extract = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %4, 0, !dbg !228\0A  %.fca.0.extract1 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, 0, !dbg !228\0A  %.fca.0.extract9 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 0, !dbg !228\0A  %.fca.0.extract16 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, 0, !dbg !228\0A  %15 = add nsw i64 %14, -1, !dbg !260\0A  %16 = shl nuw nsw i64 %15, 3, !dbg !266\0A  %17 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract9, i64 %16, !dbg !268\0A  %18 = bitcast i8 addrspace(1)* %17 to i64 addrspace(1)*, !dbg !269\0A  %19 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %18, i32 noundef 8) #7, !dbg !269\0A  %20 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract1, i64 %16, !dbg !268\0A  %21 = bitcast i8 addrspace(1)* %20 to i64 addrspace(1)*, !dbg !269\0A  %22 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %21, i32 noundef 8) #7, !dbg !269\0A  %23 = bitcast i8 addrspace(1)* %.fca.0.extract to [2 x double] addrspace(1)*, !dbg !274\0A  %.elt = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %23, i64 %15, i64 0, !dbg !274\0A  %.unpack = load double, double addrspace(1)* %.elt, align 8, !dbg !274, !tbaa !100\0A  %.elt53 = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %23, i64 %15, i64 1, !dbg !274\0A  %.unpack54 = load double, double addrspace(1)* %.elt53, align 8, !dbg !274, !tbaa !100\0A  %24 = add i64 %19, -1, !dbg !282\0A  %25 = bitcast i8 addrspace(1)* %.fca.0.extract16 to [1 x [3 x double]] addrspace(1)*, !dbg !274\0A  %.unpack55.elt = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %24, i64 0, i64 0, !dbg !274\0A  %.unpack55.unpack = load double, double addrspace(1)* %.unpack55.elt, align 8, !dbg !274, !tbaa !100\0A  %.unpack55.elt56 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %24, i64 0, i64 1, !dbg !274\0A  %.unpack55.unpack57 = load double, double addrspace(1)* %.unpack55.elt56, align 8, !dbg !274, !tbaa !100\0A  %.unpack55.elt58 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %24, i64 0, i64 2, !dbg !274\0A  %.unpack55.unpack59 = load double, double addrspace(1)* %.unpack55.elt58, align 8, !dbg !274, !tbaa !100\0A  %26 = add i64 %22, -1, !dbg !282\0A  %.unpack61.elt = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %26, i64 0, i64 0, !dbg !274\0A  %.unpack61.unpack = load double, double addrspace(1)* %.unpack61.elt, align 8, !dbg !274, !tbaa !100\0A  %.unpack61.elt62 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %26, i64 0, i64 1, !dbg !274\0A  %.unpack61.unpack63 = load double, double addrspace(1)* %.unpack61.elt62, align 8, !dbg !274, !tbaa !100\0A  %.unpack61.elt64 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %26, i64 0, i64 2, !dbg !274\0A  %.unpack61.unpack65 = load double, double addrspace(1)* %.unpack61.elt64, align 8, !dbg !274, !tbaa !100\0A  %27 = fsub double %.unpack55.unpack, %.unpack61.unpack, !dbg !284\0A  %28 = fsub double %.unpack55.unpack57, %.unpack61.unpack63, !dbg !284\0A  %29 = fsub double %.unpack55.unpack59, %.unpack61.unpack65, !dbg !284\0A  %30 = fmul double %27, %27, !dbg !290\0A  %31 = fmul double %28, %28, !dbg !290\0A  %32 = fadd double %30, %31, !dbg !297\0A  %33 = fmul double %29, %29, !dbg !290\0A  %34 = fadd double %32, %33, !dbg !297\0A  %35 = call double @__nv_sqrt(double %34) #6, !dbg !298\0A  %36 = fcmp ule double %35, 0.000000e+00, !dbg !299\0A  br i1 %36, label %L176.i, label %L169.i, !dbg !301\0A\0AL169.i:                                           ; preds = %L31.i\0A  %37 = call i32 @__nv_isfinited(double %35) #6, !dbg !302\0A  %38 = icmp eq i32 %37, 0, !dbg !303\0A  br i1 %38, label %L176.i, label %L221.i, !dbg !301\0A\0AL176.i:                                           ; preds = %L169.i, %L31.i\0A  %39 = call double @__nv_fabs(double %27) #6, !dbg !307\0A  %40 = call double @__nv_fabs(double %28) #6, !dbg !313\0A  %41 = call double @__nv_fmax(double %39, double %40) #6, !dbg !316\0A  %42 = call double @__nv_fabs(double %29) #6, !dbg !313\0A  %43 = call double @__nv_fmax(double %41, double %42) #6, !dbg !316\0A  %44 = call i32 @__nv_isfinited(double %43) #6, !dbg !317\0A  %.not68 = icmp eq i32 %44, 0, !dbg !319\0A  br i1 %.not68, label %L221.i, label %L204.i, !dbg !323\0A\0AL204.i:                                           ; preds = %L176.i\0A  %45 = fcmp une double %43, 0.000000e+00, !dbg !324\0A  br i1 %45, label %L208.i, label %L206.i, !dbg !327\0A\0AL206.i:                                           ; preds = %L204.i\0A  %46 = call double @__nv_fabs(double noundef 0.000000e+00) #6, !dbg !328\0A  br label %L221.i, !dbg !332\0A\0AL208.i:                                           ; preds = %L204.i\0A  %47 = fdiv double %27, %43, !dbg !334\0A  %48 = fmul double %47, %47, !dbg !336\0A  %49 = fdiv double %28, %43, !dbg !334\0A  %50 = fmul double %49, %49, !dbg !336\0A  %51 = fadd double %48, %50, !dbg !339\0A  %52 = fdiv double %29, %43, !dbg !334\0A  %53 = fmul double %52, %52, !dbg !336\0A  %54 = fadd double %53, %51, !dbg !339\0A  %55 = call double @__nv_sqrt(double %54) #6, !dbg !340\0A  %56 = fmul double %43, %55, !dbg !341\0A  br label %L221.i, !dbg !332\0A\0AL221.i:                                           ; preds = %L208.i, %L206.i, %L176.i, %L169.i\0A  %value_phi.i = phi double [ %35, %L169.i ], [ %46, %L206.i ], [ %56, %L208.i ], [ %43, %L176.i ]\0A  %57 = fmul double %.unpack, 5.000000e-01, !dbg !342\0A  %58 = fsub double %value_phi.i, %.unpack54, !dbg !345\0A  %59 = fmul double %58, %58, !dbg !346\0A  %60 = fmul double %57, %59, !dbg !348\0A  %61 = bitcast i8 addrspace(1)* %.fca.0.extract25 to double addrspace(1)*, !dbg !349\0A  %62 = atomicrmw fadd double addrspace(1)* %61, double %60 monotonic, align 8, !dbg !349\0A  br label %julia_kernel__5774_inner.exit, !dbg !356\0A\0Ajulia_kernel__5774_inner.exit:                    ; preds = %L221.i, %entry\0A  ret void, !dbg !228\0A}\0A\0ANo augmented forward pass found for __nv_isfinited\0A at context:   %37 = call i32 @__nv_isfinited(double %35) #6, !dbg !145\0A\0AStacktrace:\0A [1] #isfinite\0A   @ ~/.julia/dev/CUDA/src/device/intrinsics/math.jl:190\0A [2] macro expansion\0A   @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:278\0A [3] _norm\0A   @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:266\0A [4] norm\0A   @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:265\0A [5] f\0A   @ ./REPL[5]:3\0A [6] kernel!\0A   @ ./REPL[6]:11\0A [7] kernel!\0A   @ ./REPL[6]:0\0A\00", align 1
@1 = private unnamed_addr constant [156 x i8] c"Enzyme: The original primal code hits this error condition, thus differentiating it does not make sense\0AStacktrace:\0A [1] multiple call sites\0A   @ unknown:0\00", align 1

cc: @wsmoses

vchuravy commented 1 year ago

ANd the verifier complains about:

; │││││┌ @ /home/jgreener/.julia/dev/CUDA/src/device/intrinsics/math.jl:190 within `#isfinite`
        call void inttoptr (i64 140245794510672 to void (i8*)*)(i8* getelementptr inbounds ([7808 x i8], [7808 x i8]* @0, i64 0, i64 0)) #10, !dbg !189
        call void asm sideeffect "exit;", ""(), !dbg !189
        unreachable, !dbg !189
vchuravy commented 1 year ago

So the short answer is:

No augmented forward pass found for __nv_isfinited
at context:   %37 = call i32 @__nv_isfinited(double %35) #6, !dbg !145\0A\0AStacktrace:
 [1] #isfinite 
      @ ~/.julia/dev/CUDA/src/device/intrinsics/math.jl:190
 [2] macro expansion
      @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:278
[3] _norm\0A   @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:266#1122 

So three work items:

  1. Add support for isfinite
  2. improve error reporting
  3. Make julia_error callback aware of its target environment
wsmoses commented 1 year ago

I can add support isfinite quickly without issue, 2/3 seem a lot harder, do you have any ideas?

vchuravy commented 1 year ago

the julia_error callback could query the target (from the data-layout?) and if it's nvptx use GPUCompiler's report exception https://github.com/JuliaGPU/GPUCompiler.jl/blob/e5cd575dd44bec8d2914f3ce2cd1ae83dcd9ac91/src/irgen.jl#L220

jgreener64 commented 1 year ago

Can confirm that the MWE at the top of the issue is fixed on main (31012f8b4b5f6d5699ded0bb9e7c184c4991312b).

I ran into an issue with some similar GPU code, see https://github.com/EnzymeAD/Enzyme.jl/issues/1129.