Closed jgreener64 closed 6 months ago
Can you add a log with Enzyme.API.printall!(true) ?
With Enzyme.API.printall!(true)
:
after simplification :
; Function Attrs: mustprogress willreturn
define void @preprocess_julia_kernel__3362_inner5({ i8 addrspace(1)*, i64, [1 x i64], i64 } %0, { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, { i8 addrspace(1)*, i64, [1 x i64], i64 } %4) local_unnamed_addr #5 !dbg !224 {
entry:
%.fca.0.extract25 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, 0, !dbg !225
%.fca.2.0.extract11 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 2, 0, !dbg !225
%5 = call {}*** @julia.get_pgcstack() #6
%6 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #6, !dbg !226, !range !27
%7 = zext i32 %6 to i64, !dbg !233
%8 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #6, !dbg !235, !range !39
%9 = zext i32 %8 to i64, !dbg !240
%10 = mul nuw nsw i64 %7, %9, !dbg !242
%11 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #6, !dbg !244, !range !53
%12 = add nuw nsw i32 %11, 1, !dbg !249
%13 = zext i32 %12 to i64, !dbg !250
%14 = add nuw nsw i64 %10, %13, !dbg !252
%.not = icmp sgt i64 %14, %.fca.2.0.extract11, !dbg !254
br i1 %.not, label %julia_kernel__3362_inner.exit, label %L31.i, !dbg !256
L31.i: ; preds = %entry
%.fca.0.extract = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %4, 0, !dbg !225
%.fca.0.extract1 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, 0, !dbg !225
%.fca.0.extract9 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 0, !dbg !225
%.fca.0.extract16 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, 0, !dbg !225
%15 = add nsw i64 %14, -1, !dbg !257
%16 = shl nuw nsw i64 %15, 3, !dbg !263
%17 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract9, i64 %16, !dbg !265
%18 = bitcast i8 addrspace(1)* %17 to i64 addrspace(1)*, !dbg !266
%19 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %18, i32 noundef 8) #7, !dbg !266
%20 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract1, i64 %16, !dbg !265
%21 = bitcast i8 addrspace(1)* %20 to i64 addrspace(1)*, !dbg !266
%22 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %21, i32 noundef 8) #7, !dbg !266
%23 = bitcast i8 addrspace(1)* %.fca.0.extract to [2 x double] addrspace(1)*, !dbg !271
%.elt = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %23, i64 %15, i64 0, !dbg !271
%.unpack = load double, double addrspace(1)* %.elt, align 8, !dbg !271, !tbaa !98
%.elt53 = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %23, i64 %15, i64 1, !dbg !271
%.unpack54 = load double, double addrspace(1)* %.elt53, align 8, !dbg !271, !tbaa !98
%24 = add i64 %19, -1, !dbg !279
%25 = bitcast i8 addrspace(1)* %.fca.0.extract16 to [1 x [3 x double]] addrspace(1)*, !dbg !271
%.unpack55.elt = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %24, i64 0, i64 0, !dbg !271
%.unpack55.unpack = load double, double addrspace(1)* %.unpack55.elt, align 8, !dbg !271, !tbaa !98
%.unpack55.elt56 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %24, i64 0, i64 1, !dbg !271
%.unpack55.unpack57 = load double, double addrspace(1)* %.unpack55.elt56, align 8, !dbg !271, !tbaa !98
%.unpack55.elt58 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %24, i64 0, i64 2, !dbg !271
%.unpack55.unpack59 = load double, double addrspace(1)* %.unpack55.elt58, align 8, !dbg !271, !tbaa !98
%26 = add i64 %22, -1, !dbg !279
%.unpack61.elt = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %26, i64 0, i64 0, !dbg !271
%.unpack61.unpack = load double, double addrspace(1)* %.unpack61.elt, align 8, !dbg !271, !tbaa !98
%.unpack61.elt62 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %26, i64 0, i64 1, !dbg !271
%.unpack61.unpack63 = load double, double addrspace(1)* %.unpack61.elt62, align 8, !dbg !271, !tbaa !98
%.unpack61.elt64 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %26, i64 0, i64 2, !dbg !271
%.unpack61.unpack65 = load double, double addrspace(1)* %.unpack61.elt64, align 8, !dbg !271, !tbaa !98
%27 = fsub double %.unpack55.unpack, %.unpack61.unpack, !dbg !281
%28 = fsub double %.unpack55.unpack57, %.unpack61.unpack63, !dbg !281
%29 = fsub double %.unpack55.unpack59, %.unpack61.unpack65, !dbg !281
%30 = fmul double %27, %27, !dbg !287
%31 = fmul double %28, %28, !dbg !287
%32 = fadd double %30, %31, !dbg !294
%33 = fmul double %29, %29, !dbg !287
%34 = fadd double %32, %33, !dbg !294
%35 = call double @__nv_sqrt(double %34) #6, !dbg !295
%36 = fcmp ule double %35, 0.000000e+00, !dbg !296
br i1 %36, label %L176.i, label %L169.i, !dbg !298
L169.i: ; preds = %L31.i
%37 = call i32 @__nv_isfinited(double %35) #6, !dbg !299
%38 = icmp eq i32 %37, 0, !dbg !300
br i1 %38, label %L176.i, label %L221.i, !dbg !298
L176.i: ; preds = %L169.i, %L31.i
%39 = call double @__nv_fabs(double %27) #6, !dbg !304
%40 = call double @__nv_fabs(double %28) #6, !dbg !310
%41 = call double @__nv_fmax(double %39, double %40) #6, !dbg !313
%42 = call double @__nv_fabs(double %29) #6, !dbg !310
%43 = call double @__nv_fmax(double %41, double %42) #6, !dbg !313
%44 = call i32 @__nv_isfinited(double %43) #6, !dbg !314
%.not68 = icmp eq i32 %44, 0, !dbg !316
br i1 %.not68, label %L221.i, label %L204.i, !dbg !320
L204.i: ; preds = %L176.i
%45 = fcmp une double %43, 0.000000e+00, !dbg !321
br i1 %45, label %L208.i, label %L206.i, !dbg !324
L206.i: ; preds = %L204.i
%46 = call double @__nv_fabs(double noundef 0.000000e+00) #6, !dbg !325
br label %L221.i, !dbg !329
L208.i: ; preds = %L204.i
%47 = fdiv double %27, %43, !dbg !331
%48 = fmul double %47, %47, !dbg !333
%49 = fdiv double %28, %43, !dbg !331
%50 = fmul double %49, %49, !dbg !333
%51 = fadd double %48, %50, !dbg !336
%52 = fdiv double %29, %43, !dbg !331
%53 = fmul double %52, %52, !dbg !333
%54 = fadd double %53, %51, !dbg !336
%55 = call double @__nv_sqrt(double %54) #6, !dbg !337
%56 = fmul double %43, %55, !dbg !338
br label %L221.i, !dbg !329
L221.i: ; preds = %L208.i, %L206.i, %L176.i, %L169.i
%value_phi.i = phi double [ %35, %L169.i ], [ %46, %L206.i ], [ %56, %L208.i ], [ %43, %L176.i ]
%57 = fmul double %.unpack, 5.000000e-01, !dbg !339
%58 = fsub double %value_phi.i, %.unpack54, !dbg !342
%59 = fmul double %58, %58, !dbg !343
%60 = fmul double %57, %59, !dbg !345
%61 = bitcast i8 addrspace(1)* %.fca.0.extract25 to double addrspace(1)*, !dbg !346
%62 = atomicrmw fadd double addrspace(1)* %61, double %60 monotonic, align 8, !dbg !346
br label %julia_kernel__3362_inner.exit, !dbg !353
julia_kernel__3362_inner.exit: ; preds = %L221.i, %entry
ret void, !dbg !225
}
; Function Attrs: mustprogress willreturn
define internal void @diffejulia_kernel__3362_inner5({ i8 addrspace(1)*, i64, [1 x i64], i64 } %0, { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'", { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'1", { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, { i8 addrspace(1)*, i64, [1 x i64], i64 } %4, { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'2") local_unnamed_addr #5 !dbg !354 {
entry:
%"'de" = alloca double, align 8
%5 = getelementptr double, double* %"'de", i64 0
store double 0.000000e+00, double* %5, align 8
%"'de9" = alloca double, align 8
%6 = getelementptr double, double* %"'de9", i64 0
store double 0.000000e+00, double* %6, align 8
%"'de10" = alloca double, align 8
%7 = getelementptr double, double* %"'de10", i64 0
store double 0.000000e+00, double* %7, align 8
%"'de11" = alloca double, align 8
%8 = getelementptr double, double* %"'de11", i64 0
store double 0.000000e+00, double* %8, align 8
%"'de12" = alloca double, align 8
%9 = getelementptr double, double* %"'de12", i64 0
store double 0.000000e+00, double* %9, align 8
%"'de13" = alloca double, align 8
%10 = getelementptr double, double* %"'de13", i64 0
store double 0.000000e+00, double* %10, align 8
%"'de14" = alloca double, align 8
%11 = getelementptr double, double* %"'de14", i64 0
store double 0.000000e+00, double* %11, align 8
%"'de15" = alloca double, align 8
%12 = getelementptr double, double* %"'de15", i64 0
store double 0.000000e+00, double* %12, align 8
%"'de16" = alloca double, align 8
%13 = getelementptr double, double* %"'de16", i64 0
store double 0.000000e+00, double* %13, align 8
%".unpack55.unpack59'de" = alloca double, align 8
%14 = getelementptr double, double* %".unpack55.unpack59'de", i64 0
store double 0.000000e+00, double* %14, align 8
%".unpack61.unpack65'de" = alloca double, align 8
%15 = getelementptr double, double* %".unpack61.unpack65'de", i64 0
store double 0.000000e+00, double* %15, align 8
%".unpack55.unpack57'de" = alloca double, align 8
%16 = getelementptr double, double* %".unpack55.unpack57'de", i64 0
store double 0.000000e+00, double* %16, align 8
%".unpack61.unpack63'de" = alloca double, align 8
%17 = getelementptr double, double* %".unpack61.unpack63'de", i64 0
store double 0.000000e+00, double* %17, align 8
%".unpack55.unpack'de" = alloca double, align 8
%18 = getelementptr double, double* %".unpack55.unpack'de", i64 0
store double 0.000000e+00, double* %18, align 8
%".unpack61.unpack'de" = alloca double, align 8
%19 = getelementptr double, double* %".unpack61.unpack'de", i64 0
store double 0.000000e+00, double* %19, align 8
%".unpack54'de" = alloca double, align 8
%20 = getelementptr double, double* %".unpack54'de", i64 0
store double 0.000000e+00, double* %20, align 8
%".unpack'de" = alloca double, align 8
%21 = getelementptr double, double* %".unpack'de", i64 0
store double 0.000000e+00, double* %21, align 8
%"'de33" = alloca double, align 8
%22 = getelementptr double, double* %"'de33", i64 0
store double 0.000000e+00, double* %22, align 8
%"'de34" = alloca double, align 8
%23 = getelementptr double, double* %"'de34", i64 0
store double 0.000000e+00, double* %23, align 8
%"'de35" = alloca double, align 8
%24 = getelementptr double, double* %"'de35", i64 0
store double 0.000000e+00, double* %24, align 8
%"'de36" = alloca double, align 8
%25 = getelementptr double, double* %"'de36", i64 0
store double 0.000000e+00, double* %25, align 8
%"'de37" = alloca double, align 8
%26 = getelementptr double, double* %"'de37", i64 0
store double 0.000000e+00, double* %26, align 8
%"'de44" = alloca double, align 8
%27 = getelementptr double, double* %"'de44", i64 0
store double 0.000000e+00, double* %27, align 8
%"'de53" = alloca double, align 8
%28 = getelementptr double, double* %"'de53", i64 0
store double 0.000000e+00, double* %28, align 8
%"'de54" = alloca double, align 8
%29 = getelementptr double, double* %"'de54", i64 0
store double 0.000000e+00, double* %29, align 8
%"'de55" = alloca double, align 8
%30 = getelementptr double, double* %"'de55", i64 0
store double 0.000000e+00, double* %30, align 8
%"'de56" = alloca double, align 8
%31 = getelementptr double, double* %"'de56", i64 0
store double 0.000000e+00, double* %31, align 8
%"'de57" = alloca double, align 8
%32 = getelementptr double, double* %"'de57", i64 0
store double 0.000000e+00, double* %32, align 8
%"'de58" = alloca double, align 8
%33 = getelementptr double, double* %"'de58", i64 0
store double 0.000000e+00, double* %33, align 8
%"'de59" = alloca double, align 8
%34 = getelementptr double, double* %"'de59", i64 0
store double 0.000000e+00, double* %34, align 8
%"'de60" = alloca double, align 8
%35 = getelementptr double, double* %"'de60", i64 0
store double 0.000000e+00, double* %35, align 8
%"'de61" = alloca double, align 8
%36 = getelementptr double, double* %"'de61", i64 0
store double 0.000000e+00, double* %36, align 8
%"'de63" = alloca double, align 8
%37 = getelementptr double, double* %"'de63", i64 0
store double 0.000000e+00, double* %37, align 8
%"'de66" = alloca double, align 8
%38 = getelementptr double, double* %"'de66", i64 0
store double 0.000000e+00, double* %38, align 8
%"'de68" = alloca double, align 8
%39 = getelementptr double, double* %"'de68", i64 0
store double 0.000000e+00, double* %39, align 8
%"'de69" = alloca double, align 8
%40 = getelementptr double, double* %"'de69", i64 0
store double 0.000000e+00, double* %40, align 8
%"value_phi.i'de" = alloca double, align 8
%41 = getelementptr double, double* %"value_phi.i'de", i64 0
store double 0.000000e+00, double* %41, align 8
%.fca.0.extract25 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, 0, !dbg !355
%".fca.0.extract25'ipev" = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'", 0, !dbg !355
%.fca.2.0.extract11 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 2, 0, !dbg !355
%42 = call {}*** @julia.get_pgcstack() #6
%43 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #6, !dbg !356, !range !27
%44 = zext i32 %43 to i64, !dbg !363
%45 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #6, !dbg !365, !range !39
%46 = zext i32 %45 to i64, !dbg !370
%47 = mul nuw nsw i64 %44, %46, !dbg !372
%48 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #6, !dbg !374, !range !53
%49 = add nuw nsw i32 %48, 1, !dbg !379
%50 = zext i32 %49 to i64, !dbg !380
%51 = add nuw nsw i64 %47, %50, !dbg !382
%.not = icmp sgt i64 %51, %.fca.2.0.extract11, !dbg !384
br i1 %.not, label %julia_kernel__3362_inner.exit, label %L31.i, !dbg !386
L31.i: ; preds = %entry
%.fca.0.extract = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %4, 0, !dbg !355
%".fca.0.extract'ipev" = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'2", 0, !dbg !355
%.fca.0.extract1 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, 0, !dbg !355
%.fca.0.extract9 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 0, !dbg !355
%.fca.0.extract16 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, 0, !dbg !355
%".fca.0.extract16'ipev" = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'1", 0, !dbg !387
%52 = add nsw i64 %51, -1, !dbg !387
%53 = shl nuw nsw i64 %52, 3, !dbg !393
%54 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract9, i64 %53, !dbg !395
%55 = bitcast i8 addrspace(1)* %54 to i64 addrspace(1)*, !dbg !396
%56 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %55, i32 noundef 8) #7, !dbg !396, !alias.scope !401, !noalias !404
%57 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract1, i64 %53, !dbg !395
%58 = bitcast i8 addrspace(1)* %57 to i64 addrspace(1)*, !dbg !396
%59 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %58, i32 noundef 8) #7, !dbg !396, !alias.scope !406, !noalias !409
%"'ipc25" = bitcast i8 addrspace(1)* %".fca.0.extract'ipev" to [2 x double] addrspace(1)*, !dbg !411
%60 = bitcast i8 addrspace(1)* %.fca.0.extract to [2 x double] addrspace(1)*, !dbg !411
%".elt'ipg" = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %"'ipc25", i64 %52, i64 0, !dbg !411
%.elt = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %60, i64 %52, i64 0, !dbg !411
%.unpack = load double, double addrspace(1)* %.elt, align 8, !dbg !411, !tbaa !98, !alias.scope !419, !noalias !422
%".elt53'ipg" = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %"'ipc25", i64 %52, i64 1, !dbg !411
%.elt53 = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %60, i64 %52, i64 1, !dbg !411
%.unpack54 = load double, double addrspace(1)* %.elt53, align 8, !dbg !411, !tbaa !98, !alias.scope !419, !noalias !422
%61 = add i64 %56, -1, !dbg !424
%"'ipc" = bitcast i8 addrspace(1)* %".fca.0.extract16'ipev" to [1 x [3 x double]] addrspace(1)*, !dbg !411
%62 = bitcast i8 addrspace(1)* %.fca.0.extract16 to [1 x [3 x double]] addrspace(1)*, !dbg !411
%".unpack55.elt'ipg" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc", i64 %61, i64 0, i64 0, !dbg !411
%.unpack55.elt = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %62, i64 %61, i64 0, i64 0, !dbg !411
%.unpack55.unpack = load double, double addrspace(1)* %.unpack55.elt, align 8, !dbg !411, !tbaa !98, !alias.scope !426, !noalias !429
%".unpack55.elt56'ipg" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc", i64 %61, i64 0, i64 1, !dbg !411
%.unpack55.elt56 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %62, i64 %61, i64 0, i64 1, !dbg !411
%.unpack55.unpack57 = load double, double addrspace(1)* %.unpack55.elt56, align 8, !dbg !411, !tbaa !98, !alias.scope !426, !noalias !429
%".unpack55.elt58'ipg" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc", i64 %61, i64 0, i64 2, !dbg !411
%.unpack55.elt58 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %62, i64 %61, i64 0, i64 2, !dbg !411
%.unpack55.unpack59 = load double, double addrspace(1)* %.unpack55.elt58, align 8, !dbg !411, !tbaa !98, !alias.scope !426, !noalias !429
%63 = add i64 %59, -1, !dbg !424
%".unpack61.elt'ipg" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc", i64 %63, i64 0, i64 0, !dbg !411
%.unpack61.elt = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %62, i64 %63, i64 0, i64 0, !dbg !411
%.unpack61.unpack = load double, double addrspace(1)* %.unpack61.elt, align 8, !dbg !411, !tbaa !98, !alias.scope !426, !noalias !429
%".unpack61.elt62'ipg" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc", i64 %63, i64 0, i64 1, !dbg !411
%.unpack61.elt62 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %62, i64 %63, i64 0, i64 1, !dbg !411
%.unpack61.unpack63 = load double, double addrspace(1)* %.unpack61.elt62, align 8, !dbg !411, !tbaa !98, !alias.scope !426, !noalias !429
%".unpack61.elt64'ipg" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc", i64 %63, i64 0, i64 2, !dbg !411
%.unpack61.elt64 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %62, i64 %63, i64 0, i64 2, !dbg !411
%.unpack61.unpack65 = load double, double addrspace(1)* %.unpack61.elt64, align 8, !dbg !411, !tbaa !98, !alias.scope !426, !noalias !429
%64 = fsub double %.unpack55.unpack, %.unpack61.unpack, !dbg !431
%65 = fsub double %.unpack55.unpack57, %.unpack61.unpack63, !dbg !431
%66 = fsub double %.unpack55.unpack59, %.unpack61.unpack65, !dbg !431
%67 = fmul double %64, %64, !dbg !437
%68 = fmul double %65, %65, !dbg !437
%69 = fadd double %67, %68, !dbg !444
%70 = fmul double %66, %66, !dbg !437
%71 = fadd double %69, %70, !dbg !444
%72 = call double @__nv_sqrt(double %71) #6, !dbg !445
%73 = fcmp ule double %72, 0.000000e+00, !dbg !446
br i1 %73, label %L176.i, label %L169.i, !dbg !448
L169.i: ; preds = %L31.i
call void inttoptr (i64 139880611672816 to void (i8*)*)(i8* getelementptr inbounds ([7867 x i8], [7867 x i8]* @0, i32 0, i32 0)) #8, !dbg !449
%_augmented = call { {} addrspace(10)*, i32 } @fakeaugmented___nv_isfinited(double %72), !dbg !449
%subcache = extractvalue { {} addrspace(10)*, i32 } %_augmented, 0, !dbg !449
%74 = extractvalue { {} addrspace(10)*, i32 } %_augmented, 1, !dbg !449
%75 = icmp eq i32 %74, 0, !dbg !450
br i1 %75, label %L176.i, label %L221.i, !dbg !448
L176.i: ; preds = %L169.i, %L31.i
%subcache_cache.0 = phi {} addrspace(10)* [ addrspacecast ({}* inttoptr (i64 139887560998920 to {}*) to {} addrspace(10)*), %L31.i ], [ %subcache, %L169.i ]
%76 = call double @__nv_fabs(double %64) #6, !dbg !454
%77 = call double @__nv_fabs(double %65) #6, !dbg !460
%78 = call double @__nv_fmax(double %76, double %77) #6, !dbg !463
%79 = call double @__nv_fabs(double %66) #6, !dbg !460
%80 = call double @__nv_fmax(double %78, double %79) #6, !dbg !463
%_augmented31 = call { {} addrspace(10)*, i32 } @fakeaugmented___nv_isfinited(double %80), !dbg !464
%subcache32 = extractvalue { {} addrspace(10)*, i32 } %_augmented31, 0, !dbg !464
%81 = extractvalue { {} addrspace(10)*, i32 } %_augmented31, 1, !dbg !464
%.not68 = icmp eq i32 %81, 0, !dbg !466
br i1 %.not68, label %L221.i, label %L204.i, !dbg !470
L204.i: ; preds = %L176.i
%82 = fcmp une double %80, 0.000000e+00, !dbg !471
br i1 %82, label %L208.i, label %L206.i, !dbg !474
L206.i: ; preds = %L204.i
%83 = call double @__nv_fabs(double noundef 0.000000e+00) #6, !dbg !475
br label %L221.i, !dbg !479
L208.i: ; preds = %L204.i
%84 = fdiv double %64, %80, !dbg !481
%85 = fmul double %84, %84, !dbg !483
%86 = fdiv double %65, %80, !dbg !481
%87 = fmul double %86, %86, !dbg !483
%88 = fadd double %85, %87, !dbg !486
%89 = fdiv double %66, %80, !dbg !481
%90 = fmul double %89, %89, !dbg !483
%91 = fadd double %90, %88, !dbg !486
%92 = call double @__nv_sqrt(double %91) #6, !dbg !487
%93 = fmul double %80, %92, !dbg !488
br label %L221.i, !dbg !479
L221.i: ; preds = %L208.i, %L206.i, %L176.i, %L169.i
%_cache73.0 = phi i8 [ 2, %L176.i ], [ 0, %L208.i ], [ 3, %L206.i ], [ 1, %L169.i ], !dbg !489
%_cache72.0 = phi i8 [ 0, %L176.i ], [ 2, %L208.i ], [ 3, %L206.i ], [ 1, %L169.i ], !dbg !489
%subcache32_cache.0 = phi {} addrspace(10)* [ %subcache32, %L176.i ], [ %subcache32, %L208.i ], [ %subcache32, %L206.i ], [ addrspacecast ({}* inttoptr (i64 139887560998920 to {}*) to {} addrspace(10)*), %L169.i ]
%subcache_cache.1 = phi {} addrspace(10)* [ %subcache_cache.0, %L176.i ], [ %subcache_cache.0, %L208.i ], [ %subcache_cache.0, %L206.i ], [ %subcache, %L169.i ]
%value_phi.i = phi double [ %72, %L169.i ], [ %83, %L206.i ], [ %93, %L208.i ], [ %80, %L176.i ]
%94 = fmul double %.unpack, 5.000000e-01, !dbg !490
%95 = fsub double %value_phi.i, %.unpack54, !dbg !493
%96 = fmul double %95, %95, !dbg !494
%97 = fmul double %94, %96, !dbg !496
%"'ipc62" = bitcast i8 addrspace(1)* %".fca.0.extract25'ipev" to double addrspace(1)*, !dbg !497
%98 = bitcast i8 addrspace(1)* %.fca.0.extract25 to double addrspace(1)*, !dbg !497
%99 = atomicrmw fadd double addrspace(1)* %98, double %97 monotonic, align 8, !dbg !497
br label %julia_kernel__3362_inner.exit, !dbg !504
julia_kernel__3362_inner.exit: ; preds = %L221.i, %entry
%_cache73.1 = phi i8 [ undef, %entry ], [ %_cache73.0, %L221.i ]
%_cache72.1 = phi i8 [ undef, %entry ], [ %_cache72.0, %L221.i ]
%_cache67.0 = phi double [ undef, %entry ], [ %94, %L221.i ]
%_cache64.0 = phi double [ undef, %entry ], [ %95, %L221.i ]
%subcache32_cache.1 = phi {} addrspace(10)* [ addrspacecast ({}* inttoptr (i64 139887560998920 to {}*) to {} addrspace(10)*), %entry ], [ %subcache32_cache.0, %L221.i ]
%subcache_cache.2 = phi {} addrspace(10)* [ addrspacecast ({}* inttoptr (i64 139887560998920 to {}*) to {} addrspace(10)*), %entry ], [ %subcache_cache.1, %L221.i ]
%_cache6.0 = phi double [ undef, %entry ], [ %66, %L221.i ]
%_cache3.0 = phi double [ undef, %entry ], [ %65, %L221.i ]
%_cache.0 = phi double [ undef, %entry ], [ %64, %L221.i ]
br label %invertjulia_kernel__3362_inner.exit, !dbg !355
invertentry: ; preds = %invertjulia_kernel__3362_inner.exit, %invertL31.i
ret void
invertL31.i: ; preds = %invertL176.i, %invertL169.i
%100 = load double, double* %"'de", align 8, !dbg !445
store double 0.000000e+00, double* %"'de", align 8, !dbg !445
%_unwrap = fmul double %_cache.0, %_cache.0, !dbg !445
%_unwrap4 = fmul double %_cache3.0, %_cache3.0, !dbg !445
%_unwrap5 = fadd double %_unwrap, %_unwrap4, !dbg !445
%_unwrap7 = fmul double %_cache6.0, %_cache6.0, !dbg !445
%_unwrap8 = fadd double %_unwrap5, %_unwrap7, !dbg !445
%101 = fcmp fast ueq double %_unwrap8, 0.000000e+00, !dbg !445
%102 = call fast double @__nv_sqrt(double %_unwrap8) #9, !dbg !445
%103 = fmul fast double 2.000000e+00, %102, !dbg !445
%104 = fdiv fast double %100, %103, !dbg !445
%105 = select fast i1 %101, double 0.000000e+00, double %104, !dbg !445
%106 = load double, double* %"'de9", align 8, !dbg !445
%107 = fadd fast double %106, %104, !dbg !445
%108 = select fast i1 %101, double %106, double %107, !dbg !445
store double %108, double* %"'de9", align 8, !dbg !445
%109 = load double, double* %"'de9", align 8, !dbg !444
store double 0.000000e+00, double* %"'de9", align 8, !dbg !444
%110 = load double, double* %"'de10", align 8, !dbg !444
%111 = fadd fast double %110, %109, !dbg !444
store double %111, double* %"'de10", align 8, !dbg !444
%112 = load double, double* %"'de11", align 8, !dbg !444
%113 = fadd fast double %112, %109, !dbg !444
store double %113, double* %"'de11", align 8, !dbg !444
%114 = load double, double* %"'de11", align 8, !dbg !437
store double 0.000000e+00, double* %"'de11", align 8, !dbg !437
%115 = fmul fast double %114, %_cache6.0, !dbg !437
%116 = load double, double* %"'de12", align 8, !dbg !437
%117 = fadd fast double %116, %115, !dbg !437
store double %117, double* %"'de12", align 8, !dbg !437
%118 = fmul fast double %114, %_cache6.0, !dbg !437
%119 = load double, double* %"'de12", align 8, !dbg !437
%120 = fadd fast double %119, %118, !dbg !437
store double %120, double* %"'de12", align 8, !dbg !437
%121 = load double, double* %"'de10", align 8, !dbg !444
store double 0.000000e+00, double* %"'de10", align 8, !dbg !444
%122 = load double, double* %"'de13", align 8, !dbg !444
%123 = fadd fast double %122, %121, !dbg !444
store double %123, double* %"'de13", align 8, !dbg !444
%124 = load double, double* %"'de14", align 8, !dbg !444
%125 = fadd fast double %124, %121, !dbg !444
store double %125, double* %"'de14", align 8, !dbg !444
%126 = load double, double* %"'de14", align 8, !dbg !437
store double 0.000000e+00, double* %"'de14", align 8, !dbg !437
%127 = fmul fast double %126, %_cache3.0, !dbg !437
%128 = load double, double* %"'de15", align 8, !dbg !437
%129 = fadd fast double %128, %127, !dbg !437
store double %129, double* %"'de15", align 8, !dbg !437
%130 = fmul fast double %126, %_cache3.0, !dbg !437
%131 = load double, double* %"'de15", align 8, !dbg !437
%132 = fadd fast double %131, %130, !dbg !437
store double %132, double* %"'de15", align 8, !dbg !437
%133 = load double, double* %"'de13", align 8, !dbg !437
store double 0.000000e+00, double* %"'de13", align 8, !dbg !437
%134 = fmul fast double %133, %_cache.0, !dbg !437
%135 = load double, double* %"'de16", align 8, !dbg !437
%136 = fadd fast double %135, %134, !dbg !437
store double %136, double* %"'de16", align 8, !dbg !437
%137 = fmul fast double %133, %_cache.0, !dbg !437
%138 = load double, double* %"'de16", align 8, !dbg !437
%139 = fadd fast double %138, %137, !dbg !437
store double %139, double* %"'de16", align 8, !dbg !437
%140 = load double, double* %"'de12", align 8, !dbg !431
store double 0.000000e+00, double* %"'de12", align 8, !dbg !431
%141 = load double, double* %".unpack55.unpack59'de", align 8, !dbg !431
%142 = fadd fast double %141, %140, !dbg !431
store double %142, double* %".unpack55.unpack59'de", align 8, !dbg !431
%143 = fneg fast double %140, !dbg !431
%144 = load double, double* %".unpack61.unpack65'de", align 8, !dbg !431
%145 = fadd fast double %144, %143, !dbg !431
store double %145, double* %".unpack61.unpack65'de", align 8, !dbg !431
%146 = load double, double* %"'de15", align 8, !dbg !431
store double 0.000000e+00, double* %"'de15", align 8, !dbg !431
%147 = load double, double* %".unpack55.unpack57'de", align 8, !dbg !431
%148 = fadd fast double %147, %146, !dbg !431
store double %148, double* %".unpack55.unpack57'de", align 8, !dbg !431
%149 = fneg fast double %146, !dbg !431
%150 = load double, double* %".unpack61.unpack63'de", align 8, !dbg !431
%151 = fadd fast double %150, %149, !dbg !431
store double %151, double* %".unpack61.unpack63'de", align 8, !dbg !431
%152 = load double, double* %"'de16", align 8, !dbg !431
store double 0.000000e+00, double* %"'de16", align 8, !dbg !431
%153 = load double, double* %".unpack55.unpack'de", align 8, !dbg !431
%154 = fadd fast double %153, %152, !dbg !431
store double %154, double* %".unpack55.unpack'de", align 8, !dbg !431
%155 = fneg fast double %152, !dbg !431
%156 = load double, double* %".unpack61.unpack'de", align 8, !dbg !431
%157 = fadd fast double %156, %155, !dbg !431
store double %157, double* %".unpack61.unpack'de", align 8, !dbg !431
%158 = load double, double* %".unpack61.unpack65'de", align 8, !dbg !411
store double 0.000000e+00, double* %".unpack61.unpack65'de", align 8, !dbg !411
%".fca.0.extract16'ipev_unwrap" = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'1", 0, !dbg !411
%"'ipc_unwrap" = bitcast i8 addrspace(1)* %".fca.0.extract16'ipev_unwrap" to [1 x [3 x double]] addrspace(1)*, !dbg !411
%.fca.0.extract1_unwrap = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, 0, !dbg !411
%_unwrap17 = add nsw i64 %51, -1, !dbg !411
%_unwrap18 = shl nuw nsw i64 %_unwrap17, 3, !dbg !411
%_unwrap19 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract1_unwrap, i64 %_unwrap18, !dbg !411
%_unwrap20 = bitcast i8 addrspace(1)* %_unwrap19 to i64 addrspace(1)*, !dbg !411
%159 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %_unwrap20, i32 noundef 8) #7, !dbg !396
%_unwrap21 = add i64 %159, -1, !dbg !411
%".unpack61.elt64'ipg_unwrap" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc_unwrap", i64 %_unwrap21, i64 0, i64 2, !dbg !411
%160 = atomicrmw fadd double addrspace(1)* %".unpack61.elt64'ipg_unwrap", double %158 monotonic, align 8, !dbg !411
%161 = load double, double* %".unpack61.unpack63'de", align 8, !dbg !411
store double 0.000000e+00, double* %".unpack61.unpack63'de", align 8, !dbg !411
%".unpack61.elt62'ipg_unwrap" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc_unwrap", i64 %_unwrap21, i64 0, i64 1, !dbg !411
%162 = atomicrmw fadd double addrspace(1)* %".unpack61.elt62'ipg_unwrap", double %161 monotonic, align 8, !dbg !411
%163 = load double, double* %".unpack61.unpack'de", align 8, !dbg !411
store double 0.000000e+00, double* %".unpack61.unpack'de", align 8, !dbg !411
%".unpack61.elt'ipg_unwrap" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc_unwrap", i64 %_unwrap21, i64 0, i64 0, !dbg !411
%164 = atomicrmw fadd double addrspace(1)* %".unpack61.elt'ipg_unwrap", double %163 monotonic, align 8, !dbg !411
%165 = load double, double* %".unpack55.unpack59'de", align 8, !dbg !411
store double 0.000000e+00, double* %".unpack55.unpack59'de", align 8, !dbg !411
%.fca.0.extract9_unwrap = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 0, !dbg !411
%_unwrap22 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract9_unwrap, i64 %_unwrap18, !dbg !411
%_unwrap23 = bitcast i8 addrspace(1)* %_unwrap22 to i64 addrspace(1)*, !dbg !411
%166 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %_unwrap23, i32 noundef 8) #7, !dbg !396
%_unwrap24 = add i64 %166, -1, !dbg !411
%".unpack55.elt58'ipg_unwrap" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc_unwrap", i64 %_unwrap24, i64 0, i64 2, !dbg !411
%167 = atomicrmw fadd double addrspace(1)* %".unpack55.elt58'ipg_unwrap", double %165 monotonic, align 8, !dbg !411
%168 = load double, double* %".unpack55.unpack57'de", align 8, !dbg !411
store double 0.000000e+00, double* %".unpack55.unpack57'de", align 8, !dbg !411
%".unpack55.elt56'ipg_unwrap" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc_unwrap", i64 %_unwrap24, i64 0, i64 1, !dbg !411
%169 = atomicrmw fadd double addrspace(1)* %".unpack55.elt56'ipg_unwrap", double %168 monotonic, align 8, !dbg !411
%170 = load double, double* %".unpack55.unpack'de", align 8, !dbg !411
store double 0.000000e+00, double* %".unpack55.unpack'de", align 8, !dbg !411
%".unpack55.elt'ipg_unwrap" = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %"'ipc_unwrap", i64 %_unwrap24, i64 0, i64 0, !dbg !411
%171 = atomicrmw fadd double addrspace(1)* %".unpack55.elt'ipg_unwrap", double %170 monotonic, align 8, !dbg !411
%172 = load double, double* %".unpack54'de", align 8, !dbg !411
store double 0.000000e+00, double* %".unpack54'de", align 8, !dbg !411
%".fca.0.extract'ipev_unwrap" = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %"'2", 0, !dbg !411
%"'ipc25_unwrap" = bitcast i8 addrspace(1)* %".fca.0.extract'ipev_unwrap" to [2 x double] addrspace(1)*, !dbg !411
%".elt53'ipg_unwrap" = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %"'ipc25_unwrap", i64 %_unwrap17, i64 1, !dbg !411
%173 = atomicrmw fadd double addrspace(1)* %".elt53'ipg_unwrap", double %172 monotonic, align 8, !dbg !411
%174 = load double, double* %".unpack'de", align 8, !dbg !411
store double 0.000000e+00, double* %".unpack'de", align 8, !dbg !411
%".elt'ipg_unwrap" = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %"'ipc25_unwrap", i64 %_unwrap17, i64 0, !dbg !411
%175 = atomicrmw fadd double addrspace(1)* %".elt'ipg_unwrap", double %174 monotonic, align 8, !dbg !411
br label %invertentry
invertL169.i: ; preds = %invertL221.i, %invertL176.i
%_unwrap26 = fmul double %_cache.0, %_cache.0, !dbg !449
%_unwrap27 = fmul double %_cache3.0, %_cache3.0, !dbg !449
%_unwrap28 = fadd double %_unwrap26, %_unwrap27, !dbg !449
%_unwrap29 = fmul double %_cache6.0, %_cache6.0, !dbg !449
%_unwrap30 = fadd double %_unwrap28, %_unwrap29, !dbg !449
%176 = call double @__nv_sqrt(double %_unwrap30) #6, !dbg !445
call void inttoptr (i64 139880611672816 to void (i8*)*)(i8* getelementptr inbounds ([7857 x i8], [7857 x i8]* @1, i32 0, i32 0)) #8, !dbg !449
%177 = call { double } @diffe__nv_isfinited(double %176, {} addrspace(10)* %subcache_cache.2), !dbg !449
%178 = extractvalue { double } %177, 0, !dbg !449
%179 = load double, double* %"'de", align 8, !dbg !449
%180 = fadd fast double %179, %178, !dbg !449
store double %180, double* %"'de", align 8, !dbg !449
br label %invertL31.i
invertL176.i: ; preds = %invertL221.i, %invertL204.i
%181 = call double @__nv_fabs(double %_cache.0) #6, !dbg !454
%182 = call double @__nv_fabs(double %_cache3.0) #6, !dbg !460
%183 = call double @__nv_fmax(double %181, double %182) #6, !dbg !463
%184 = call double @__nv_fabs(double %_cache6.0) #6, !dbg !460
%185 = call double @__nv_fmax(double %183, double %184) #6, !dbg !463
%186 = call { double } @diffe__nv_isfinited(double %185, {} addrspace(10)* %subcache32_cache.1), !dbg !464
%187 = extractvalue { double } %186, 0, !dbg !464
%188 = load double, double* %"'de33", align 8, !dbg !464
%189 = fadd fast double %188, %187, !dbg !464
store double %189, double* %"'de33", align 8, !dbg !464
%190 = load double, double* %"'de33", align 8, !dbg !463
store double 0.000000e+00, double* %"'de33", align 8, !dbg !463
%191 = fcmp fast olt double %183, %184, !dbg !463
%192 = select fast i1 %191, double 0.000000e+00, double %190, !dbg !463
%193 = load double, double* %"'de34", align 8, !dbg !463
%194 = fadd fast double %193, %190, !dbg !463
%195 = select fast i1 %191, double %193, double %194, !dbg !463
store double %195, double* %"'de34", align 8, !dbg !463
%196 = fcmp fast olt double %183, %184, !dbg !463
%197 = select fast i1 %196, double %190, double 0.000000e+00, !dbg !463
%198 = load double, double* %"'de35", align 8, !dbg !463
%199 = fadd fast double %198, %190, !dbg !463
%200 = select fast i1 %196, double %199, double %198, !dbg !463
store double %200, double* %"'de35", align 8, !dbg !463
%201 = load double, double* %"'de35", align 8, !dbg !460
store double 0.000000e+00, double* %"'de35", align 8, !dbg !460
%202 = fcmp fast olt double %_cache6.0, 0.000000e+00, !dbg !460
%203 = select fast i1 %202, double -1.000000e+00, double 1.000000e+00, !dbg !460
%204 = fmul fast double %201, %203, !dbg !460
%205 = load double, double* %"'de12", align 8, !dbg !460
%206 = fadd fast double %205, %204, !dbg !460
store double %206, double* %"'de12", align 8, !dbg !460
%207 = load double, double* %"'de34", align 8, !dbg !463
store double 0.000000e+00, double* %"'de34", align 8, !dbg !463
%208 = fcmp fast olt double %181, %182, !dbg !463
%209 = select fast i1 %208, double 0.000000e+00, double %207, !dbg !463
%210 = load double, double* %"'de36", align 8, !dbg !463
%211 = fadd fast double %210, %207, !dbg !463
%212 = select fast i1 %208, double %210, double %211, !dbg !463
store double %212, double* %"'de36", align 8, !dbg !463
%213 = fcmp fast olt double %181, %182, !dbg !463
%214 = select fast i1 %213, double %207, double 0.000000e+00, !dbg !463
%215 = load double, double* %"'de37", align 8, !dbg !463
%216 = fadd fast double %215, %207, !dbg !463
%217 = select fast i1 %213, double %216, double %215, !dbg !463
store double %217, double* %"'de37", align 8, !dbg !463
%218 = load double, double* %"'de37", align 8, !dbg !460
store double 0.000000e+00, double* %"'de37", align 8, !dbg !460
%219 = fcmp fast olt double %_cache3.0, 0.000000e+00, !dbg !460
%220 = select fast i1 %219, double -1.000000e+00, double 1.000000e+00, !dbg !460
%221 = fmul fast double %218, %220, !dbg !460
%222 = load double, double* %"'de15", align 8, !dbg !460
%223 = fadd fast double %222, %221, !dbg !460
store double %223, double* %"'de15", align 8, !dbg !460
%224 = load double, double* %"'de36", align 8, !dbg !454
store double 0.000000e+00, double* %"'de36", align 8, !dbg !454
%225 = fcmp fast olt double %_cache.0, 0.000000e+00, !dbg !454
%226 = select fast i1 %225, double -1.000000e+00, double 1.000000e+00, !dbg !454
%227 = fmul fast double %224, %226, !dbg !454
%228 = load double, double* %"'de16", align 8, !dbg !454
%229 = fadd fast double %228, %227, !dbg !454
store double %229, double* %"'de16", align 8, !dbg !454
%_unwrap38 = fmul double %_cache.0, %_cache.0
%_unwrap39 = fmul double %_cache3.0, %_cache3.0
%_unwrap40 = fadd double %_unwrap38, %_unwrap39
%_unwrap41 = fmul double %_cache6.0, %_cache6.0
%_unwrap42 = fadd double %_unwrap40, %_unwrap41
%230 = call double @__nv_sqrt(double %_unwrap42) #6, !dbg !445
%_unwrap43 = fcmp ule double %230, 0.000000e+00
br i1 %_unwrap43, label %invertL31.i, label %invertL169.i
invertL204.i: ; preds = %invertL208.i, %invertL206.i
br label %invertL176.i
invertL206.i: ; preds = %invertL221.i
br label %invertL204.i
invertL208.i: ; preds = %invertL221.i
%231 = load double, double* %"'de44", align 8, !dbg !488
store double 0.000000e+00, double* %"'de44", align 8, !dbg !488
%232 = call double @__nv_fabs(double %_cache.0) #6, !dbg !454
%233 = call double @__nv_fabs(double %_cache3.0) #6, !dbg !460
%234 = call double @__nv_fmax(double %232, double %233) #6, !dbg !463
%235 = call double @__nv_fabs(double %_cache6.0) #6, !dbg !460
%236 = call double @__nv_fmax(double %234, double %235) #6, !dbg !463
%_unwrap45 = fdiv double %_cache6.0, %236, !dbg !488
%_unwrap46 = fmul double %_unwrap45, %_unwrap45, !dbg !488
%_unwrap47 = fdiv double %_cache.0, %236, !dbg !488
%_unwrap48 = fmul double %_unwrap47, %_unwrap47, !dbg !488
%_unwrap49 = fdiv double %_cache3.0, %236, !dbg !488
%_unwrap50 = fmul double %_unwrap49, %_unwrap49, !dbg !488
%_unwrap51 = fadd double %_unwrap48, %_unwrap50, !dbg !488
%_unwrap52 = fadd double %_unwrap46, %_unwrap51, !dbg !488
%237 = call double @__nv_sqrt(double %_unwrap52) #6, !dbg !487
%238 = fmul fast double %231, %237, !dbg !488
%239 = load double, double* %"'de33", align 8, !dbg !488
%240 = fadd fast double %239, %238, !dbg !488
store double %240, double* %"'de33", align 8, !dbg !488
%241 = fmul fast double %231, %236, !dbg !488
%242 = load double, double* %"'de53", align 8, !dbg !488
%243 = fadd fast double %242, %241, !dbg !488
store double %243, double* %"'de53", align 8, !dbg !488
%244 = load double, double* %"'de53", align 8, !dbg !487
store double 0.000000e+00, double* %"'de53", align 8, !dbg !487
%245 = fcmp fast ueq double %_unwrap52, 0.000000e+00, !dbg !487
%246 = call fast double @__nv_sqrt(double %_unwrap52) #9, !dbg !487
%247 = fmul fast double 2.000000e+00, %246, !dbg !487
%248 = fdiv fast double %244, %247, !dbg !487
%249 = select fast i1 %245, double 0.000000e+00, double %248, !dbg !487
%250 = load double, double* %"'de54", align 8, !dbg !487
%251 = fadd fast double %250, %248, !dbg !487
%252 = select fast i1 %245, double %250, double %251, !dbg !487
store double %252, double* %"'de54", align 8, !dbg !487
%253 = load double, double* %"'de54", align 8, !dbg !486
store double 0.000000e+00, double* %"'de54", align 8, !dbg !486
%254 = load double, double* %"'de55", align 8, !dbg !486
%255 = fadd fast double %254, %253, !dbg !486
store double %255, double* %"'de55", align 8, !dbg !486
%256 = load double, double* %"'de56", align 8, !dbg !486
%257 = fadd fast double %256, %253, !dbg !486
store double %257, double* %"'de56", align 8, !dbg !486
%258 = load double, double* %"'de55", align 8, !dbg !483
store double 0.000000e+00, double* %"'de55", align 8, !dbg !483
%259 = fmul fast double %258, %_unwrap45, !dbg !483
%260 = load double, double* %"'de57", align 8, !dbg !483
%261 = fadd fast double %260, %259, !dbg !483
store double %261, double* %"'de57", align 8, !dbg !483
%262 = fmul fast double %258, %_unwrap45, !dbg !483
%263 = load double, double* %"'de57", align 8, !dbg !483
%264 = fadd fast double %263, %262, !dbg !483
store double %264, double* %"'de57", align 8, !dbg !483
%265 = load double, double* %"'de57", align 8, !dbg !481
store double 0.000000e+00, double* %"'de57", align 8, !dbg !481
%266 = fdiv fast double %265, %236, !dbg !481
%267 = load double, double* %"'de12", align 8, !dbg !481
%268 = fadd fast double %267, %266, !dbg !481
store double %268, double* %"'de12", align 8, !dbg !481
%269 = fdiv fast double %265, %236, !dbg !481
%270 = fdiv fast double %_cache6.0, %236, !dbg !481
%271 = fmul fast double %269, %270, !dbg !481
%272 = fneg fast double %271, !dbg !481
%273 = load double, double* %"'de33", align 8, !dbg !481
%274 = fadd fast double %273, %272, !dbg !481
store double %274, double* %"'de33", align 8, !dbg !481
%275 = load double, double* %"'de56", align 8, !dbg !486
store double 0.000000e+00, double* %"'de56", align 8, !dbg !486
%276 = load double, double* %"'de58", align 8, !dbg !486
%277 = fadd fast double %276, %275, !dbg !486
store double %277, double* %"'de58", align 8, !dbg !486
%278 = load double, double* %"'de59", align 8, !dbg !486
%279 = fadd fast double %278, %275, !dbg !486
store double %279, double* %"'de59", align 8, !dbg !486
%280 = load double, double* %"'de59", align 8, !dbg !483
store double 0.000000e+00, double* %"'de59", align 8, !dbg !483
%281 = fmul fast double %280, %_unwrap49, !dbg !483
%282 = load double, double* %"'de60", align 8, !dbg !483
%283 = fadd fast double %282, %281, !dbg !483
store double %283, double* %"'de60", align 8, !dbg !483
%284 = fmul fast double %280, %_unwrap49, !dbg !483
%285 = load double, double* %"'de60", align 8, !dbg !483
%286 = fadd fast double %285, %284, !dbg !483
store double %286, double* %"'de60", align 8, !dbg !483
%287 = load double, double* %"'de60", align 8, !dbg !481
store double 0.000000e+00, double* %"'de60", align 8, !dbg !481
%288 = fdiv fast double %287, %236, !dbg !481
%289 = load double, double* %"'de15", align 8, !dbg !481
%290 = fadd fast double %289, %288, !dbg !481
store double %290, double* %"'de15", align 8, !dbg !481
%291 = fdiv fast double %287, %236, !dbg !481
%292 = fdiv fast double %_cache3.0, %236, !dbg !481
%293 = fmul fast double %291, %292, !dbg !481
%294 = fneg fast double %293, !dbg !481
%295 = load double, double* %"'de33", align 8, !dbg !481
%296 = fadd fast double %295, %294, !dbg !481
store double %296, double* %"'de33", align 8, !dbg !481
%297 = load double, double* %"'de58", align 8, !dbg !483
store double 0.000000e+00, double* %"'de58", align 8, !dbg !483
%298 = fmul fast double %297, %_unwrap47, !dbg !483
%299 = load double, double* %"'de61", align 8, !dbg !483
%300 = fadd fast double %299, %298, !dbg !483
store double %300, double* %"'de61", align 8, !dbg !483
%301 = fmul fast double %297, %_unwrap47, !dbg !483
%302 = load double, double* %"'de61", align 8, !dbg !483
%303 = fadd fast double %302, %301, !dbg !483
store double %303, double* %"'de61", align 8, !dbg !483
%304 = load double, double* %"'de61", align 8, !dbg !481
store double 0.000000e+00, double* %"'de61", align 8, !dbg !481
%305 = fdiv fast double %304, %236, !dbg !481
%306 = load double, double* %"'de16", align 8, !dbg !481
%307 = fadd fast double %306, %305, !dbg !481
store double %307, double* %"'de16", align 8, !dbg !481
%308 = fdiv fast double %304, %236, !dbg !481
%309 = fdiv fast double %_cache.0, %236, !dbg !481
%310 = fmul fast double %308, %309, !dbg !481
%311 = fneg fast double %310, !dbg !481
%312 = load double, double* %"'de33", align 8, !dbg !481
%313 = fadd fast double %312, %311, !dbg !481
store double %313, double* %"'de33", align 8, !dbg !481
br label %invertL204.i
invertL221.i: ; preds = %invertjulia_kernel__3362_inner.exit
%"'ipc62_unwrap" = bitcast i8 addrspace(1)* %".fca.0.extract25'ipev" to double addrspace(1)*, !dbg !497
%314 = load atomic double, double addrspace(1)* %"'ipc62_unwrap" monotonic, align 8, !dbg !497
%315 = load double, double* %"'de63", align 8, !dbg !497
%316 = fadd fast double %315, %314, !dbg !497
store double %316, double* %"'de63", align 8, !dbg !497
%317 = load double, double* %"'de63", align 8, !dbg !496
store double 0.000000e+00, double* %"'de63", align 8, !dbg !496
%_unwrap65 = fmul double %_cache64.0, %_cache64.0, !dbg !496
%318 = fmul fast double %317, %_unwrap65, !dbg !496
%319 = load double, double* %"'de66", align 8, !dbg !496
%320 = fadd fast double %319, %318, !dbg !496
store double %320, double* %"'de66", align 8, !dbg !496
%321 = fmul fast double %317, %_cache67.0, !dbg !496
%322 = load double, double* %"'de68", align 8, !dbg !496
%323 = fadd fast double %322, %321, !dbg !496
store double %323, double* %"'de68", align 8, !dbg !496
%324 = load double, double* %"'de68", align 8, !dbg !494
store double 0.000000e+00, double* %"'de68", align 8, !dbg !494
%325 = fmul fast double %324, %_cache64.0, !dbg !494
%326 = load double, double* %"'de69", align 8, !dbg !494
%327 = fadd fast double %326, %325, !dbg !494
store double %327, double* %"'de69", align 8, !dbg !494
%328 = fmul fast double %324, %_cache64.0, !dbg !494
%329 = load double, double* %"'de69", align 8, !dbg !494
%330 = fadd fast double %329, %328, !dbg !494
store double %330, double* %"'de69", align 8, !dbg !494
%331 = load double, double* %"'de69", align 8, !dbg !493
store double 0.000000e+00, double* %"'de69", align 8, !dbg !493
%332 = load double, double* %"value_phi.i'de", align 8, !dbg !493
%333 = fadd fast double %332, %331, !dbg !493
store double %333, double* %"value_phi.i'de", align 8, !dbg !493
%334 = fneg fast double %331, !dbg !493
%335 = load double, double* %".unpack54'de", align 8, !dbg !493
%336 = fadd fast double %335, %334, !dbg !493
store double %336, double* %".unpack54'de", align 8, !dbg !493
%337 = load double, double* %"'de66", align 8, !dbg !490
store double 0.000000e+00, double* %"'de66", align 8, !dbg !490
%338 = fmul fast double %337, 5.000000e-01, !dbg !490
%339 = load double, double* %".unpack'de", align 8, !dbg !490
%340 = fadd fast double %339, %338, !dbg !490
store double %340, double* %".unpack'de", align 8, !dbg !490
%341 = load double, double* %"value_phi.i'de", align 8
store double 0.000000e+00, double* %"value_phi.i'de", align 8
%342 = icmp eq i8 0, %_cache72.1
%343 = icmp eq i8 1, %_cache72.1
%344 = icmp eq i8 2, %_cache72.1
%345 = select fast i1 %344, double %341, double 0.000000e+00
%346 = load double, double* %"'de44", align 8
%347 = fadd fast double %346, %341
%348 = select fast i1 %344, double %347, double %346
store double %348, double* %"'de44", align 8
%349 = select fast i1 %342, double %341, double 0.000000e+00
%350 = load double, double* %"'de33", align 8
%351 = fadd fast double %350, %341
%352 = select fast i1 %342, double %351, double %350
store double %352, double* %"'de33", align 8
%353 = select fast i1 %343, double %341, double 0.000000e+00
%354 = load double, double* %"'de", align 8
%355 = fadd fast double %354, %341
%356 = select fast i1 %343, double %355, double %354
store double %356, double* %"'de", align 8
switch i8 %_cache73.1, label %invertL206.i [
i8 0, label %invertL208.i
i8 1, label %invertL169.i
i8 2, label %invertL176.i
]
invertjulia_kernel__3362_inner.exit: ; preds = %julia_kernel__3362_inner.exit
br i1 %.not, label %invertentry, label %invertL221.i
}
ERROR: LoadError: InvalidIRError: compiling MethodInstance for grad_kernel!(::CuDeviceVector{Float64, 1}, ::CuDeviceVector{Float64, 1}, ::CuDeviceVector{SVector{3, Float64}, 1}, ::CuDeviceVector{SVector{3, Float64}, 1}, ::CuDeviceVector{Int64, 1}, ::CuDeviceVector{Int64, 1}, ::CuDeviceVector{HarmonicBond, 1}, ::CuDeviceVector{HarmonicBond, 1}) resulted in invalid LLVM IR
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
[1] #isfinite
@ ~/.julia/dev/CUDA/src/device/intrinsics/math.jl:190
[2] macro expansion
@ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:278
[3] _norm
@ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:266
[4] norm
@ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:265
[5] f
@ ~/dms/molly_dev/enzyme_err28.jl:19
[6] kernel!
@ ~/dms/molly_dev/enzyme_err28.jl:33
[7] kernel!
@ ~/dms/molly_dev/enzyme_err28.jl:0
[8] diffejulia_kernel__3362_inner5wrap
@ ~/dms/molly_dev/enzyme_err28.jl:0
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
[1] multiple call sites
@ unknown:0
Reason: unsupported call through a literal pointer (call to )
Stacktrace:
[1] #isfinite
@ ~/.julia/dev/CUDA/src/device/intrinsics/math.jl:190
[2] macro expansion
@ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:278
[3] _norm
@ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:266
[4] norm
@ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:265
[5] f
@ ~/dms/molly_dev/enzyme_err28.jl:19
[6] kernel!
@ ~/dms/molly_dev/enzyme_err28.jl:33
[7] kernel!
@ ~/dms/molly_dev/enzyme_err28.jl:0
[8] diffejulia_kernel__3362_inner5wrap
@ ~/dms/molly_dev/enzyme_err28.jl:0
[9] macro expansion
@ ~/.julia/dev/Enzyme/src/compiler.jl:9872
[10] enzyme_call
@ ~/.julia/dev/Enzyme/src/compiler.jl:9550
[11] CombinedAdjointThunk
@ ~/.julia/dev/Enzyme/src/compiler.jl:9513
[12] autodiff_deferred
@ ~/.julia/dev/Enzyme/src/Enzyme.jl:372
[13] autodiff_deferred
@ ~/.julia/dev/Enzyme/src/Enzyme.jl:442
[14] grad_kernel!
@ ~/dms/molly_dev/enzyme_err28.jl:40
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
[1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
@ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/validation.jl:149
[2] macro expansion
@ ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:415 [inlined]
[3] macro expansion
@ ~/.julia/packages/TimerOutputs/RsWnF/src/TimerOutput.jl:253 [inlined]
[4] macro expansion
@ ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:414 [inlined]
[5] emit_llvm(job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool)
@ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/utils.jl:89
[6] emit_llvm
@ ~/.julia/packages/GPUCompiler/YO8Uj/src/utils.jl:83 [inlined]
[7] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
@ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:129
[8] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool)
@ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:106
[9] compile
@ ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:98 [inlined]
[10] #1037
@ ~/.julia/dev/CUDA/src/compiler/compilation.jl:104 [inlined]
[11] JuliaContext(f::CUDA.var"#1037#1040"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
@ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/driver.jl:47
[12] compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia/dev/CUDA/src/compiler/compilation.jl:103
[13] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/execution.jl:125
[14] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/YO8Uj/src/execution.jl:103
[15] macro expansion
@ ~/.julia/dev/CUDA/src/compiler/execution.jl:318 [inlined]
[16] macro expansion
@ ./lock.jl:267 [inlined]
[17] cufunction(f::typeof(grad_kernel!), tt::Type{Tuple{CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{SVector{3, Float64}, 1}, CuDeviceVector{SVector{3, Float64}, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{HarmonicBond, 1}, CuDeviceVector{HarmonicBond, 1}}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:313
[18] cufunction(f::typeof(grad_kernel!), tt::Type{Tuple{CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{SVector{3, Float64}, 1}, CuDeviceVector{SVector{3, Float64}, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{Int64, 1}, CuDeviceVector{HarmonicBond, 1}, CuDeviceVector{HarmonicBond, 1}}})
@ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:310
[19] macro expansion
@ ~/.julia/dev/CUDA/src/compiler/execution.jl:104 [inlined]
[20] top-level scope
@ ~/.julia/dev/CUDA/src/utilities.jl:25
@vchuravy any ideas here?
We will need to look at @device_code_dump
, but I am wondering if your libm function replacement is inserting calls to pointers.
Given that we are looking at isfinite
The output of
@device_code dir="kernel_dump" @cuda threads=128 kernel!(pe_vec, coords, is, js, inters)
and
@device_code dir="grad_kernel_dump" @cuda threads=128 grad_kernel!(pe_vec, d_pe_vec, coords, d_coords, is, js, inters, d_inters)
are in device_code.zip.
So I immediatly see:
@0 = private unnamed_addr constant [7808 x i8] c"Enzyme compilation failed.\0ACurrent scope: \0A; Function Attrs: mustprogress willreturn\0Adefine void @preprocess_julia_kernel__5774_inner5({ i8 addrspace(1)*, i64, [1 x i64], i64 } %0, { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, { i8 addrspace(1)*, i64, [1 x i64], i64 } %4) local_unnamed_addr #5 !dbg !227 {\0Aentry:\0A %.fca.0.extract25 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, 0, !dbg !228\0A %.fca.2.0.extract11 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 2, 0, !dbg !228\0A %5 = call {}*** @julia.get_pgcstack() #6\0A %6 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #6, !dbg !229, !range !27\0A %7 = zext i32 %6 to i64, !dbg !236\0A %8 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #6, !dbg !238, !range !39\0A %9 = zext i32 %8 to i64, !dbg !243\0A %10 = mul nuw nsw i64 %7, %9, !dbg !245\0A %11 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #6, !dbg !247, !range !53\0A %12 = add nuw nsw i32 %11, 1, !dbg !252\0A %13 = zext i32 %12 to i64, !dbg !253\0A %14 = add nuw nsw i64 %10, %13, !dbg !255\0A %.not = icmp sgt i64 %14, %.fca.2.0.extract11, !dbg !257\0A br i1 %.not, label %julia_kernel__5774_inner.exit, label %L31.i, !dbg !259\0A\0AL31.i: ; preds = %entry\0A %.fca.0.extract = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %4, 0, !dbg !228\0A %.fca.0.extract1 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %3, 0, !dbg !228\0A %.fca.0.extract9 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %2, 0, !dbg !228\0A %.fca.0.extract16 = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %1, 0, !dbg !228\0A %15 = add nsw i64 %14, -1, !dbg !260\0A %16 = shl nuw nsw i64 %15, 3, !dbg !266\0A %17 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract9, i64 %16, !dbg !268\0A %18 = bitcast i8 addrspace(1)* %17 to i64 addrspace(1)*, !dbg !269\0A %19 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %18, i32 noundef 8) #7, !dbg !269\0A %20 = getelementptr i8, i8 addrspace(1)* %.fca.0.extract1, i64 %16, !dbg !268\0A %21 = bitcast i8 addrspace(1)* %20 to i64 addrspace(1)*, !dbg !269\0A %22 = call i64 @llvm.nvvm.ldg.global.i.i64.p1i64(i64 addrspace(1)* readonly %21, i32 noundef 8) #7, !dbg !269\0A %23 = bitcast i8 addrspace(1)* %.fca.0.extract to [2 x double] addrspace(1)*, !dbg !274\0A %.elt = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %23, i64 %15, i64 0, !dbg !274\0A %.unpack = load double, double addrspace(1)* %.elt, align 8, !dbg !274, !tbaa !100\0A %.elt53 = getelementptr inbounds [2 x double], [2 x double] addrspace(1)* %23, i64 %15, i64 1, !dbg !274\0A %.unpack54 = load double, double addrspace(1)* %.elt53, align 8, !dbg !274, !tbaa !100\0A %24 = add i64 %19, -1, !dbg !282\0A %25 = bitcast i8 addrspace(1)* %.fca.0.extract16 to [1 x [3 x double]] addrspace(1)*, !dbg !274\0A %.unpack55.elt = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %24, i64 0, i64 0, !dbg !274\0A %.unpack55.unpack = load double, double addrspace(1)* %.unpack55.elt, align 8, !dbg !274, !tbaa !100\0A %.unpack55.elt56 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %24, i64 0, i64 1, !dbg !274\0A %.unpack55.unpack57 = load double, double addrspace(1)* %.unpack55.elt56, align 8, !dbg !274, !tbaa !100\0A %.unpack55.elt58 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %24, i64 0, i64 2, !dbg !274\0A %.unpack55.unpack59 = load double, double addrspace(1)* %.unpack55.elt58, align 8, !dbg !274, !tbaa !100\0A %26 = add i64 %22, -1, !dbg !282\0A %.unpack61.elt = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %26, i64 0, i64 0, !dbg !274\0A %.unpack61.unpack = load double, double addrspace(1)* %.unpack61.elt, align 8, !dbg !274, !tbaa !100\0A %.unpack61.elt62 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %26, i64 0, i64 1, !dbg !274\0A %.unpack61.unpack63 = load double, double addrspace(1)* %.unpack61.elt62, align 8, !dbg !274, !tbaa !100\0A %.unpack61.elt64 = getelementptr inbounds [1 x [3 x double]], [1 x [3 x double]] addrspace(1)* %25, i64 %26, i64 0, i64 2, !dbg !274\0A %.unpack61.unpack65 = load double, double addrspace(1)* %.unpack61.elt64, align 8, !dbg !274, !tbaa !100\0A %27 = fsub double %.unpack55.unpack, %.unpack61.unpack, !dbg !284\0A %28 = fsub double %.unpack55.unpack57, %.unpack61.unpack63, !dbg !284\0A %29 = fsub double %.unpack55.unpack59, %.unpack61.unpack65, !dbg !284\0A %30 = fmul double %27, %27, !dbg !290\0A %31 = fmul double %28, %28, !dbg !290\0A %32 = fadd double %30, %31, !dbg !297\0A %33 = fmul double %29, %29, !dbg !290\0A %34 = fadd double %32, %33, !dbg !297\0A %35 = call double @__nv_sqrt(double %34) #6, !dbg !298\0A %36 = fcmp ule double %35, 0.000000e+00, !dbg !299\0A br i1 %36, label %L176.i, label %L169.i, !dbg !301\0A\0AL169.i: ; preds = %L31.i\0A %37 = call i32 @__nv_isfinited(double %35) #6, !dbg !302\0A %38 = icmp eq i32 %37, 0, !dbg !303\0A br i1 %38, label %L176.i, label %L221.i, !dbg !301\0A\0AL176.i: ; preds = %L169.i, %L31.i\0A %39 = call double @__nv_fabs(double %27) #6, !dbg !307\0A %40 = call double @__nv_fabs(double %28) #6, !dbg !313\0A %41 = call double @__nv_fmax(double %39, double %40) #6, !dbg !316\0A %42 = call double @__nv_fabs(double %29) #6, !dbg !313\0A %43 = call double @__nv_fmax(double %41, double %42) #6, !dbg !316\0A %44 = call i32 @__nv_isfinited(double %43) #6, !dbg !317\0A %.not68 = icmp eq i32 %44, 0, !dbg !319\0A br i1 %.not68, label %L221.i, label %L204.i, !dbg !323\0A\0AL204.i: ; preds = %L176.i\0A %45 = fcmp une double %43, 0.000000e+00, !dbg !324\0A br i1 %45, label %L208.i, label %L206.i, !dbg !327\0A\0AL206.i: ; preds = %L204.i\0A %46 = call double @__nv_fabs(double noundef 0.000000e+00) #6, !dbg !328\0A br label %L221.i, !dbg !332\0A\0AL208.i: ; preds = %L204.i\0A %47 = fdiv double %27, %43, !dbg !334\0A %48 = fmul double %47, %47, !dbg !336\0A %49 = fdiv double %28, %43, !dbg !334\0A %50 = fmul double %49, %49, !dbg !336\0A %51 = fadd double %48, %50, !dbg !339\0A %52 = fdiv double %29, %43, !dbg !334\0A %53 = fmul double %52, %52, !dbg !336\0A %54 = fadd double %53, %51, !dbg !339\0A %55 = call double @__nv_sqrt(double %54) #6, !dbg !340\0A %56 = fmul double %43, %55, !dbg !341\0A br label %L221.i, !dbg !332\0A\0AL221.i: ; preds = %L208.i, %L206.i, %L176.i, %L169.i\0A %value_phi.i = phi double [ %35, %L169.i ], [ %46, %L206.i ], [ %56, %L208.i ], [ %43, %L176.i ]\0A %57 = fmul double %.unpack, 5.000000e-01, !dbg !342\0A %58 = fsub double %value_phi.i, %.unpack54, !dbg !345\0A %59 = fmul double %58, %58, !dbg !346\0A %60 = fmul double %57, %59, !dbg !348\0A %61 = bitcast i8 addrspace(1)* %.fca.0.extract25 to double addrspace(1)*, !dbg !349\0A %62 = atomicrmw fadd double addrspace(1)* %61, double %60 monotonic, align 8, !dbg !349\0A br label %julia_kernel__5774_inner.exit, !dbg !356\0A\0Ajulia_kernel__5774_inner.exit: ; preds = %L221.i, %entry\0A ret void, !dbg !228\0A}\0A\0ANo augmented forward pass found for __nv_isfinited\0A at context: %37 = call i32 @__nv_isfinited(double %35) #6, !dbg !145\0A\0AStacktrace:\0A [1] #isfinite\0A @ ~/.julia/dev/CUDA/src/device/intrinsics/math.jl:190\0A [2] macro expansion\0A @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:278\0A [3] _norm\0A @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:266\0A [4] norm\0A @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:265\0A [5] f\0A @ ./REPL[5]:3\0A [6] kernel!\0A @ ./REPL[6]:11\0A [7] kernel!\0A @ ./REPL[6]:0\0A\00", align 1
@1 = private unnamed_addr constant [156 x i8] c"Enzyme: The original primal code hits this error condition, thus differentiating it does not make sense\0AStacktrace:\0A [1] multiple call sites\0A @ unknown:0\00", align 1
cc: @wsmoses
ANd the verifier complains about:
; │││││┌ @ /home/jgreener/.julia/dev/CUDA/src/device/intrinsics/math.jl:190 within `#isfinite`
call void inttoptr (i64 140245794510672 to void (i8*)*)(i8* getelementptr inbounds ([7808 x i8], [7808 x i8]* @0, i64 0, i64 0)) #10, !dbg !189
call void asm sideeffect "exit;", ""(), !dbg !189
unreachable, !dbg !189
So the short answer is:
No augmented forward pass found for __nv_isfinited
at context: %37 = call i32 @__nv_isfinited(double %35) #6, !dbg !145\0A\0AStacktrace:
[1] #isfinite
@ ~/.julia/dev/CUDA/src/device/intrinsics/math.jl:190
[2] macro expansion
@ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:278
[3] _norm\0A @ ~/.julia/packages/StaticArrays/cZ1ET/src/linalg.jl:266#1122
So three work items:
I can add support isfinite quickly without issue, 2/3 seem a lot harder, do you have any ideas?
the julia_error callback
could query the target (from the data-layout?) and if it's nvptx use GPUCompiler's report exception https://github.com/JuliaGPU/GPUCompiler.jl/blob/e5cd575dd44bec8d2914f3ce2cd1ae83dcd9ac91/src/irgen.jl#L220
Can confirm that the MWE at the top of the issue is fixed on main (31012f8b4b5f6d5699ded0bb9e7c184c4991312b).
I ran into an issue with some similar GPU code, see https://github.com/EnzymeAD/Enzyme.jl/issues/1129.
I am on Julia 1.9.2, Enzyme main (bf3415d5b032eacd4a924ba09ec725798129d007), CUDA 4.4.1, Atomix 0.1.0, StaticArrays 1.6.5. The following used to work but now errors: