Closed leecannon closed 1 month ago
I think this has to do with softfloat. When I tried to do some math in my kernel it also had a similar recursion codegen bug. I don't remember if it is the same function or a different one.
Unfortunately we have to work with LLVM to get errors like this solved. Next step here is to get a bug report filed upstream.
I need to learn abit more on LLVM IR, because I cant follow on where things fall apart.
zig build-exe badcodegen.zig -target x86_64-freestanding-none -mcpu=x86_64+soft_float-sse-sse2 -femit-llvm-ir=badcodegen.ll
is the command to get the LLVM ir, but the exported function gets optimized out via
zig build-exe badcodegen.zig -target x86_64-freestanding-none -mcpu=x86_64+soft_float-sse-sse2 -femit-llvm-ir=bad.ll
llc -filetype=asm bad.ll -o bad_ll.s
objdump -dxS -Mintel bad_ll.s &> badecodegen_ll_s.dump
Also, I have no clue why stuff like is emitted at all in the debug info.
!212 = !DIEnumerator(name: "win10_th2", value: 167772161, isUnsigned: true)
!213 = !DIEnumerator(name: "win10_rs1", value: 167772162, isUnsigned: true)
!214 = !DIEnumerator(name: "win10_rs2", value: 167772163, isUnsigned: true)
To me it also looks like -fllvm-emit-ir
does not emit subdf3, since I can find only %4 = fsub double %0, %3, !dbg !270
when searching for sub
outside of the debug info below and fsub is an intrinsic
; Function Attrs: nounwind
define dso_local i64 @badCodeGen(double %0) #0 !dbg !262 {
Entry:
%1 = alloca double, align 8
store double %0, ptr %1, align 8
call void @llvm.dbg.declare(metadata ptr %1, metadata !268, metadata !DIExpression()), !dbg !269
%2 = fptoui double %0 to i64, !dbg !270
%3 = uitofp i64 %2 to double, !dbg !270
%4 = fsub double %0, %3, !dbg !270
%5 = fcmp olt double %4, 1.000000e+00, !dbg !270
%6 = fcmp ogt double %4, -1.000000e+00, !dbg !270
%7 = and i1 %5, %6, !dbg !270
br i1 %7, label %Then, label %Else, !dbg !270
Block: ; preds = %Then
ret i64 %2, !dbg !272
Then: ; preds = %Entry
br label %Block, !dbg !270
Else: ; preds = %Entry
call fastcc void @badcodegen.panic(ptr nonnull readonly align 1 @builtin.panic_messages.integer_part_out_of_bounds__anon_958, i64 50, ptr readonly align 8 null, ptr nonnull readonly align 8 @0), !dbg !270
unreachable, !dbg !270
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
; Function Attrs: cold noreturn nounwind
define internal fastcc void @badcodegen.panic(ptr nonnull readonly align 1 %0, i64 %1, ptr readonly align 8 %2, ptr nonnull readonly align 8 %3) unnamed_addr #2 !dbg !273 {
Entry:
%4 = alloca ptr, align 8
%5 = alloca { ptr, i64 }, align 8
%6 = insertvalue { ptr, i64 } poison, ptr %0, 0
%7 = insertvalue { ptr, i64 } %6, i64 %1, 1
store { ptr, i64 } %7, ptr %5, align 8
call void @llvm.dbg.declare(metadata ptr %5, metadata !293, metadata !DIExpression()), !dbg !296
store ptr %2, ptr %4, align 8
call void @llvm.dbg.declare(metadata ptr %4, metadata !294, metadata !DIExpression()), !dbg !296
call void @llvm.dbg.declare(metadata ptr %3, metadata !295, metadata !DIExpression()), !dbg !296
br label %Loop, !dbg !297
Loop: ; preds = %Loop, %Entry
br label %Loop, !dbg !299
}
; Function Attrs: naked noreturn nounwind
define dso_local void @_start() #3 !dbg !301 {
Entry:
br label %Loop
Loop: ; preds = %Loop, %Entry
br label %Loop, !dbg !305
}
attributes #0 = { nounwind "frame-pointer"="all" "probe-stack"="__zig_probe_stack" "target-cpu"="x86-64" "target-features"="-16bit-mode,-32bit-mode,-3dnow,-3dnowa,+64bit,-adx,-aes,-allow-light-256-bit,-amx-bf16,-amx-fp16,-amx-int8,-amx-tile,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-bmi,-bmi2,-branchfusion,-cldemote,-clflushopt,-clwb,-clzero,+cmov,-cmpccxadd,-crc32,-cx16,+cx8,-enqcmd,-ermsb,-f16c,-false-deps-getmant,-false-deps-lzcnt-tzcnt,-false-deps-mulc,-false-deps-mullq,-false-deps-perm,-false-deps-popcnt,-false-deps-range,-fast-11bytenop,-fast-15bytenop,-fast-7bytenop,-fast-bextr,-fast-gather,-fast-hops,-fast-lzcnt,-fast-movbe,-fast-scalar-fsqrt,-fast-scalar-shift-masks,-fast-shld-rotate,-fast-variable-crosslane-shuffle,-fast-variable-perlane-shuffle,-fast-vector-fsqrt,-fast-vector-shift-masks,-fma,-fma4,-fsgsbase,-fsrm,+fxsr,-gfni,-harden-sls-ijmp,-harden-sls-ret,-hreset,-idivl-to-divb,+idivq-to-divl,-invpcid,-kl,-lea-sp,-lea-uses-ag,-lvi-cfi,-lvi-load-hardening,-lwp,-lzcnt,+macrofusion,+mmx,-movbe,-movdir64b,-movdiri,-mwaitx,+nopl,-pad-short-functions,-pclmul,-pconfig,-pku,-popcnt,-prefer-128-bit,-prefer-256-bit,-prefer-mask-registers,-prefetchi,-prefetchwt1,-prfchw,-ptwrite,-raoint,-rdpid,-rdpru,-rdrnd,-rdseed,-retpoline,-retpoline-external-thunk,-retpoline-indirect-branches,-retpoline-indirect-calls,-rtm,-sahf,-sbb-dep-breaking,-serialize,-seses,-sgx,-sha,-shstk,+slow-3ops-lea,+slow-incdec,-slow-lea,-slow-pmaddwd,-slow-pmulld,-slow-shld,-slow-two-mem-ops,-slow-unaligned-mem-16,-slow-unaligned-mem-32,+soft-float,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-sse-unaligned-mem,-ssse3,-tagged-globals,-tbm,-tsxldtrk,-uintr,-use-glm-div-sqrt-costs,-use-slm-arith-costs,-vaes,-vpclmulqdq,+vzeroupper,-waitpkg,-wbnoinvd,-widekl,+x87,-xop,-xsave,-xsavec,-xsaveopt,-xsaves" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { cold noreturn nounwind "frame-pointer"="all" "probe-stack"="__zig_probe_stack" "target-cpu"="x86-64" "target-features"="-16bit-mode,-32bit-mode,-3dnow,-3dnowa,+64bit,-adx,-aes,-allow-light-256-bit,-amx-bf16,-amx-fp16,-amx-int8,-amx-tile,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-bmi,-bmi2,-branchfusion,-cldemote,-clflushopt,-clwb,-clzero,+cmov,-cmpccxadd,-crc32,-cx16,+cx8,-enqcmd,-ermsb,-f16c,-false-deps-getmant,-false-deps-lzcnt-tzcnt,-false-deps-mulc,-false-deps-mullq,-false-deps-perm,-false-deps-popcnt,-false-deps-range,-fast-11bytenop,-fast-15bytenop,-fast-7bytenop,-fast-bextr,-fast-gather,-fast-hops,-fast-lzcnt,-fast-movbe,-fast-scalar-fsqrt,-fast-scalar-shift-masks,-fast-shld-rotate,-fast-variable-crosslane-shuffle,-fast-variable-perlane-shuffle,-fast-vector-fsqrt,-fast-vector-shift-masks,-fma,-fma4,-fsgsbase,-fsrm,+fxsr,-gfni,-harden-sls-ijmp,-harden-sls-ret,-hreset,-idivl-to-divb,+idivq-to-divl,-invpcid,-kl,-lea-sp,-lea-uses-ag,-lvi-cfi,-lvi-load-hardening,-lwp,-lzcnt,+macrofusion,+mmx,-movbe,-movdir64b,-movdiri,-mwaitx,+nopl,-pad-short-functions,-pclmul,-pconfig,-pku,-popcnt,-prefer-128-bit,-prefer-256-bit,-prefer-mask-registers,-prefetchi,-prefetchwt1,-prfchw,-ptwrite,-raoint,-rdpid,-rdpru,-rdrnd,-rdseed,-retpoline,-retpoline-external-thunk,-retpoline-indirect-branches,-retpoline-indirect-calls,-rtm,-sahf,-sbb-dep-breaking,-serialize,-seses,-sgx,-sha,-shstk,+slow-3ops-lea,+slow-incdec,-slow-lea,-slow-pmaddwd,-slow-pmulld,-slow-shld,-slow-two-mem-ops,-slow-unaligned-mem-16,-slow-unaligned-mem-32,+soft-float,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-sse-unaligned-mem,-ssse3,-tagged-globals,-tbm,-tsxldtrk,-uintr,-use-glm-div-sqrt-costs,-use-slm-arith-costs,-vaes,-vpclmulqdq,+vzeroupper,-waitpkg,-wbnoinvd,-widekl,+x87,-xop,-xsave,-xsavec,-xsaveopt,-xsaves" }
attributes #3 = { naked noreturn nounwind "frame-pointer"="all" "probe-stack"="__zig_probe_stack" "target-cpu"="x86-64" "target-features"="-16bit-mode,-32bit-mode,-3dnow,-3dnowa,+64bit,-adx,-aes,-allow-light-256-bit,-amx-bf16,-amx-fp16,-amx-int8,-amx-tile,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-bmi,-bmi2,-branchfusion,-cldemote,-clflushopt,-clwb,-clzero,+cmov,-cmpccxadd,-crc32,-cx16,+cx8,-enqcmd,-ermsb,-f16c,-false-deps-getmant,-false-deps-lzcnt-tzcnt,-false-deps-mulc,-false-deps-mullq,-false-deps-perm,-false-deps-popcnt,-false-deps-range,-fast-11bytenop,-fast-15bytenop,-fast-7bytenop,-fast-bextr,-fast-gather,-fast-hops,-fast-lzcnt,-fast-movbe,-fast-scalar-fsqrt,-fast-scalar-shift-masks,-fast-shld-rotate,-fast-variable-crosslane-shuffle,-fast-variable-perlane-shuffle,-fast-vector-fsqrt,-fast-vector-shift-masks,-fma,-fma4,-fsgsbase,-fsrm,+fxsr,-gfni,-harden-sls-ijmp,-harden-sls-ret,-hreset,-idivl-to-divb,+idivq-to-divl,-invpcid,-kl,-lea-sp,-lea-uses-ag,-lvi-cfi,-lvi-load-hardening,-lwp,-lzcnt,+macrofusion,+mmx,-movbe,-movdir64b,-movdiri,-mwaitx,+nopl,-pad-short-functions,-pclmul,-pconfig,-pku,-popcnt,-prefer-128-bit,-prefer-256-bit,-prefer-mask-registers,-prefetchi,-prefetchwt1,-prfchw,-ptwrite,-raoint,-rdpid,-rdpru,-rdrnd,-rdseed,-retpoline,-retpoline-external-thunk,-retpoline-indirect-branches,-retpoline-indirect-calls,-rtm,-sahf,-sbb-dep-breaking,-serialize,-seses,-sgx,-sha,-shstk,+slow-3ops-lea,+slow-incdec,-slow-lea,-slow-pmaddwd,-slow-pmulld,-slow-shld,-slow-two-mem-ops,-slow-unaligned-mem-16,-slow-unaligned-mem-32,+soft-float,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-sse-unaligned-mem,-ssse3,-tagged-globals,-tbm,-tsxldtrk,-uintr,-use-glm-div-sqrt-costs,-use-slm-arith-costs,-vaes,-vpclmulqdq,+vzeroupper,-waitpkg,-wbnoinvd,-widekl,+x87,-xop,-xsave,-xsavec,-xsaveopt,-xsaves" }
Ideas how to proceed? Am I missing anything?
To me it also looks like
-fllvm-emit-ir
does not emit subdf3, since I can find only%4 = fsub double %0, %3, !dbg !270
when searching forsub
outside of the debug info below and fsub is an intrinsic
The lowering to __subdf3
happens in instruction selection; you won't see it in the LLVM IR.
This no longer reproduces for me.
Zig Version
0.10.0-dev.4476+0f0076666
Steps to Reproduce
badcodegen.zig
build
zig build-exe badcodegen.zig -target x86_64-freestanding-none -mcpu=x86_64+soft_float-sse-sse2
codegen
The codegen for
__subdf3
is:Expected Behavior
Valid code that isn't just infinite recursion.
Actual Behavior
Invalid code that will stack overflow.