llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.7k stars 11.87k forks source link

Miscompilation corrupts stack-allocated vectors #63475

Closed cbeuw closed 1 year ago

cbeuw commented 1 year ago

This should print 42 42 42 42 42 42 42, but prints 42 0 42 0 42 42 42 with clang or opt -O3 https://godbolt.org/z/8v3d7enK8

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: noinline
define internal fastcc void @_ZN5repro11black_box_217h2c9aef2f201b1074E(i128 %val0, i64 %val1) #0 {
start:
  %0 = alloca [0 x [0 x [0 x i8]]], i32 0, align 1
  %_4 = icmp eq i128 %val0, 0
  %1 = zext i1 %_4 to i8
  store i8 %1, ptr %0, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr %0)
  %_7 = icmp eq i64 %val1, 0
  %2 = zext i1 %_7 to i8
  store i8 %2, ptr %0, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr null)
  ret void
}

define void @fn1(i1 %0) #1 {
start:
  %1 = alloca [0 x [0 x [0 x [5 x i32]]]], i32 0, align 4
  %2 = alloca [7 x i32], align 4
  store <7 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, ptr %2, align 4
  call fastcc void @_ZN5repro4fn1117ha0d291cafd330a2bE(i64 0, i128 0, ptr %1, i128 0, ptr %2)
  br i1 %0, label %bb2.preheader.i, label %_ZN5repro3fn517h51e49bf383c47da1E.exit

bb2.preheader.i:                                  ; preds = %start
  call fastcc void @_ZN5repro4fn1117ha0d291cafd330a2bE(i64 1, i128 1, ptr null, i128 1, ptr null)
  br label %_ZN5repro3fn517h51e49bf383c47da1E.exit

_ZN5repro3fn517h51e49bf383c47da1E.exit:           ; preds = %bb2.preheader.i, %start
  ret void
}

define internal fastcc void @_ZN5repro4fn1117ha0d291cafd330a2bE(i64 %_1, i128 %_3, ptr %_4, i128 %_5.1, ptr %_7) #1 personality ptr null {
start:
  %0 = alloca [0 x [0 x [0 x i8]]], i32 0, align 1
  %fmt.i = alloca [4 x i8], align 1
  %1 = alloca [5 x i32], align 4
  %2 = alloca [4 x i128], align 8
  %_8 = alloca [7 x i32], align 4
  %3 = load <7 x i32>, ptr %_7, align 4
  store <7 x i32> %3, ptr %_8, align 4
  tail call fastcc void @_ZN5repro11black_box_217h2c9aef2f201b1074E(i128 %_3, i64 %_1)
  store i128 %_5.1, ptr %2, align 8
  %4 = load i8, ptr %_4, align 1
  store i8 %4, ptr %1, align 4
  %bcmp.i.i.i = call i32 @bcmp(ptr %_4, ptr %_8, i64 28)
  %5 = icmp eq i32 %bcmp.i.i.i, 0
  %6 = zext i1 %5 to i8
  store i8 %6, ptr %_4, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr %0)
  %bcmp.i.i3.i = call i32 @bcmp(ptr %_4, ptr %2, i64 64)
  %7 = icmp eq i32 %bcmp.i.i3.i, 0
  %8 = zext i1 %7 to i8
  store i8 %8, ptr %_4, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr null)
  %bcmp.i.i4.i = call i32 @bcmp(ptr %_4, ptr %1, i64 20)
  %9 = icmp eq i32 %bcmp.i.i4.i, 0
  %10 = zext i1 %9 to i8
  store i8 %10, ptr %_4, align 1
  call void asm sideeffect "", "r,~{memory}"(ptr null)
  store i8 37, ptr %fmt.i, align 1
  %11 = getelementptr [4 x i8], ptr %fmt.i, i64 0, i64 1
  store i8 100, ptr %11, align 1
  %12 = getelementptr [4 x i8], ptr %fmt.i, i64 0, i64 2
  store i8 32, ptr %12, align 1
  %iter.i.sroa.10.16.vec.extract = extractelement <7 x i32> %3, i64 0
  %_44.i = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.16.vec.extract)
  %iter.i.sroa.10.20.vec.extract = extractelement <7 x i32> %3, i64 1
  %_44.i.1 = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.20.vec.extract)
  %iter.i.sroa.10.24.vec.extract = extractelement <7 x i32> %3, i64 2
  %_44.i.2 = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.24.vec.extract)
  %iter.i.sroa.10.28.vec.extract = extractelement <7 x i32> %3, i64 3
  %_44.i.3 = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.28.vec.extract)
  %iter.i.sroa.10.32.vec.extract = extractelement <7 x i32> %3, i64 4
  %_44.i.4 = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.32.vec.extract)
  %iter.i.sroa.10.36.vec.extract = extractelement <7 x i32> %3, i64 5
  %_44.i.5 = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.36.vec.extract)
  %_44.i.6 = call i32 (ptr, ...) @printf(ptr %fmt.i, i32 %iter.i.sroa.10.16.vec.extract)
  ret void
}

declare i32 @printf(ptr, ...)

define i32 @main() {
top:
  call void @fn1(i1 false)
  ret i32 0
}

declare i32 @bcmp(ptr, ptr, i64)

; uselistorder directives
uselistorder ptr null, { 1, 2, 6, 7, 0, 3, 4, 5 }
uselistorder ptr @_ZN5repro4fn1117ha0d291cafd330a2bE, { 1, 0 }
uselistorder ptr @printf, { 6, 5, 4, 3, 2, 1, 0 }
uselistorder ptr @bcmp, { 2, 1, 0 }

attributes #0 = { noinline }
attributes #1 = { "target-cpu"="x86-64" }

The above was from llvm-reduce. I don't know if it broke something so I attached the original IR below. This is compiled from Rust but I've patched out the symbols from Rust std so has no dependency on Rust.

original IR ```llvm ; ModuleID = 'repro.46f743e1561fb24e-cgu.0' source_filename = "repro.46f743e1561fb24e-cgu.0" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" %Adt56 = type { { i128, i128, { i16, i128 }, i64, i32, [1 x i32] }, %Adt55 } %Adt55 = type { %Adt54 } %Adt54 = type { { i128, ptr }, { i128, i128, { i16, i128 }, i64, i32, [1 x i32] } } @vtable.0 = private unnamed_addr constant <{ ptr, [16 x i8], ptr, ptr, ptr }> <{ ptr @"_ZN4core3ptr85drop_in_place$LT$std..rt..lang_start$LT$$LP$$RP$$GT$..$u7b$$u7b$closure$u7d$$u7d$$GT$17h0eee5ecdc5932091E", [16 x i8] c"\08\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00", ptr @"_ZN4core3ops8function6FnOnce40call_once$u7b$$u7b$vtable.shim$u7d$$u7d$17hf0de4a394f8e37a1E", ptr @"_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h991b85cf75f57f3aE", ptr @"_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h991b85cf75f57f3aE" }>, align 8 @alloc_a00f8a95864fc305bf508c11187211d8 = private unnamed_addr constant <{ [28 x i8] }> zeroinitializer, align 4 @alloc_4f40612ab7406a7d1f3f0640c8ea0fb4 = private unnamed_addr constant <{ [64 x i8] }> zeroinitializer, align 8 @alloc_ee0548ff1320ae5be168b83ab0b060cd = private unnamed_addr constant <{ [20 x i8] }> <{ [20 x i8] c"a\00\00\00a\00\00\00a\00\00\00a\00\00\00a\00\00\00" }>, align 4 ; std::sys_common::backtrace::__rust_begin_short_backtrace ; Function Attrs: noinline nonlazybind uwtable define internal fastcc void @_ZN3std10sys_common9backtrace28__rust_begin_short_backtrace17h3230664098c98715E(ptr nocapture noundef nonnull readonly %f) unnamed_addr #0 { start: tail call void %f() tail call void asm sideeffect "", "~{memory}"() #10, !srcloc !3 ret void } ; std::rt::lang_start::{{closure}} ; Function Attrs: inlinehint nonlazybind uwtable define internal noundef i32 @"_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h991b85cf75f57f3aE"(ptr noalias nocapture noundef readonly align 8 dereferenceable(8) %_1) unnamed_addr #2 { start: %_4 = load ptr, ptr %_1, align 8, !nonnull !4, !noundef !4 ; call std::sys_common::backtrace::__rust_begin_short_backtrace tail call fastcc void @_ZN3std10sys_common9backtrace28__rust_begin_short_backtrace17h3230664098c98715E(ptr noundef nonnull %_4) ret i32 0 } ; core::ops::function::FnOnce::call_once{{vtable.shim}} ; Function Attrs: inlinehint nonlazybind uwtable define internal noundef i32 @"_ZN4core3ops8function6FnOnce40call_once$u7b$$u7b$vtable.shim$u7d$$u7d$17hf0de4a394f8e37a1E"(ptr nocapture noundef readonly %_1) unnamed_addr #2 personality ptr @rust_eh_personality { start: %0 = load ptr, ptr %_1, align 8, !nonnull !4, !noundef !4 ; call std::sys_common::backtrace::__rust_begin_short_backtrace tail call fastcc void @_ZN3std10sys_common9backtrace28__rust_begin_short_backtrace17h3230664098c98715E(ptr noundef nonnull %0), !noalias !5 ret i32 0 } ; core::ptr::drop_in_place::{{closure}}> ; Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable define internal void @"_ZN4core3ptr85drop_in_place$LT$std..rt..lang_start$LT$$LP$$RP$$GT$..$u7b$$u7b$closure$u7d$$u7d$$GT$17h0eee5ecdc5932091E"(ptr noalias nocapture readnone align 8 %_1) unnamed_addr #3 { start: ret void } ; repro::black_box_1 ; Function Attrs: noinline nonlazybind uwtable define internal fastcc void @_ZN5repro11black_box_117h2948a258b3403becE(ptr noalias nocapture noundef readonly dereferenceable(28) %val3) unnamed_addr #0 { start: %0 = alloca i8, align 1 %bcmp.i.i = tail call i32 @bcmp(ptr noundef nonnull dereferenceable(28) @alloc_a00f8a95864fc305bf508c11187211d8, ptr noundef nonnull dereferenceable(28) %val3, i64 28) %1 = icmp eq i32 %bcmp.i.i, 0 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %0) %2 = zext i1 %1 to i8 store i8 %2, ptr %0, align 1 call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) #10, !srcloc !3 call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %0) ret void } ; repro::black_box_2 ; Function Attrs: noinline nonlazybind uwtable define internal fastcc void @_ZN5repro11black_box_217h2c9aef2f201b1074E(i128 noundef %val0, i64 noundef %val1) unnamed_addr #0 { start: %0 = alloca i8, align 1 %1 = alloca i8, align 1 %_4 = icmp eq i128 %val0, 0 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %1) %2 = zext i1 %_4 to i8 store i8 %2, ptr %1, align 1 call void asm sideeffect "", "r,~{memory}"(ptr nonnull %1) #10, !srcloc !3 call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %1) %_7 = icmp eq i64 %val1, 0 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %0) %3 = zext i1 %_7 to i8 store i8 %3, ptr %0, align 1 call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) #10, !srcloc !3 call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %0) ret void } ; Function Attrs: nonlazybind uwtable define dso_local void @fn1() unnamed_addr #1 { start: %0 = alloca [7 x i32], align 4 %1 = alloca [5 x i32], align 4 %2 = alloca [7 x i32], align 4 %3 = alloca [5 x i32], align 4 %_2.i = alloca %Adt56, align 8 %_1 = alloca [7 x i32], align 4 store i32 42, ptr %_1, align 4 %4 = getelementptr inbounds i32, ptr %_1, i64 1 store i32 42, ptr %4, align 4 %5 = getelementptr inbounds i32, ptr %_1, i64 2 store i32 42, ptr %5, align 4 %6 = getelementptr inbounds i32, ptr %_1, i64 3 store i32 42, ptr %6, align 4 %7 = getelementptr inbounds i32, ptr %_1, i64 4 store i32 42, ptr %7, align 4 %8 = getelementptr inbounds i32, ptr %_1, i64 5 store i32 42, ptr %8, align 4 %9 = getelementptr inbounds i32, ptr %_1, i64 6 store i32 42, ptr %9, align 4 %10 = load <7 x i32>, ptr %_1, align 4 call void @llvm.lifetime.start.p0(i64 168, ptr nonnull %_2.i) %11 = getelementptr inbounds %Adt56, ptr %_2.i, i64 0, i32 1, i32 0, i32 1 %12 = getelementptr inbounds %Adt56, ptr %_2.i, i64 0, i32 1, i32 0, i32 1, i32 3 store i64 0, ptr %12, align 8, !noalias !8 %13 = getelementptr inbounds %Adt56, ptr %_2.i, i64 0, i32 1, i32 0, i32 1, i32 2 store i16 -21983, ptr %13, align 8, !noalias !8 %14 = getelementptr inbounds %Adt56, ptr %_2.i, i64 0, i32 1, i32 0, i32 1, i32 2, i32 1 store i128 0, ptr %14, align 8, !noalias !8 store i128 0, ptr %11, align 8, !noalias !8 %15 = getelementptr inbounds %Adt56, ptr %_2.i, i64 0, i32 1, i32 0, i32 0, i32 1 store ptr %_2.i, ptr %15, align 8, !noalias !8 call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %3), !noalias !8 store i32 97, ptr %3, align 4, !noalias !8 %_3.sroa.3.0..sroa_idx.i = getelementptr inbounds i8, ptr %3, i64 4 store i32 97, ptr %_3.sroa.3.0..sroa_idx.i, align 4, !noalias !8 %_3.sroa.4.0..sroa_idx.i = getelementptr inbounds i8, ptr %3, i64 8 store i32 97, ptr %_3.sroa.4.0..sroa_idx.i, align 4, !noalias !8 %_3.sroa.5.0..sroa_idx.i = getelementptr inbounds i8, ptr %3, i64 12 store i32 97, ptr %_3.sroa.5.0..sroa_idx.i, align 4, !noalias !8 %_3.sroa.6.0..sroa_idx.i = getelementptr inbounds i8, ptr %3, i64 16 store i32 97, ptr %_3.sroa.6.0..sroa_idx.i, align 4, !noalias !8 call void @llvm.lifetime.start.p0(i64 28, ptr nonnull %2), !noalias !8 store <7 x i32> %10, ptr %2, align 4, !noalias !8 ; call repro::fn11 call fastcc void @_ZN5repro4fn1117ha0d291cafd330a2bE(i64 noundef 0, i128 noundef 0, ptr noalias nocapture noundef nonnull readonly dereferenceable(20) %3, i128 noundef 0, ptr noalias nocapture noundef nonnull readonly dereferenceable(28) %2) call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %3), !noalias !8 call void @llvm.lifetime.end.p0(i64 28, ptr nonnull %2), !noalias !8 %16 = load i16, ptr %13, align 8, !noalias !8, !noundef !4 %17 = icmp eq i16 %16, 2 br i1 %17, label %bb2.preheader.i, label %_ZN5repro3fn517h51e49bf383c47da1E.exit bb2.preheader.i: ; preds = %start %_3.sroa.3.0..sroa_idx3.i = getelementptr inbounds i8, ptr %1, i64 4 %_3.sroa.4.0..sroa_idx5.i = getelementptr inbounds i8, ptr %1, i64 8 %_3.sroa.5.0..sroa_idx7.i = getelementptr inbounds i8, ptr %1, i64 12 %_3.sroa.6.0..sroa_idx9.i = getelementptr inbounds i8, ptr %1, i64 16 br label %bb2.i bb2.i: ; preds = %bb2.i, %bb2.preheader.i %18 = load i64, ptr %12, align 8, !noalias !8, !noundef !4 %19 = load i128, ptr %11, align 8, !noalias !8, !noundef !4 call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %1), !noalias !8 store i32 97, ptr %1, align 4, !noalias !8 store i32 97, ptr %_3.sroa.3.0..sroa_idx3.i, align 4, !noalias !8 store i32 97, ptr %_3.sroa.4.0..sroa_idx5.i, align 4, !noalias !8 store i32 97, ptr %_3.sroa.5.0..sroa_idx7.i, align 4, !noalias !8 store i32 97, ptr %_3.sroa.6.0..sroa_idx9.i, align 4, !noalias !8 %20 = load i128, ptr %14, align 8, !noalias !8, !noundef !4 call void @llvm.lifetime.start.p0(i64 28, ptr nonnull %0), !noalias !8 store <7 x i32> %10, ptr %0, align 4, !noalias !8 ; call repro::fn11 call fastcc void @_ZN5repro4fn1117ha0d291cafd330a2bE(i64 noundef %18, i128 noundef %19, ptr noalias nocapture noundef nonnull readonly dereferenceable(20) %1, i128 noundef %20, ptr noalias nocapture noundef nonnull readonly dereferenceable(28) %0) call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %1), !noalias !8 call void @llvm.lifetime.end.p0(i64 28, ptr nonnull %0), !noalias !8 %21 = load i16, ptr %13, align 8, !noalias !8, !noundef !4 %22 = icmp eq i16 %21, 2 br i1 %22, label %bb2.i, label %_ZN5repro3fn517h51e49bf383c47da1E.exit _ZN5repro3fn517h51e49bf383c47da1E.exit: ; preds = %bb2.i, %start call void @llvm.lifetime.end.p0(i64 168, ptr nonnull %_2.i) ; call repro::black_box_1 call fastcc void @_ZN5repro11black_box_117h2948a258b3403becE(ptr noalias nocapture noundef nonnull readonly dereferenceable(28) %_1) ret void } ; repro::fn11 ; Function Attrs: nonlazybind uwtable define internal fastcc void @_ZN5repro4fn1117ha0d291cafd330a2bE(i64 noundef %_1, i128 noundef %_3, ptr noalias nocapture noundef readonly dereferenceable(20) %_4, i128 noundef %_5.1, ptr noalias nocapture noundef readonly dereferenceable(28) %_7) unnamed_addr #1 personality ptr @rust_eh_personality { start: %0 = alloca i8, align 1 %1 = alloca i8, align 1 %2 = alloca i8, align 1 %3 = alloca i8, align 1 %4 = alloca i8, align 1 %5 = alloca i8, align 1 %lf.i = alloca [2 x i8], align 1 %fmt.i = alloca [4 x i8], align 1 %6 = alloca [5 x i32], align 4 %7 = alloca [4 x i128], align 8 %_8 = alloca [7 x i32], align 4 %8 = load <7 x i32>, ptr %_7, align 4 store <7 x i32> %8, ptr %_8, align 4 ; call repro::black_box_2 tail call fastcc void @_ZN5repro11black_box_217h2c9aef2f201b1074E(i128 noundef %_3, i64 noundef %_1) call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %7) %_12.sroa.3.0..sroa_idx = getelementptr inbounds i8, ptr %7, i64 32 call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %7, i8 0, i64 32, i1 false) store i128 %_5.1, ptr %_12.sroa.3.0..sroa_idx, align 8 %_12.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %7, i64 48 store i128 0, ptr %_12.sroa.4.0..sroa_idx, align 8 call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %6) call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(20) %6, ptr noundef nonnull align 4 dereferenceable(20) %_4, i64 20, i1 false) call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %5), !noalias !11 store i8 0, ptr %5, align 1, !noalias !11 call void asm sideeffect "", "r,~{memory}"(ptr nonnull %5) #10, !noalias !11, !srcloc !3 call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %5), !noalias !11 %bcmp.i.i.i = call i32 @bcmp(ptr noundef nonnull dereferenceable(28) @alloc_a00f8a95864fc305bf508c11187211d8, ptr noundef nonnull dereferenceable(28) %_8, i64 28), !noalias !16 %9 = icmp eq i32 %bcmp.i.i.i, 0 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %4), !noalias !11 %10 = zext i1 %9 to i8 store i8 %10, ptr %4, align 1, !noalias !11 call void asm sideeffect "", "r,~{memory}"(ptr nonnull %4) #10, !noalias !11, !srcloc !3 call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %4), !noalias !11 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %3), !noalias !11 store i8 1, ptr %3, align 1, !noalias !11 call void asm sideeffect "", "r,~{memory}"(ptr nonnull %3) #10, !noalias !11, !srcloc !3 call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %3), !noalias !11 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %2), !noalias !11 store i8 1, ptr %2, align 1, !noalias !11 call void asm sideeffect "", "r,~{memory}"(ptr nonnull %2) #10, !noalias !11, !srcloc !3 call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %2), !noalias !11 %bcmp.i.i3.i = call i32 @bcmp(ptr noundef nonnull dereferenceable(64) @alloc_4f40612ab7406a7d1f3f0640c8ea0fb4, ptr noundef nonnull dereferenceable(64) %7, i64 64), !noalias !17 %11 = icmp eq i32 %bcmp.i.i3.i, 0 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %1), !noalias !11 %12 = zext i1 %11 to i8 store i8 %12, ptr %1, align 1, !noalias !11 call void asm sideeffect "", "r,~{memory}"(ptr nonnull %1) #10, !noalias !11, !srcloc !3 call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %1), !noalias !11 %bcmp.i.i4.i = call i32 @bcmp(ptr noundef nonnull dereferenceable(20) @alloc_ee0548ff1320ae5be168b83ab0b060cd, ptr noundef nonnull dereferenceable(20) %6, i64 20), !noalias !18 %13 = icmp eq i32 %bcmp.i.i4.i, 0 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %0), !noalias !11 %14 = zext i1 %13 to i8 store i8 %14, ptr %0, align 1, !noalias !11 call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) #10, !noalias !11, !srcloc !3 call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %0), !noalias !11 call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %fmt.i), !noalias !11 store i8 37, ptr %fmt.i, align 1, !noalias !11 %15 = getelementptr inbounds [4 x i8], ptr %fmt.i, i64 0, i64 1 store i8 100, ptr %15, align 1, !noalias !11 %16 = getelementptr inbounds [4 x i8], ptr %fmt.i, i64 0, i64 2 store i8 32, ptr %16, align 1, !noalias !11 %17 = getelementptr inbounds [4 x i8], ptr %fmt.i, i64 0, i64 3 store i8 0, ptr %17, align 1, !noalias !11 %iter.i.sroa.10.16.vec.extract = extractelement <7 x i32> %8, i64 0 %_44.i = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.16.vec.extract), !noalias !11 %iter.i.sroa.10.20.vec.extract = extractelement <7 x i32> %8, i64 1 %_44.i.1 = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.20.vec.extract), !noalias !11 %iter.i.sroa.10.24.vec.extract = extractelement <7 x i32> %8, i64 2 %_44.i.2 = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.24.vec.extract), !noalias !11 %iter.i.sroa.10.28.vec.extract = extractelement <7 x i32> %8, i64 3 %_44.i.3 = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.28.vec.extract), !noalias !11 %iter.i.sroa.10.32.vec.extract = extractelement <7 x i32> %8, i64 4 %_44.i.4 = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.32.vec.extract), !noalias !11 %iter.i.sroa.10.36.vec.extract = extractelement <7 x i32> %8, i64 5 %_44.i.5 = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.36.vec.extract), !noalias !11 %iter.i.sroa.10.40.vec.extract = extractelement <7 x i32> %8, i64 6 %_44.i.6 = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %fmt.i, i32 noundef %iter.i.sroa.10.40.vec.extract), !noalias !11 call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %lf.i), !noalias !11 store i8 10, ptr %lf.i, align 1, !noalias !11 %18 = getelementptr inbounds [2 x i8], ptr %lf.i, i64 0, i64 1 store i8 0, ptr %18, align 1, !noalias !11 %_50.i = call noundef i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) %lf.i), !noalias !11 call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %lf.i), !noalias !11 call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %fmt.i), !noalias !11 call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %7) call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %6) ret void } ; repro::main ; Function Attrs: nonlazybind uwtable define internal void @_ZN5repro4main17hc1028cd349e9622cE() unnamed_addr #1 { start: tail call void @fn1() ret void } ; Function Attrs: nonlazybind uwtable define internal i32 @rust_eh_personality(i32 noundef, i32 noundef, i64 noundef, ptr noundef, ptr noundef) unnamed_addr #1 { start: ret i32 0 } ; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite) declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #4 ; Function Attrs: nofree nounwind nonlazybind uwtable declare noundef i32 @printf(ptr nocapture noundef readonly, ...) unnamed_addr #5 ; Function Attrs: nonlazybind define i32 @main(i32 %0, ptr %1) unnamed_addr #6 { top: call void @fn1() ret i32 0 } ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #7 ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #7 ; Function Attrs: nofree nounwind nonlazybind willreturn memory(argmem: read) declare i32 @bcmp(ptr nocapture, ptr nocapture, i64) local_unnamed_addr #8 ; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #9 attributes #0 = { noinline nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" } attributes #1 = { nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" } attributes #2 = { inlinehint nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" } attributes #3 = { inlinehint mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" } attributes #4 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite) } attributes #5 = { nofree nounwind nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" } attributes #6 = { nonlazybind "probe-stack"="inline-asm" "target-cpu"="x86-64" } attributes #7 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } attributes #8 = { nofree nounwind nonlazybind willreturn memory(argmem: read) } attributes #9 = { nocallback nofree nounwind willreturn memory(argmem: write) } attributes #10 = { nounwind } !llvm.module.flags = !{!0, !1, !2} !0 = !{i32 8, !"PIC Level", i32 2} !1 = !{i32 7, !"PIE Level", i32 2} !2 = !{i32 2, !"RtLibUseGOT", i32 1} !3 = !{i32 704612} !4 = !{} !5 = !{!6} !6 = distinct !{!6, !7, !"_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h991b85cf75f57f3aE: %_1"} !7 = distinct !{!7, !"_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17h991b85cf75f57f3aE"} !8 = !{!9} !9 = distinct !{!9, !10, !"_ZN5repro3fn517h51e49bf383c47da1E: %_1"} !10 = distinct !{!10, !"_ZN5repro3fn517h51e49bf383c47da1E"} !11 = !{!12, !14, !15} !12 = distinct !{!12, !13, !"_ZN5repro8dump_var17h8ee3b87d4b793436E: %val0"} !13 = distinct !{!13, !"_ZN5repro8dump_var17h8ee3b87d4b793436E"} !14 = distinct !{!14, !13, !"_ZN5repro8dump_var17h8ee3b87d4b793436E: %val2"} !15 = distinct !{!15, !13, !"_ZN5repro8dump_var17h8ee3b87d4b793436E: %val3"} !16 = !{!14, !15} !17 = !{!12, !15} !18 = !{!12, !14} ```
efriedma-quic commented 1 year ago

I don't think your reduction is correct; it looks like it involves accessing zero-byte allocations.

Generally, the first tool I reach for to reduce miscompiles is opt-bisect-limit (https://llvm.org/docs/OptBisect.html).

cbeuw commented 1 year ago

I removed all the zero-byte allocas: https://godbolt.org/z/jEbPc1P94

nikic commented 1 year ago

Looks like there is an ABI mismatch. The arguments are pushed via pushq at 8 byte offsets and then read via movl at 4 byte offsets.

nikic commented 1 year ago

Here's a reduction:

define void @caller() nounwind {
  call void @callee(ptr null, ptr null, ptr null, ptr null, ptr null, ptr null, <7 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>)
  ret void
}

define void @callee(ptr %p0, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, <7 x i32> %arg) nounwind {
start:
  %alloca = alloca [7 x i32], align 4
  store <7 x i32> %arg, ptr %alloca, align 4
  %extract0 = extractelement <7 x i32> %arg, i64 0
  call void @use(i32 %extract0)
  %extract1 = extractelement <7 x i32> %arg, i64 1
  call void @use(i32 %extract1)
  %extract2 = extractelement <7 x i32> %arg, i64 2
  call void @use(i32 %extract2)
  %extract3 = extractelement <7 x i32> %arg, i64 3
  call void @use(i32 %extract3)
  %extract4 = extractelement <7 x i32> %arg, i64 4
  call void @use(i32 %extract4)
  %extract5 = extractelement <7 x i32> %arg, i64 5
  call void @use(i32 %extract5)
  %extract6 = extractelement <7 x i32> %arg, i64 6
  call void @use(i32 %extract6)
  %extract7 = extractelement <7 x i32> %arg, i64 7
  call void @use(i32 %extract7)
  ret void
}

declare void @use(i32)

The caller does:

    pushq   $42
    pushq   $42
    pushq   $42
    pushq   $42
    pushq   $42
    pushq   $42
    pushq   $42
    callq   callee@PLT

The callee does:

    movl    112(%rsp), %ebx
    movl    104(%rsp), %ebp
    movl    96(%rsp), %r14d
    movl    76(%rsp), %r15d
    movl    72(%rsp), %r12d
    movl    64(%rsp), %edi
    movl    68(%rsp), %r13d

If we drop the store, then the offsets are correct (don't mind the different base):

    movl    144(%rsp), %ebx
    movl    136(%rsp), %ebp
    movl    128(%rsp), %r14d
    movl    120(%rsp), %r15d
    movl    112(%rsp), %r12d
    movl    104(%rsp), %r13d
    movl    96(%rsp), %edi

So this is again in some way related to the arg copy elision optimization.

nikic commented 1 year ago

This seems to be related to the code in X86ISelLowering::LowerMemArgument() handling isCopyElisionCandidate(). It checks for ScalarizedAndExtendedVector, but does so by inspecting the size of the LocVT. However, if I'm understanding this right, in this case the LocVT is i32 matching the vector size, but this doesn't match the size of the stack slot, which is 8.

I'm not sure if there's any easy way to access that stack slot size though... CCAssignVal only stores the start offset.

nikic commented 1 year ago

Candidate patch: https://reviews.llvm.org/D154078

llvmbot commented 1 year ago

@llvm/issue-subscribers-backend-x86

nikic commented 1 year ago

@cbeuw Do you have the original Rust code that lead to this issue? I find it suspicious that we end up with illegal vector types in optimized IR -- unless you did something with repr(simd) I don't think that's supposed to happen.

cbeuw commented 1 year ago

@nikic I have the unreduced code in custom MIR: https://godbolt.org/z/7q6q8eK96. But I don't have the reduced one around any more... I'm happy to run the minimisation script again though if needed.

This isn't reproducible from surface Rust, which is why I opened a bug report with LLVM directly. The reproduction required a Move operand of an array local in a function call, where the same local was previously used. This MIR cannot be built from surface Rust as MIR building creates temporary copies for all Move operands in Call. The local that gets moved is assigned to and used exactly once. If you change Move(_16) to _16 on line 3200 then the bug goes away.

cbeuw commented 1 year ago

By illegal vector types do you mean the zero-byte [0 x [0 x [0 x i8]]]s? They weren't from rustc, they were from llvm-reduce. The IR from Rust was folded under original IR in the OP.

nikic commented 1 year ago

"Illegal vector type" here refers to the non-power-of-two vectors, which are not natively supported by the target. They are already part of the input IR, and the most likely culprit for that is https://github.com/rust-lang/rust/pull/111999.

I wonder whether it would make sense to prevent argument promotion for such types, as the legalized argument passing for such vectors can be substantially worse than just passing them indirectly (and it makes it more likely to hit legalization bugs like https://github.com/llvm/llvm-project/issues/63608).