rust-lang / rust

Empowering everyone to build reliable and efficient software.
https://www.rust-lang.org
Other
99.01k stars 12.79k forks source link

Box::new(expr) first puts expr on the stack, then copies. #50047

Closed glandium closed 6 years ago

glandium commented 6 years ago

Consider the following code:

pub fn foo() -> Box<[u8; 4096]> {
    Box::new([0; 4096])
}

(made it big because it's kind of simpler to see the memset and memcpy calls in the resulting asm)

It generates the following assembly:

example::foo:
  push rbx
  mov eax, 4096
  call __rust_probestack
  sub rsp, rax
  mov rdi, rsp
  xor esi, esi
  mov edx, 4096
  call memset@PLT
  mov edi, 4096
  mov esi, 1
  call __rust_alloc@PLT
  mov rbx, rax
  test rbx, rbx
  je .LBB1_1
  mov rsi, rsp
  mov edx, 4096
  mov rdi, rbx
  call memcpy@PLT
  mov rax, rbx
  add rsp, 4096
  pop rbx
  ret
.LBB1_1:
  call <alloc::alloc::Global as core::alloc::GlobalAlloc>::oom
  ud2

which does a memset, alloc, memcpy dance.

I was accepting this as a fact of life, but today, I was looking at a random old version of rustc on godbolt, and it turns out before 1.12, the memset, alloc, memcpy dance wasn't happening:

example::foo:
  push rbx
  mov edi, 4096
  mov esi, 1
  call __rust_allocate@PLT
  mov rbx, rax
  test rbx, rbx
  je .LBB0_2
  xor esi, esi
  mov edx, 4096
  mov rdi, rbx
  call memset@PLT
  mov rax, rbx
  pop rbx
  ret
.LBB0_2:
  call alloc::oom::oom@PLT

https://godbolt.org/g/J3cy5E

The llvm ir back then looks like the following:

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define noalias dereferenceable(4096) [4096 x i8]* @example::foo() unnamed_addr #0 {
entry-block:
  %0 = tail call i8* @__rust_allocate(i64 4096, i64 1) #1, !noalias !0
  %1 = icmp eq i8* %0, null
  br i1 %1, label %then-block-57-.i.i, label %"_ZN5alloc5boxed30_$LT$impl$u20$Box$LT$T$GT$$GT$3new17ha7ffa7dfb1e725d2E.exit"

then-block-57-.i.i: ; preds = %entry-block
  tail call void @alloc::oom::oom(), !noalias !0
  unreachable

"_ZN5alloc5boxed30_$LT$impl$u20$Box$LT$T$GT$$GT$3new17ha7ffa7dfb1e725d2E.exit": ; preds = %entry-block
  %2 = bitcast i8* %0 to [4096 x i8]*
  call void @llvm.memset.p0i8.i64(i8* nonnull %0, i8 0, i64 4096, i32 1, i1 false)
  ret [4096 x i8]* %2
}

declare noalias i8* @__rust_allocate(i64, i64) unnamed_addr #1

declare void @alloc::oom::oom() unnamed_addr #2

declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3

attributes #0 = { uwtable }
attributes #1 = { nounwind }
attributes #2 = { cold noinline noreturn }
attributes #3 = { argmemonly nounwind }

!0 = !{!1}
!1 = distinct !{!1, !2, !"alloc::boxed::<impl Box<T>>::new: %x"}
!2 = distinct !{!2, !"alloc::boxed::<impl Box<T>>::new"}

while on nightly, it looks like:

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define internal fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() unnamed_addr #0 {
  tail call void @__rust_oom()
  unreachable
}

define noalias align 1 dereferenceable(4096) [4096 x i8]* @example::foo() unnamed_addr #1 {
  %_1 = alloca [4096 x i8], align 1
  %_1.0.sroa_idx2 = getelementptr inbounds [4096 x i8], [4096 x i8]* %_1, i64 0, i64 0
  call void @llvm.lifetime.start.p0i8(i64 4096, i8* nonnull %_1.0.sroa_idx2)
  call void @llvm.memset.p0i8.i64(i8* nonnull %_1.0.sroa_idx2, i8 0, i64 4096, i32 1, i1 false)
  %0 = tail call i8* @__rust_alloc(i64 4096, i64 1) #5, !noalias !0
  %1 = icmp eq i8* %0, null
  br i1 %1, label %bb7.i.i, label %"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17hbb8214c4d412a6d3E.exit"

bb7.i.i: ; preds = %start
  tail call fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() #5, !noalias !0
  unreachable

"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17hbb8214c4d412a6d3E.exit": ; preds = %start
  %2 = bitcast i8* %0 to [4096 x i8]*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %0, i8* nonnull %_1.0.sroa_idx2, i64 4096, i32 1, i1 false) #5
  call void @llvm.lifetime.end.p0i8(i64 4096, i8* nonnull %_1.0.sroa_idx2)
  ret [4096 x i8]* %2
}

declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #2

declare void @__rust_oom() unnamed_addr #3

declare noalias i8* @__rust_alloc(i64, i64) unnamed_addr #4

declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #2

declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2

declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2

attributes #0 = { inlinehint noreturn nounwind uwtable "probe-stack"="__rust_probestack" }
attributes #1 = { nounwind uwtable "probe-stack"="__rust_probestack" }
attributes #2 = { argmemonly nounwind }
attributes #3 = { cold noreturn nounwind "probe-stack"="__rust_probestack" }
attributes #4 = { nounwind "probe-stack"="__rust_probestack" }
attributes #5 = { nounwind }

!0 = !{!1}
!1 = distinct !{!1, !2, !"<alloc::boxed::Box<T>>::new: %x"}
!2 = distinct !{!2, !"<alloc::boxed::Box<T>>::new"}
rcoh commented 6 years ago

1.12.0 was the release that added MIR: "rustc translates code to LLVM IR via its own "middle" IR (MIR)", which seems like the mostly likely cause from that release.

glandium commented 6 years ago

The corresponding MIR:

const foo::{{initializer}}: usize ={
  let mut _0: usize; // return place

  bb0: { 
  _0 = const 4096usize; // bb0[0]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:1:26: 1:30
  return; // bb0[1]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:1:26: 1:30
  }
}

const foo::{{initializer}}: usize ={
  let mut _0: usize; // return place

  bb0: { 
  _0 = const 4096usize; // bb0[0]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:18: 2:22
  return; // bb0[1]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:18: 2:22
  }
}

fn foo() -> std::boxed::Box<[u8; 4096]>{
  let mut _0: std::boxed::Box<[u8; 4096]>; // return place
  let mut _1: [u8; 4096];

  bb0: { 
  StorageLive(_1); // bb0[0]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:14: 2:23
  _1 = [const 0u8; 4096]; // bb0[1]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:14: 2:23
  _0 = const <std::boxed::Box<T>>::new(move _1) -> bb1; // bb0[2]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:5: 2:24
  }

  bb1: { 
  StorageDead(_1); // bb1[0]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:23: 2:24
  return; // bb1[1]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:3:2: 3:2
  }
}
oli-obk commented 6 years ago

Can we add a trick to HAIR that treats Box::new calls just like the box syntax?

glandium commented 6 years ago

Related:

pub fn bar(buf: [u8; 4096]) -> Box<[u8; 4096]> {
    Box::new(buf)
}

copies buf to the local stack before copying it in the box:

example::bar:
  push rbx
  mov eax, 4096
  call __rust_probestack
  sub rsp, rax
  mov rax, rdi
  mov rdi, rsp
  mov edx, 4096
  mov rsi, rax
  call memcpy@PLT
  mov edi, 4096
  mov esi, 1
  call __rust_alloc@PLT
  mov rbx, rax
  test rbx, rbx
  je .LBB2_1
  mov rsi, rsp
  mov edx, 4096
  mov rdi, rbx
  call memcpy@PLT
  mov rax, rbx
  add rsp, 4096
  pop rbx
  ret
.LBB2_1:
  call <alloc::alloc::Global as core::alloc::GlobalAlloc>::oom
  ud2
glandium commented 6 years ago

So, one interesting fact, it doesn't happen when the object is small enough:

pub fn foo() -> Box<[u8; 8]> {
    Box::new([0; 8])
}

pub fn bar() -> Box<[u8; 9]> {
    Box::new([0; 9])
}
example::foo:
  push rax
  mov edi, 8
  mov esi, 1
  call __rust_alloc@PLT
  test rax, rax
  je .LBB1_1
  mov qword ptr [rax], 0
  pop rcx
  ret
.LBB1_1:
  call <alloc::alloc::Global as core::alloc::GlobalAlloc>::oom
  ud2

example::bar:
  sub rsp, 24
  mov byte ptr [rsp + 16], 0
  mov qword ptr [rsp + 8], 0
  mov edi, 9
  mov esi, 1
  call __rust_alloc@PLT
  test rax, rax
  je .LBB2_1
  mov cl, byte ptr [rsp + 16]
  mov byte ptr [rax + 8], cl
  mov rcx, qword ptr [rsp + 8]
  mov qword ptr [rax], rcx
  add rsp, 24
  ret
.LBB2_1:
  call <alloc::alloc::Global as core::alloc::GlobalAlloc>::oom
  ud2

The MIR in both cases looks similar, but the LLVM-IR differs:

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define internal fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() unnamed_addr #0 {
  tail call void @__rust_oom()
  unreachable
}

define noalias align 1 dereferenceable(8) [8 x i8]* @example::foo() unnamed_addr #1 {
  %0 = tail call i8* @__rust_alloc(i64 8, i64 1) #5
  %1 = icmp eq i8* %0, null
  br i1 %1, label %bb7.i.i, label %"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17h4e4edfda70298278E.exit"

bb7.i.i: ; preds = %start
  tail call fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() #5
  unreachable

"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17h4e4edfda70298278E.exit": ; preds = %start
  %2 = bitcast i8* %0 to [8 x i8]*
  %_3.sroa.0.0..sroa_cast.i = bitcast i8* %0 to i64*
  store i64 0, i64* %_3.sroa.0.0..sroa_cast.i, align 1
  ret [8 x i8]* %2
}

define noalias align 1 dereferenceable(9) [9 x i8]* @example::bar() unnamed_addr #1 {
  %_1 = alloca [9 x i8], align 1
  %_1.0.sroa_idx2 = getelementptr inbounds [9 x i8], [9 x i8]* %_1, i64 0, i64 0
  call void @llvm.lifetime.start.p0i8(i64 9, i8* nonnull %_1.0.sroa_idx2)
  call void @llvm.memset.p0i8.i64(i8* nonnull %_1.0.sroa_idx2, i8 0, i64 9, i32 1, i1 false)
  %0 = tail call i8* @__rust_alloc(i64 9, i64 1) #5, !noalias !0
  %1 = icmp eq i8* %0, null
  br i1 %1, label %bb7.i.i, label %"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17ha687450047947beaE.exit"

bb7.i.i: ; preds = %start
  tail call fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() #5, !noalias !0
  unreachable

"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17ha687450047947beaE.exit": ; preds = %start
  %2 = bitcast i8* %0 to [9 x i8]*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %0, i8* nonnull %_1.0.sroa_idx2, i64 9, i32 1, i1 false) #5
  call void @llvm.lifetime.end.p0i8(i64 9, i8* nonnull %_1.0.sroa_idx2)
  ret [9 x i8]* %2
}

declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #2

declare void @__rust_oom() unnamed_addr #3

declare noalias i8* @__rust_alloc(i64, i64) unnamed_addr #4

declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #2

declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2

declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2

attributes #0 = { inlinehint noreturn nounwind uwtable "probe-stack"="__rust_probestack" }
attributes #1 = { nounwind uwtable "probe-stack"="__rust_probestack" }
attributes #2 = { argmemonly nounwind }
attributes #3 = { cold noreturn nounwind "probe-stack"="__rust_probestack" }
attributes #4 = { nounwind "probe-stack"="__rust_probestack" }
attributes #5 = { nounwind }

!0 = !{!1}
!1 = distinct !{!1, !2, !"<alloc::boxed::Box<T>>::new: %x"}
!2 = distinct !{!2, !"<alloc::boxed::Box<T>>::new"}
arthurprs commented 6 years ago

Related (or dup?) of https://github.com/rust-lang/rust/issues/41160

Mark-Simulacrum commented 6 years ago

Yeah, this seems like the same issue as https://github.com/rust-lang/rust/issues/41160; closing.