Closed glandium closed 6 years ago
1.12.0 was the release that added MIR: "rustc translates code to LLVM IR via its own "middle" IR (MIR)", which seems like the mostly likely cause from that release.
The corresponding MIR:
const foo::{{initializer}}: usize ={
let mut _0: usize; // return place
bb0: {
_0 = const 4096usize; // bb0[0]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:1:26: 1:30
return; // bb0[1]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:1:26: 1:30
}
}
const foo::{{initializer}}: usize ={
let mut _0: usize; // return place
bb0: {
_0 = const 4096usize; // bb0[0]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:18: 2:22
return; // bb0[1]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:18: 2:22
}
}
fn foo() -> std::boxed::Box<[u8; 4096]>{
let mut _0: std::boxed::Box<[u8; 4096]>; // return place
let mut _1: [u8; 4096];
bb0: {
StorageLive(_1); // bb0[0]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:14: 2:23
_1 = [const 0u8; 4096]; // bb0[1]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:14: 2:23
_0 = const <std::boxed::Box<T>>::new(move _1) -> bb1; // bb0[2]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:5: 2:24
}
bb1: {
StorageDead(_1); // bb1[0]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:23: 2:24
return; // bb1[1]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:3:2: 3:2
}
}
Can we add a trick to HAIR that treats Box::new
calls just like the box
syntax?
Related:
pub fn bar(buf: [u8; 4096]) -> Box<[u8; 4096]> {
Box::new(buf)
}
copies buf to the local stack before copying it in the box:
example::bar:
push rbx
mov eax, 4096
call __rust_probestack
sub rsp, rax
mov rax, rdi
mov rdi, rsp
mov edx, 4096
mov rsi, rax
call memcpy@PLT
mov edi, 4096
mov esi, 1
call __rust_alloc@PLT
mov rbx, rax
test rbx, rbx
je .LBB2_1
mov rsi, rsp
mov edx, 4096
mov rdi, rbx
call memcpy@PLT
mov rax, rbx
add rsp, 4096
pop rbx
ret
.LBB2_1:
call <alloc::alloc::Global as core::alloc::GlobalAlloc>::oom
ud2
So, one interesting fact, it doesn't happen when the object is small enough:
pub fn foo() -> Box<[u8; 8]> {
Box::new([0; 8])
}
pub fn bar() -> Box<[u8; 9]> {
Box::new([0; 9])
}
example::foo:
push rax
mov edi, 8
mov esi, 1
call __rust_alloc@PLT
test rax, rax
je .LBB1_1
mov qword ptr [rax], 0
pop rcx
ret
.LBB1_1:
call <alloc::alloc::Global as core::alloc::GlobalAlloc>::oom
ud2
example::bar:
sub rsp, 24
mov byte ptr [rsp + 16], 0
mov qword ptr [rsp + 8], 0
mov edi, 9
mov esi, 1
call __rust_alloc@PLT
test rax, rax
je .LBB2_1
mov cl, byte ptr [rsp + 16]
mov byte ptr [rax + 8], cl
mov rcx, qword ptr [rsp + 8]
mov qword ptr [rax], rcx
add rsp, 24
ret
.LBB2_1:
call <alloc::alloc::Global as core::alloc::GlobalAlloc>::oom
ud2
The MIR in both cases looks similar, but the LLVM-IR differs:
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define internal fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() unnamed_addr #0 {
tail call void @__rust_oom()
unreachable
}
define noalias align 1 dereferenceable(8) [8 x i8]* @example::foo() unnamed_addr #1 {
%0 = tail call i8* @__rust_alloc(i64 8, i64 1) #5
%1 = icmp eq i8* %0, null
br i1 %1, label %bb7.i.i, label %"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17h4e4edfda70298278E.exit"
bb7.i.i: ; preds = %start
tail call fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() #5
unreachable
"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17h4e4edfda70298278E.exit": ; preds = %start
%2 = bitcast i8* %0 to [8 x i8]*
%_3.sroa.0.0..sroa_cast.i = bitcast i8* %0 to i64*
store i64 0, i64* %_3.sroa.0.0..sroa_cast.i, align 1
ret [8 x i8]* %2
}
define noalias align 1 dereferenceable(9) [9 x i8]* @example::bar() unnamed_addr #1 {
%_1 = alloca [9 x i8], align 1
%_1.0.sroa_idx2 = getelementptr inbounds [9 x i8], [9 x i8]* %_1, i64 0, i64 0
call void @llvm.lifetime.start.p0i8(i64 9, i8* nonnull %_1.0.sroa_idx2)
call void @llvm.memset.p0i8.i64(i8* nonnull %_1.0.sroa_idx2, i8 0, i64 9, i32 1, i1 false)
%0 = tail call i8* @__rust_alloc(i64 9, i64 1) #5, !noalias !0
%1 = icmp eq i8* %0, null
br i1 %1, label %bb7.i.i, label %"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17ha687450047947beaE.exit"
bb7.i.i: ; preds = %start
tail call fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() #5, !noalias !0
unreachable
"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17ha687450047947beaE.exit": ; preds = %start
%2 = bitcast i8* %0 to [9 x i8]*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %0, i8* nonnull %_1.0.sroa_idx2, i64 9, i32 1, i1 false) #5
call void @llvm.lifetime.end.p0i8(i64 9, i8* nonnull %_1.0.sroa_idx2)
ret [9 x i8]* %2
}
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #2
declare void @__rust_oom() unnamed_addr #3
declare noalias i8* @__rust_alloc(i64, i64) unnamed_addr #4
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #2
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
attributes #0 = { inlinehint noreturn nounwind uwtable "probe-stack"="__rust_probestack" }
attributes #1 = { nounwind uwtable "probe-stack"="__rust_probestack" }
attributes #2 = { argmemonly nounwind }
attributes #3 = { cold noreturn nounwind "probe-stack"="__rust_probestack" }
attributes #4 = { nounwind "probe-stack"="__rust_probestack" }
attributes #5 = { nounwind }
!0 = !{!1}
!1 = distinct !{!1, !2, !"<alloc::boxed::Box<T>>::new: %x"}
!2 = distinct !{!2, !"<alloc::boxed::Box<T>>::new"}
Related (or dup?) of https://github.com/rust-lang/rust/issues/41160
Yeah, this seems like the same issue as https://github.com/rust-lang/rust/issues/41160; closing.
Consider the following code:
(made it big because it's kind of simpler to see the memset and memcpy calls in the resulting asm)
It generates the following assembly:
which does a memset, alloc, memcpy dance.
I was accepting this as a fact of life, but today, I was looking at a random old version of rustc on godbolt, and it turns out before 1.12, the memset, alloc, memcpy dance wasn't happening:
https://godbolt.org/g/J3cy5E
The llvm ir back then looks like the following:
while on nightly, it looks like: