Output code bloat when programming with a functional style

I have been wanting to experiment writing some code in Rust with a more “pure functional / Haskelly” style. To explain things more clearly, let’s say you have this type.

#[derive(Debug)]
struct Type {
    field1 : u64,
    field2 : u32,
    field3 : Vec<u32>,
}

A very simple method to modify field2 would be written so in everyday Rust.

impl Type   {
    #[no_mangle]
    #[inline(never)]
    fn mut_update(&mut self, field2 : u32) -> &mut Self {
        self.field2 = field2;
        self
    }
}

The &mut Self as return value is here so that one can chain such method calls. Now here is a more “pure functional” way of doing it.

impl Type   {
    #[no_mangle]
    #[inline(never)]
    fn func_update(self, field2 : u32) -> Self  {
        Type { field2 : field2, .. self }
    }
}

And once optimized out, these two functions should be perfectly identical, because func_update takes the property of self and never returns it, so the memory can be safely reused.

So I tested it out with this basic code.

fn main()   {
    let mut var = Type { field1 : 42, field2 : 0, field3 : vec![12] };

    let _ = var.mut_update(79);
    let _other = var.func_update(112);
}

But no… This is the assembly generated for mut_update

mut_update:
    .cfi_startproc
    mov dword ptr [rdi + 32], 79
    ret

And this is the one generated for func_update.

func_update:
    .cfi_startproc
    mov rax, qword ptr [rsi]
    mov qword ptr [rdi], rax
    mov dword ptr [rdi + 32], 112
    mov rax, qword ptr [rsi + 24]
    mov qword ptr [rdi + 24], rax
    movups  xmm0, xmmword ptr [rsi + 8]
    movups  xmmword ptr [rdi + 8], xmm0
    mov rax, rdi
    ret

As you can see, the whole memory occupied by self is copied to another location, a pointer to which is returned by the function. And here is the LLVM IR for these two functions.

; Function Attrs: noinline norecurse nounwind uwtable
define internal fastcc void @mut_update(%Type* nocapture dereferenceable(40)) unnamed_addr #0 {
start:
  %1 = getelementptr inbounds %Type, %Type* %0, i64 0, i32 4
  store i32 79, i32* %1, align 4
  ret void
}

; Function Attrs: noinline nounwind uwtable
define internal fastcc void @func_update(%Type* noalias nocapture sret dereferenceable(40), %Type* noalias nocapture readonly dereferenceable(40)) unnamed_addr #1 {
start:
  %self.sroa.0.0..sroa_idx = getelementptr inbounds %Type, %Type* %1, i64 0, i32 0
  %self.sroa.0.0.copyload = load i64, i64* %self.sroa.0.0..sroa_idx, align 8
  %self.sroa.4.0..sroa_idx = getelementptr inbounds %Type, %Type* %1, i64 0, i32 2
  %self.sroa.4.0..sroa_cast2 = bitcast %"collections::vec::Vec<u32>"* %self.sroa.4.0..sroa_idx to i8*
  %2 = getelementptr inbounds %Type, %Type* %0, i64 0, i32 0
  store i64 %self.sroa.0.0.copyload, i64* %2, align 8
  %3 = getelementptr inbounds %Type, %Type* %0, i64 0, i32 4
  store i32 112, i32* %3, align 8
  %self.sroa.4.8..sroa_idx = getelementptr inbounds %Type, %Type* %0, i64 0, i32 2
  %self.sroa.4.8..sroa_cast = bitcast %"collections::vec::Vec<u32>"* %self.sroa.4.8..sroa_idx to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %self.sroa.4.8..sroa_cast, i8* %self.sroa.4.0..sroa_cast2, i64 24, i32 8, i1 false)
  ret void
}

The “functional” variant is already bloated at this point, so the problem comes form rustc itself.

That’s the first problem. The other is in the main function: look at what happens between the call to the two functions.

    mov dword ptr [rax], 12
    mov qword ptr [rsp + 8], 42
    mov dword ptr [rsp + 40], 0
    mov qword ptr [rsp + 16], rax
    mov qword ptr [rsp + 24], 1
    mov qword ptr [rsp + 32], 1
    lea rdi, [rsp + 8]
    call    mut_update
    mov rax, qword ptr [rsp + 40]
    mov qword ptr [rsp + 80], rax
    movups  xmm0, xmmword ptr [rsp + 8]
    movups  xmm1, xmmword ptr [rsp + 24]
    movaps  xmmword ptr [rsp + 64], xmm1
    movaps  xmmword ptr [rsp + 48], xmm0
    lea rdi, [rsp + 96]
    lea rsi, [rsp + 48]
    call    func_update

Yes, you read it right: the whole memory space containing var is being copied to another place before the next function is even called. So it is copied twice: immediately before the call and immediately after.

Here is the LLVM IR for the main function.

; Function Attrs: uwtable
define internal void @_ZN4opti4main17ha95f343356fbf6f4E() unnamed_addr #2 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
  %_9 = alloca %Type, align 8
  %_other = alloca %Type, align 8
  %var = alloca %Type, align 8
  %0 = bitcast %Type* %var to i8*
  call void @llvm.lifetime.start(i64 40, i8* nonnull %0)
  %1 = tail call i8* @__rust_allocate(i64 4, i64 4) #4
  %2 = icmp eq i8* %1, null
  br i1 %2, label %bb5.i, label %bb5

bb5.i:                                            ; preds = %start
  tail call void @_ZN5alloc3oom3oom17h5b02814f1abf9784E()
  unreachable

bb5:                                              ; preds = %start
  %3 = bitcast i8* %1 to i32*
  store i32 12, i32* %3, align 4
  %4 = getelementptr inbounds %Type, %Type* %var, i64 0, i32 0
  store i64 42, i64* %4, align 8
  %5 = getelementptr inbounds %Type, %Type* %var, i64 0, i32 4
  store i32 0, i32* %5, align 8
  %_2.sroa.0.0..sroa_idx = getelementptr inbounds %Type, %Type* %var, i64 0, i32 2, i32 0, i32 0, i32 0, i32 0
  %6 = bitcast i32** %_2.sroa.0.0..sroa_idx to i8**
  store i8* %1, i8** %6, align 8
  %_2.sroa.4.0..sroa_idx11 = getelementptr inbounds %Type, %Type* %var, i64 0, i32 2, i32 0, i32 2
  store i64 1, i64* %_2.sroa.4.0..sroa_idx11, align 8
  %_2.sroa.5.0..sroa_idx13 = getelementptr inbounds %Type, %Type* %var, i64 0, i32 2, i32 2
  store i64 1, i64* %_2.sroa.5.0..sroa_idx13, align 8
  call fastcc void @mut_update(%Type* nonnull dereferenceable(40) %var)
  %7 = bitcast %Type* %_other to i8*
  call void @llvm.lifetime.start(i64 40, i8* nonnull %7)
  %8 = bitcast %Type* %_9 to i8*
  call void @llvm.lifetime.start(i64 40, i8* nonnull %8)
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %8, i8* nonnull %0, i64 40, i32 8, i1 false)
  call fastcc void @func_update(%Type* noalias nocapture nonnull sret dereferenceable(40) %_other, %Type* noalias nocapture nonnull dereferenceable(40) %_9)
  call void @llvm.lifetime.end(i64 40, i8* nonnull %8)
  %9 = getelementptr inbounds %Type, %Type* %_other, i64 0, i32 2, i32 0, i32 2
  %10 = load i64, i64* %9, align 8
  %not..i.i.i.i = icmp eq i64 %10, 0
  br i1 %not..i.i.i.i, label %bb6, label %bb6.i.i.i.i

bb6.i.i.i.i:                                      ; preds = %bb5
  %11 = getelementptr inbounds %Type, %Type* %_other, i64 0, i32 2
  %12 = shl i64 %10, 2
  %13 = bitcast %"collections::vec::Vec<u32>"* %11 to i8**
  %_3.sroa.0.0.copyload3.i1.i.i.i.i = load i8*, i8** %13, align 8, !alias.scope !1
  tail call void @__rust_deallocate(i8* %_3.sroa.0.0.copyload3.i1.i.i.i.i, i64 %12, i64 4) #4
  br label %bb6

bb6:                                              ; preds = %bb6.i.i.i.i, %bb5
  call void @llvm.lifetime.end(i64 40, i8* nonnull %7)
  call void @llvm.lifetime.end(i64 40, i8* nonnull %0)
  ret void
}

I know next to nothing about LLVM IR, so I don’t really understand what is going on in there. But I guess that this line is responsible for the unnecessary copy.

  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %8, i8* nonnull %0, i64 40, i32 8, i1 false)

I hope you people find a solution to this, because it is a huge loss in processor time to do almost thrice as much memory access as would be needed. And even if it is not relevant on such small a code, with a bigger code base, it must surely bloat memory usage compared to C for instance.

Inlining fixes this, but you've explicitly disabled it. Why not rely on that when code like this is on a hot path for you?

Actually, no. This is the assembly generated for the relevant section when letting the inlining happen. And adding some println!(), otherwise, the whole code is optimized into /dev/null…

    mov dword ptr [rax], 12
    mov qword ptr [rsp + 8], 42
    mov qword ptr [rsp + 16], rax
    mov qword ptr [rsp + 24], 1
    mov qword ptr [rsp + 32], 1
    mov dword ptr [rsp + 40], 79 ; This is what `mut_update` becomes.
; Begin of `println!()`.
    lea rax, [rsp + 8]
    mov qword ptr [rsp + 88], rax
    lea r14, [rip + _ZN47_$LT$opti..Type$u20$as$u20$core..fmt..Debug$GT$3fmt17h06d64ac62be9bfa7E]
    mov qword ptr [rsp + 96], r14
    lea rbx, [rip + ref.9]
    mov qword ptr [rsp + 120], rbx
    mov qword ptr [rsp + 128], 2
    mov qword ptr [rsp + 136], 0
    lea rax, [rsp + 88]
    mov qword ptr [rsp + 152], rax
    mov qword ptr [rsp + 160], 1
.Ltmp0:
    lea rdi, [rsp + 120]
    call    _ZN3std2io5stdio6_print17h4d11beeeb2d08206E@PLT
; End of `println!()`.
.Ltmp1:
    mov rax, qword ptr [rsp + 8]
    mov qword ptr [rsp + 48], rax
    mov dword ptr [rsp + 80], 112
    mov rax, qword ptr [rsp + 32]
    mov qword ptr [rsp + 72], rax
    movups  xmm0, xmmword ptr [rsp + 16]
    movups  xmmword ptr [rsp + 56], xmm0
; Yep. Everything is still copied for no reason.

Moreover, a lot of code ends up in a library, where such functions will not be inlined, so we cannot always rely on inlining to solve our problems.

Interesting, can you share the exact Rust code that causes this?

Moreover, a lot of code ends up in a library, where such functions will not be inlined, so we cannot always rely on inlining to solve our problems.

#[inline] on those functions allows inlining.

Regardless, the proper fix is likely to augment the MIR optimizations to eliminate more moves (in case this isn't purely an LLVM issue).

Also, not producing code (instead of relying on successive removal/optimization of it by LLVM) is what will make Rustc faster.

@jonas-schievink : here is the full code of the inlined version.

#[derive(Debug)]
struct Type {
    field1 : u64,
    field2 : u32,
    field3 : Vec<u32>,
}

impl Type   {
    #[no_mangle]
    //#[inline(never)]
    fn mut_update(&mut self, field2 : u32) -> &mut Self {
        self.field2 = field2;
        self
    }

    #[no_mangle]
    //#[inline(never)]
    fn func_update(self, field2 : u32) -> Self  {
        Type { field2 : field2, .. self }
    }
}

fn main()   {
    let mut var = Type { field1 : 42, field2 : 0, field3 : vec![12] };

    let _ = var.mut_update(79);
    println!("{:?}", var);
    let _other = var.func_update(112);
    println!("{:?}", _other);
}

As you can see, almost nothing changed compared to the first version.

I have been trying to understand better what is going on, so I reduced the code as much as possible.

#![crate_type = "rlib"]

struct Type {
    field1 : u64,
    field2 : u32,
}

impl Type   {
    fn mut_update(&mut self, field2 : u32) -> &mut Self {
        self.field2 = field2;
        self
    }

    fn func_update(self, field2 : u32) -> Self  {
        Type { field2 : field2, .. self }
    }
}

And when I use rustc --emit mir, here is what I get.

fn <impl at opti.rs:10:1: 23:2>::mut_update(_1: &mut Type, _2: u32) -> &mut Type {
    let mut _0: &mut Type;               // return pointer
    scope 1 {
        let _3: &mut Type;               // "self" in scope 1 at opti.rs:13:19: 13:28
        let _4: u32;                     // "field2" in scope 1 at opti.rs:13:30: 13:36
    }
    let mut _5: &mut Type;
    let mut _6: u32;

    bb0: {
        StorageLive(_3);                 // scope 0 at opti.rs:13:19: 13:28
        _3 = _1;                         // scope 0 at opti.rs:13:19: 13:28
        StorageLive(_4);                 // scope 0 at opti.rs:13:30: 13:36
        _4 = _2;                         // scope 0 at opti.rs:13:30: 13:36
        StorageLive(_5);                 // scope 1 at opti.rs:13:57: 16:6
        StorageLive(_6);                 // scope 1 at opti.rs:14:23: 14:29
        _6 = _4;                         // scope 1 at opti.rs:14:23: 14:29
        ((*_3).1: u32) = _6;             // scope 1 at opti.rs:14:9: 14:29
        StorageDead(_6);                 // scope 1 at opti.rs:14:29: 14:29
        _5 = _3;                         // scope 1 at opti.rs:15:9: 15:13
        _0 = _5;                         // scope 1 at opti.rs:13:57: 16:6
        StorageDead(_5);                 // scope 1 at opti.rs:16:6: 16:6
        StorageDead(_4);                 // scope 0 at opti.rs:16:6: 16:6
        StorageDead(_3);                 // scope 0 at opti.rs:16:6: 16:6
        return;                          // scope 1 at opti.rs:16:6: 16:6
    }
}

fn <impl at opti.rs:10:1: 23:2>::func_update(_1: Type, _2: u32) -> Type {
    let mut _0: Type;                    // return pointer
    scope 1 {
        let _3: Type;                    // "self" in scope 1 at opti.rs:20:20: 20:24
        let _4: u32;                     // "field2" in scope 1 at opti.rs:20:26: 20:32
    }
    let mut _5: u32;

    bb0: {
        StorageLive(_3);                 // scope 0 at opti.rs:20:20: 20:24
        _3 = _1;                         // scope 0 at opti.rs:20:20: 20:24
        StorageLive(_4);                 // scope 0 at opti.rs:20:26: 20:32
        _4 = _2;                         // scope 0 at opti.rs:20:26: 20:32
        StorageLive(_5);                 // scope 1 at opti.rs:21:25: 21:31
        _5 = _4;                         // scope 1 at opti.rs:21:25: 21:31
        _0 = Type { field1: (_3.0: u64), field2: _5 }; // scope 1 at opti.rs:21:9: 21:42
        StorageDead(_5);                 // scope 1 at opti.rs:21:42: 21:42
        StorageDead(_4);                 // scope 0 at opti.rs:22:6: 22:6
        StorageDead(_3);                 // scope 0 at opti.rs:22:6: 22:6
        return;                          // scope 1 at opti.rs:22:6: 22:6
    }
}

The first thing that strikes me is: why is the code so bloated? If I understand well what happens, this code should be exactly equivalent.

fn <impl at opti.rs:10:1: 23:2>::mut_update(_1: &mut Type, _2: u32) -> &mut Type {
    let mut _0: &mut Type;               // return pointer

    bb0: {
        ((*_1).1: u32) = _2;             // scope 1 at opti.rs:14:9: 14:29
        _0 = _1;                         // scope 1 at opti.rs:13:57: 16:6
        return;                          // scope 1 at opti.rs:16:6: 16:6
    }
}

fn <impl at opti.rs:10:1: 23:2>::func_update(_1: Type, _2: u32) -> Type {
    let mut _0: Type;                    // return pointer

    bb0: {
        _0 = Type { field1: (_1.0: u64), field2: _2 }; // scope 1 at opti.rs:21:9: 21:42
        return;                          // scope 1 at opti.rs:22:6: 22:6
    }
}

So if someone who knows Rust internals well comes here: is the full code useful in some way, and if yes, which one? Or should we try to make it a lot simpler?

Second thing, the simplified code makes it possible to imagine a way to solve my original problem, that is the useless data copying.

If the argument arg is owned by the function.
If a new variable var is created with arg as basis.
If no read or write is performed on arg posteriorly to the creation of var.
Then, replace the creation of var by a reuse of arg, possibly modified.

This should solve the problem without introducing any new one.

Now, I am willing to try and create this optimization, but I need help because I have no idea where in the code I should introduce it.

-Z mir-opt-level=3 gives the following. Seems func_update is what you expected now. I think most of this will be due to the copy propagation pass.

fn <impl at opti.rs:8:1: 17:2>::mut_update(_1: &mut Type, _2: u32) -> &mut Type {
    let mut _0: &mut Type;               // return pointer
    scope 1 {
        let _3: &mut Type;               // "self" in scope 1 at opti.rs:9:19: 9:28
    }
    let mut _4: &mut Type;

    bb0: {
        StorageLive(_3);                 // scope 0 at opti.rs:9:19: 9:28
        _3 = _1;                         // scope 0 at opti.rs:9:19: 9:28
        nop;                             // scope 0 at opti.rs:9:30: 9:36
        nop;                             // scope 0 at opti.rs:9:30: 9:36
        StorageLive(_4);                 // scope 1 at opti.rs:9:57: 12:6
        nop;                             // scope 1 at opti.rs:10:23: 10:29
        nop;                             // scope 1 at opti.rs:10:23: 10:29
        ((*_3).1: u32) = _2;             // scope 1 at opti.rs:10:9: 10:29
        nop;                             // scope 1 at opti.rs:10:29: 10:29
        _4 = _3;                         // scope 1 at opti.rs:11:9: 11:13
        _0 = _4;                         // scope 1 at opti.rs:9:57: 12:6
        StorageDead(_4);                 // scope 1 at opti.rs:12:6: 12:6
        nop;                             // scope 0 at opti.rs:12:6: 12:6
        StorageDead(_3);                 // scope 0 at opti.rs:12:6: 12:6
        return;                          // scope 1 at opti.rs:12:6: 12:6
    }
}

fn <impl at opti.rs:8:1: 17:2>::func_update(_1: Type, _2: u32) -> Type {
    let mut _0: Type;                    // return pointer
    scope 1 {
    }

    bb0: {
        nop;                             // scope 0 at opti.rs:14:20: 14:24
        nop;                             // scope 0 at opti.rs:14:20: 14:24
        nop;                             // scope 0 at opti.rs:14:26: 14:32
        nop;                             // scope 0 at opti.rs:14:26: 14:32
        nop;                             // scope 1 at opti.rs:15:25: 15:31
        nop;                             // scope 1 at opti.rs:15:25: 15:31
        (_0.0: u64) = (_1.0: u64);       // scope 1 at opti.rs:15:9: 15:42
        (_0.1: u32) = _2;                // scope 1 at opti.rs:15:9: 15:42
        nop;                             // scope 1 at opti.rs:15:42: 15:42
        nop;                             // scope 0 at opti.rs:16:6: 16:6
        nop;                             // scope 0 at opti.rs:16:6: 16:6
        return;                          // scope 1 at opti.rs:16:6: 16:6
    }
}

Small update, here's the code on godbolt: https://godbolt.org/z/BR-uGo

Looks like you still get the same asm today.

This looks non-actionable to me. All those copies look necessary from a black-box perspective. The one at the caller could be elided making use of the fact that the argument is actually readonly (and nocapture), but that's a property of the implementation, not of the ABI.

rust-lang / rust

Output code bloat when programming with a functional style #42657