fitzgen / bumpalo

A fast bump allocation arena for Rust
https://docs.rs/bumpalo
Apache License 2.0
1.41k stars 111 forks source link

Mark `alloc_layout_slow` as `#[cold]` #198

Closed Kmeakin closed 1 year ago

Kmeakin commented 1 year ago

Marking alloc_layout_slow as #[cold] slightly improves the assembly for calls to bump.alloc(42):

Before


.section .text.playground::alloc,"ax",@progbits
        .globl  playground::alloc
        .p2align        4, 0x90
        .type   playground::alloc,@function
playground::alloc:
        .cfi_startproc
        push rax
        .cfi_def_cfa_offset 16

        mov rcx, qword ptr [rdi + 16]
        mov rax, qword ptr [rcx + 32]

        cmp rax, 4
        jb .LBB3_2

        add rax, -4
        and rax, -4

        cmp rax, qword ptr [rcx]
        jae .LBB3_4

.LBB3_2:
        mov esi, 4
        mov edx, 4
        call qword ptr [rip + bumpalo::Bump::alloc_layout_slow@GOTPCREL]

        test rax, rax
        je .LBB3_3

        mov dword ptr [rax], 42

        pop rcx
        .cfi_def_cfa_offset 8
        ret

.LBB3_4:
        .cfi_def_cfa_offset 16
        mov qword ptr [rcx + 32], rax
        mov dword ptr [rax], 42

        pop rcx

        .cfi_def_cfa_offset 8
        ret

.LBB3_3:
        .cfi_def_cfa_offset 16
        call qword ptr [rip + bumpalo::oom@GOTPCREL]
        ud2

After:

.section .text.playground::alloc,"ax",@progbits
        .globl  playground::alloc
        .p2align        4, 0x90
        .type   playground::alloc,@function
playground::alloc:

        .cfi_startproc
        push rax
        .cfi_def_cfa_offset 16

        mov rcx, qword ptr [rdi + 16]
        mov rax, qword ptr [rcx + 32]

        cmp rax, 4
        jb .LBB3_2

        add rax, -4
        and rax, -4

        cmp rax, qword ptr [rcx]
        jb .LBB3_2

        mov qword ptr [rcx + 32], rax

.LBB3_5:
        mov dword ptr [rax], 42

        pop rcx
        .cfi_def_cfa_offset 8
        ret

.LBB3_2:
        .cfi_def_cfa_offset 16
        mov esi, 4
        mov edx, 4
        call qword ptr [rip + bumpalo::Bump::alloc_layout_slow@GOTPCREL]

        test rax, rax
        jne .LBB3_5

        call qword ptr [rip + bumpalo::oom@GOTPCREL]
        ud2

This shrinks the call from 23 instructions to 20 instructions. The benchmarks showed some improvements and some regressions between +3% and -3%, so I assume any performance impact is lost in the noise.