rust-lang / rust

Empowering everyone to build reliable and efficient software.
https://www.rust-lang.org
Other
94.95k stars 12.24k forks source link

Three-way `match` compare mapping to immediates are lowered to a memory table load, not conditional moves, in size-optimized x86-64 assembly #127384

Open dead-claudia opened 5 days ago

dead-claudia commented 5 days ago

Godbolt link

I tried this code:

pub fn parse_u32_digit_u32_if(acc: u32, byte: u8) -> Option<u32> {
    let first_invalid = if acc < 429496729 {
        10
    } else if acc > 429496729 {
        0
    } else {
        6
    };
    let byte = byte as u32;
    let sub = if acc > 429496729 { byte } else { b'0' as u32 };
    let v = byte.wrapping_sub(sub);
    (v < first_invalid).then_some(acc.wrapping_mul(10).wrapping_add(v))
}

pub fn parse_u32_digit_u32_match(acc: u32, byte: u8) -> Option<u32> {
    let first_invalid = match acc.cmp(&429496729) {
        std::cmp::Ordering::Less => 10,
        std::cmp::Ordering::Equal => 6,
        std::cmp::Ordering::Greater => 0,
    };
    let byte = byte as u32;
    let sub = if acc > 429496729 { byte } else { b'0' as u32 };
    let v = byte.wrapping_sub(sub);
    (v < first_invalid).then_some(acc.wrapping_mul(10).wrapping_add(v))
}

I expected to see this happen: assembly for the second to be like the first and just use immediates:

parse_u32_digit_u32_if:
        xor     eax, eax
        cmp     edi, 429496729
        mov     ecx, 6
        cmovne  ecx, eax
        mov     edx, 10
        cmovae  edx, ecx
        movzx   ecx, sil
        add     ecx, -48
        cmp     edi, 429496730
        cmovae  ecx, eax
        xor     eax, eax
        cmp     ecx, edx
        setb    al
        lea     edx, [rdi + 4*rdi]
        lea     edx, [rcx + 2*rdx]
        ret

Instead, this happened: the following assembly, 20 bytes larger (54 to 74 bytes) and one instruction longer:

parse_u32_digit_u32_match:
        xor     ecx, ecx
        cmp     edi, 429496729
        setne   cl
        inc     ecx
        xor     eax, eax
        cmp     edi, 429496729
        cmovb   ecx, eax
        lea     rdx, [rip + .Lswitch.table.parse_u32_digit_u32_match]
        movzx   esi, sil
        add     esi, -48
        cmp     edi, 429496730
        cmovae  esi, eax
        xor     eax, eax
        cmp     esi, dword ptr [rdx + 4*rcx]
        setb    al
        lea     ecx, [rdi + 4*rdi]
        lea     edx, [rsi + 2*rcx]
        ret

.Lswitch.table.parse_u32_digit_u32_match:
        .long   10
        .long   6
        .long   0

Interestingly, if you move the u8 -> u32 cast down to the last point possible, it results in the same exact assembly emit:

pub fn parse_u32_digit_u8_if(acc: u32, byte: u8) -> Option<u32> {
    let first_invalid = if acc < 429496729 {
        10
    } else if acc > 429496729 {
        0
    } else {
        6
    };
    let sub = if acc > 429496729 { byte } else { b'0' };
    let v = byte.wrapping_sub(sub);
    (v < first_invalid).then_some(acc.wrapping_mul(10).wrapping_add(v as u32))
}

pub fn parse_u32_digit_u8_match(acc: u32, byte: u8) -> Option<u32> {
    let first_invalid = match acc.cmp(&429496729) {
        std::cmp::Ordering::Less => 10,
        std::cmp::Ordering::Equal => 6,
        std::cmp::Ordering::Greater => 0,
    };
    let sub = if acc > 429496729 { byte } else { b'0' };
    let v = byte.wrapping_sub(sub);
    (v < first_invalid).then_some(acc.wrapping_mul(10).wrapping_add(v as u32))
}
parse_u32_digit_u8_if:
        xor     eax, eax
        cmp     edi, 429496729
        mov     ecx, 6
        cmovne  ecx, eax
        mov     edx, 10
        cmovae  edx, ecx
        add     sil, -48
        cmp     edi, 429496730
        movzx   ecx, sil
        cmovae  ecx, eax
        xor     eax, eax
        cmp     cl, dl
        setb    al
        lea     edx, [rdi + 4*rdi]
        movzx   ecx, cl
        lea     edx, [rcx + 2*rdx]
        ret

Meta

rustc --version --verbose:

rustc 1.79.0 (129f3b996 2024-06-10)
binary: rustc
commit-hash: 129f3b9964af4d4a709d1383930ade12dfe7c081
commit-date: 2024-06-10
host: x86_64-unknown-linux-gnu
release: 1.79.0
LLVM version: 18.1.7
Compiler returned: 0
dead-claudia commented 5 days ago

The fascinating part about this is it didn't even attempt to compress the generated lookup table to 8-bit entries. That alone would've saved 9 of the 20 bytes difference.

So there's apparently at least two optimization bugs hiding here.

workingjubilee commented 5 days ago

@rustbot label: +C-optimization -C-bug +I-heavy +A-LLVM +A-codegen +T-compiler

veera-sivarajan commented 3 days ago

@rustbot label -needs-triage