llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
29.07k stars 11.99k forks source link

Missed optimization: `udiv` and `urem` ignores assumptions, that can help to choose either 'DIV r/m32' or 'DIV r/m64' on x86-64. #115158

Open zheland opened 1 week ago

zheland commented 1 week ago

Currently udiv i64 i32 compiles to a preudocode like: if dividend <= u32::MAX { DIV r/m32 } else { DIV r/m64 }. That's great because DIV r/m32 is typically faster than DIV r/m64. However, in some cases LLVM could compile directly to the optimal variant using specified assumptions.

Examples:

Same is applicable for all udiv, sdiv, urem, srem, and div_rem operations. Same is applicable for udiv i32 i16 with choosing between 'DIV r/m16' or 'DIV r/m32' (on x86). Same is applicable for udiv i64 i64 and udiv i128 i64 with choosing between 'DIV r/m64' or (__udivti3 and __umodti3) (see https://github.com/llvm/llvm-project/issues/6769).

As far as I understand it is problematic to optimize such a thing in the language compiler (Clang or Rust) because the LLVM does not provide asymmetric division instructions with different dividend and divisor and quotient types.

Example on C

https://godbolt.org/z/j3c9KP7cx

#include <stdint.h>
#define uint128_t __uint128_t

// Optimized.
uint32_t div_u64_u32_default(uint64_t dividend, uint32_t divisor) {
    return (uint32_t)(dividend / (uint64_t)divisor);
}

// Optimized.
uint32_t div_u64_u32_if_div_le_u32_max(uint64_t dividend, uint32_t divisor) {
    if (dividend <= (uint64_t)UINT32_MAX) {
        return (uint32_t)(dividend / (uint64_t)divisor);
    } else {
        __builtin_unreachable();
    }
}

// Missed optimization.
uint32_t div_u64_u32_if_div_gt_u32_max(uint64_t dividend, uint32_t divisor) {
    if (dividend > (uint64_t)UINT32_MAX) {
        return (uint32_t)(dividend / (uint64_t)divisor);
    } else {
        __builtin_unreachable();
    }
}

// Missed optimization.
uint32_t div_u64_u32_if_quot_le_u32_max(uint64_t dividend, uint32_t divisor) {
    if ((uint32_t)(dividend >> 32) < divisor) {
        return (uint32_t)(dividend / (uint64_t)divisor);
    } else {
        __builtin_unreachable();
    }
}

// Optimized.
uint64_t div_u128_u64_default(uint128_t dividend, uint64_t divisor) {
    return (uint64_t)(dividend / (uint128_t)divisor);
}

// Optimized.
uint64_t div_u128_u64_if_div_le_u64_max(uint128_t dividend, uint64_t divisor) {
    if (dividend <= (uint128_t)UINT32_MAX) {
        return (uint64_t)(dividend / (uint128_t)divisor);
    } else {
        __builtin_unreachable();
    }
}

// Missed optimization.
uint64_t div_u128_u64_if_quot_le_u64_max(uint128_t dividend, uint64_t divisor) {
    if ((uint64_t)(dividend >> 64) < divisor) {
        return (uint64_t)(dividend / (uint128_t)divisor);
    } else {
        __builtin_unreachable();
    }
}

Similar example on Rust (click to expand)

https://godbolt.org/z/3qd6qMd7s ```rust use std::hint::assert_unchecked; // Optimized. #[no_mangle] pub fn div_u64_u32_default(dividend: u64, divisor: u32) -> u32 { unsafe { assert_unchecked(divisor > 0) }; (dividend / divisor as u64) as u32 } // Optimized. #[no_mangle] pub fn div_u64_u32_if_div_le_u32_max(dividend: u64, divisor: u32) -> u32 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(dividend <= u32::MAX as u64) }; (dividend / divisor as u64) as u32 } // Missed optimization. #[no_mangle] pub fn div_u64_u32_if_div_gt_u32_max(dividend: u64, divisor: u32) -> u32 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(dividend > u32::MAX as u64) }; (dividend / divisor as u64) as u32 } // Missed optimization. #[no_mangle] pub fn div_u64_u32_if_quot_le_u32_max(dividend: u64, divisor: u32) -> u32 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(((dividend >> 32) as u32) < divisor) }; (dividend / divisor as u64) as u32 } // Optimized. #[no_mangle] pub fn div_u128_u64_default(dividend: u128, divisor: u64) -> u64 { unsafe { assert_unchecked(divisor > 0) }; (dividend / divisor as u128) as u64 } // Optimized. #[no_mangle] pub fn div_u128_u64_if_div_le_u64_max(dividend: u128, divisor: u64) -> u64 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(dividend <= u64::MAX as u128) }; (dividend / divisor as u128) as u64 } // Missed optimization. #[no_mangle] pub fn div_u128_u64_if_quot_le_u64_max(dividend: u128, divisor: u64) -> u64 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(((dividend >> 64) as u64) < divisor) }; (dividend / divisor as u128) as u64 } ```

Produced assembly

x86-64 clang (trunk) -O3 or rust +nightly -C opt-level=3

; Optimized.
div_u64_u32_default:
        mov     rax, rdi
        mov     ecx, esi
        mov     rdx, rdi
        shr     rdx, 32
        je      .LBB0_1
        xor     edx, edx
        div     rcx
        ret
.LBB0_1:
        xor     edx, edx
        div     ecx
        ret

; Optimized.
div_u64_u32_if_div_le_u32_max:
        mov     rax, rdi
        xor     edx, edx
        div     esi
        ret

; Missed optimization: can use only 'DIV r/m64', should not try 'DIV r/m32'.
div_u64_u32_if_div_gt_u32_max:
        mov     rax, rdi
        mov     ecx, esi
        mov     rdx, rdi
        shr     rdx, 32
        je      .LBB2_1
        xor     edx, edx
        div     rcx
        ret
.LBB2_1:
        xor     edx, edx
        div     ecx
        ret

; Missed optimization: can use only 'DIV r/m32', should not try 'DIV r/m64'.
div_u64_u32_if_quot_le_u32_max:
        mov     rax, rdi
        mov     ecx, esi
        mov     rdx, rdi
        shr     rdx, 32
        je      .LBB3_1
        xor     edx, edx
        div     rcx
        ret
.LBB3_1:
        xor     edx, edx
        div     ecx
        ret

; Optimized.
div_u128_u64_default:
        push    rax
        xor     ecx, ecx
        call    __udivti3@PLT
        pop     rcx
        ret

; Optimized.
div_u128_u64_if_div_le_u64_max:
        mov     rcx, rdx
        mov     rax, rdi
        mov     rdx, rdi
        or      rdx, rcx
        shr     rdx, 32
        je      .LBB5_1
        xor     edx, edx
        div     rcx
        ret
.LBB5_1:
        xor     edx, edx
        div     ecx
        ret

; Missed optimization: can use only 'DIV r/m32 and 'DIV r/m64' without `__udivti3`.
div_u128_u64_if_quot_le_u64_max:
        push    rax
        xor     ecx, ecx
        call    __udivti3@PLT
        pop     rcx
        ret

Expected assembly

(well-optimized functions are skipped)

div_u64_u32_if_div_gt_u32_max:
        mov     rax, rdi
        mov     ecx, esi
        xor     edx, edx
        div     rcx
        ret

div_u64_u32_if_quot_le_u32_max:
        mov     rax, rdi
        xor     edx, edx
        div     esi
        ret

div_u128_u64_if_quot_le_u64_max:
        mov     rcx, rdx
        mov     rax, rdi
        mov     rdx, rdi
        or      rdx, rcx
        shr     rdx, 32
        je      .LBB5_1
        xor     edx, edx
        div     rcx
        ret
.LBB5_1:
        xor     edx, edx
        div     ecx
        ret
llvmbot commented 1 week ago

@llvm/issue-subscribers-backend-x86

Author: Andrey Zheleznov (zheland)

Currently `udiv i64 i32` compiles to a preudocode like: `if dividend <= u32::MAX { DIV r/m32 } else { DIV r/m64 }`. That's great because `DIV r/m32` is typically faster than `DIV r/m64`. However, in some cases LLVM could compile directly to the optimal variant using specified assumptions. Examples: - `udiv i64 i32` should use `DIV r/m64` if the compiler provides the assumption that the dividend won't fit into 32-bit register. - `udiv i64 i32` should use `DIV r/m32` if the compiler provides the assumption that the quotient will fit into 32-bit register. Same is applicable for all `udiv`, `sdiv`, `urem`, `srem`, and `div_rem` operations. Same is applicable for `udiv i32 i16` with choosing between 'DIV r/m16' or 'DIV r/m32' (on x86). Same is applicable for `udiv i64 i64` and `udiv i128 i64` with choosing between 'DIV r/m64' or (`__udivti3` and `__umodti3`) (see https://github.com/llvm/llvm-project/issues/6769). As far as I understand it is problematic to optimize such a thing in the language compiler (Clang or Rust) because the LLVM does not provide asymmetric division instructions with different dividend and divisor and quotient types. ### Example on C https://godbolt.org/z/j3c9KP7cx ```c #include <stdint.h> #define uint128_t __uint128_t // Optimized. uint32_t div_u64_u32_default(uint64_t dividend, uint32_t divisor) { return (uint32_t)(dividend / (uint64_t)divisor); } // Optimized. uint32_t div_u64_u32_if_div_le_u32_max(uint64_t dividend, uint32_t divisor) { if (dividend <= (uint64_t)UINT32_MAX) { return (uint32_t)(dividend / (uint64_t)divisor); } else { __builtin_unreachable(); } } // Missed optimization. uint32_t div_u64_u32_if_div_gt_u32_max(uint64_t dividend, uint32_t divisor) { if (dividend > (uint64_t)UINT32_MAX) { return (uint32_t)(dividend / (uint64_t)divisor); } else { __builtin_unreachable(); } } // Missed optimization. uint32_t div_u64_u32_if_quot_le_u32_max(uint64_t dividend, uint32_t divisor) { if ((uint32_t)(dividend >> 32) < divisor) { return (uint32_t)(dividend / (uint64_t)divisor); } else { __builtin_unreachable(); } } // Optimized. uint64_t div_u128_u64_default(uint128_t dividend, uint64_t divisor) { return (uint64_t)(dividend / (uint128_t)divisor); } // Optimized. uint64_t div_u128_u64_if_div_le_u64_max(uint128_t dividend, uint64_t divisor) { if (dividend <= (uint128_t)UINT32_MAX) { return (uint64_t)(dividend / (uint128_t)divisor); } else { __builtin_unreachable(); } } // Missed optimization. uint64_t div_u128_u64_if_quot_le_u64_max(uint128_t dividend, uint64_t divisor) { if ((uint64_t)(dividend >> 64) < divisor) { return (uint64_t)(dividend / (uint128_t)divisor); } else { __builtin_unreachable(); } } ``` <details> <summary><h3>Similar example on Rust (click to expand)</h3></summary> https://godbolt.org/z/3qd6qMd7s ```rust use std::hint::assert_unchecked; // Optimized. #[no_mangle] pub fn div_u64_u32_default(dividend: u64, divisor: u32) -> u32 { unsafe { assert_unchecked(divisor > 0) }; (dividend / divisor as u64) as u32 } // Optimized. #[no_mangle] pub fn div_u64_u32_if_div_le_u32_max(dividend: u64, divisor: u32) -> u32 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(dividend <= u32::MAX as u64) }; (dividend / divisor as u64) as u32 } // Missed optimization. #[no_mangle] pub fn div_u64_u32_if_div_gt_u32_max(dividend: u64, divisor: u32) -> u32 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(dividend > u32::MAX as u64) }; (dividend / divisor as u64) as u32 } // Missed optimization. #[no_mangle] pub fn div_u64_u32_if_quot_le_u32_max(dividend: u64, divisor: u32) -> u32 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(((dividend >> 32) as u32) < divisor) }; (dividend / divisor as u64) as u32 } // Optimized. #[no_mangle] pub fn div_u128_u64_default(dividend: u128, divisor: u64) -> u64 { unsafe { assert_unchecked(divisor > 0) }; (dividend / divisor as u128) as u64 } // Optimized. #[no_mangle] pub fn div_u128_u64_if_div_le_u64_max(dividend: u128, divisor: u64) -> u64 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(dividend <= u64::MAX as u128) }; (dividend / divisor as u128) as u64 } // Missed optimization. #[no_mangle] pub fn div_u128_u64_if_quot_le_u64_max(dividend: u128, divisor: u64) -> u64 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(((dividend >> 64) as u64) < divisor) }; (dividend / divisor as u128) as u64 } ``` </details> ### Produced assembly `x86-64 clang (trunk)` `-O3` or `rust +nightly` `-C opt-level=3` ```assembly ; Optimized. div_u64_u32_default: mov rax, rdi mov ecx, esi mov rdx, rdi shr rdx, 32 je .LBB0_1 xor edx, edx div rcx ret .LBB0_1: xor edx, edx div ecx ret ; Optimized. div_u64_u32_if_div_le_u32_max: mov rax, rdi xor edx, edx div esi ret ; Missed optimization: can use only 'DIV r/m64', should not try 'DIV r/m32'. div_u64_u32_if_div_gt_u32_max: mov rax, rdi mov ecx, esi mov rdx, rdi shr rdx, 32 je .LBB2_1 xor edx, edx div rcx ret .LBB2_1: xor edx, edx div ecx ret ; Missed optimization: can use only 'DIV r/m32', should not try 'DIV r/m64'. div_u64_u32_if_quot_le_u32_max: mov rax, rdi mov ecx, esi mov rdx, rdi shr rdx, 32 je .LBB3_1 xor edx, edx div rcx ret .LBB3_1: xor edx, edx div ecx ret ; Optimized. div_u128_u64_default: push rax xor ecx, ecx call __udivti3@PLT pop rcx ret ; Optimized. div_u128_u64_if_div_le_u64_max: mov rcx, rdx mov rax, rdi mov rdx, rdi or rdx, rcx shr rdx, 32 je .LBB5_1 xor edx, edx div rcx ret .LBB5_1: xor edx, edx div ecx ret ; Missed optimization: can use only 'DIV r/m32 and 'DIV r/m64' without `__udivti3`. div_u128_u64_if_quot_le_u64_max: push rax xor ecx, ecx call __udivti3@PLT pop rcx ret ``` ### Expected assembly (well-optimized functions are skipped) ```assembly div_u64_u32_if_div_gt_u32_max: mov rax, rdi mov ecx, esi xor edx, edx div rcx ret div_u64_u32_if_quot_le_u32_max: mov rax, rdi xor edx, edx div esi ret div_u128_u64_if_quot_le_u64_max: mov rcx, rdx mov rax, rdi mov rdx, rdi or rdx, rcx shr rdx, 32 je .LBB5_1 xor edx, edx div rcx ret .LBB5_1: xor edx, edx div ecx ret ```