Currently udiv i64 i32 compiles to a preudocode like: if dividend <= u32::MAX { DIV r/m32 } else { DIV r/m64 }.
That's great because DIV r/m32 is typically faster than DIV r/m64.
However, in some cases LLVM could compile directly to the optimal variant using specified assumptions.
Examples:
udiv i64 i32 should use DIV r/m64 if the compiler provides the assumption that the dividend won't fit into 32-bit register.
udiv i64 i32 should use DIV r/m32 if the compiler provides the assumption that the quotient will fit into 32-bit register.
Same is applicable for all udiv, sdiv, urem, srem, and div_rem operations.
Same is applicable for udiv i32 i16 with choosing between 'DIV r/m16' or 'DIV r/m32' (on x86).
Same is applicable for udiv i64 i64 and udiv i128 i64 with choosing between 'DIV r/m64' or (__udivti3 and __umodti3) (see https://github.com/llvm/llvm-project/issues/6769).
As far as I understand it is problematic to optimize such a thing in the language compiler (Clang or Rust) because the LLVM does not provide asymmetric division instructions with different dividend and divisor and quotient types.
x86-64 clang (trunk)-O3 or rust +nightly-C opt-level=3
; Optimized.
div_u64_u32_default:
mov rax, rdi
mov ecx, esi
mov rdx, rdi
shr rdx, 32
je .LBB0_1
xor edx, edx
div rcx
ret
.LBB0_1:
xor edx, edx
div ecx
ret
; Optimized.
div_u64_u32_if_div_le_u32_max:
mov rax, rdi
xor edx, edx
div esi
ret
; Missed optimization: can use only 'DIV r/m64', should not try 'DIV r/m32'.
div_u64_u32_if_div_gt_u32_max:
mov rax, rdi
mov ecx, esi
mov rdx, rdi
shr rdx, 32
je .LBB2_1
xor edx, edx
div rcx
ret
.LBB2_1:
xor edx, edx
div ecx
ret
; Missed optimization: can use only 'DIV r/m32', should not try 'DIV r/m64'.
div_u64_u32_if_quot_le_u32_max:
mov rax, rdi
mov ecx, esi
mov rdx, rdi
shr rdx, 32
je .LBB3_1
xor edx, edx
div rcx
ret
.LBB3_1:
xor edx, edx
div ecx
ret
; Optimized.
div_u128_u64_default:
push rax
xor ecx, ecx
call __udivti3@PLT
pop rcx
ret
; Optimized.
div_u128_u64_if_div_le_u64_max:
mov rcx, rdx
mov rax, rdi
mov rdx, rdi
or rdx, rcx
shr rdx, 32
je .LBB5_1
xor edx, edx
div rcx
ret
.LBB5_1:
xor edx, edx
div ecx
ret
; Missed optimization: can use only 'DIV r/m32 and 'DIV r/m64' without `__udivti3`.
div_u128_u64_if_quot_le_u64_max:
push rax
xor ecx, ecx
call __udivti3@PLT
pop rcx
ret
Expected assembly
(well-optimized functions are skipped)
div_u64_u32_if_div_gt_u32_max:
mov rax, rdi
mov ecx, esi
xor edx, edx
div rcx
ret
div_u64_u32_if_quot_le_u32_max:
mov rax, rdi
xor edx, edx
div esi
ret
div_u128_u64_if_quot_le_u64_max:
mov rcx, rdx
mov rax, rdi
mov rdx, rdi
or rdx, rcx
shr rdx, 32
je .LBB5_1
xor edx, edx
div rcx
ret
.LBB5_1:
xor edx, edx
div ecx
ret
Currently `udiv i64 i32` compiles to a preudocode like: `if dividend <= u32::MAX { DIV r/m32 } else { DIV r/m64 }`.
That's great because `DIV r/m32` is typically faster than `DIV r/m64`.
However, in some cases LLVM could compile directly to the optimal variant using specified assumptions.
Examples:
- `udiv i64 i32` should use `DIV r/m64` if the compiler provides the assumption that the dividend won't fit into 32-bit register.
- `udiv i64 i32` should use `DIV r/m32` if the compiler provides the assumption that the quotient will fit into 32-bit register.
Same is applicable for all `udiv`, `sdiv`, `urem`, `srem`, and `div_rem` operations.
Same is applicable for `udiv i32 i16` with choosing between 'DIV r/m16' or 'DIV r/m32' (on x86).
Same is applicable for `udiv i64 i64` and `udiv i128 i64` with choosing between 'DIV r/m64' or (`__udivti3` and `__umodti3`) (see https://github.com/llvm/llvm-project/issues/6769).
As far as I understand it is problematic to optimize such a thing in the language compiler (Clang or Rust) because the LLVM does not provide asymmetric division instructions with different dividend and divisor and quotient types.
### Example on C
https://godbolt.org/z/j3c9KP7cx
```c
#include <stdint.h>
#define uint128_t __uint128_t
// Optimized.
uint32_t div_u64_u32_default(uint64_t dividend, uint32_t divisor) {
return (uint32_t)(dividend / (uint64_t)divisor);
}
// Optimized.
uint32_t div_u64_u32_if_div_le_u32_max(uint64_t dividend, uint32_t divisor) {
if (dividend <= (uint64_t)UINT32_MAX) {
return (uint32_t)(dividend / (uint64_t)divisor);
} else {
__builtin_unreachable();
}
}
// Missed optimization.
uint32_t div_u64_u32_if_div_gt_u32_max(uint64_t dividend, uint32_t divisor) {
if (dividend > (uint64_t)UINT32_MAX) {
return (uint32_t)(dividend / (uint64_t)divisor);
} else {
__builtin_unreachable();
}
}
// Missed optimization.
uint32_t div_u64_u32_if_quot_le_u32_max(uint64_t dividend, uint32_t divisor) {
if ((uint32_t)(dividend >> 32) < divisor) {
return (uint32_t)(dividend / (uint64_t)divisor);
} else {
__builtin_unreachable();
}
}
// Optimized.
uint64_t div_u128_u64_default(uint128_t dividend, uint64_t divisor) {
return (uint64_t)(dividend / (uint128_t)divisor);
}
// Optimized.
uint64_t div_u128_u64_if_div_le_u64_max(uint128_t dividend, uint64_t divisor) {
if (dividend <= (uint128_t)UINT32_MAX) {
return (uint64_t)(dividend / (uint128_t)divisor);
} else {
__builtin_unreachable();
}
}
// Missed optimization.
uint64_t div_u128_u64_if_quot_le_u64_max(uint128_t dividend, uint64_t divisor) {
if ((uint64_t)(dividend >> 64) < divisor) {
return (uint64_t)(dividend / (uint128_t)divisor);
} else {
__builtin_unreachable();
}
}
```
<details>
<summary><h3>Similar example on Rust (click to expand)</h3></summary>
https://godbolt.org/z/3qd6qMd7s
```rust
use std::hint::assert_unchecked;
// Optimized.
#[no_mangle]
pub fn div_u64_u32_default(dividend: u64, divisor: u32) -> u32 {
unsafe { assert_unchecked(divisor > 0) };
(dividend / divisor as u64) as u32
}
// Optimized.
#[no_mangle]
pub fn div_u64_u32_if_div_le_u32_max(dividend: u64, divisor: u32) -> u32 {
unsafe { assert_unchecked(divisor > 0) };
unsafe { assert_unchecked(dividend <= u32::MAX as u64) };
(dividend / divisor as u64) as u32
}
// Missed optimization.
#[no_mangle]
pub fn div_u64_u32_if_div_gt_u32_max(dividend: u64, divisor: u32) -> u32 {
unsafe { assert_unchecked(divisor > 0) };
unsafe { assert_unchecked(dividend > u32::MAX as u64) };
(dividend / divisor as u64) as u32
}
// Missed optimization.
#[no_mangle]
pub fn div_u64_u32_if_quot_le_u32_max(dividend: u64, divisor: u32) -> u32 {
unsafe { assert_unchecked(divisor > 0) };
unsafe { assert_unchecked(((dividend >> 32) as u32) < divisor) };
(dividend / divisor as u64) as u32
}
// Optimized.
#[no_mangle]
pub fn div_u128_u64_default(dividend: u128, divisor: u64) -> u64 {
unsafe { assert_unchecked(divisor > 0) };
(dividend / divisor as u128) as u64
}
// Optimized.
#[no_mangle]
pub fn div_u128_u64_if_div_le_u64_max(dividend: u128, divisor: u64) -> u64 {
unsafe { assert_unchecked(divisor > 0) };
unsafe { assert_unchecked(dividend <= u64::MAX as u128) };
(dividend / divisor as u128) as u64
}
// Missed optimization.
#[no_mangle]
pub fn div_u128_u64_if_quot_le_u64_max(dividend: u128, divisor: u64) -> u64 {
unsafe { assert_unchecked(divisor > 0) };
unsafe { assert_unchecked(((dividend >> 64) as u64) < divisor) };
(dividend / divisor as u128) as u64
}
```
</details>
### Produced assembly
`x86-64 clang (trunk)` `-O3` or `rust +nightly` `-C opt-level=3`
```assembly
; Optimized.
div_u64_u32_default:
mov rax, rdi
mov ecx, esi
mov rdx, rdi
shr rdx, 32
je .LBB0_1
xor edx, edx
div rcx
ret
.LBB0_1:
xor edx, edx
div ecx
ret
; Optimized.
div_u64_u32_if_div_le_u32_max:
mov rax, rdi
xor edx, edx
div esi
ret
; Missed optimization: can use only 'DIV r/m64', should not try 'DIV r/m32'.
div_u64_u32_if_div_gt_u32_max:
mov rax, rdi
mov ecx, esi
mov rdx, rdi
shr rdx, 32
je .LBB2_1
xor edx, edx
div rcx
ret
.LBB2_1:
xor edx, edx
div ecx
ret
; Missed optimization: can use only 'DIV r/m32', should not try 'DIV r/m64'.
div_u64_u32_if_quot_le_u32_max:
mov rax, rdi
mov ecx, esi
mov rdx, rdi
shr rdx, 32
je .LBB3_1
xor edx, edx
div rcx
ret
.LBB3_1:
xor edx, edx
div ecx
ret
; Optimized.
div_u128_u64_default:
push rax
xor ecx, ecx
call __udivti3@PLT
pop rcx
ret
; Optimized.
div_u128_u64_if_div_le_u64_max:
mov rcx, rdx
mov rax, rdi
mov rdx, rdi
or rdx, rcx
shr rdx, 32
je .LBB5_1
xor edx, edx
div rcx
ret
.LBB5_1:
xor edx, edx
div ecx
ret
; Missed optimization: can use only 'DIV r/m32 and 'DIV r/m64' without `__udivti3`.
div_u128_u64_if_quot_le_u64_max:
push rax
xor ecx, ecx
call __udivti3@PLT
pop rcx
ret
```
### Expected assembly
(well-optimized functions are skipped)
```assembly
div_u64_u32_if_div_gt_u32_max:
mov rax, rdi
mov ecx, esi
xor edx, edx
div rcx
ret
div_u64_u32_if_quot_le_u32_max:
mov rax, rdi
xor edx, edx
div esi
ret
div_u128_u64_if_quot_le_u64_max:
mov rcx, rdx
mov rax, rdi
mov rdx, rdi
or rdx, rcx
shr rdx, 32
je .LBB5_1
xor edx, edx
div rcx
ret
.LBB5_1:
xor edx, edx
div ecx
ret
```
Currently
udiv i64 i32
compiles to a preudocode like:if dividend <= u32::MAX { DIV r/m32 } else { DIV r/m64 }
. That's great becauseDIV r/m32
is typically faster thanDIV r/m64
. However, in some cases LLVM could compile directly to the optimal variant using specified assumptions.Examples:
udiv i64 i32
should useDIV r/m64
if the compiler provides the assumption that the dividend won't fit into 32-bit register.udiv i64 i32
should useDIV r/m32
if the compiler provides the assumption that the quotient will fit into 32-bit register.Same is applicable for all
udiv
,sdiv
,urem
,srem
, anddiv_rem
operations. Same is applicable forudiv i32 i16
with choosing between 'DIV r/m16' or 'DIV r/m32' (on x86). Same is applicable forudiv i64 i64
andudiv i128 i64
with choosing between 'DIV r/m64' or (__udivti3
and__umodti3
) (see https://github.com/llvm/llvm-project/issues/6769).As far as I understand it is problematic to optimize such a thing in the language compiler (Clang or Rust) because the LLVM does not provide asymmetric division instructions with different dividend and divisor and quotient types.
Example on C
https://godbolt.org/z/j3c9KP7cx
https://godbolt.org/z/3qd6qMd7s ```rust use std::hint::assert_unchecked; // Optimized. #[no_mangle] pub fn div_u64_u32_default(dividend: u64, divisor: u32) -> u32 { unsafe { assert_unchecked(divisor > 0) }; (dividend / divisor as u64) as u32 } // Optimized. #[no_mangle] pub fn div_u64_u32_if_div_le_u32_max(dividend: u64, divisor: u32) -> u32 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(dividend <= u32::MAX as u64) }; (dividend / divisor as u64) as u32 } // Missed optimization. #[no_mangle] pub fn div_u64_u32_if_div_gt_u32_max(dividend: u64, divisor: u32) -> u32 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(dividend > u32::MAX as u64) }; (dividend / divisor as u64) as u32 } // Missed optimization. #[no_mangle] pub fn div_u64_u32_if_quot_le_u32_max(dividend: u64, divisor: u32) -> u32 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(((dividend >> 32) as u32) < divisor) }; (dividend / divisor as u64) as u32 } // Optimized. #[no_mangle] pub fn div_u128_u64_default(dividend: u128, divisor: u64) -> u64 { unsafe { assert_unchecked(divisor > 0) }; (dividend / divisor as u128) as u64 } // Optimized. #[no_mangle] pub fn div_u128_u64_if_div_le_u64_max(dividend: u128, divisor: u64) -> u64 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(dividend <= u64::MAX as u128) }; (dividend / divisor as u128) as u64 } // Missed optimization. #[no_mangle] pub fn div_u128_u64_if_quot_le_u64_max(dividend: u128, divisor: u64) -> u64 { unsafe { assert_unchecked(divisor > 0) }; unsafe { assert_unchecked(((dividend >> 64) as u64) < divisor) }; (dividend / divisor as u128) as u64 } ```Similar example on Rust (click to expand)
Produced assembly
x86-64 clang (trunk)
-O3
orrust +nightly
-C opt-level=3
Expected assembly
(well-optimized functions are skipped)