Fast algorithm for u128 (and i128) divided by small constant

leonardo-m commented 6 years ago

This is an enhancement request. While u16, u32 and u64 numbers get divided by a small constant divisor using a fast algorithm, the same isn't true for u128 numbers:

fn digits_sum0(mut n: u16) -> u16 {
    let mut total = 0;
    while n != 0 {
        total += n % 10;
        n /= 10;
    }
    total
}

fn digits_sum1(mut n: u32) -> u32 {
    let mut total = 0;
    while n != 0 {
        total += n % 10;
        n /= 10;
    }
    total
}

fn digits_sum2(mut n: u64) -> u64 {
    let mut total = 0;
    while n != 0 {
        total += n % 10;
        n /= 10;
    }
    total
}

fn digits_sum3(mut n: u128) -> u128 {
    let mut total = 0;
    while n != 0 {
        total += n % 10;
        n /= 10;
    }
    total
}

Generate asm (with -O):

digits_sum0:
        xor     eax, eax
        test    di, di
        je      .LBB0_2
.LBB0_1:
        movzx   ecx, di
        imul    edx, ecx, 52429
        shr     edx, 19
        lea     esi, [rdx + rdx]
        lea     esi, [rsi + 4*rsi]
        sub     edi, esi
        add     eax, edi
        mov     edi, edx
        cmp     ecx, 10
        jae     .LBB0_1
.LBB0_2:
        ret

digits_sum1:
        xor     eax, eax
        test    edi, edi
        je      .LBB0_3
        mov     r8d, 3435973837
.LBB0_2:
        mov     edx, edi
        imul    rdx, r8
        shr     rdx, 35
        lea     esi, [rdx + rdx]
        lea     esi, [rsi + 4*rsi]
        mov     ecx, edi
        sub     ecx, esi
        add     eax, ecx
        cmp     edi, 10
        mov     edi, edx
        jae     .LBB0_2
.LBB0_3:
        ret

digits_sum2:
        xor     ecx, ecx
        test    rdi, rdi
        je      .LBB1_3
        movabs  r8, -3689348814741910323
.LBB1_2:
        mov     rax, rdi
        mul     r8
        shr     rdx, 3
        lea     rax, [rdx + rdx]
        lea     rax, [rax + 4*rax]
        mov     rsi, rdi
        sub     rsi, rax
        add     rcx, rsi
        cmp     rdi, 10
        mov     rdi, rdx
        jae     .LBB1_2
.LBB1_3:
        mov     rax, rcx
        ret

digits_sum3:
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        mov     rax, rdi
        or      rax, rsi
        je      .LBB2_1
        mov     rbx, rsi
        mov     r15, rdi
        xor     r14d, r14d
        mov     r13d, 10
        xor     r12d, r12d
.LBB2_4:
        mov     edx, 10
        xor     ecx, ecx
        mov     rdi, r15
        mov     rsi, rbx
        call    __udivti3@PLT
        mov     rcx, rax
        mov     rsi, rdx
        mul     r13
        lea     rdi, [rsi + 4*rsi]
        lea     rdx, [rdx + 2*rdi]
        mov     rdi, r15
        sub     rdi, rax
        mov     rax, rbx
        sbb     rax, rdx
        add     r14, rdi
        adc     r12, rax
        cmp     r15, 10
        sbb     rbx, 0
        mov     r15, rcx
        mov     rbx, rsi
        jae     .LBB2_4
        jmp     .LBB2_2
.LBB2_1:
        xor     r14d, r14d
        xor     r12d, r12d
.LBB2_2:
        mov     rax, r14
        mov     rdx, r12
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret

The faster algorithm is short enough and it could be added to rustc:

http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html http://ridiculousfish.com/blog/posts/labor-of-division-episode-ii.html http://libdivide.com/ http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html

nagisa commented 6 years ago

No point in doing it in rustc, if it should be done in LLVM.

leonardo-m commented 6 years ago

scottmcm commented 6 years ago

Do you have benchmarks that the multiply algorithm is actually faster for u128?

Note that the 32-bit one does

    imul    rcx, rax
    shr rcx, 35

which is using a 64-bit multiply (two-argument form, with r registers).

So it's possible that the u128 version would need a 256-bit multiply, and it isn't obvious to me whether that, and the corresponding fixups, would be faster overall than __udivti3.

leonardo-m commented 6 years ago

I have no proof, but this is worth exploring. The specialized-div-rem shows that we could go much faster than the current div and rem operations.

nagisa commented 6 years ago

@scottmcm I’m fairly sure that it would need at most 3 multiplications and a few additions to calculate the value necessary for shifting.

AaronKutch commented 4 years ago

The fact that LLVM cannot reduce the division to multiplications by a constant might be caused by #44545 or is an independent issue. I extracted the small divisor path from my algorithms, and the compiler was able to reduce the divisions to multiplications by a constant.

pub fn u128_div_rem_10(duo: u128) -> (u128, u128) {
    let duo_hi = (duo >> 64) as u64;
    let div_0 = 10;
    let (quo_hi, rem_3) = (duo_hi / div_0, duo_hi % div_0);

    let duo_mid =
        ((duo >> 32) as u32 as u64)
        | (rem_3 << 32);
    let (quo_1, rem_2) = (duo_mid / div_0, duo_mid % div_0);

    let duo_lo =
        (duo as u32 as u64)
        | (rem_2 << 32);
    let (quo_0, rem_1) = (duo_lo / div_0, duo_lo % div_0);

    return (
        (quo_0 as u128)
        | ((quo_1 as u128) << 32)
        | ((quo_hi as u128) << 64),
        rem_1 as u128
    )
}

I have checked for correctness with my fuzz tester.

nagisa commented 4 years ago

The fact that LLVM cannot reduce the division to multiplications by a constant might be caused

I investigated this recently for https://github.com/rust-lang/rust/pull/76017#discussion_r479699174.

It is a bug in LLVM and it isn’t like it is impossible for it to strength-reduce, its just that doing so:

a) requires the upper-half of the multiplication result (i.e. for 128-bit multiplication it requires the upper 128-bits of a 256-bit result); and b) calculating that cannot be easily made less conservative because there are a couple of bad backends in LLVM – namely RISCV and 32-bit ARM.

I think I know an alternative way to resolve it, though, just haven’t had the time to get back to it.

u64 numbers get divided by a small constant divisor using a fast algorithm

Turns out it isn’t on 32-bit targets!

nagisa commented 4 years ago

I’ve drafted a LLVM differential to fix this https://reviews.llvm.org/D87976.

leonardo-m commented 3 years ago

In the meantime GCC had implemented it:

pub fn reversed(mut n: u128) -> u128 { // In base 10.
    let mut reversed = 0;
    while n != 0 {
        reversed = reversed * 10 + n % 10;
        n /= 10;
    }
    reversed
}

Rustc 1.52.0-nightly (152f66092 2021-02-17):

reversed:
    push    rbp
    push    r15
    push    r14
    push    r13
    push    r12
    push    rbx
    sub     rsp, 8
    mov     rax, rdi
    or      rax, rsi
    je      .LBB0_1
    mov     rbx, rsi
    mov     r14, rdi
    xor     edx, edx
    mov     r15d, 10
    xor     ecx, ecx
.LBB0_4:
    mulx    rax, r13, r15
    lea     rcx, [rcx + 4*rcx]
    lea     r12, [rax + 2*rcx]
    mov     edx, 10
    mov     rdi, r14
    mov     rsi, rbx
    xor     ecx, ecx
    call    qword ptr [rip + __udivti3@GOTPCREL]
    mov     rsi, rdx
    mov     rdx, rax
    mulx    rcx, rdi, r15
    lea     rdx, [rsi + 4*rsi]
    lea     rbp, [rcx + 2*rdx]
    mov     rdx, r14
    sub     rdx, rdi
    mov     rcx, rbx
    sbb     rcx, rbp
    add     rdx, r13
    adc     rcx, r12
    cmp     r14, 10
    sbb     rbx, 0
    mov     r14, rax
    mov     rbx, rsi
    jae     .LBB0_4
    jmp     .LBB0_2
.LBB0_1:
    xor     edx, edx
    xor     ecx, ecx
.LBB0_2:
    mov     rax, rdx
    mov     rdx, rcx
    add     rsp, 8
    pop     rbx
    pop     r12
    pop     r13
    pop     r14
    pop     r15
    pop     rbp
    ret

__uint128_t reversed(__uint128_t n) {
    __uint128_t reversed = 0;
    while (n != 0) {
        reversed = reversed * 10 + n % 10;
        n /= 10;
    }
    return reversed;
}

GCC thunk 11.0.1 20210307 (experimental):

reversed(unsigned __int128):
    mov     r8, rsi
    mov     r9, rdi
    mov     rsi, rdi
    mov     rax, r8
    mov     rdi, r8
    or      rax, r9
    je      .L6
    push    r15
    xor     eax, eax
    xor     edx, edx
    mov     r15d, 10
    push    r14
    movabs  r14, -3689348814741910324
    push    r13
    xor     r13d, r13d
    push    r12
    movabs  r12, -3689348814741910323
    push    rbp
    push    rbx
.L5:
    imul    rcx, rdx, 10
    mul     r15
    mov     r9, rdx
    mov     r8, rax
    add     r9, rcx
    mov     rcx, rsi
    add     rcx, rdi
    adc     rcx, 0
    xor     r11d, r11d
    mov     rax, rcx
    mul     r12
    mov     rax, rdx
    and     rdx, -4
    shr     rax, 2
    add     rdx, rax
    mov     rax, rsi
    sub     rcx, rdx
    mov     rdx, rdi
    sub     rax, rcx
    mov     r10, rcx
    sbb     rdx, r11
    mov     rbp, rdx
    mov     rdx, rax
    imul    rdx, r14
    imul    rbp, r12
    add     rbp, rdx
    mul     r12
    mov     rcx, rax
    mov     rbx, rdx
    and     eax, 1
    mov     edx, 5
    mul     rdx
    add     rbx, rbp
    add     rax, r10
    adc     rdx, r11
    add     rax, r8
    mov     r8, rdi
    mov     rdi, rbx
    adc     rdx, r9
    mov     r9, rsi
    mov     rsi, rcx
    shr     rdi
    shrd    rsi, rbx, 1
    mov     ebx, 9
    cmp     rbx, r9
    mov     rbx, r13
    sbb     rbx, r8
    jc      .L5
    pop     rbx
    pop     rbp
    pop     r12
    pop     r13
    pop     r14
    pop     r15
    ret
.L6:
    mov     rax, r9
    mov     rdx, r8
    ret

nagisa commented 3 years ago

Yes, the primary couple of concerns I've seen blocking the LLVM diff is:

There are 2 issues here that x86 in particular avoids:

Some targets don't really have a multiplier, like certain RISC-V variants. Or some targets have a multiply instruction that's hard to use here, like Cortex-M0.

The libcall is doing a ton of extra work to produce the result of an NxN->N multiply. We don't have the libcall variant we want here.

and the regression on codegen quality in certain corner cases.

The former can probably be resolved by some sort of a target property. The latter is harder, I suspect.

est31 commented 1 year ago

Btw, clang is also able to optimize this for C (godbolt):

#include "stdint.h"

uint64_t div10(uint64_t num) {
    return num / 10;
}

unsigned __int128 div10(unsigned __int128 num) {
    return num / 10;
}

gives:

```asm div10(unsigned long): # @div10(unsigned long) mov rax, rdi movabs rcx, -3689348814741910323 mul rcx mov rax, rdx shr rax, 3 ret div10(unsigned __int128): # @div10(unsigned __int128) shrd rdi, rsi, 1 shr rsi mov rcx, rdi add rcx, rsi adc rcx, 0 movabs r8, -3689348814741910323 mov rax, rcx mul r8 shr rdx, 2 lea rax, [rdx + 4*rdx] sub rcx, rax sub rdi, rcx sbb rsi, 0 movabs rcx, -3689348814741910324 mov rax, rdi mul r8 imul rcx, rdi add rdx, rcx imul r8, rsi add rdx, r8 ret ```

Still an issue for Rust though.

cc also #103126 where the size reductions of libcore thanks to that PR are probably due to this issue (see also this zulip discussion).

nikic commented 1 year ago

Still an issue for Rust though.

[citation needed]

This should be fixed with the LLVM 16 update in nightly, and as far as I can tell it is: https://rust.godbolt.org/z/E85djEnTY

leonardo-m commented 1 year ago

Now Rustc nightly is able to do it for u128 (but not for i128).

workingjubilee commented 1 year ago

Comparing the idiv variant under clang

#include "stdint.h"

 __int128 idiv10( __int128 num) {
    return num / 10;
}

and rustc:

pub fn idiv10(a: i128) -> i128 {
    a / 10
}

Both emit almost exactly the same code under opts:

example::idiv10:
        push    rax
        mov     edx, 10
        xor     ecx, ecx
        call    qword ptr [rip + __divti3@GOTPCREL] ; clang: call __divti3@PLT
        pop     rcx
        ret

est31 commented 1 year ago

@nikic my mistake, I compared stable rustc with clang trunk. Didn't check nightly rustc. I'm very happy that it's now optimized :laughing: .

rust-lang / rust

Fast algorithm for u128 (and i128) divided by small constant #54867