llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.31k stars 11.69k forks source link

Multi-precision addition using `__int128`/`i128` doesn't produce optimal code #83030

Open Eisenwave opened 7 months ago

Eisenwave commented 7 months ago

https://godbolt.org/z/vovWPxexE

Expected output

Firstly, the expected "canonical" output which we got both for _BitInt(512) additions and __builtin_addcll looks as follows:

using u128 = unsigned __int128;
using u64 = unsigned long long;

void add_adc(u64* __restrict dest, const u64* a, const u64* b)
{
    u64 carry = 0;
    for (int i = 0; i < 8; ++i) {
        dest[i] = __builtin_addcll(a[i], b[i], carry, &carry);
    }
}
add_adc(unsigned long long*, unsigned long long const*, unsigned long long const*): # @add_adc(unsigned long long*, unsigned long long const*, unsigned long long const*)
  mov rax, qword ptr [rsi]
  add rax, qword ptr [rdx]
  mov qword ptr [rdi], rax
  mov rax, qword ptr [rsi + 8]
  adc rax, qword ptr [rdx + 8]
  mov qword ptr [rdi + 8], rax
  mov rax, qword ptr [rsi + 16]
  adc rax, qword ptr [rdx + 16]
  mov qword ptr [rdi + 16], rax
  mov rax, qword ptr [rsi + 24]
  adc rax, qword ptr [rdx + 24]
  mov qword ptr [rdi + 24], rax
  mov rax, qword ptr [rsi + 32]
  adc rax, qword ptr [rdx + 32]
  mov qword ptr [rdi + 32], rax
  mov rax, qword ptr [rsi + 40]
  adc rax, qword ptr [rdx + 40]
  mov qword ptr [rdi + 40], rax
  mov rax, qword ptr [rsi + 48]
  adc rax, qword ptr [rdx + 48]
  mov qword ptr [rdi + 48], rax
  mov rax, qword ptr [rsi + 56]
  adc rax, qword ptr [rdx + 56]
  mov qword ptr [rdi + 56], rax
  ret

We get one adc per loop iteration, which is the theoretical optimum here.

Actual output for 128-bit integers

When attempting to get the same codegen with 128-bit addition, we don't get the optimum:

void add_128(u64* __restrict dest, const u64* a, const u64* b)
{
    u64 carry = 0;
    for (int i = 0; i < 8; ++i) {
        auto sum = u128(carry) + a[i] + b[i];
        dest[i] = sum;
        carry = sum >> 64;
    }
}

This compiles to:

add_128(unsigned long long*, unsigned long long const*, unsigned long long const*):
  mov rax, qword ptr [rsi]
  add rax, qword ptr [rdx]
  mov qword ptr [rdi], rax
  mov rax, qword ptr [rsi + 8]
  adc rax, 0
  setb cl
  movzx ecx, cl
  add rax, qword ptr [rdx + 8]
  mov qword ptr [rdi + 8], rax
  adc rcx, qword ptr [rsi + 16]
  setb al
  movzx eax, al
  add rcx, qword ptr [rdx + 16]
  mov qword ptr [rdi + 16], rcx
  adc rax, qword ptr [rsi + 24]
  setb cl
  movzx ecx, cl
  add rax, qword ptr [rdx + 24]
  mov qword ptr [rdi + 24], rax
  adc rcx, qword ptr [rsi + 32]
  setb al
  movzx eax, al
  add rcx, qword ptr [rdx + 32]
  mov qword ptr [rdi + 32], rcx
  adc rax, qword ptr [rsi + 40]
  setb cl
  movzx ecx, cl
  add rax, qword ptr [rdx + 40]
  mov qword ptr [rdi + 40], rax
  adc rcx, qword ptr [rsi + 48]
  setb al
  movzx eax, al
  add rcx, qword ptr [rdx + 48]
  mov qword ptr [rdi + 48], rcx
  adc rax, qword ptr [rsi + 56]
  add rax, qword ptr [rdx + 56]
  mov qword ptr [rdi + 56], rax
  ret

LLVM emits adc here, however, two per loop iteration.

Basically, u128(carry) + a[i] + b[i] turns into:

  adc rcx, qword ptr [rsi + 16] ; TMP = u128(carry) + a[i]
  setb al                       ; carry = CARRY_FLAG
  movzx eax, al                 ; carry = ZERO_EXTEND(carry)
  add rcx, qword ptr [rdx + 16] ; SUM = TMP + b[i]
  mov qword ptr [rdi + 16], rcx ; dest[i] = SUM
  adc rax, qword ptr [rsi + 24] ; TMP = u128(carry) + a[i]
; ...

Well, it doesn't quite translate into that and it's a bit interleaved, but overall, we need two additions per iteration and instead of keeping the carry in the carry CPU flag at all times, it's being "spilled" into a register.

Relevant passes

Prior to x86 instruction selection, the adc variant still uses two additions per iteration:

  %8 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %6, i64 %7)
  %9 = extractvalue { i64, i1 } %8, 1
  %10 = extractvalue { i64, i1 } %8, 0
  %11 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %10, i64 %5)
  %12 = extractvalue { i64, i1 } %11, 1
  %13 = extractvalue { i64, i1 } %11, 0

x86-isel combines these into

  %4:gr64 = ADD64rm %3:gr64(tied-def 0), %2:gr64, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %ir.b); example.cpp:18:19

The 128-bit variant prior to instruction selection looks like this, per iteration:

  %conv.1 = zext i1 %ov to i128
  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 8
  %3 = load i64, ptr %arrayidx.1, align 8
  %conv1.1 = zext i64 %3 to i128
  %add.1 = add nuw nsw i128 %conv1.1, %conv.1
  %arrayidx3.1 = getelementptr inbounds i8, ptr %b, i64 8
  %4 = load i64, ptr %arrayidx3.1, align 8
  %conv4.1 = zext i64 %4 to i128

Presumably, the i128 addition would already need to be broken up into separate uadd.with.overflow calls prior to instruction selection to get the optimal codegen.

llvmbot commented 7 months ago

@llvm/issue-subscribers-backend-x86

Author: Jan Schultke (Eisenwave)

https://godbolt.org/z/vovWPxexE ## Expected output Firstly, the expected "canonical" output which we got both for `_BitInt(512)` additions and `__builtin_addcll` looks as follows: ```cpp using u128 = unsigned __int128; using u64 = unsigned long long; void add_adc(u64* __restrict dest, const u64* a, const u64* b) { u64 carry = 0; for (int i = 0; i < 8; ++i) { dest[i] = __builtin_addcll(a[i], b[i], carry, &carry); } } ``` ```asm add_adc(unsigned long long*, unsigned long long const*, unsigned long long const*): # @add_adc(unsigned long long*, unsigned long long const*, unsigned long long const*) mov rax, qword ptr [rsi] add rax, qword ptr [rdx] mov qword ptr [rdi], rax mov rax, qword ptr [rsi + 8] adc rax, qword ptr [rdx + 8] mov qword ptr [rdi + 8], rax mov rax, qword ptr [rsi + 16] adc rax, qword ptr [rdx + 16] mov qword ptr [rdi + 16], rax mov rax, qword ptr [rsi + 24] adc rax, qword ptr [rdx + 24] mov qword ptr [rdi + 24], rax mov rax, qword ptr [rsi + 32] adc rax, qword ptr [rdx + 32] mov qword ptr [rdi + 32], rax mov rax, qword ptr [rsi + 40] adc rax, qword ptr [rdx + 40] mov qword ptr [rdi + 40], rax mov rax, qword ptr [rsi + 48] adc rax, qword ptr [rdx + 48] mov qword ptr [rdi + 48], rax mov rax, qword ptr [rsi + 56] adc rax, qword ptr [rdx + 56] mov qword ptr [rdi + 56], rax ret ``` We get one `adc` per loop iteration, which is the theoretical optimum here. ## Actual output for 128-bit integers When attempting to get the same codegen with 128-bit addition, we don't get the optimum: ```cpp void add_128(u64* __restrict dest, const u64* a, const u64* b) { u64 carry = 0; for (int i = 0; i < 8; ++i) { auto sum = u128(carry) + a[i] + b[i]; dest[i] = sum; carry = sum >> 64; } } ``` This compiles to: ```asm add_128(unsigned long long*, unsigned long long const*, unsigned long long const*): mov rax, qword ptr [rsi] add rax, qword ptr [rdx] mov qword ptr [rdi], rax mov rax, qword ptr [rsi + 8] adc rax, 0 setb cl movzx ecx, cl add rax, qword ptr [rdx + 8] mov qword ptr [rdi + 8], rax adc rcx, qword ptr [rsi + 16] setb al movzx eax, al add rcx, qword ptr [rdx + 16] mov qword ptr [rdi + 16], rcx adc rax, qword ptr [rsi + 24] setb cl movzx ecx, cl add rax, qword ptr [rdx + 24] mov qword ptr [rdi + 24], rax adc rcx, qword ptr [rsi + 32] setb al movzx eax, al add rcx, qword ptr [rdx + 32] mov qword ptr [rdi + 32], rcx adc rax, qword ptr [rsi + 40] setb cl movzx ecx, cl add rax, qword ptr [rdx + 40] mov qword ptr [rdi + 40], rax adc rcx, qword ptr [rsi + 48] setb al movzx eax, al add rcx, qword ptr [rdx + 48] mov qword ptr [rdi + 48], rcx adc rax, qword ptr [rsi + 56] add rax, qword ptr [rdx + 56] mov qword ptr [rdi + 56], rax ret ``` LLVM emits `adc` here, however, two per loop iteration. Basically, `u128(carry) + a[i] + b[i]` turns into: ```asm adc rcx, qword ptr [rsi + 16] ; TMP = u128(carry) + a[i] setb al ; carry = CARRY_FLAG movzx eax, al ; carry = ZERO_EXTEND(carry) add rcx, qword ptr [rdx + 16] ; SUM = TMP + b[i] mov qword ptr [rdi + 16], rcx ; dest[i] = SUM adc rax, qword ptr [rsi + 24] ; TMP = u128(carry) + a[i] ; ... ``` Well, it doesn't quite translate into that and it's a bit interleaved, but overall, we need two additions per iteration and instead of keeping the carry in the carry CPU flag at all times, it's being "spilled" into a register. ## Relevant passes Prior to x86 instruction selection, the `adc` variant still uses two additions per iteration: ```llvm %8 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %6, i64 %7) %9 = extractvalue { i64, i1 } %8, 1 %10 = extractvalue { i64, i1 } %8, 0 %11 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %10, i64 %5) %12 = extractvalue { i64, i1 } %11, 1 %13 = extractvalue { i64, i1 } %11, 0 ``` x86-isel combines these into ```llvm %4:gr64 = ADD64rm %3:gr64(tied-def 0), %2:gr64, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %ir.b); example.cpp:18:19 ``` The 128-bit variant prior to instruction selection looks like this, per iteration: ```llvm %conv.1 = zext i1 %ov to i128 %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 8 %3 = load i64, ptr %arrayidx.1, align 8 %conv1.1 = zext i64 %3 to i128 %add.1 = add nuw nsw i128 %conv1.1, %conv.1 %arrayidx3.1 = getelementptr inbounds i8, ptr %b, i64 8 %4 = load i64, ptr %arrayidx3.1, align 8 %conv4.1 = zext i64 %4 to i128 ``` Presumably, the `i128` addition would already need to be broken up into separate `uadd.with.overflow` calls prior to instruction selection to get the optimal codegen.