suboptimal codegen for llvm.x86.addcarryx.u32 and u64

54aefcd4-c07d-4252-8441-723563c8826f commented 5 years ago


Bugzilla Link	41546
Version	trunk
OS	All
CC	@topperc,@RKSimon,@rotateright

Extended Description

MP multiply is documented by Intel as one of the main usecases of addcarryx intrinsics (https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf).

We implement this in Rust as:

#[target_feature(enable = "adx,bmi2")]
pub unsafe fn mp_mul_intrin(a: &[u64], b: &[u64], c: &mut [u64]) {
    let len = a.len();
    if len != b.len() {
        unreachable_unchecked();
    }
    if c.len() != len * 2 {
        unreachable_unchecked();
    }

    for b_i in (0..len).into_iter().rev() {
        let b_elem = *b.get_unchecked(b_i);
        let mut carry_lo = 0;
        let mut carry_hi = 0;
        for a_i in (0..len).into_iter().rev() {
            let mut hi = 0;
            let lo = _mulx_u64(*a.get_unchecked(a_i), b_elem, &mut hi);
            carry_lo = _addcarryx_u64(
                carry_lo,
                lo,
                *c.get_unchecked(b_i + a_i + 1),
                c.get_unchecked_mut(b_i + a_i + 1),
            );
            carry_hi = _addcarryx_u64(
                carry_hi,
                hi,
                *c.get_unchecked(b_i + a_i),
                c.get_unchecked_mut(b_i + a_i),
            );
        }
        *c.get_unchecked_mut(b_i) += carry_lo as u64;
    }
}

which produces the following LLVM-IR after optimizations (https://rust.godbolt.org/z/EJFEHB):

source_filename = "example.3a1fbbbh-cgu.0"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @_ZN7example13mp_mul_intrin17ha949a97694eafb34E([0 x i64]* noalias nocapture nonnull readonly align 8 %a.0, i64 %a.1, [0 x i64]* noalias nocapture nonnull readonly align 8 %b.0, i64 %b.1, [0 x i64]* nocapture nonnull align 8 %c.0, i64 %c.1) unnamed_addr #0 personality i32 (...)* @rust_eh_personality {
start:
  %0 = icmp eq i64 %a.1, 0
  br i1 %0, label %bb15, label %bb14

bb14: ; preds = %start, %bb23
  %iter.sroa.4.040 = phi i64 [ %1, %bb23 ], [ %a.1, %start ]
  %1 = add i64 %iter.sroa.4.040, -1
  %2 = getelementptr inbounds [0 x i64], [0 x i64]* %b.0, i64 0, i64 %1
  %3 = load i64, i64* %2, align 8
  %4 = zext i64 %3 to i128
  br label %bb22

bb15: ; preds = %bb23, %start
  ret void

bb22: ; preds = %bb14, %bb22
  %carry_hi.039 = phi i8 [ 0, %bb14 ], [ %24, %bb22 ]
  %carry_lo.038 = phi i8 [ 0, %bb14 ], [ %19, %bb22 ]
  %iter2.sroa.4.037 = phi i64 [ %a.1, %bb14 ], [ %5, %bb22 ]
  %5 = add i64 %iter2.sroa.4.037, -1
  %6 = getelementptr inbounds [0 x i64], [0 x i64]* %a.0, i64 0, i64 %5
  %7 = load i64, i64* %6, align 8
  %8 = zext i64 %7 to i128
  %9 = mul nuw i128 %8, %4
  %10 = lshr i128 %9, 64
  %11 = trunc i128 %10 to i64
  %12 = trunc i128 %9 to i64
  %13 = add i64 %5, %1
  %14 = add i64 %iter2.sroa.4.037, %1
  %15 = getelementptr inbounds [0 x i64], [0 x i64]* %c.0, i64 0, i64 %14
  %16 = load i64, i64* %15, align 8
  %17 = tail call { i8, i64 } @llvm.x86.addcarry.64(i8 %carry_lo.038, i64 %12, i64 %16) #3
  %18 = extractvalue { i8, i64 } %17, 1
  store i64 %18, i64* %15, align 8
  %19 = extractvalue { i8, i64 } %17, 0
  %20 = getelementptr inbounds [0 x i64], [0 x i64]* %c.0, i64 0, i64 %13
  %21 = load i64, i64* %20, align 8
  %22 = tail call { i8, i64 } @llvm.x86.addcarry.64(i8 %carry_hi.039, i64 %11, i64 %21) #3
  %23 = extractvalue { i8, i64 } %22, 1
  store i64 %23, i64* %20, align 8
  %24 = extractvalue { i8, i64 } %22, 0
  %25 = icmp eq i64 %5, 0
  br i1 %25, label %bb23, label %bb22

bb23: ; preds = %bb22
  %26 = zext i8 %19 to i64
  %27 = getelementptr inbounds [0 x i64], [0 x i64]* %c.0, i64 0, i64 %1
  %28 = load i64, i64* %27, align 8
  %29 = add i64 %28, %26
  store i64 %29, i64* %27, align 8
  %30 = icmp eq i64 %1, 0
  br i1 %30, label %bb15, label %bb14
}

declare i32 @rust_eh_personality(...) unnamed_addr #1

declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64) #2

attributes #0 = { nounwind nonlazybind "probe-stack"="__rust_probestack""target-cpu"="x86-64""target-features"="+adx,+bmi2" }
attributes #1 = { nonlazybind "target-cpu"="x86-64" }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }

!llvm.module.flags = !{#0}

!0 = !{i32 2, !"RtLibUseGOT", i32 1}

which gets compiled down to the following machine code:

example::mp_mul_intrin:
  pushq %r15
  pushq %r14
  pushq %r12
  pushq %rbx
  testq %rsi, %rsi
  je .LBB0_5
  movq %rdx, %r9
  movq %rsi, %r10
  negq %r10
  movq %rsi, %rax
  shlq $4, %rax
  leaq (%r8,%rax), %r12
  addq $-8, %r12
  leaq (%rdi,%rsi,8), %r11
  addq $-8, %r11
.LBB0_2:
  leaq -1(%rsi), %r14
  movq -8(%r9,%rsi,8), %r15
  xorl %ebx, %ebx
  xorl %ecx, %ecx
  xorl %edi, %edi
.LBB0_3:
  movq %r15, %rax
  mulq (%r11,%rbx,8)
  addb $-1, %dil
  adcq %rax, (%r12,%rbx,8)
  setb %dil
  addb $-1, %cl
  adcq %rdx, -8(%r12,%rbx,8)
  setb %cl
  addq $-1, %rbx
  cmpq %rbx, %r10
  jne .LBB0_3
  movzbl %dil, %eax
  addq %rax, -8(%r8,%rsi,8)
  addq $-8, %r12
  movq %r14, %rsi
  testq %r14, %r14
  jne .LBB0_2
.LBB0_5:
  popq %rbx
  popq %r12
  popq %r14
  popq %r15
  retq

Implementing this operation using inline assembly with the expected machine code output:

#[inline(never)]
#[no_mangle]
pub fn mp_mul_asm(a: &[u64], b: &[u64]) -> Vec<u64> {
    unsafe {
        let len = a.len();
        assert_eq!(len, b.len());
        let mut c = vec![0; len * 2];

        asm!("
    # start iteration of `b_i`: `len`-1 downto 0
    lea -1($3), %rsi
1:  # every iteration of `b_i`
        # load `b_elem`
        mov ($1, %rsi, 8), %rdx
        # clear `carry_lo`, `carry_hi`
        xor %r9, %r9
        # start iteration of `a_i`+1: `len` downto 1
        mov $3, %rcx
        jmp 3f
2:          # the end of every iteration of `a_i`+1, except the last iteration
            # no displacement, RCX has already been decremented
            mov %r10, (%r11, %rcx, 8)
3:      # every iteration of `a_i`+1
            # compute c + b_i
            lea ($2, %rsi, 8), %r11
            # compute a[a_i]*b_elem
            mulx -8($0, %rcx, 8), %r8, %r9
            # add low word
            mov (%r11, %rcx, 8), %r10
            adcx %r8, %r10
            mov %r10, (%r11, %rcx, 8)
            # add high word
            mov -8(%r11, %rcx, 8), %r10
            adox %r9, %r10
            # this is done later to be able to add in last carry in last iteration of `a_i`+1
            # mov %r10, -8(%r11, %rcx, 8)
            # advance `a_i`+1
            lea -1(%rcx), %rcx
            jrcxz 4f
            jmp 2b
4:          # the end of the last iteration of `a_i`+1
            adc $$0, %r10
            mov %r10, (%r11)
        # advance `b_i`
        dec %rsi
        jns 1b
"
            :
            : "r"(a.as_ptr()), "r"(b.as_ptr()), "r"(c.as_mut_ptr()), "r"(len)
            : "rsi", "rcx", "rdx", "r8", "r9", "r10", "r11", "flags"
        );

        c
    }
}

and benchmarking both (https://github.com/rust-lang-nursery/stdsimd/issues/666#issuecomment-485065551) shows a significant performance difference: 390ns/iteration for the inline assembly version vs 507ns/iteration for the one using llvm.x86.addcarryx.u64.

It appears that LLVM always replaces llvm.x86.addcarryx.u64 with a polyfill based on llvm.x86.addcarry.u64 and then fails to emit adcx instructions.

topperc commented 5 years ago

Even that's not enough. We need to use jrcxz for the loop control and an lea for the index adjustment. And we need to keep flags alive across basic block boundaries which I don't think we usually do.

54aefcd4-c07d-4252-8441-723563c8826f commented 5 years ago

Hmm. Indeed. LLVM should also emit a proper mulx here for adcx to make sense.

topperc commented 5 years ago

The X86 backend isn't currently set up to model the C flag and O flag separately. We model all of the flags as one register. Because of this we can't interleave the flag dependencies. We would need to do something about that before it makes sense to implement _addcarryx_u64 as anything other than plain adc.

chfast commented 9 months ago

Is it possible to have custom legalization of e.g. mul i256 that uses mulx, adcx and adox?

llvm / llvm-project

suboptimal codegen for llvm.x86.addcarryx.u32 and u64 #40891

Extended Description