llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
27.85k stars 11.47k forks source link

Failure to vectorize mixture of integer multiplies and shift lefts #30920

Open RKSimon opened 7 years ago

RKSimon commented 7 years ago
Bugzilla Link 31572
Version trunk
OS All
CC @alexey-bataev,@dtemirbulatov,@rotateright

Extended Description

We can vectorize pure integer multiplies quite easily:

void mul_4i32(int * __restrict dst, const int * __restrict src) {
  *dst++ = *src++ * 257;
  *dst++ = *src++ * -3;
  *dst++ = *src++ * -2;
  *dst++ = *src++ * -9;
}
mul_4i32:
        vmovdqu (%rsi), %xmm0
        vpmulld .LCPI0_0(%rip), %xmm0, %xmm0
        vmovdqu %xmm0, (%rdi)
        retq

But this fails if some of the scalar multiplies can be combined to left shifts instead:

void mul_as_shift_4i32(int * __restrict dst, const int * __restrict src) {
  *dst++ = *src++ * 257;
  *dst++ = *src++ * -3;
  *dst++ = *src++ * 2;
  *dst++ = *src++ * -9;
}
mul_as_shift_4i32:
        movl    (%rsi), %eax
        movl    8(%rsi), %edx
        movl    %eax, %ecx
        addl    %edx, %edx
        shll    $8, %ecx
        addl    %eax, %ecx
        movl    %ecx, (%rdi)
        imull   $-3, 4(%rsi), %ecx
        movl    %ecx, 4(%rdi)
        imull   $-9, 12(%rsi), %ecx
        movl    %edx, 8(%rdi)
        movl    %ecx, 12(%rdi)
        retq

Similarly we should be able to vectorize a mixture of integer multiplies and left shifts:

void mul_and_shift_4i32(int * __restrict dst, const int * __restrict src) {
  *dst++ = *src++ * 257;
  *dst++ = *src++ * -3;
  *dst++ = *src++ << 4;
  *dst++ = *src++ * -9;
}
mul_and_shift_4i32:
        movl    (%rsi), %eax
        movl    8(%rsi), %edx
        movl    %eax, %ecx
        shll    $4, %edx
        shll    $8, %ecx
        addl    %eax, %ecx
        movl    %ecx, (%rdi)
        imull   $-3, 4(%rsi), %ecx
        movl    %ecx, 4(%rdi)
        imull   $-9, 12(%rsi), %ecx
        movl    %edx, 8(%rdi)
        movl    %ecx, 12(%rdi)
        retq
davidbolvansky commented 2 years ago

mentioned in issue llvm/llvm-bugzilla-archive#50256

RKSimon commented 6 years ago

Between the recent SLPVectorizer improvements (SK_Select etc.) and Sanjay's work on [Bug #​37806], the original test cases now all vectorize to v4i32 multiplies.

The next step will be to get the missing_mul case from [Comment #​2] working - SLP only uses the 'alternate' opcode pattern for binary operators, relaxing that somehow should help.

The missing_load_and_mul case is a lot trickier, it reminds me of [Bug #​21780] which Dinar was looking at in D37648.

rotateright commented 7 years ago

Patch for review: https://reviews.llvm.org/D28907

I didn't look closely at the patch, so I'm not sure if it handles this particular example or the 'no-load' example in comment 2.

rotateright commented 7 years ago

The translation back to mul from shift is hopefully not too hard, but if we want/need to solve this in general, the solution would have to account for:

void missing_mul(int * __restrict dst, const int * __restrict src) {
  *dst++ = *src++ * 257;
  *dst++ = *src++ * -3;
  *dst++ = *src++ * 1;   <--- the load/store are here, but there is no math op
  *dst++ = *src++ * -9;
}

void missing_load_and_mul(int * __restrict dst, const int * __restrict src) {
  *dst++ = *src++ * 257;
  *dst++ = *src++ * -3;
  *dst++ = *src++ * 0;   <--- there is no load or math op in this case
  *dst++ = *src++ * -9;
}

The 2nd case is similar to bug 21780 because we want to preserve info about the safety of loading extra bytes from a deleted load.

rotateright commented 7 years ago

The first example is vectorized by the SLP vectorizer, so the solution should be to make the pattern recognition in there more flexible.

Ie, this works:

target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.12.0"

define void @mul_as_shift_4i32(i32* %dst, i32* noalias nocapture readonly %src) {
entry:
  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
  %0 = load i32, i32* %src, align 4
  %mul = mul nsw i32 %0, 257
  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
  store i32 %mul, i32* %dst, align 4
  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
  %1 = load i32, i32* %incdec.ptr, align 4
  %mul3 = mul nsw i32 %1, -3
  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
  store i32 %mul3, i32* %incdec.ptr1, align 4
  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
  %2 = load i32, i32* %incdec.ptr2, align 4
  %mul6 = mul nsw i32 %2, -2
  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
  store i32 %mul6, i32* %incdec.ptr4, align 4
  %3 = load i32, i32* %incdec.ptr5, align 4
  %mul9 = mul nsw i32 %3, -9
  store i32 %mul9, i32* %incdec.ptr7, align 4
  ret void
}

$ ./opt -slp-vectorizer 31572.ll -S
define void @mul_as_shift_4i32(i32* %dst, i32* noalias nocapture readonly %src) {
entry:
  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
  %0 = bitcast i32* %src to <4 x i32>*
  %1 = load <4 x i32>, <4 x i32>* %0, align 4
  %2 = mul nsw <4 x i32> <i32 257, i32 -3, i32 -2, i32 -9>, %1
  %3 = bitcast i32* %dst to <4 x i32>*
  store <4 x i32> %2, <4 x i32>* %3, align 4
  ret void
}

But this does not work:

define void @mul_as_shift_4i32(i32* %dst, i32* noalias nocapture readonly %src) {
entry:
  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
  %0 = load i32, i32* %src, align 4
  %mul = mul nsw i32 %0, 257
  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
  store i32 %mul, i32* %dst, align 4
  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
  %1 = load i32, i32* %incdec.ptr, align 4
  %mul3 = mul nsw i32 %1, -3
  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
  store i32 %mul3, i32* %incdec.ptr1, align 4
  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
  %2 = load i32, i32* %incdec.ptr2, align 4
  %mul6 = shl nsw i32 %2, 1    ; MUL REPLACED BY EQUIVALENT SHL
  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
  store i32 %mul6, i32* %incdec.ptr4, align 4
  %3 = load i32, i32* %incdec.ptr5, align 4
  %mul9 = mul nsw i32 %3, -9
  store i32 %mul9, i32* %incdec.ptr7, align 4
  ret void
}
RKSimon commented 7 years ago

assigned to @alexey-bataev

xgupta commented 1 year ago

This seems to fix for both case mention in description -

  1. https://gcc.godbolt.org/z/967cP51MK
  2. https://gcc.godbolt.org/z/5Yzz6Mfed
RKSimon commented 1 year ago

The follow up cases are not however:

void missing_mul(int * __restrict dst, const int * __restrict src) {
  *dst++ = *src++ * 257;
  *dst++ = *src++ * -3;
  *dst++ = *src++ * 1;   //<--- the load/store are here, but there is no math op
  *dst++ = *src++ * -9;
}

void missing_load_and_mul(int * __restrict dst, const int * __restrict src) {
  *dst++ = *src++ * 257;
  *dst++ = *src++ * -3;
  *dst++ = *src++ * 0;  // <--- there is no load or math op in this case
  *dst++ = *src++ * -9;
}