Open RKSimon opened 7 years ago
mentioned in issue llvm/llvm-bugzilla-archive#50256
Between the recent SLPVectorizer improvements (SK_Select etc.) and Sanjay's work on [Bug #37806], the original test cases now all vectorize to v4i32 multiplies.
The next step will be to get the missing_mul case from [Comment #2] working - SLP only uses the 'alternate' opcode pattern for binary operators, relaxing that somehow should help.
The missing_load_and_mul case is a lot trickier, it reminds me of [Bug #21780] which Dinar was looking at in D37648.
Patch for review: https://reviews.llvm.org/D28907
I didn't look closely at the patch, so I'm not sure if it handles this particular example or the 'no-load' example in comment 2.
The translation back to mul from shift is hopefully not too hard, but if we want/need to solve this in general, the solution would have to account for:
void missing_mul(int * __restrict dst, const int * __restrict src) {
*dst++ = *src++ * 257;
*dst++ = *src++ * -3;
*dst++ = *src++ * 1; <--- the load/store are here, but there is no math op
*dst++ = *src++ * -9;
}
void missing_load_and_mul(int * __restrict dst, const int * __restrict src) {
*dst++ = *src++ * 257;
*dst++ = *src++ * -3;
*dst++ = *src++ * 0; <--- there is no load or math op in this case
*dst++ = *src++ * -9;
}
The 2nd case is similar to bug 21780 because we want to preserve info about the safety of loading extra bytes from a deleted load.
The first example is vectorized by the SLP vectorizer, so the solution should be to make the pattern recognition in there more flexible.
Ie, this works:
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.12.0"
define void @mul_as_shift_4i32(i32* %dst, i32* noalias nocapture readonly %src) {
entry:
%incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
%0 = load i32, i32* %src, align 4
%mul = mul nsw i32 %0, 257
%incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
store i32 %mul, i32* %dst, align 4
%incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
%1 = load i32, i32* %incdec.ptr, align 4
%mul3 = mul nsw i32 %1, -3
%incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
store i32 %mul3, i32* %incdec.ptr1, align 4
%incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
%2 = load i32, i32* %incdec.ptr2, align 4
%mul6 = mul nsw i32 %2, -2
%incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
store i32 %mul6, i32* %incdec.ptr4, align 4
%3 = load i32, i32* %incdec.ptr5, align 4
%mul9 = mul nsw i32 %3, -9
store i32 %mul9, i32* %incdec.ptr7, align 4
ret void
}
$ ./opt -slp-vectorizer 31572.ll -S
define void @mul_as_shift_4i32(i32* %dst, i32* noalias nocapture readonly %src) {
entry:
%incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
%incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
%incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
%incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
%incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
%incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
%0 = bitcast i32* %src to <4 x i32>*
%1 = load <4 x i32>, <4 x i32>* %0, align 4
%2 = mul nsw <4 x i32> <i32 257, i32 -3, i32 -2, i32 -9>, %1
%3 = bitcast i32* %dst to <4 x i32>*
store <4 x i32> %2, <4 x i32>* %3, align 4
ret void
}
But this does not work:
define void @mul_as_shift_4i32(i32* %dst, i32* noalias nocapture readonly %src) {
entry:
%incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
%0 = load i32, i32* %src, align 4
%mul = mul nsw i32 %0, 257
%incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
store i32 %mul, i32* %dst, align 4
%incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
%1 = load i32, i32* %incdec.ptr, align 4
%mul3 = mul nsw i32 %1, -3
%incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
store i32 %mul3, i32* %incdec.ptr1, align 4
%incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
%2 = load i32, i32* %incdec.ptr2, align 4
%mul6 = shl nsw i32 %2, 1 ; MUL REPLACED BY EQUIVALENT SHL
%incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
store i32 %mul6, i32* %incdec.ptr4, align 4
%3 = load i32, i32* %incdec.ptr5, align 4
%mul9 = mul nsw i32 %3, -9
store i32 %mul9, i32* %incdec.ptr7, align 4
ret void
}
assigned to @alexey-bataev
This seems to fix for both case mention in description -
The follow up cases are not however:
void missing_mul(int * __restrict dst, const int * __restrict src) {
*dst++ = *src++ * 257;
*dst++ = *src++ * -3;
*dst++ = *src++ * 1; //<--- the load/store are here, but there is no math op
*dst++ = *src++ * -9;
}
void missing_load_and_mul(int * __restrict dst, const int * __restrict src) {
*dst++ = *src++ * 257;
*dst++ = *src++ * -3;
*dst++ = *src++ * 0; // <--- there is no load or math op in this case
*dst++ = *src++ * -9;
}
Extended Description
We can vectorize pure integer multiplies quite easily:
But this fails if some of the scalar multiplies can be combined to left shifts instead:
Similarly we should be able to vectorize a mixture of integer multiplies and left shifts: