Failure to simplify SIMD vector conversion.

silvasean commented 11 years ago


Bugzilla Link	16739
Version	trunk
OS	All
CC	@filcab,@LebedevRI,@RKSimon,@rotateright
Fixed by commit(s)	r366441

Extended Description

This code was reduced from a function that converted between SIMD vector classes used by two different libraries; the source and destination vectors have a <4 x float> underlying storage, but notionally hold only {x, y, z} (and the destination duplicates z into the last lane; the source leaves it undefined I think).

typedef float __m128 __attribute__((__vector_size__(16)));
union ElementWiseAccess {
  ElementWiseAccess(__m128 v) : ReprM128(v) {}
  __m128 ReprM128;
  float ReprFloatArray[4];
  float getAt(int i) const { return ReprFloatArray[i]; }
};
// Making this return `const ElementWiseAccess` instead of `const ElementWiseAccess &`
// still results in a failure to optimize, but in a different way.
static const ElementWiseAccess &castToElementWiseAccess(const __m128 &t) {
  return reinterpret_cast<const ElementWiseAccess &>(t);
}
__m128 ConvertVectors(const __m128 &V) {
  // Replacing `castToElementWiseAccess` with directly calling
  // `ElementWiseAccess` makes the issue go away.
  return (__m128) { castToElementWiseAccess(V).getAt(0), //
                    castToElementWiseAccess(V).getAt(1), //
                    castToElementWiseAccess(V).getAt(2), //
                    castToElementWiseAccess(V).getAt(2) };
}

clang -O3 produces:

define <4 x float> @_Z14ConvertVectorsRKDv4_f(<4 x float>* nocapture readonly %V) #0 {
  %1 = bitcast <4 x float>* %V to [4 x float]*
  %2 = getelementptr inbounds <4 x float>* %V, i64 0, i64 0
  %3 = load float* %2, align 4, !tbaa !0
  %4 = insertelement <4 x float> undef, float %3, i32 0
  %5 = getelementptr inbounds [4 x float]* %1, i64 0, i64 1
  %6 = load float* %5, align 4, !tbaa !0
  %7 = insertelement <4 x float> %4, float %6, i32 1
  %8 = getelementptr inbounds [4 x float]* %1, i64 0, i64 2
  %9 = load float* %8, align 4, !tbaa !0
  %10 = insertelement <4 x float> %7, float %9, i32 2
  %11 = insertelement <4 x float> %10, float %9, i32 3
  ret <4 x float> %11
}

It appears that something is interfering with folding the load/insertelement sequence into a vector load + shufflevector.

Making the modification indicated in the comments of having castToElementWiseAccess return by value instead of by reference results in:

define <4 x float> @_Z14ConvertVectorsRKDv4_f(<4 x float>* nocapture readonly %V) #0 {
  %1 = bitcast <4 x float>* %V to i8*
  %2 = bitcast <4 x float>* %V to double*
  %3 = load double* %2, align 16
  %4 = getelementptr inbounds i8* %1, i64 8
  %5 = bitcast i8* %4 to double*
  %6 = bitcast double %3 to i64
  %trunc = trunc i64 %6 to i32
  %bitcast = bitcast i32 %trunc to float
  %7 = insertelement <4 x float> undef, float %bitcast, i32 0
  %8 = lshr i64 %6, 32
  %9 = trunc i64 %8 to i32
  %10 = bitcast i32 %9 to float
  %11 = insertelement <4 x float> %7, float %10, i32 1
  %12 = load double* %5, align 8
  %13 = bitcast double %12 to i64
  %trunc6 = trunc i64 %13 to i32
  %bitcast7 = bitcast i32 %trunc6 to float
  %14 = insertelement <4 x float> %11, float %bitcast7, i32 2
  %15 = insertelement <4 x float> %14, float %bitcast7, i32 3
  ret <4 x float> %15
}

The issue in this case seems to be that clang lowers castToElementWiseAccess as returning {double, double}, which then prevents a <4 x float> load being generated.

Making the modification of replacing the call to castToElementWiseAcess with directly invoking the constructor (e.g. ElementWiseAccess(V).getAt(<<<n>>>)) results in the following code, which is the desired codegen for the initial test case:

define <4 x float> @_Z14ConvertVectorsRKDv4_f(<4 x float>* nocapture readonly %V) #0 {
  %1 = load <4 x float>* %V, align 16, !tbaa !0
  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %2
}

RKSimon commented 5 years ago

https://godbolt.org/z/NlK7rA

rotateright commented 6 years ago

For the original example, the bitcast from vector to array might be interfering with subsequent transforms, so: https://reviews.llvm.org/D44833

RKSimon commented 7 years ago

Trunk still has issues with this, and it reminds me of [Bug #21780]:

typedef float __m128 __attribute__((__vector_size__(16)));
union ElementWiseAccess {
  ElementWiseAccess(__m128 v) : ReprM128(v) {}
  __m128 ReprM128;
  float ReprFloatArray[4];
  float getAt(int i) const { return ReprFloatArray[i]; }
};

static const ElementWiseAccess &castToElementWiseAccess_ByRef(const __m128 &t) {
  return reinterpret_cast<const ElementWiseAccess &>(t);
}
static const ElementWiseAccess castToElementWiseAccess_ByVal(const __m128 &t) {
  return reinterpret_cast<const ElementWiseAccess &>(t);
}

__m128 ConvertVectors_ByRef(const __m128 &V) {
  return (__m128) { castToElementWiseAccess_ByRef(V).getAt(0), //
                    castToElementWiseAccess_ByRef(V).getAt(1), //
                    castToElementWiseAccess_ByRef(V).getAt(2), //
                    castToElementWiseAccess_ByRef(V).getAt(2) };
}
__m128 ConvertVectors_ByVal(const __m128 &V) {
  return (__m128) { castToElementWiseAccess_ByVal(V).getAt(0), //
                    castToElementWiseAccess_ByVal(V).getAt(1), //
                    castToElementWiseAccess_ByVal(V).getAt(2), //
                    castToElementWiseAccess_ByVal(V).getAt(2) };
}
__m128 ConvertVectors_ByCopy(const __m128 &V) {
  return (__m128) { ElementWiseAccess(V).getAt(0), //
                    ElementWiseAccess(V).getAt(1), //
                    ElementWiseAccess(V).getAt(2), //
                    ElementWiseAccess(V).getAt(2) };
}

Looking at the IR, it knows that the entire vector load is dereferencable, but still makes a mess of combining the inserted loads:

define <4 x float> @ConvertVectors_ByRef(<4 x float>* nocapture readonly dereferenceable(16)) {
  %2 = bitcast <4 x float>* %0 to [4 x float]*
  %3 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
  %4 = load float, float* %3, align 4, !tbaa !1
  %5 = insertelement <4 x float> undef, float %4, i32 0
  %6 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1
  %7 = load float, float* %6, align 4, !tbaa !1
  %8 = insertelement <4 x float> %5, float %7, i32 1
  %9 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2
  %10 = load float, float* %9, align 4, !tbaa !1
  %11 = insertelement <4 x float> %8, float %10, i32 2
  %12 = insertelement <4 x float> %11, float %10, i32 3
  ret <4 x float> %12
}

define <4 x float> @ConvertVectors_ByVal(<4 x float>* nocapture readonly dereferenceable(16)) {
  %2 = bitcast <4 x float>* %0 to i64*
  %3 = load i64, i64* %2, align 16
  %4 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
  %5 = trunc i64 %3 to i32
  %6 = bitcast i32 %5 to float
  %7 = insertelement <4 x float> undef, float %6, i32 0
  %8 = lshr i64 %3, 32
  %9 = trunc i64 %8 to i32
  %10 = bitcast i32 %9 to float
  %11 = insertelement <4 x float> %7, float %10, i32 1
  %12 = bitcast float* %4 to i64*
  %13 = load i64, i64* %12, align 8
  %14 = trunc i64 %13 to i32
  %15 = bitcast i32 %14 to float
  %16 = insertelement <4 x float> %11, float %15, i32 2
  %17 = insertelement <4 x float> %16, float %15, i32 3
  ret <4 x float> %17
}

define <4 x float> @ConvertVectors_ByCopy(<4 x float>* nocapture readonly dereferenceable(16)) {
  %2 = load <4 x float>, <4 x float>* %0, align 16, !tbaa !5
  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %3
}

Resulting in final assembly:

ConvertVectors_ByRef(float __vector(4) const&):        # @ConvertVectors_ByRef(float __vector(4) const&)
        vmovss  8(%rdi), %xmm0          # xmm0 = mem[0],zero,zero,zero
        vmovsd  (%rdi), %xmm1           # xmm1 = mem[0],zero
        vshufps $4, %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0,1],xmm0[0,0]
        retq

ConvertVectors_ByVal(float __vector(4) const&):        # @ConvertVectors_ByVal(float __vector(4) const&)
        vmovss  (%rdi), %xmm0           # xmm0 = mem[0],zero,zero,zero
        vmovss  8(%rdi), %xmm1          # xmm1 = mem[0],zero,zero,zero
        vinsertps       $16, 4(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0],xmm0[2,3]
        vshufps $4, %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0,1],xmm1[0,0]
        retq

ConvertVectors_ByCopy(float __vector(4) const&):       # @ConvertVectors_ByCopy(float __vector(4) const&)
        vpermilps       $164, (%rdi), %xmm0 # xmm0 = mem[0,1,2,2]
        retq

RKSimon commented 2 years ago

Current IR:

define <4 x float> @ConvertVectors_ByRef(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %0) {
  %2 = load <2 x float>, ptr %0, align 16
  %3 = shufflevector <2 x float> %2, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
  %4 = getelementptr inbounds [4 x float], ptr %0, i64 0, i64 2
  %5 = load float, ptr %4, align 8
  %6 = insertelement <4 x float> %3, float %5, i64 2
  %7 = insertelement <4 x float> %6, float %5, i64 3
  ret <4 x float> %7
}

define <4 x float> @ConvertVectors_ByVal(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %0) {
  %2 = load i64, ptr %0, align 16
  %3 = getelementptr i8, ptr %0, i64 8
  %4 = load i64, ptr %3, align 8
  %5 = trunc i64 %2 to i32
  %6 = insertelement <4 x i32> undef, i32 %5, i64 0
  %7 = lshr i64 %2, 32
  %8 = trunc i64 %7 to i32
  %9 = insertelement <4 x i32> %6, i32 %8, i64 1
  %10 = trunc i64 %4 to i32
  %11 = insertelement <4 x i32> %9, i32 %10, i64 2
  %12 = insertelement <4 x i32> %11, i32 %10, i64 3
  %13 = bitcast <4 x i32> %12 to <4 x float>
  ret <4 x float> %13
}

define <4 x float> @ConvertVectors_ByCopy(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %0) {
  %2 = load <4 x float>, ptr %0, align 16
  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %3
}

RKSimon commented 2 years ago

Current IR:

define <4 x float> @ConvertVectors_ByRef(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %0 = load <4 x float>, ptr %V, align 16
  %arrayidx.i14 = getelementptr inbounds [4 x float], ptr %V, i64 0, i64 2
  %1 = load float, ptr %arrayidx.i14, align 8
  %vecinit7 = insertelement <4 x float> %0, float %1, i64 2
  %vecinit10 = insertelement <4 x float> %vecinit7, float %1, i64 3
  ret <4 x float> %vecinit10
}

define <4 x float> @ConvertVectors_ByVal(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %V.val2536 = load i64, ptr %V, align 16
  %0 = getelementptr i8, ptr %V, i64 8
  %V.val2637 = load i64, ptr %0, align 8
  %1 = trunc i64 %V.val2536 to i32
  %2 = insertelement <4 x i32> undef, i32 %1, i64 0
  %3 = lshr i64 %V.val2536, 32
  %4 = trunc i64 %3 to i32
  %5 = insertelement <4 x i32> %2, i32 %4, i64 1
  %6 = trunc i64 %V.val2637 to i32
  %7 = insertelement <4 x i32> %5, i32 %6, i64 2
  %8 = insertelement <4 x i32> %7, i32 %6, i64 3
  %vecinit16 = bitcast <4 x i32> %8 to <4 x float>
  ret <4 x float> %vecinit16
}

define <4 x float> @_Z21ConvertVectors_ByCopy(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %0 = load <4 x float>, ptr %V, align 16
  %vecinit9 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %vecinit9
}

RKSimon commented 2 years ago

ConvertVectors_ByRef IR has improved, and if we run it through opt -O3 again we get:

define <4 x float> @ConvertVectors_ByRef(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
  %0 = load <4 x float>, ptr %V, align 16
  %vecinit10 = shufflevector <4 x float> %0, <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x float> %vecinit10
}

rotateright commented 2 years ago

ConvertVectors_ByRef IR has improved, and if we run it through opt -O3 again we get:

b57819e got us the wide load, so now GVN + instcombine can give us the ideal IR.

So now there's a tough question: should we run GVN again post-vectorization (IIUC, it's not cheap in compile-time), should there be a lighter-weight pass that specializes in load combining, or can we get away with some kind of load combining enhancement in SDAG?

rotateright commented 2 years ago

I missed another possibility: we can add an early run of vector-combine.

We already do it (!), but it is defaulted off with a debug flag, and that invocation only tries a subset of folds. That was added last year: https://reviews.llvm.org/D102496 ...and it had a similar motivation - it was specifically added before GVN because that made a difference for matrix ops.

RKSimon commented 2 years ago

SGTM - I suppose always enabling it comes down to whether it will have a detrimental effect on compile time?

rotateright commented 2 years ago

Compile-time improvement for vector-combine: 87debdadaf18 ...hoping that buys enough goodwill to run it a 2nd time :)

rotateright commented 2 years ago

Enable early vector-combine proposal: https://reviews.llvm.org/D138353

RKSimon commented 2 years ago

define <4 x float> @ConvertVectors_ByVal(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %V.val2536 = load i64, ptr %V, align 16
  %0 = getelementptr i8, ptr %V, i64 8
  %V.val2637 = load i64, ptr %0, align 8
  %1 = trunc i64 %V.val2536 to i32
  %2 = insertelement <4 x i32> undef, i32 %1, i64 0
  %3 = lshr i64 %V.val2536, 32
  %4 = trunc i64 %3 to i32
  %5 = insertelement <4 x i32> %2, i32 %4, i64 1
  %6 = trunc i64 %V.val2637 to i32
  %7 = insertelement <4 x i32> %5, i32 %6, i64 2
  %8 = insertelement <4 x i32> %7, i32 %6, i64 3
  %vecinit16 = bitcast <4 x i32> %8 to <4 x float>
  ret <4 x float> %vecinit16
}

I'm not sure what the best way to tackle this one is - should we consider trying harder to vectorize the truncs to something like:

define <4 x float> @ConvertVectors_ByVal_trunc(ptr nocapture noundef nonnull readonly align 16 dereferenceable(16) %V) {
entry:
  %V.val2536 = load i64, ptr %V, align 16
  %0 = getelementptr i8, ptr %V, i64 8
  %V.val2637 = load i64, ptr %0, align 8
  %1 = insertelement <4 x i64> undef, i64 %V.val2536, i64 0
  %2 = insertelement <4 x i64> %1, i64 %V.val2536, i64 1
  %3 = insertelement <4 x i64> %2, i64 %V.val2637, i64 2
  %4 = insertelement <4 x i64> %3, i64 %V.val2637, i64 3
  %5 = lshr <4 x i64> %4, <i64 0, i64 32, i64 0, i64 0>
  %6 = trunc <4 x i64> %5 to <4 x i32>
  %vecinit16 = bitcast <4 x i32> %6 to <4 x float>
  ret <4 x float> %vecinit16
}

rotateright commented 2 years ago

The "ByVal" part of this shows a series of potential missed canonicalizations. I think we want to replace the truncs and shifts with bitcasts and shuffles.

That leads to trade-offs like this: https://alive2.llvm.org/ce/z/dLqHXU Ie, we can replace a trunc + insert with a bitcast alone, but then we've lost information that the high elements are undef/poison. Is that better or worse than bitcast + identity shuffle with undef elements in the mask? Note that this will also need adjustments for endian.

If we can do those transforms, then we can hopefully convert the load to a vector load (easier now that we have opaque ptr?). Once that happens, we're almost back to the "ByRef" version of the IR, so then it should fold similarly...

RKSimon commented 2 years ago

Ie, we can replace a trunc + insert with a bitcast alone, but then we've lost information that the high elements are undef/poison. Is that better or worse than bitcast + identity shuffle with undef elements in the mask? Note that this will also need adjustments for endian.

I suppose if we can guarantee that we are populating the entire vector then the undef/poison issue is benign.

rotateright commented 2 years ago

I suppose if we can guarantee that we are populating the entire vector then the undef/poison issue is benign.

Right - although thinking about this example a bit more, we're going to need an identity shuffle either way because the vector (128-bit) is larger than the load (64-bit).

Maybe we can enhance VectorCombine to widen the load first by peeking through the trunc.

I suspect there would be cost problems with trying to increase the vector width to 256-bit as shown in your code example.

RKSimon commented 2 years ago

I suspect there would be cost problems with trying to increase the vector width to 256-bit as shown in your code example.

Yes I'd expect the SLP should get that, so if it doesn't then it might not be profitable.

rotateright commented 2 years ago

Another micro-optimization for VectorCombine: ede6d608f4 ...that should definitely make it faster to run twice now than it took to run once before.

rotateright commented 2 years ago

Proposal to canonicalize towards bitcast + shuffle (away from trunc + shift) - it's split into 3 pieces to reduce risk and ease review, but I consider it one fold: https://reviews.llvm.org/D138872 https://reviews.llvm.org/D138873 https://reviews.llvm.org/D138874

There's an annoying "lack-of-CSE" artifact in the test tracking this bug. We might be able to bend instcombine to deal with that rather than go down another pipeline-altering rabbit hole. That would allow forming a vector load. Then, we're still left with a final scalar load that GVN is not recognizing as overlapping. Either the load needs to be narrowed or GVN needs to be enhanced to see that.

rotateright commented 1 year ago

We're down to this form with 2 loads after combining the 1st two inserts:

define noundef <4 x float> @_Z20ConvertVectors_ByValRKDv4_f(ptr align 16 dereferenceable(16) %V) {
entry:
  %0 = load <4 x float>, ptr %V, align 16
  %1 = getelementptr i8, ptr %V, i64 8
  %V.val2637 = load i64, ptr %1, align 8, !tbaa.struct !6
  %2 = trunc i64 %V.val2637 to i32
  %3 = bitcast i32 %2 to float
  %vecinit11 = insertelement <4 x float> %0, float %3, i64 2
  %vecinit16 = insertelement <4 x float> %vecinit11, float %3, i64 3
  ret <4 x float> %vecinit16
}

And GVN + instcombine is able to reduce that to a single load + shuffle, so we have another phase ordering problem.

rotateright commented 1 year ago

I think we've really hit the hard problem of phase ordering -- https://github.com/llvm/llvm-project/issues/17113#issuecomment-1317734716 -- that we managed to sidestep on the previous example.

I did confirm that just adding a GVN run later in the pipeline is very costly in compile-time: https://llvm-compile-time-tracker.com/compare.php?from=a274d62fecfc3f49065f3fcdcb9577637778e0bc&to=d211b22ba1f4ee87faac2fedd5627bbf9f945c01&stat=instructions:u

NewPM-O3:

Benchmark | Old | New
-- | -- | --
kimwitu++ | 49008M | 49638M (+1.29%)
sqlite3 | 49587M | 51559M (+3.98%)
consumer-typeset | 43061M | 44187M (+2.61%)
Bullet | 111829M | 113963M (+1.91%)
tramp3d-v4 | 108415M | 112719M (+3.97%)
mafft | 45181M | 46892M (+3.79%)
ClamAV | 69184M | 71392M (+3.19%)
lencod | 84530M | 88648M (+4.87%)
SPASS | 55491M | 56501M (+1.82%)
7zip | 224276M | 227877M (+1.61%)
geomean | 72783M | 74892M (+2.90%)

rotateright commented 1 year ago

Adding GVN does show some small-ish improvements on other regression tests, so it really is just a question of cost:

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index f545fb4f11be..2e6607b07d46 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1178,7 +1178,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   }
   // Enhance/cleanup vector code.
   FPM.addPass(VectorCombinePass());
-  FPM.addPass(GVNPass());
+  if (Level != OptimizationLevel::O1)
+    FPM.addPass(GVNPass());

   if (!IsFullLTO) {
     FPM.addPass(InstCombinePass());
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
index 171fe16acb10..a6bec494d9b6 100644
--- a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
@@ -51,12 +51,9 @@ define i32 @main() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[INPUT_RELOAD_ADDR13_I]] to i32*
 ; CHECK-NEXT:    [[N_VAL3_RELOAD_ADDR11_I:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[N_VAL3_RELOAD_ADDR11_I]] to i32*
-; CHECK-NEXT:    [[N_VAL3_RELOAD12_I:%.*]] = load i32, i32* [[TMP4]], align 4, !noalias !3
-; CHECK-NEXT:    [[SUM7_I:%.*]] = add i32 [[N_VAL3_RELOAD12_I]], 2
-; CHECK-NEXT:    store i32 [[SUM7_I]], i32* [[TMP4]], align 4, !noalias !3
+; CHECK-NEXT:    store i32 3, i32* [[TMP4]], align 4, !noalias !3
 ; CHECK-NEXT:    store i32 4, i32* [[TMP3]], align 4, !noalias !3
-; CHECK-NEXT:    [[SUM7_I7:%.*]] = add i32 [[N_VAL3_RELOAD12_I]], 6
-; CHECK-NEXT:    tail call void @print(i32 [[SUM7_I7]]), !noalias !6
+; CHECK-NEXT:    tail call void @print(i32 7), !noalias !6
 ; CHECK-NEXT:    tail call void @deallocate(i8* [[TMP0]]), !noalias !6
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index 6a4ed2cd056a..dfa95aafde1a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -1406,7 +1406,7 @@ define i32 @disabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b
 ; O3DEFAULT-NEXT:    [[TMP24:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_44]], align 4
 ; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[TMP24]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], ptr [[ARRAYIDX2_44]], align 4
-; O3DEFAULT-NEXT:    [[TMP26:%.*]] = load i32, ptr [[A]], align 4
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
 ; O3DEFAULT-NEXT:    ret i32 [[TMP26]]
 ;
 ; Os-LABEL: @disabled(
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index 7cbce461a492..d8fb2e4ca8c5 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -21,31 +21,27 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[N_VEC]], 1
 ; CHECK-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP2]]
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP0]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i32 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[TMP1]]
+; CHECK-NEXT:    [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP1]]
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <8 x i16>, ptr [[NEXT_GEP14]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = sext <8 x i16> [[WIDE_LOAD15]] to <8 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = mul nsw <8 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = ashr <8 x i32> [[TMP8]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP9]], <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <8 x i32> [[TMP10]] to <8 x i16>
-; CHECK-NEXT:    store <8 x i16> [[TMP11]], ptr [[NEXT_GEP13]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD15]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP5]], <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <8 x i32> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    store <8 x i16> [[TMP7]], ptr [[NEXT_GEP13]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER16]]
@@ -61,15 +57,15 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER16]] ]
 ; CHECK-NEXT:    [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER16]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PSRCA_ADDR_05]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 ; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i16, ptr [[PSRCB_ADDR_03]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[PSRCB_ADDR_03]], align 2
-; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[PSRCB_ADDR_03]], align 2
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[TMP10]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[MUL]], 15
-; CHECK-NEXT:    [[TMP15:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
-; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[TMP15]] to i16
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
+; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[TMP11]] to i16
 ; CHECK-NEXT:    [[INCDEC_PTR4]] = getelementptr inbounds i16, ptr [[PDST_ADDR_04]], i32 1
 ; CHECK-NEXT:    store i16 [[CONV3]], ptr [[PDST_ADDR_04]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_06]], -1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
index 11b6f72793db..d934c080965e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
@@ -29,12 +29,6 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT9]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT11]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT13]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -16
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
@@ -44,58 +38,58 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK:       vector.ph.new:
 ; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], -2
 ; CHECK-NEXT:    [[TMP5:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT10]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT12]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT14]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT12]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT14]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP13]], align 8, !tbaa [[TBAA3:![0-9]+]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 4
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP15]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 8
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP17]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 12
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x double>, ptr [[TMP19]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6]], [[TMP6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7]], [[TMP7]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8]], [[TMP8]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x double> [[TMP21]], ptr [[TMP25]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 4
-; CHECK-NEXT:    store <4 x double> [[TMP22]], ptr [[TMP27]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 8
-; CHECK-NEXT:    store <4 x double> [[TMP23]], ptr [[TMP29]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 12
-; CHECK-NEXT:    store <4 x double> [[TMP24]], ptr [[TMP31]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP14]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 8
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP15]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 12
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x double>, ptr [[TMP16]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6]], [[TMP6]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7]], [[TMP7]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8]], [[TMP8]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
+; CHECK-NEXT:    store <4 x double> [[TMP17]], ptr [[TMP21]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 4
+; CHECK-NEXT:    store <4 x double> [[TMP18]], ptr [[TMP22]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 8
+; CHECK-NEXT:    store <4 x double> [[TMP19]], ptr [[TMP23]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 12
+; CHECK-NEXT:    store <4 x double> [[TMP20]], ptr [[TMP24]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX_NEXT]]
-; CHECK-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x double>, ptr [[TMP33]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 4
-; CHECK-NEXT:    [[WIDE_LOAD6_1:%.*]] = load <4 x double>, ptr [[TMP35]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 8
-; CHECK-NEXT:    [[WIDE_LOAD7_1:%.*]] = load <4 x double>, ptr [[TMP37]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 12
-; CHECK-NEXT:    [[WIDE_LOAD8_1:%.*]] = load <4 x double>, ptr [[TMP39]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP41:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_1]], [[TMP9]]
-; CHECK-NEXT:    [[TMP42:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6_1]], [[TMP10]]
-; CHECK-NEXT:    [[TMP43:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7_1]], [[TMP11]]
-; CHECK-NEXT:    [[TMP44:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8_1]], [[TMP12]]
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX_NEXT]]
-; CHECK-NEXT:    store <4 x double> [[TMP41]], ptr [[TMP45]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 4
-; CHECK-NEXT:    store <4 x double> [[TMP42]], ptr [[TMP47]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 8
-; CHECK-NEXT:    store <4 x double> [[TMP43]], ptr [[TMP49]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 12
-; CHECK-NEXT:    store <4 x double> [[TMP44]], ptr [[TMP51]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x double>, ptr [[TMP25]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD6_1:%.*]] = load <4 x double>, ptr [[TMP26]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 8
+; CHECK-NEXT:    [[WIDE_LOAD7_1:%.*]] = load <4 x double>, ptr [[TMP27]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, ptr [[TMP25]], i64 12
+; CHECK-NEXT:    [[WIDE_LOAD8_1:%.*]] = load <4 x double>, ptr [[TMP28]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6_1]], [[TMP10]]
+; CHECK-NEXT:    [[TMP31:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7_1]], [[TMP11]]
+; CHECK-NEXT:    [[TMP32:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8_1]], [[TMP12]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    store <4 x double> [[TMP29]], ptr [[TMP33]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 4
+; CHECK-NEXT:    store <4 x double> [[TMP30]], ptr [[TMP34]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 8
+; CHECK-NEXT:    store <4 x double> [[TMP31]], ptr [[TMP35]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, ptr [[TMP33]], i64 12
+; CHECK-NEXT:    store <4 x double> [[TMP32]], ptr [[TMP36]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDEX_NEXT_1]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
 ; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
@@ -105,115 +99,115 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
 ; CHECK-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]]
 ; CHECK:       vector.body.epil:
-; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX_UNR]]
-; CHECK-NEXT:    [[WIDE_LOAD_EPIL:%.*]] = load <4 x double>, ptr [[TMP53]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds double, ptr [[TMP53]], i64 4
-; CHECK-NEXT:    [[WIDE_LOAD6_EPIL:%.*]] = load <4 x double>, ptr [[TMP55]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds double, ptr [[TMP53]], i64 8
-; CHECK-NEXT:    [[WIDE_LOAD7_EPIL:%.*]] = load <4 x double>, ptr [[TMP57]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds double, ptr [[TMP53]], i64 12
-; CHECK-NEXT:    [[WIDE_LOAD8_EPIL:%.*]] = load <4 x double>, ptr [[TMP59]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP61:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD_EPIL]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP62:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD6_EPIL]], [[BROADCAST_SPLAT10]]
-; CHECK-NEXT:    [[TMP63:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD7_EPIL]], [[BROADCAST_SPLAT12]]
-; CHECK-NEXT:    [[TMP64:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD8_EPIL]], [[BROADCAST_SPLAT14]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX_UNR]]
-; CHECK-NEXT:    store <4 x double> [[TMP61]], ptr [[TMP65]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds double, ptr [[TMP65]], i64 4
-; CHECK-NEXT:    store <4 x double> [[TMP62]], ptr [[TMP67]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds double, ptr [[TMP65]], i64 8
-; CHECK-NEXT:    store <4 x double> [[TMP63]], ptr [[TMP69]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds double, ptr [[TMP65]], i64 12
-; CHECK-NEXT:    store <4 x double> [[TMP64]], ptr [[TMP71]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX_UNR]]
+; CHECK-NEXT:    [[WIDE_LOAD_EPIL:%.*]] = load <4 x double>, ptr [[TMP37]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, ptr [[TMP37]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD6_EPIL:%.*]] = load <4 x double>, ptr [[TMP38]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds double, ptr [[TMP37]], i64 8
+; CHECK-NEXT:    [[WIDE_LOAD7_EPIL:%.*]] = load <4 x double>, ptr [[TMP39]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, ptr [[TMP37]], i64 12
+; CHECK-NEXT:    [[WIDE_LOAD8_EPIL:%.*]] = load <4 x double>, ptr [[TMP40]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP41:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD_EPIL]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP42:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD6_EPIL]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP43:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD7_EPIL]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP44:%.*]] = fdiv fast <4 x double> [[WIDE_LOAD8_EPIL]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX_UNR]]
+; CHECK-NEXT:    store <4 x double> [[TMP41]], ptr [[TMP45]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 4
+; CHECK-NEXT:    store <4 x double> [[TMP42]], ptr [[TMP46]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 8
+; CHECK-NEXT:    store <4 x double> [[TMP43]], ptr [[TMP47]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 12
+; CHECK-NEXT:    store <4 x double> [[TMP44]], ptr [[TMP48]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15]]
 ; CHECK:       for.body.preheader15:
 ; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[TMP73:%.*]] = xor i64 [[INDVARS_IV_PH]], -1
-; CHECK-NEXT:    [[TMP74:%.*]] = add nsw i64 [[TMP73]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[TMP49:%.*]] = xor i64 [[INDVARS_IV_PH]], -1
+; CHECK-NEXT:    [[TMP50:%.*]] = add nsw i64 [[TMP49]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    [[XTRAITER16:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7
 ; CHECK-NEXT:    [[LCMP_MOD17_NOT:%.*]] = icmp eq i64 [[XTRAITER16]], 0
 ; CHECK-NEXT:    br i1 [[LCMP_MOD17_NOT]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL_PREHEADER:%.*]]
 ; CHECK:       for.body.prol.preheader:
-; CHECK-NEXT:    [[TMP75:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP51:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    br label [[FOR_BODY_PROL:%.*]]
 ; CHECK:       for.body.prol:
 ; CHECK-NEXT:    [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_BODY_PROL]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PROL_PREHEADER]] ]
 ; CHECK-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_NEXT:%.*]], [[FOR_BODY_PROL]] ], [ 0, [[FOR_BODY_PROL_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_PROL]]
 ; CHECK-NEXT:    [[T0_PROL:%.*]] = load double, ptr [[ARRAYIDX_PROL]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP76:%.*]] = fmul fast double [[T0_PROL]], [[TMP75]]
+; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast double [[T0_PROL]], [[TMP51]]
 ; CHECK-NEXT:    [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_PROL]]
-; CHECK-NEXT:    store double [[TMP76]], ptr [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP52]], ptr [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1
 ; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
 ; CHECK-NEXT:    [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER16]]
 ; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       for.body.prol.loopexit:
 ; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER15]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ]
-; CHECK-NEXT:    [[TMP77:%.*]] = icmp ult i64 [[TMP74]], 7
-; CHECK-NEXT:    br i1 [[TMP77]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15_NEW:%.*]]
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp ult i64 [[TMP50]], 7
+; CHECK-NEXT:    br i1 [[TMP53]], label [[FOR_END]], label [[FOR_BODY_PREHEADER15_NEW:%.*]]
 ; CHECK:       for.body.preheader15.new:
-; CHECK-NEXT:    [[TMP78:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP79:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP80:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP81:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP82:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP83:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP84:%.*]] = fdiv fast double 1.000000e+00, [[A]]
-; CHECK-NEXT:    [[TMP85:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP54:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP55:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP56:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP57:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP58:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP59:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP60:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP61:%.*]] = fdiv fast double 1.000000e+00, [[A]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER15_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP86:%.*]] = fmul fast double [[T0]], [[TMP78]]
+; CHECK-NEXT:    [[TMP62:%.*]] = fmul fast double [[T0]], [[TMP54]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store double [[TMP86]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP62]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[T0_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast double [[T0_1]], [[TMP79]]
+; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast double [[T0_1]], [[TMP55]]
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    store double [[TMP87]], ptr [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP63]], ptr [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_1]]
 ; CHECK-NEXT:    [[T0_2:%.*]] = load double, ptr [[ARRAYIDX_2]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP88:%.*]] = fmul fast double [[T0_2]], [[TMP80]]
+; CHECK-NEXT:    [[TMP64:%.*]] = fmul fast double [[T0_2]], [[TMP56]]
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_1]]
-; CHECK-NEXT:    store double [[TMP88]], ptr [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP64]], ptr [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-NEXT:    [[T0_3:%.*]] = load double, ptr [[ARRAYIDX_3]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast double [[T0_3]], [[TMP81]]
+; CHECK-NEXT:    [[TMP65:%.*]] = fmul fast double [[T0_3]], [[TMP57]]
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_2]]
-; CHECK-NEXT:    store double [[TMP89]], ptr [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP65]], ptr [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_3]]
 ; CHECK-NEXT:    [[T0_4:%.*]] = load double, ptr [[ARRAYIDX_4]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP90:%.*]] = fmul fast double [[T0_4]], [[TMP82]]
+; CHECK-NEXT:    [[TMP66:%.*]] = fmul fast double [[T0_4]], [[TMP58]]
 ; CHECK-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_3]]
-; CHECK-NEXT:    store double [[TMP90]], ptr [[ARRAYIDX2_4]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP66]], ptr [[ARRAYIDX2_4]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_4]]
 ; CHECK-NEXT:    [[T0_5:%.*]] = load double, ptr [[ARRAYIDX_5]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP91:%.*]] = fmul fast double [[T0_5]], [[TMP83]]
+; CHECK-NEXT:    [[TMP67:%.*]] = fmul fast double [[T0_5]], [[TMP59]]
 ; CHECK-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_4]]
-; CHECK-NEXT:    store double [[TMP91]], ptr [[ARRAYIDX2_5]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP67]], ptr [[ARRAYIDX2_5]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_5]]
 ; CHECK-NEXT:    [[T0_6:%.*]] = load double, ptr [[ARRAYIDX_6]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP92:%.*]] = fmul fast double [[T0_6]], [[TMP84]]
+; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast double [[T0_6]], [[TMP60]]
 ; CHECK-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_5]]
-; CHECK-NEXT:    store double [[TMP92]], ptr [[ARRAYIDX2_6]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP68]], ptr [[ARRAYIDX2_6]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_6]]
 ; CHECK-NEXT:    [[T0_7:%.*]] = load double, ptr [[ARRAYIDX_7]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT:    [[TMP93:%.*]] = fmul fast double [[T0_7]], [[TMP85]]
+; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast double [[T0_7]], [[TMP61]]
 ; CHECK-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_6]]
-; CHECK-NEXT:    store double [[TMP93]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store double [[TMP69]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 77cbc70ff369..334405b12e65 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -47,24 +47,18 @@ define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull a
 define noundef <4 x float> @ConvertVectors_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %V) #0 {
 ; SSE-LABEL: @ConvertVectors_ByVal(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
-; SSE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8
-; SSE-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
-; SSE-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
-; SSE-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; SSE-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
-; SSE-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
+; SSE-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[V:%.*]], align 16
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float>
+; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float>
+; SSE-NEXT:    [[VECINIT16:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[BC]], <4 x i32> <i32 0, i32 1, i32 6, i32 6>
 ; SSE-NEXT:    ret <4 x float> [[VECINIT16]]
 ;
 ; AVX-LABEL: @ConvertVectors_ByVal(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
-; AVX-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8
-; AVX-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
-; AVX-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
-; AVX-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; AVX-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
-; AVX-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
+; AVX-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[V:%.*]], align 16
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float>
+; AVX-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float>
+; AVX-NEXT:    [[VECINIT16:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[BC]], <4 x i32> <i32 0, i32 1, i32 6, i32 6>
 ; AVX-NEXT:    ret <4 x float> [[VECINIT16]]
 ;
 entry:

RKSimon commented 1 year ago

Alternatively - is there anything we can add/move before the GVN pass to improve its chances?

rotateright commented 1 year ago

I've been searching for something that could allow us to get the necessary transforms without adding a pass, and I can't see anything.

The existing GVN + InstCombine makes the pattern that allows VectorCombine to create a wide load, and only after that load is created, another round of GVN + InstCombine allows combining the loads.

Also note that the GVN transform has been flagged as not poison-safe, so there's a chance that we'll lose the optimization on the first example. :(

RKSimon commented 1 year ago

CC @nikic - this was one of the tickets I mentioned at EuroLLVM regarding possible improvements to FunctionAttr/Attributor/GVN or adding an additional GVN run

llvm / llvm-project

Failure to simplify SIMD vector conversion. #17113

Extended Description