llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.6k stars 11.82k forks source link

Zero/AllOnes XMM / YMM registers are treated separately #26392

Open RKSimon opened 8 years ago

RKSimon commented 8 years ago
Bugzilla Link 26018
Version trunk
OS All
CC @adibiagio,@topperc,@dtemirbulatov,@LebedevRI,@rotateright

Extended Description

It should be possible to share 128-bit and 256-bit zero vector registers instead of generating them separately, increasing instruction count and wasting registers.

Zero ZMM registers probably have the same issue.

As a stretch goal it might be possible to recognise that a VEX encoded 128-bit instruction will implicitly zero the upper bits and make use of it.

Example: llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll

define void @endless_loop() {
; CHECK-LABEL: endless_loop:
; CHECK-NEXT:  # BB#0:
; CHECK-NEXT:    vmovaps (%eax), %ymm0
; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2 <-- XMM ZERO
; CHECK-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; CHECK-NEXT:    vxorps %ymm2, %ymm2, %ymm2 <-- YMM ZERO
; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
; CHECK-NEXT:    vmovaps %ymm0, (%eax)
; CHECK-NEXT:    vmovaps %ymm1, (%eax)
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retl
entry:
  %0 = load <8 x i32>, <8 x i32> addrspace(1)* undef, align 32
  %1 = shufflevector <8 x i32> %0, <8 x i32> undef, <16 x i32> <i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %2 = shufflevector <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef>, <16 x i32> %1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i$
  store <16 x i32> %2, <16 x i32> addrspace(1)* undef, align 64
  ret void
}
rotateright commented 4 years ago

Canonicalizing to remove undefs is more interesting. Will DAG combine try to remove any zeroes we put back in if they aren't demanded? Thus triggering an infinite loop when we run lowering and DAG combine together in the last combine stage?

Good question. I didn't check to see if something would prevent that, but we don't have any regression test failures currently when I tried that change. The patch experiment that I tried is pasted below. I'm not sure if we'd call the "insertps" test diff a win, but avoiding the GPR->XMM transfer on the other test is probably better for most targets?

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8e8a7cce9fb..bfa0c73466a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9615,7 +9615,7 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,

   // Vectors containing all zeros can be matched by pxor and xorps.
   if (ISD::isBuildVectorAllZeros(Op.getNode()))
-    return Op;
+    return getZeroVector(VT, Subtarget, DAG, DL);

   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
diff --git a/llvm/test/CodeGen/X86/fold-load-vec.ll b/llvm/test/CodeGen/X86/fold-load-vec.ll
index e8dc8f26ffa..115f2bf7a5b 100644
--- a/llvm/test/CodeGen/X86/fold-load-vec.ll
+++ b/llvm/test/CodeGen/X86/fold-load-vec.ll
@@ -12,7 +12,7 @@ define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
 ; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    movlps %xmm0, (%rsp)
-; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; CHECK-NEXT:    movlps %xmm0, (%rsp)
 ; CHECK-NEXT:    movlps %xmm0, (%rsi)
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 32709b4fd5d..72cbad7cd68 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -759,14 +759,12 @@ define <16 x i8> @constant_fold_pshufb() {
 define <16 x i8> @constant_fold_pshufb_2() {
 ; SSE-LABEL: constant_fold_pshufb_2:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movl $2, %eax
-; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: constant_fold_pshufb_2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movl $2, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX-NEXT:    retq
   %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 2, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
   ret <16 x i8> %1
RKSimon commented 4 years ago

Canonicalizing to remove undefs is more interesting. Will DAG combine try to remove any zeroes we put back in if they aren't demanded? Thus triggering an infinite loop when we run lowering and DAG combine together in the last combine stage?

SimplifyDemanded* doesn't touch splat/uniform build_vectors like that - technically we could for non-constant cases but there's a lot of weird regressions so I gave up the last time I tried :-(

topperc commented 4 years ago

getZeroVector used to canonicalize to vXi32. And we used to match bitcasts in isel patterns. But isel now has special support for all zeroes/ones and can peak through bitcasts without them being explicitly listed. And once that was in place there wasn't much reason to canonicalize to the same type. The extra bitcasts are propably worse for our analysis in combine/lowering.

Canonicalizing to remove undefs is more interesting. Will DAG combine try to remove any zeroes we put back in if they aren't demanded? Thus triggering an infinite loop when we run lowering and DAG combine together in the last combine stage?

rotateright commented 4 years ago

The zero case is a pain as we just need the xmm zero (we can rely on the implicit zeroing from vxorps). The allones case could be trickier so maybe ensure that whatever solution you go for works for that?

Is the MachineCSE issue just dealing with the zero/allones cases? I imagine there's something that allows re-loads of constant pool values in a similar way - does that handle rematerializable values as well, including creating a constant pool entry instead?

Disclaimer: I've never touched MachineCSE. :)

At first glance, MachineCSE works only by matching exactly equivalent instructions. So if we had: vxorps %xmm1, %xmm1, %xmm1 vxorps %xmm2, %xmm2, %xmm2 ...then it wouldn't recognize those as the same zero value.

Also I'd guess that because we list these constants as easily rematerializable that would mean they're not normal candidates for reloading or CSE.

RKSimon commented 4 years ago

The zero case is a pain as we just need the xmm zero (we can rely on the implicit zeroing from vxorps). The allones case could be trickier so maybe ensure that whatever solution you go for works for that?

Is the MachineCSE issue just dealing with the zero/allones cases? I imagine there's something that allows re-loads of constant pool values in a similar way - does that handle rematerializable values as well, including creating a constant pool entry instead?

rotateright commented 4 years ago
  1. The zero "instructions" are selected as post-RA pseudo instructions, so they bypass most machine IR optimizations.

Forgot to add: the reason we do this is to allow the zero/ones to be load-folded from memory if we've run out of registers to hold the value via xorps/pcmpeq. If that problem (do we want to revisit this to see if it's really a problem?) was handled differently, then the MachineCSE pass would supposedly be able to deal with the duplicate instructions that we are seeing in this report.

rotateright commented 4 years ago

I spent some time looking at how to wrestle SDAG or later into doing what we want for the zero side of this, but I don't have a fix yet.

Let me list some comments/experiments that I tried here so I don't forget (or if someone else wants to try fixing):

  1. The zero "instructions" are selected as post-RA pseudo instructions, so they bypass most machine IR optimizations.

  2. Change the tablegen patterns to make 128-bit zero an extract of 256-bit zero:

let Predicates = [UseAVX,NoAVX512] in { def : Pat<(v16i8 immAllZerosV), (EXTRACT_SUBREG (AVX_SET0), sub_xmm)>; def : Pat<(v8i16 immAllZerosV), (EXTRACT_SUBREG (AVX_SET0), sub_xmm)>; def : Pat<(v4i32 immAllZerosV), (EXTRACT_SUBREG (AVX_SET0), sub_xmm)>; def : Pat<(v2i64 immAllZerosV), (EXTRACT_SUBREG (AVX_SET0), sub_xmm)>; def : Pat<(v4f32 immAllZerosV), (EXTRACT_SUBREG (AVX_SET0), sub_xmm)>; def : Pat<(v2f64 immAllZerosV), (EXTRACT_SUBREG (AVX_SET0), sub_xmm)>; }

This allows the expected CSE, but it leads to >100 regression test failures. Most are benign RA/scheduling diffs, but there's at least 1 real problem: we incur many false issuances of "vzeroupper" (that has it's own late machine pass) because we create an implicit YMM usage with this change. We may want to improve VZeroUpperInserter() independently to account for that.

  1. Use X86DAGToDAGISel::PreprocessISelDAG() or X86DAGToDAGISel::PostprocessISelDAG() to extract the 128-bit zero from 256-bit zero only when 256-bit zero exists (using CurDAG->getNodeIfExists()). I couldn't get this to work as expected, but that's probably just me not understanding the nuances of DAG nodes in this in-between state.

  2. Lower the 128-bit zero as an extract of 256-bit in X86TargetLowering::LowerBUILD_VECTOR(). This infinite loops because we constant fold the extract.

  3. There's a side issue to #​4 that we might want to change independently: we don't canonicalize the build_vector zero to a specific type and without undefs. Doing that would reduce the tablegen patterns, and it also leads to a potential improvement on at least 1 regression test via better shuffle combining. See getZeroVector().

RKSimon commented 4 years ago

The allones case is just as bad: https://gcc.godbolt.org/z/-XS8f8

RKSimon commented 4 years ago

Current Codegen: https://gcc.godbolt.org/z/8Jny_X

RKSimon commented 7 years ago

Although D35839/rL309298 stopped the original test example, its still not difficult to break this (although it now has generated 2 xmm zero registers, not a xmm and ymm):

#include <x86intrin.h>

void foo(__m128 a, __m256 b, __m128 *f128, __m256 *f256) {
  a = _mm_blend_ps(a, _mm_setzero_ps(), 0x3);
  b = _mm256_blend_ps(b, _mm256_setzero_ps(), 0x3);
  *f128++ = a;
  *f256++ = b;
}

llc -mtriple=x86_64-unknown-unknown -mcpu=btver2

define void @foo(<4 x float>, <8 x float>, <4 x float>* nocapture, <8 x float>* nocapture) {
  %5 = shufflevector <4 x float> %0, <4 x float> <float 0.000000e+00, float 0.000000e+00, float undef, float undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
  %6 = shufflevector <8 x float> %1, <8 x float> <float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef>, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <4 x float> %5, <4 x float>* %2, align 16, !tbaa !2
  store <8 x float> %6, <8 x float>* %3, align 32, !tbaa !2
  ret void
}
foo:
  vxorpd %xmm2, %xmm2, %xmm2
  vblendpd $1, %xmm2, %xmm0, %xmm0 # xmm0 = xmm2[0],xmm0[1]
  vxorpd %xmm2, %xmm2, %xmm2
  vblendpd $1, %ymm2, %ymm1, %ymm1 # ymm1 = ymm2[0],ymm1[1,2,3]
  vmovapd %xmm0, (%rdi)
  vmovapd %ymm1, (%rsi)
  retq