Quuxplusone / LLVMBugzillaTest

0 stars 0 forks source link

Zero/AllOnes XMM / YMM registers are treated separately #26017

Open Quuxplusone opened 8 years ago

Quuxplusone commented 8 years ago
Bugzilla Link PR26018
Status NEW
Importance P normal
Reported by Simon Pilgrim (llvm-dev@redking.me.uk)
Reported on 2016-01-04 16:00:48 -0800
Last modified on 2020-03-23 07:10:20 -0700
Version trunk
Hardware PC All
CC andrea.dibiagio@gmail.com, craig.topper@gmail.com, dtemirbulatov@gmail.com, lebedev.ri@gmail.com, llvm-bugs@lists.llvm.org, spatel+llvm@rotateright.com
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also PR32862, PR9588, PR42653, PR43691, PR39381
It should be possible to share 128-bit and 256-bit zero vector registers
instead of generating them separately, increasing instruction count and wasting
registers.

Zero ZMM registers probably have the same issue.

As a stretch goal it might be possible to recognise that a VEX encoded 128-bit
instruction will implicitly zero the upper bits and make use of it.

Example: llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll

define void @endless_loop() {
; CHECK-LABEL: endless_loop:
; CHECK-NEXT:  # BB#0:
; CHECK-NEXT:    vmovaps (%eax), %ymm0
; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2 <-- XMM ZERO
; CHECK-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; CHECK-NEXT:    vxorps %ymm2, %ymm2, %ymm2 <-- YMM ZERO
; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
; CHECK-NEXT:    vmovaps %ymm0, (%eax)
; CHECK-NEXT:    vmovaps %ymm1, (%eax)
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retl
entry:
  %0 = load <8 x i32>, <8 x i32> addrspace(1)* undef, align 32
  %1 = shufflevector <8 x i32> %0, <8 x i32> undef, <16 x i32> <i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %2 = shufflevector <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef>, <16 x i32> %1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i$
  store <16 x i32> %2, <16 x i32> addrspace(1)* undef, align 64
  ret void
}
Quuxplusone commented 7 years ago
Although D35839/rL309298 stopped the original test example, its still not
difficult to break this (although it now has generated 2 xmm zero registers,
not a xmm and ymm):

#include <x86intrin.h>

void foo(__m128 a, __m256 b, __m128 *f128, __m256 *f256) {
  a = _mm_blend_ps(a, _mm_setzero_ps(), 0x3);
  b = _mm256_blend_ps(b, _mm256_setzero_ps(), 0x3);
  *f128++ = a;
  *f256++ = b;
}

llc -mtriple=x86_64-unknown-unknown -mcpu=btver2

define void @foo(<4 x float>, <8 x float>, <4 x float>* nocapture, <8 x float>*
nocapture) {
  %5 = shufflevector <4 x float> %0, <4 x float> <float 0.000000e+00, float 0.000000e+00, float undef, float undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
  %6 = shufflevector <8 x float> %1, <8 x float> <float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef>, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <4 x float> %5, <4 x float>* %2, align 16, !tbaa !2
  store <8 x float> %6, <8 x float>* %3, align 32, !tbaa !2
  ret void
}

foo:
  vxorpd %xmm2, %xmm2, %xmm2
  vblendpd $1, %xmm2, %xmm0, %xmm0 # xmm0 = xmm2[0],xmm0[1]
  vxorpd %xmm2, %xmm2, %xmm2
  vblendpd $1, %ymm2, %ymm1, %ymm1 # ymm1 = ymm2[0],ymm1[1,2,3]
  vmovapd %xmm0, (%rdi)
  vmovapd %ymm1, (%rsi)
  retq
Quuxplusone commented 4 years ago

Current Codegen: https://gcc.godbolt.org/z/8Jny_X

Quuxplusone commented 4 years ago

The allones case is just as bad: https://gcc.godbolt.org/z/-XS8f8

Quuxplusone commented 4 years ago
I spent some time looking at how to wrestle SDAG or later into doing what we
want for the zero side of this, but I don't have a fix yet.

Let me list some comments/experiments that I tried here so I don't forget (or
if someone else wants to try fixing):

1. The zero "instructions" are selected as post-RA pseudo instructions, so they
bypass most machine IR optimizations.

2. Change the tablegen patterns to make 128-bit zero an extract of 256-bit zero:

let Predicates = [UseAVX,NoAVX512] in {
def : Pat<(v16i8 immAllZerosV), (EXTRACT_SUBREG (AVX_SET0), sub_xmm)>;
def : Pat<(v8i16 immAllZerosV), (EXTRACT_SUBREG (AVX_SET0), sub_xmm)>;
def : Pat<(v4i32 immAllZerosV), (EXTRACT_SUBREG (AVX_SET0), sub_xmm)>;
def : Pat<(v2i64 immAllZerosV), (EXTRACT_SUBREG (AVX_SET0), sub_xmm)>;
def : Pat<(v4f32 immAllZerosV), (EXTRACT_SUBREG (AVX_SET0), sub_xmm)>;
def : Pat<(v2f64 immAllZerosV), (EXTRACT_SUBREG (AVX_SET0), sub_xmm)>;
}

This allows the expected CSE, but it leads to >100 regression test failures.
Most are benign RA/scheduling diffs, but there's at least 1 real problem: we
incur many false issuances of "vzeroupper" (that has it's own late machine
pass) because we create an implicit YMM usage with this change. We may want to
improve VZeroUpperInserter() independently to account for that.

3. Use X86DAGToDAGISel::PreprocessISelDAG() or
X86DAGToDAGISel::PostprocessISelDAG() to extract the 128-bit zero from 256-bit
zero only when 256-bit zero exists (using CurDAG->getNodeIfExists()). I
couldn't get this to work as expected, but that's probably just me not
understanding the nuances of DAG nodes in this in-between state.

4. Lower the 128-bit zero as an extract of 256-bit in
X86TargetLowering::LowerBUILD_VECTOR(). This infinite loops because we constant
fold the extract.

5. There's a side issue to #4 that we might want to change independently: we
don't canonicalize the build_vector zero to a specific type and without undefs.
Doing that would reduce the tablegen patterns, and it also leads to a potential
improvement on at least 1 regression test via better shuffle combining. See
getZeroVector().
Quuxplusone commented 4 years ago
(In reply to Sanjay Patel from comment #4)
> 1. The zero "instructions" are selected as post-RA pseudo instructions, so
> they bypass most machine IR optimizations.

Forgot to add: the reason we do this is to allow the zero/ones to be load-
folded from memory if we've run out of registers to hold the value via
xorps/pcmpeq. If that problem (do we want to revisit this to see if it's really
a problem?) was handled differently, then the MachineCSE pass would supposedly
be able to deal with the duplicate instructions that we are seeing in this
report.
Quuxplusone commented 4 years ago

The zero case is a pain as we just need the xmm zero (we can rely on the implicit zeroing from vxorps). The allones case could be trickier so maybe ensure that whatever solution you go for works for that?

Is the MachineCSE issue just dealing with the zero/allones cases? I imagine there's something that allows re-loads of constant pool values in a similar way - does that handle rematerializable values as well, including creating a constant pool entry instead?

Quuxplusone commented 4 years ago
(In reply to Simon Pilgrim from comment #6)
> The zero case is a pain as we just need the xmm zero (we can rely on the
> implicit zeroing from vxorps). The allones case could be trickier so maybe
> ensure that whatever solution you go for works for that?
>
> Is the MachineCSE issue just dealing with the zero/allones cases? I imagine
> there's something that allows re-loads of constant pool values in a similar
> way - does that handle rematerializable values as well, including creating a
> constant pool entry instead?

Disclaimer: I've never touched MachineCSE. :)

At first glance, MachineCSE works only by matching exactly equivalent
instructions. So if we had:
vxorps %xmm1, %xmm1, %xmm1
vxorps %xmm2, %xmm2, %xmm2
...then it wouldn't recognize those as the same zero value.

Also I'd guess that because we list these constants as easily rematerializable
that would mean they're not normal candidates for reloading or CSE.
Quuxplusone commented 4 years ago

getZeroVector used to canonicalize to vXi32. And we used to match bitcasts in isel patterns. But isel now has special support for all zeroes/ones and can peak through bitcasts without them being explicitly listed. And once that was in place there wasn't much reason to canonicalize to the same type. The extra bitcasts are propably worse for our analysis in combine/lowering.

Canonicalizing to remove undefs is more interesting. Will DAG combine try to remove any zeroes we put back in if they aren't demanded? Thus triggering an infinite loop when we run lowering and DAG combine together in the last combine stage?

Quuxplusone commented 4 years ago
(In reply to Craig Topper from comment #8)
> Canonicalizing to remove undefs is more interesting. Will DAG combine try to
> remove any zeroes we put back in if they aren't demanded? Thus triggering an
> infinite loop when we run lowering and DAG combine together in the last
> combine stage?

SimplifyDemanded* doesn't touch splat/uniform build_vectors like that -
technically we could for non-constant cases but there's a lot of weird
regressions so I gave up the last time I tried :-(
Quuxplusone commented 4 years ago
(In reply to Craig Topper from comment #8)
> Canonicalizing to remove undefs is more interesting. Will DAG combine try to
> remove any zeroes we put back in if they aren't demanded? Thus triggering an
> infinite loop when we run lowering and DAG combine together in the last
> combine stage?

Good question. I didn't check to see if something would prevent that, but we
don't have any regression test failures currently when I tried that change. The
patch experiment that I tried is pasted below. I'm not sure if we'd call the
"insertps" test diff a win, but avoiding the GPR->XMM transfer on the other
test is probably better for most targets?

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8e8a7cce9fb..bfa0c73466a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9615,7 +9615,7 @@ static SDValue materializeVectorConstant(SDValue Op,
SelectionDAG &DAG,

   // Vectors containing all zeros can be matched by pxor and xorps.
   if (ISD::isBuildVectorAllZeros(Op.getNode()))
-    return Op;
+    return getZeroVector(VT, Subtarget, DAG, DL);

   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
diff --git a/llvm/test/CodeGen/X86/fold-load-vec.ll
b/llvm/test/CodeGen/X86/fold-load-vec.ll
index e8dc8f26ffa..115f2bf7a5b 100644
--- a/llvm/test/CodeGen/X86/fold-load-vec.ll
+++ b/llvm/test/CodeGen/X86/fold-load-vec.ll
@@ -12,7 +12,7 @@ define void @sample_test(<4 x float>* %source, <2 x float>*
%dest) nounwind {
 ; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    movlps %xmm0, (%rsp)
-; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; CHECK-NEXT:    movlps %xmm0, (%rsp)
 ; CHECK-NEXT:    movlps %xmm0, (%rsi)
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 32709b4fd5d..72cbad7cd68 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -759,14 +759,12 @@ define <16 x i8> @constant_fold_pshufb() {
 define <16 x i8> @constant_fold_pshufb_2() {
 ; SSE-LABEL: constant_fold_pshufb_2:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movl $2, %eax
-; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: constant_fold_pshufb_2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movl $2, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX-NEXT:    retq
   %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 2, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
   ret <16 x i8> %1