Zero/AllOnes XMM / YMM registers are treated separately

Quuxplusone commented 8 years ago


Bugzilla Link	PR26018
Status	NEW
Importance	P normal
Reported by	Simon Pilgrim (llvm-dev@redking.me.uk)
Reported on	2016-01-04 16:00:48 -0800
Last modified on	2020-03-23 07:10:20 -0700
Version	trunk
Hardware	PC All
CC	andrea.dibiagio@gmail.com, craig.topper@gmail.com, dtemirbulatov@gmail.com, lebedev.ri@gmail.com, llvm-bugs@lists.llvm.org, spatel+llvm@rotateright.com
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also	PR32862, PR9588, PR42653, PR43691, PR39381

It should be possible to share 128-bit and 256-bit zero vector registers
instead of generating them separately, increasing instruction count and wasting
registers.

Zero ZMM registers probably have the same issue.

As a stretch goal it might be possible to recognise that a VEX encoded 128-bit
instruction will implicitly zero the upper bits and make use of it.

Example: llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll

define void @endless_loop() {
; CHECK-LABEL: endless_loop:
; CHECK-NEXT:  # BB#0:
; CHECK-NEXT:    vmovaps (%eax), %ymm0
; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2 <-- XMM ZERO
; CHECK-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; CHECK-NEXT:    vxorps %ymm2, %ymm2, %ymm2 <-- YMM ZERO
; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
; CHECK-NEXT:    vmovaps %ymm0, (%eax)
; CHECK-NEXT:    vmovaps %ymm1, (%eax)
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retl
entry:
  %0 = load <8 x i32>, <8 x i32> addrspace(1)* undef, align 32
  %1 = shufflevector <8 x i32> %0, <8 x i32> undef, <16 x i32> <i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %2 = shufflevector <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef>, <16 x i32> %1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i$
  store <16 x i32> %2, <16 x i32> addrspace(1)* undef, align 64
  ret void
}

Quuxplusone commented 7 years ago

Although D35839/rL309298 stopped the original test example, its still not
difficult to break this (although it now has generated 2 xmm zero registers,
not a xmm and ymm):

#include <x86intrin.h>

void foo(__m128 a, __m256 b, __m128 *f128, __m256 *f256) {
  a = _mm_blend_ps(a, _mm_setzero_ps(), 0x3);
  b = _mm256_blend_ps(b, _mm256_setzero_ps(), 0x3);
  *f128++ = a;
  *f256++ = b;
}

llc -mtriple=x86_64-unknown-unknown -mcpu=btver2

define void @foo(<4 x float>, <8 x float>, <4 x float>* nocapture, <8 x float>*
nocapture) {
  %5 = shufflevector <4 x float> %0, <4 x float> <float 0.000000e+00, float 0.000000e+00, float undef, float undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
  %6 = shufflevector <8 x float> %1, <8 x float> <float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef>, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <4 x float> %5, <4 x float>* %2, align 16, !tbaa !2
  store <8 x float> %6, <8 x float>* %3, align 32, !tbaa !2
  ret void
}

foo:
  vxorpd %xmm2, %xmm2, %xmm2
  vblendpd $1, %xmm2, %xmm0, %xmm0 # xmm0 = xmm2[0],xmm0[1]
  vxorpd %xmm2, %xmm2, %xmm2
  vblendpd $1, %ymm2, %ymm1, %ymm1 # ymm1 = ymm2[0],ymm1[1,2,3]
  vmovapd %xmm0, (%rdi)
  vmovapd %ymm1, (%rsi)
  retq