llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
29.24k stars 12.07k forks source link

[X86] Failure to merge X86ISD::CVTPH2PS nodes #83414

Open RKSimon opened 8 months ago

RKSimon commented 8 months ago
define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) {
  %cvt = fptosi <2 x half> %a to <2 x i32>
  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  ret <4 x i32> %ext
}

llc -mcpu=x86-64-v3

fptosi_2f16_to_4i32:                    # @fptosi_2f16_to_4i32
    vpshufb .LCPI0_0(%rip), %xmm0, %xmm1    # xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
    vcvtph2ps   %xmm1, %xmm1
    vpmovzxwq   %xmm0, %xmm0            # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
    vcvtph2ps   %xmm0, %xmm0
    vunpcklps   %xmm1, %xmm0, %xmm0     # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    vcvttps2dq  %xmm0, %xmm0
    vmovq   %xmm0, %xmm0                    # xmm0 = xmm0[0],zero
    retq

Latest trunk now gives the above assembly, ideally we would only have a single vcvtph2ps node, and avoid all the shuffles which are just trying to move elements into the lowest element:

fptosi_2f16_to_4i32:                    # @fptosi_2f16_to_4i32
    vcvtph2ps   %xmm0, %xmm0
    vcvttps2dq  %xmm0, %xmm0
    vmovq   %xmm0, %xmm0                    # xmm0 = xmm0[0],zero
    retq
llvmbot commented 8 months ago

@llvm/issue-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

```ll define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) { %cvt = fptosi <2 x half> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ret <4 x i32> %ext } ``` llc -mcpu=x86-64-v3 ```asm fptosi_2f16_to_4i32: # @fptosi_2f16_to_4i32 vpshufb .LCPI0_0(%rip), %xmm0, %xmm1 # xmm1 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] vcvtph2ps %xmm1, %xmm1 vpmovzxwq %xmm0, %xmm0 # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero vcvtph2ps %xmm0, %xmm0 vunpcklps %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] vcvttps2dq %xmm0, %xmm0 vmovq %xmm0, %xmm0 # xmm0 = xmm0[0],zero retq ``` Latest trunk now gives the above assembly, ideally we would only have a single vcvtph2ps node, and avoid all the shuffles which are just trying to move elements into the lowest element: ```asm fptosi_2f16_to_4i32: # @fptosi_2f16_to_4i32 vcvtph2ps %xmm0, %xmm0 vcvttps2dq %xmm0, %xmm0 vmovq %xmm0, %xmm0 # xmm0 = xmm0[0],zero retq ```