llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.41k stars 11.74k forks source link

[PowerPC] missing VSX FMA Mutation optimize in some case for option -schedule-ppc-vsx-fma-mutation-early #111906

Open diggerlin opened 3 hours ago

diggerlin commented 3 hours ago

Description:

bash> cat test.c

   extern "C" {
#include "altivec.h"

void vsexp (float* __restrict __output_a,float* __restrict var1321In_a,int* __restrict n)
{
  int j;
# pragma nounroll
  for(j=j*4;j<*n;j++) {
    vector float var1321;
    vector unsigned char var1326;
    vector unsigned char var1323=(vector unsigned char) {63,184,170,59,63,184,170,59,63,184,170,59,63,184,170,59};
    vector unsigned char var1325=(vector unsigned char) {69,195,244,0,69,195,244,0,69,195,244,0,69,195,244,0};
    var1321=vec_xl(0,var1321In_a+4*(j));
    var1326=(vector unsigned char) vec_madd((vector float)(var1321), (vector float)(var1323), (vector float)(var1325));
     vec_xst((vector float)var1326,0,__output_a+4*(j));
}
  return;
}
} //extern "C"

when compile with -mllvm -disable-ppc-vsx-fma-mutation=false -mllvm -schedule-ppc-vsx-fma-mutation-early

it generate the asm as (the loop has 6 instructions)

 .vsexp:
# %bb.0:                                # %entry
        lwz r5, 0(r5)
        cmpwi   r5, 1
        bltlr   cr0
# %bb.1:                                # %for.body.preheader
        xxspltiw vs0, 1069066811
        mtctr r5
        li r5, 0
        .align  5
L..BB0_2:                               # %for.body
                                        # =>This Inner Loop Header: Depth=1
        lxvx vs1, r4, r5
        xxspltiw vs2, 1170469888
        xvmaddasp vs2, vs1, vs0
        stxvx vs2, r3, r5
        addi r5, r5, 16
        bdnz L..BB0_2
# %bb.3:                                # %for.end
        blr

obviously , there is more efficient code as which move xxspltiw vs2, 1170469888 out from the loop and change the xvmaddasp to `xvmaddmsp' , the asm code as following (the loop only has 5 instructions)

.vsexp:
# %bb.0:                                # %entry
        lwz r5, 0(r5)
        cmpwi   r5, 1
        bltlr   cr0
# %bb.1:                                # %for.body.preheader
        xxspltiw vs0, 1069066811
        xxspltiw vs1, 1170469888
        mtctr r5
        li r5, 0
        .align  5
L..BB0_2:                               # %for.body
                                        # =>This Inner Loop Header: Depth=1
        lxvx vs2, r4, r5
        xvmaddmsp vs2, vs0, vs1
        stxvx vs2, r3, r5
        addi r5, r5, 16
        bdnz L..BB0_2
# %bb.3:                                # %for.end
        blr
llvmbot commented 3 hours ago

@llvm/issue-subscribers-backend-powerpc

Author: zhijian lin (diggerlin)

Description: bash> cat test.c ``` extern "C" { #include "altivec.h" void vsexp (float* __restrict __output_a,float* __restrict var1321In_a,int* __restrict n) { int j; # pragma nounroll for(j=j*4;j<*n;j++) { vector float var1321; vector unsigned char var1326; vector unsigned char var1323=(vector unsigned char) {63,184,170,59,63,184,170,59,63,184,170,59,63,184,170,59}; vector unsigned char var1325=(vector unsigned char) {69,195,244,0,69,195,244,0,69,195,244,0,69,195,244,0}; var1321=vec_xl(0,var1321In_a+4*(j)); var1326=(vector unsigned char) vec_madd((vector float)(var1321), (vector float)(var1323), (vector float)(var1325)); vec_xst((vector float)var1326,0,__output_a+4*(j)); } return; } } //extern "C" ``` when compile with `-mllvm -disable-ppc-vsx-fma-mutation=false -mllvm -schedule-ppc-vsx-fma-mutation-early` it generate the asm as (the loop has 6 instructions) ``` .vsexp: # %bb.0: # %entry lwz r5, 0(r5) cmpwi r5, 1 bltlr cr0 # %bb.1: # %for.body.preheader xxspltiw vs0, 1069066811 mtctr r5 li r5, 0 .align 5 L..BB0_2: # %for.body # =>This Inner Loop Header: Depth=1 lxvx vs1, r4, r5 xxspltiw vs2, 1170469888 xvmaddasp vs2, vs1, vs0 stxvx vs2, r3, r5 addi r5, r5, 16 bdnz L..BB0_2 # %bb.3: # %for.end blr ``` obviously , there is more efficient code as which pull ` xxspltiw vs2, 1170469888` put the loop and change the `xvmaddasp ` to `xvmaddmsp' , the asm code as (the loop only has 5 instructions) ``` .vsexp: # %bb.0: # %entry lwz r5, 0(r5) cmpwi r5, 1 bltlr cr0 # %bb.1: # %for.body.preheader xxspltiw vs0, 1069066811 xxspltiw vs1, 1170469888 mtctr r5 li r5, 0 .align 5 L..BB0_2: # %for.body # =>This Inner Loop Header: Depth=1 lxvx vs2, r4, r5 xvmaddmsp vs2, vs0, vs1 stxvx vs2, r3, r5 addi r5, r5, 16 bdnz L..BB0_2 # %bb.3: # %for.end blr ```
diggerlin commented 3 hours ago

The PowerPC VSX FMA Mutation pass convert COPY adjacent with XSMADDADP instruction to a single instruction XSMADDMDP as the comment in the https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp#L90

        //   %5 = COPY %9; VSLRC:%5,%9
        //   %5<def,tied1> = XSMADDADP %5<tied0>, %17, %16,
        //                         implicit %rm; VSLRC:%5,%17,%16
        //   ...
        //   %9<def,tied1> = XSMADDADP %9<tied0>, %17, %19,
        //                         implicit %rm; VSLRC:%9,%17,%19
        //   ...
        // Where we can eliminate the copy by changing from the A-type to the
        // M-type instruction. Specifically, for this example, this means:
        //   %5<def,tied1> = XSMADDADP %5<tied0>, %17, %16,
        //                         implicit %rm; VSLRC:%5,%17,%16
        // is replaced by:
        //   %16<def,tied1> = XSMADDMDP %16<tied0>, %18, %9,
        //                         implicit %rm; VSLRC:%16,%18,%9
        // and we remove: %5 = COPY %9; VSLRC:%5,%9

but the Register Coalescer pass, which eliminates COPY instructions, can prevent the PowerPC VSX FMA Mutation pass from converting a COPY adjacent to XSMADDADP into a single XSMADDMDP instruction in the this case.

diggerlin commented 3 hours ago

The Register Coalescer pass will convert following IR

bb.1.for.body.preheader:
; predecessors: %bb.0
  successors: %bb.2(0x80000000); %bb.2(100.00%)

  MTCTR8loop killed %0:g8rc, implicit-def dead $ctr8
  %8:g8rc = LI8 0
  %10:vsrc = XXSPLTIW 1069066811
  %11:vsrc = XXSPLTIW 1170469888       (The line will move into loop body bb.2.for.body: )
  %14:g8rc_and_g8rc_nox0 = COPY killed %8:g8rc

bb.2.for.body:
; predecessors: %bb.1, %bb.2
  successors: %bb.2(0x7c000000), %bb.3(0x04000000); %bb.2(96.88%), %bb.3(3.12%)

  %1:g8rc_and_g8rc_nox0 = COPY killed %14:g8rc_and_g8rc_nox0
  %9:vsrc = LXVX %4:g8rc_and_g8rc_nox0, %1:g8rc_and_g8rc_nox0 :: (load (s128) from %ir.scevgep1, align 1)
  %12:vsrc = COPY %11:vsrc      (The line will be replace with   %12:vsrc = XXSPLTIW 1170469888)
  %12:vsrc = contract nofpexcept XVMADDASP %12:vsrc(tied-def 0), killed %9:vsrc, %10:vsrc, implicit $rm
  STXVX killed %12:vsrc, %3:g8rc_and_g8rc_nox0, %1:g8rc_and_g8rc_nox0 :: (store (s128) into %ir.scevgep, align 1)
  %2:g8rc = nuw nsw ADDI8 killed %1:g8rc_and_g8rc_nox0, 16
  %14:g8rc_and_g8rc_nox0 = COPY killed %2:g8rc
  BDNZ8 %bb.2, implicit-def $ctr8, implicit $ctr8
  B %bb.3

to

 bb.1.for.body.preheader:
; predecessors: %bb.0
  successors: %bb.2(0x80000000); %bb.2(100.00%)

  MTCTR8loop %0:g8rc, implicit-def dead $ctr8
  %14:g8rc_and_g8rc_nox0 = LI8 0
  %10:vsrc = XXSPLTIW 1069066811

bb.2.for.body:
; predecessors: %bb.1, %bb.2
  successors: %bb.2(0x7c000000), %bb.3(0x04000000); %bb.2(96.88%), %bb.3(3.12%)

  %9:vsrc = LXVX %4:g8rc_and_g8rc_nox0, %14:g8rc_and_g8rc_nox0 :: (load (s128) from %ir.scevgep1, align 1)
  %12:vsrc = XXSPLTIW 1170469888      (The line is moved from  bb.1.for.body.preheader into the loop and COPY eliminated )
  %12:vsrc = contract nofpexcept XVMADDASP %12:vsrc(tied-def 0), %9:vsrc, %10:vsrc, implicit $rm
  STXVX %12:vsrc, %3:g8rc_and_g8rc_nox0, %14:g8rc_and_g8rc_nox0 :: (store (s128) into %ir.scevgep, align 1)
  %14:g8rc_and_g8rc_nox0 = nuw nsw ADDI8 %14:g8rc_and_g8rc_nox0, 16
  BDNZ8 %bb.2, implicit-def $ctr8, implicit $ctr8
  B %bb.3

It will prevent the PowerPC VSX FMA Mutation (ppc-vsx-fma-mutate) on vsexp pass

converting

  %12:vsrc = COPY %11:vsrc
  %12:vsrc = contract nofpexcept XVMADDASP %12:vsrc(tied-def 0), killed %9:vsrc, %10:vsrc, implicit $rm 

to

%9:vsrc = contract nofpexcept XVMADDMSP %9:vsrc(tied-def 0), %10:vsrc, %11:vsrc, implicit $rm

diggerlin commented 3 hours ago

create a patch [PowerPC] Update to run VSX FMA Mutation pass before Register Coalescer for -schedule-ppc-vsx-fma-mutation-early