[PPC PWR10] Bit extraction opts to use expensive vector code

llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.

Other

28.82k stars 11.91k forks source link

This code:

export fn extract_bits1(a: u64) u64 {
    return ((a >> 2) & 0b1) | ((a >> 5) & 0b110);
}

This LLVM IR:

define dso_local i64 @extract_bits1(i64 %0) #0 {
1:
  %2 = zext i6 2 to i64
  %3 = lshr i64 %0, %2
  %4 = and i64 %3, 1
  %5 = zext i6 5 to i64
  %6 = lshr i64 %0, %5
  %7 = and i64 %6, 6
  %8 = or i64 %4, %7
  ret i64 %8
}

Gets "optimized" to this LLVM IR when -mcpu=pwr10:

define dso_local i64 @extract_bits1(i64 %0) local_unnamed_addr {
Entry:
  %1 = insertelement <2 x i64> poison, i64 %0, i64 0
  %2 = shufflevector <2 x i64> %1, <2 x i64> poison, <2 x i32> zeroinitializer
  %3 = lshr <2 x i64> %2, <i64 2, i64 5>
  %4 = and <2 x i64> %3, <i64 1, i64 6>
  %shift = shufflevector <2 x i64> %4, <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
  %5 = or <2 x i64> %4, %shift
  %6 = extractelement <2 x i64> %5, i64 0
  ret i64 %6
}

For pwr9, and other architecture/cpu combinations, this does not happen.

This results in this assembly:

.LCPI0_0:
        .long   0
        .long   2
        .long   0
        .long   5
.LCPI0_1:
        .long   0
        .long   1
        .long   0
        .long   6
extract_bits1:
        stwu 1, -64(1)
        stw 4, 32(1)
        stw 3, 16(1)
        xxsplti32dx 34, 0, 66051
        li 3, .LCPI0_0@l
        lis 4, .LCPI0_0@ha
        lxv 35, 32(1)
        lxv 36, 16(1)
        xxsplti32dx 34, 1, 269554195
        vperm 2, 4, 3, 2
        lxvx 35, 4, 3
        li 3, .LCPI0_1@l
        lis 4, .LCPI0_1@ha
        lxvx 0, 4, 3
        vsrd 2, 2, 3
        xxland 1, 34, 0
        xxswapd 35, 1
        xxeval 0, 35, 34, 0, 31
        stxv 0, 48(1)
        lwz 3, 48(1)
        lwz 4, 52(1)
        addi 1, 1, 64
        blr

Compare that to the pwr9 assembly, which at first glance seems a lot better:

extract_bits1:
        stwu 1, -16(1)
        rlwinm 5, 4, 30, 31, 31
        li 3, 0
        rlwimi 5, 4, 27, 29, 30
        mr      4, 5
        addi 1, 1, 16
        blr

Expected Behavior

I expect the optimized LLVM IR to be the same as it is for the pwr9 platform.

define dso_local i64 @extract_bits1(i64 %0) local_unnamed_addr {
Entry:
  %1 = lshr i64 %0, 2
  %2 = and i64 %1, 1
  %3 = lshr i64 %0, 5
  %4 = and i64 %3, 6
  %5 = or i64 %2, %4
  ret i64 %5
}

Originally https://github.com/ziglang/zig/issues/18381

@llvm/issue-subscribers-backend-powerpc

Author: Niles Salter (Validark)

[Godbolt](https://zig.godbolt.org/z/M4decT6sc) This code: ```zig export fn extract_bits1(a: u64) u64 { return ((a >> 2) & 0b1) | ((a >> 5) & 0b110); } ``` This LLVM IR: ```llvm define dso_local i64 @extract_bits1(i64 %0) #0 { 1: %2 = zext i6 2 to i64 %3 = lshr i64 %0, %2 %4 = and i64 %3, 1 %5 = zext i6 5 to i64 %6 = lshr i64 %0, %5 %7 = and i64 %6, 6 %8 = or i64 %4, %7 ret i64 %8 } ``` Gets "optimized" to this LLVM IR when `-mcpu=pwr10`: ```llvm define dso_local i64 @extract_bits1(i64 %0) local_unnamed_addr { Entry: %1 = insertelement <2 x i64> poison, i64 %0, i64 0 %2 = shufflevector <2 x i64> %1, <2 x i64> poison, <2 x i32> zeroinitializer %3 = lshr <2 x i64> %2, <i64 2, i64 5> %4 = and <2 x i64> %3, <i64 1, i64 6> %shift = shufflevector <2 x i64> %4, <2 x i64> poison, <2 x i32> <i32 1, i32 poison> %5 = or <2 x i64> %4, %shift %6 = extractelement <2 x i64> %5, i64 0 ret i64 %6 } ``` For `pwr9`, and other architecture/cpu combinations, this does not happen. This results in this assembly: ```asm .LCPI0_0: .long 0 .long 2 .long 0 .long 5 .LCPI0_1: .long 0 .long 1 .long 0 .long 6 extract_bits1: stwu 1, -64(1) stw 4, 32(1) stw 3, 16(1) xxsplti32dx 34, 0, 66051 li 3, .LCPI0_0@l lis 4, .LCPI0_0@ha lxv 35, 32(1) lxv 36, 16(1) xxsplti32dx 34, 1, 269554195 vperm 2, 4, 3, 2 lxvx 35, 4, 3 li 3, .LCPI0_1@l lis 4, .LCPI0_1@ha lxvx 0, 4, 3 vsrd 2, 2, 3 xxland 1, 34, 0 xxswapd 35, 1 xxeval 0, 35, 34, 0, 31 stxv 0, 48(1) lwz 3, 48(1) lwz 4, 52(1) addi 1, 1, 64 blr ``` Compare that to the `pwr9` assembly, which at first glance seems a lot better: ```asm extract_bits1: stwu 1, -16(1) rlwinm 5, 4, 30, 31, 31 li 3, 0 rlwimi 5, 4, 27, 29, 30 mr 4, 5 addi 1, 1, 16 blr ``` ### Expected Behavior I expect the optimized LLVM IR to be the same as it is for the `pwr9` platform. ```llvm define dso_local i64 @extract_bits1(i64 %0) local_unnamed_addr { Entry: %1 = lshr i64 %0, 2 %2 = and i64 %1, 1 %3 = lshr i64 %0, 5 %4 = and i64 %3, 6 %5 = or i64 %2, %4 ret i64 %5 } ``` Originally https://github.com/ziglang/zig/issues/18381

llvm / llvm-project

[PPC PWR10] Bit extraction opts to use expensive vector code #113352

Expected Behavior