llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.34k stars 11.7k forks source link

[x86-64 BMI1] AND-NOT's should be preferred to OR-NOT's because we have `andn`/`vpandn` instructions #108731

Open Validark opened 3 weeks ago

Validark commented 3 weeks ago

Zig code:

export fn foo(w: u64, x: u64, y: u64, z: u64) u64 {
    var s = w;
    s &= ~(x & y);
    s &= ~(z & ~y);
    return s;
}

export fn fooVec(w: @Vector(16, u8), x: @Vector(16, u8), y: @Vector(16, u8), z: @Vector(16, u8)) @Vector(16, u8) {
    var s = w;
    s &= ~(x & y);
    s &= ~(z & ~y);
    return s;
}

Compiled for Znver3:

foo:
        not     rcx
        and     rsi, rdx
        andn    rax, rsi, rdi
        or      rcx, rdx
        and     rax, rcx
        ret

fooVec:
        vpand   xmm1, xmm2, xmm1
        vpcmpeqd        xmm4, xmm4, xmm4
        vpandn  xmm0, xmm1, xmm0
        vpxor   xmm1, xmm3, xmm4
        vpor    xmm1, xmm1, xmm2
        vpand   xmm0, xmm0, xmm1
        ret

I expected it to be and+andn+andn+andn for the scalar version, and vpand+vpandn+vpandn+vpandn for the vector version. However, it turns s &= ~(z & ~y); into s &= ~z | y;. While this makes sense on machines without andn (and doesn't matter for machines which have andn and orn, or just vpternlogq), with x86-64 BMI1 we only get andn. We never got an orn, so this optimization hurts us.

LLVM (optimized):

define dso_local i64 @foo(i64 %0, i64 %1, i64 %2, i64 %3) local_unnamed_addr {
Entry:
  %4 = and i64 %2, %1
  %5 = xor i64 %4, -1
  %6 = and i64 %5, %0
  %.not = xor i64 %3, -1
  %7 = or i64 %.not, %2
  %8 = and i64 %6, %7
  ret i64 %8
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

define dso_local <16 x i8> @fooVec(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3) local_unnamed_addr {
Entry:
  %4 = and <16 x i8> %2, %1
  %5 = xor <16 x i8> %4, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %6 = and <16 x i8> %5, %0
  %.not = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %7 = or <16 x i8> %.not, %2
  %8 = and <16 x i8> %6, %7
  ret <16 x i8> %8
}

LLVM Godbolt link

llvmbot commented 3 weeks ago

@llvm/issue-subscribers-backend-x86

Author: Niles Salter (Validark)

Zig code: ```zig export fn foo(w: u64, x: u64, y: u64, z: u64) u64 { var s = w; s &= ~(x & y); s &= ~(z & ~y); return s; } export fn fooVec(w: @Vector(16, u8), x: @Vector(16, u8), y: @Vector(16, u8), z: @Vector(16, u8)) @Vector(16, u8) { var s = w; s &= ~(x & y); s &= ~(z & ~y); return s; } ``` Compiled for Znver3: ```asm foo: not rcx and rsi, rdx andn rax, rsi, rdi or rcx, rdx and rax, rcx ret fooVec: vpand xmm1, xmm2, xmm1 vpcmpeqd xmm4, xmm4, xmm4 vpandn xmm0, xmm1, xmm0 vpxor xmm1, xmm3, xmm4 vpor xmm1, xmm1, xmm2 vpand xmm0, xmm0, xmm1 ret ``` I expected it to be `and`+`andn`+`andn`+`andn` for the scalar version, and `vpand`+`vpandn`+`vpandn`+`vpandn` for the vector version. However, it turns `s &= ~(z & ~y);` into `s &= ~z | y;`. While this makes sense on machines without `andn` (and doesn't matter for machines which have `andn` and `orn`, or just `vpternlogq`), with x86-64 BMI1 we only get `andn`. We never got an `orn`, so this optimization hurts us. LLVM (optimized): ```llvm define dso_local i64 @foo(i64 %0, i64 %1, i64 %2, i64 %3) local_unnamed_addr { Entry: %4 = and i64 %2, %1 %5 = xor i64 %4, -1 %6 = and i64 %5, %0 %.not = xor i64 %3, -1 %7 = or i64 %.not, %2 %8 = and i64 %6, %7 ret i64 %8 } declare void @llvm.dbg.value(metadata, metadata, metadata) #1 define dso_local <16 x i8> @fooVec(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3) local_unnamed_addr { Entry: %4 = and <16 x i8> %2, %1 %5 = xor <16 x i8> %4, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> %6 = and <16 x i8> %5, %0 %.not = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> %7 = or <16 x i8> %.not, %2 %8 = and <16 x i8> %6, %7 ret <16 x i8> %8 } ``` [LLVM Godbolt link](https://llvm.godbolt.org/z/ETEdK3fGM)
RKSimon commented 3 weeks ago

I've tagged this as x86 but hopefully we can handle this generically

Saldivarcher commented 2 weeks ago

@RKSimon just noticed you self-assigned this, but I took a stab at this last night. This is the PR #109215 .