llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.44k stars 11.76k forks source link

[AVX] Adding to the higher element in a to-be-interleaved vector can be deferred #89858

Open Validark opened 5 months ago

Validark commented 5 months ago

These two functions are equivalent (on little-endian): (Godbolt link)

const VEC_SIZE = 8;

export fn foo(byte_idx: @Vector(VEC_SIZE, u8)) @Vector(VEC_SIZE * 2, u8) {
    const pairs: @Vector(VEC_SIZE, u16) = @bitCast(std.simd.interlace([_]@Vector(VEC_SIZE, u8){ byte_idx, byte_idx })); 
    return @bitCast(pairs + @as(@Vector(VEC_SIZE, u16), @splat(0x100)));
}

export fn bar(byte_idx: @Vector(VEC_SIZE, u8)) @Vector(VEC_SIZE * 2, u8) {
    return std.simd.interlace(.{ byte_idx, byte_idx + @as(@Vector(VEC_SIZE, u8), @splat(1)) });
}
define dso_local <32 x i8> @foo(<16 x i8> %0) local_unnamed_addr {
Entry:
  %1 = shufflevector <16 x i8> %0, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
  %2 = bitcast <32 x i8> %1 to <16 x i16>
  %3 = add nuw <16 x i16> %2, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
  %4 = bitcast <16 x i16> %3 to <32 x i8>
  ret <32 x i8> %4
}

define dso_local <32 x i8> @bar(<16 x i8> %0) local_unnamed_addr {
Entry:
  %1 = add nuw <16 x i8> %0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  %2 = shufflevector <16 x i8> %0, <16 x i8> %1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
  ret <32 x i8> %2
}

However, they compile differently:

.LCPI0_0:
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
foo:
        vpunpcklbw      xmm0, xmm0, xmm0
        vpaddw  xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
        ret

bar:
        vpcmpeqd        xmm1, xmm1, xmm1
        vpsubb  xmm1, xmm0, xmm1
        vpunpcklbw      xmm0, xmm0, xmm1
        ret

This especially becomes a problem if we increase VEC_SIZE to 16:

.LCPI0_0:
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
        .short  256
foo:
        vpermq  ymm0, ymm0, 216
        vpunpcklbw      ymm0, ymm0, ymm0
        vpaddw  ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
        ret

bar:
        vpcmpeqd        xmm1, xmm1, xmm1
        vpsubb  xmm1, xmm0, xmm1
        vpunpckhbw      xmm2, xmm0, xmm1
        vpunpcklbw      xmm0, xmm0, xmm1
        vinserti128     ymm0, ymm0, xmm2, 1
        ret

These can have different performance characteristics depending on the machine. On Zen 2, vpermq has a latency of 6, whereas the rest of these instructions all have a latency of 1. Except, of course, there's also the vpaddw which has a memory operand, which I presume will be slower than not going to memory, assuming the compiler is right to prefer an identity vpcmpeqd to load all 1's in a vector, rather than using a memory operand for the same purpose.

Validark commented 1 week ago

@EugeneZelenko Could you please tag this as an x86 issue?

llvmbot commented 1 week ago

@llvm/issue-subscribers-backend-x86

Author: Niles Salter (Validark)

These two functions are equivalent (on little-endian): ([Godbolt link](https://zig.godbolt.org/z/EKPe1bY35)) ```zig const VEC_SIZE = 8; export fn foo(byte_idx: @Vector(VEC_SIZE, u8)) @Vector(VEC_SIZE * 2, u8) { const pairs: @Vector(VEC_SIZE, u16) = @bitCast(std.simd.interlace([_]@Vector(VEC_SIZE, u8){ byte_idx, byte_idx })); return @bitCast(pairs + @as(@Vector(VEC_SIZE, u16), @splat(0x100))); } export fn bar(byte_idx: @Vector(VEC_SIZE, u8)) @Vector(VEC_SIZE * 2, u8) { return std.simd.interlace(.{ byte_idx, byte_idx + @as(@Vector(VEC_SIZE, u8), @splat(1)) }); } ``` ```llvm define dso_local <32 x i8> @foo(<16 x i8> %0) local_unnamed_addr { Entry: %1 = shufflevector <16 x i8> %0, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15> %2 = bitcast <32 x i8> %1 to <16 x i16> %3 = add nuw <16 x i16> %2, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256> %4 = bitcast <16 x i16> %3 to <32 x i8> ret <32 x i8> %4 } define dso_local <32 x i8> @bar(<16 x i8> %0) local_unnamed_addr { Entry: %1 = add nuw <16 x i8> %0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> %2 = shufflevector <16 x i8> %0, <16 x i8> %1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> ret <32 x i8> %2 } ``` However, they compile differently: ```asm .LCPI0_0: .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 foo: vpunpcklbw xmm0, xmm0, xmm0 vpaddw xmm0, xmm0, xmmword ptr [rip + .LCPI0_0] ret bar: vpcmpeqd xmm1, xmm1, xmm1 vpsubb xmm1, xmm0, xmm1 vpunpcklbw xmm0, xmm0, xmm1 ret ``` This especially becomes a problem if we increase `VEC_SIZE` to 16: ```asm .LCPI0_0: .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 .short 256 foo: vpermq ymm0, ymm0, 216 vpunpcklbw ymm0, ymm0, ymm0 vpaddw ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] ret bar: vpcmpeqd xmm1, xmm1, xmm1 vpsubb xmm1, xmm0, xmm1 vpunpckhbw xmm2, xmm0, xmm1 vpunpcklbw xmm0, xmm0, xmm1 vinserti128 ymm0, ymm0, xmm2, 1 ret ``` These can have different performance characteristics depending on the machine. On Zen 2, `vpermq` has a latency of 6, whereas the rest of these instructions all have a latency of 1. Except, of course, there's also the `vpaddw` which has a memory operand, which I presume will be slower than not going to memory, assuming the compiler is right to prefer an identity `vpcmpeqd` to load all 1's in a vector, rather than using a memory operand for the same purpose.