dotnet / runtime

.NET is a cross-platform runtime for cloud, mobile, desktop, and IoT apps.
https://docs.microsoft.com/dotnet/core/
MIT License
14.67k stars 4.58k forks source link

[API Proposal]: AVX-512 `VPOPCNTDQ` and `BITALG` Intrinsics #96162

Open MineCake147E opened 7 months ago

MineCake147E commented 7 months ago

Background and motivation

Both VPOPCNTDQ and BITALG are supported by Intel in the Ice Lake and newer architectures, and by AMD in Zen 4. VPOPCNTDQ allows for parallel popcnt in either Vector128, Vector256, or Vector512 for ulong and uint. BITALG expands parallel popcnt for ushort and byte, and it also adds VPSHUFBITQMB instruction, which performs a bit gather select. VPOPCNTW is highly beneficial for my current project, allowing me for counting filled blocks row-by-row for a bit-board of block games.

API Proposal

namespace System.Runtime.Intrinsics.X86;

[Intrinsic]
[CLSCompliant(false)]
public abstract class Avx512VPopcntDQ : Avx512DQ
{
    public static new bool IsSupported { get; }

    [Intrinsic]
    public new abstract class X64 : Avx512DQ.X64
    {
        internal X64() { }
        public static new bool IsSupported { get; }
    }

    public static Vector512<int> PopCount(Vector512<int> value);

    public static Vector512<uint> PopCount(Vector512<uint> value);

    public static Vector512<long> PopCount(Vector512<long> value);

    public static Vector512<ulong> PopCount(Vector512<ulong> value);

    public abstract class VL : Avx512DQ.VL
    {
        public static new bool IsSupported { get; }

        public static Vector256<int> PopCount(Vector256<int> value);

        public static Vector256<uint> PopCount(Vector256<uint> value);

        public static Vector256<long> PopCount(Vector256<long> value);

        public static Vector256<ulong> PopCount(Vector256<ulong> value);

        public static Vector128<int> PopCount(Vector128<int> value);

        public static Vector128<uint> PopCount(Vector128<uint> value);

        public static Vector128<long> PopCount(Vector128<long> value);

        public static Vector128<ulong> PopCount(Vector128<ulong> value);
    }
}

[Intrinsic]
[CLSCompliant(false)]
public abstract class Avx512BitAlg : Avx512BW
{
    public static new bool IsSupported { get; }

    [Intrinsic]
    public new abstract class X64 : Avx512BW.X64
    {
        internal X64() { }
        public static new bool IsSupported { get; }
    }

    public static Vector512<short> PopCount(Vector512<short> value);

    public static Vector512<ushort> PopCount(Vector512<ushort> value);

    public static Vector512<byte> PopCount(Vector512<byte> value);

    public static Vector512<sbyte> PopCount(Vector512<sbyte> value);

    public static Vector512<byte> ShuffleBits(Vector512<ulong> value, Vector512<byte> control);

    public static Vector512<sbyte> ShuffleBits(Vector512<long> value, Vector512<sbyte> control);

    public static Vector512<byte> MaskShuffleBits(Vector512<byte> mask, Vector512<ulong> value, Vector512<byte> control);

    public static Vector512<sbyte> MaskShuffleBits(Vector512<sbyte> mask, Vector512<long> value, Vector512<sbyte> control);

    public abstract class VL : Avx512BW.VL
    {
        public static new bool IsSupported { get; }

        public static Vector256<short> PopCount(Vector256<short> value);

        public static Vector256<ushort> PopCount(Vector256<ushort> value);

        public static Vector256<byte> PopCount(Vector256<byte> value);

        public static Vector256<sbyte> PopCount(Vector256<sbyte> value);

        public static Vector128<short> PopCount(Vector128<short> value);

        public static Vector128<ushort> PopCount(Vector128<ushort> value);

        public static Vector128<byte> PopCount(Vector128<byte> value);

        public static Vector128<sbyte> PopCount(Vector128<sbyte> value);

        public static Vector256<byte> ShuffleBits(Vector256<ulong> value, Vector256<byte> control);

        public static Vector256<sbyte> ShuffleBits(Vector256<long> value, Vector256<sbyte> control);

        public static Vector256<byte> MaskShuffleBits(Vector256<byte> mask, Vector256<ulong> value, Vector256<byte> control);

        public static Vector256<sbyte> MaskShuffleBits(Vector256<sbyte> mask, Vector256<long> value, Vector256<sbyte> control);

        public static Vector128<byte> ShuffleBits(Vector128<ulong> value, Vector128<byte> control);

        public static Vector128<sbyte> ShuffleBits(Vector128<long> value, Vector128<sbyte> control);

        public static Vector128<byte> MaskShuffleBits(Vector128<byte> mask, Vector128<ulong> value, Vector128<byte> control);

        public static Vector128<sbyte> MaskShuffleBits(Vector128<sbyte> mask, Vector128<long> value, Vector128<sbyte> control);

    }
}

API Usage

var blocksPerRows = Avx512BitAlg.PopCount(board);

Alternative Designs

Risks

None

ghost commented 7 months ago

Tagging subscribers to this area: @dotnet/area-system-runtime-intrinsics See info in area-owners.md if you want to be subscribed.

Issue Details
### Background and motivation Both `VPOPCNTDQ` and `BITALG` are supported by Intel in the Ice Lake and newer architectures, and by AMD in Zen 4. `VPOPCNTDQ` allows for parallel `popcnt` in either Vector128, Vector256, or Vector512 for `ulong` and `uint`. `BITALG` expands parallel `popcnt` for `ushort` and `byte`, and it also adds [`VPSHUFBITQMB`](https://www.felixcloutier.com/x86/vpshufbitqmb) instruction, which performs a bit gather select. `VPOPCNTW` is highly beneficial for my current project, allowing me for counting filled blocks row-by-row for a bit-board of block games. ### API Proposal ```csharp namespace System.Runtime.Intrinsics.X86; [Intrinsic] [CLSCompliant(false)] public abstract class Avx512Vpopcntdq : Avx512DQ { public static new bool IsSupported { get; } [Intrinsic] public new abstract class X64 : Avx512DQ.X64 { internal X64() { } public static new bool IsSupported { get; } } public static Vector512 PopCount(Vector512 value); public static Vector512 PopCount(Vector512 value); public static Vector512 PopCount(Vector512 value); public static Vector512 PopCount(Vector512 value); public abstract class VL : Avx512DQ.VL { public static new bool IsSupported { get; } public static Vector256 PopCount(Vector256 value); public static Vector256 PopCount(Vector256 value); public static Vector256 PopCount(Vector256 value); public static Vector256 PopCount(Vector256 value); public static Vector128 PopCount(Vector128 value); public static Vector128 PopCount(Vector128 value); public static Vector128 PopCount(Vector128 value); public static Vector128 PopCount(Vector128 value); } } [Intrinsic] [CLSCompliant(false)] public abstract class Avx512BitAlg : Avx512BW { public static new bool IsSupported { get; } [Intrinsic] public new abstract class X64 : Avx512BW.X64 { internal X64() { } public static new bool IsSupported { get; } } public static Vector512 PopCount(Vector512 value); public static Vector512 PopCount(Vector512 value); public static Vector512 PopCount(Vector512 value); public static Vector512 PopCount(Vector512 value); public static Vector512 BitShuffle(Vector512 value, Vector512 control); public static Vector512 BitShuffle(Vector512 value, Vector512 control); public static Vector512 MaskBitShuffle(Vector512 mask, Vector512 value, Vector512 control); public static Vector512 MaskBitShuffle(Vector512 mask, Vector512 value, Vector512 control); public abstract class VL : Avx512BW.VL { public static new bool IsSupported { get; } public static Vector256 PopCount(Vector256 value); public static Vector256 PopCount(Vector256 value); public static Vector256 PopCount(Vector256 value); public static Vector256 PopCount(Vector256 value); public static Vector128 PopCount(Vector128 value); public static Vector128 PopCount(Vector128 value); public static Vector128 PopCount(Vector128 value); public static Vector128 PopCount(Vector128 value); public static Vector256 BitShuffle(Vector256 value, Vector256 control); public static Vector256 BitShuffle(Vector256 value, Vector256 control); public static Vector256 MaskBitShuffle(Vector256 mask, Vector256 value, Vector256 control); public static Vector256 MaskBitShuffle(Vector256 mask, Vector256 value, Vector256 control); public static Vector128 BitShuffle(Vector128 value, Vector128 control); public static Vector128 BitShuffle(Vector128 value, Vector128 control); public static Vector128 MaskBitShuffle(Vector128 mask, Vector128 value, Vector128 control); public static Vector128 MaskBitShuffle(Vector128 mask, Vector128 value, Vector128 control); } } ``` ### API Usage ```csharp var blocksPerRows = Avx512BitAlg.PopCount(board); ``` ### Alternative Designs - `Avx512Vpopcntdq` could have a different name. - `MaskBitShuffle` could have a different name and/or parameter/return types (e.g. `mask` and return type could be `ulong` instead of `Vector512`). ### Risks None
Author: MineCake147E
Assignees: -
Labels: `api-suggestion`, `area-System.Runtime.Intrinsics`
Milestone: -
tannergooding commented 7 months ago

ShuffleBits might be a better name than BitShuffle and better matches the instruction name of VPSHUFBITQMB. Notable _mm_bitshuffle_epi64_mask is the name of the C API, however.

Returning a vector is consistent with how the other APIs that "return masks" work and should mesh with the existing pattern recognition allowing it to be consumed directly as a mask.

terrajobst commented 5 months ago

Video

namespace System.Runtime.Intrinsics.X86;

[Intrinsic]
[CLSCompliant(false)]
public abstract class Avx512VPopcntDQ : Avx512DQ
{
    public static new bool IsSupported { get; }

    [Intrinsic]
    public new abstract class X64 : Avx512DQ.X64
    {
        public static new bool IsSupported { get; }
    }

    public static Vector512<int> PopCount(Vector512<int> value);
    public static Vector512<uint> PopCount(Vector512<uint> value);
    public static Vector512<long> PopCount(Vector512<long> value);
    public static Vector512<ulong> PopCount(Vector512<ulong> value);

    public abstract class VL : Avx512DQ.VL
    {
        public static new bool IsSupported { get; }

        public static Vector256<int> PopCount(Vector256<int> value);
        public static Vector256<uint> PopCount(Vector256<uint> value);
        public static Vector256<long> PopCount(Vector256<long> value);
        public static Vector256<ulong> PopCount(Vector256<ulong> value);
        public static Vector128<int> PopCount(Vector128<int> value);
        public static Vector128<uint> PopCount(Vector128<uint> value);
        public static Vector128<long> PopCount(Vector128<long> value);
        public static Vector128<ulong> PopCount(Vector128<ulong> value);
    }
}

[Intrinsic]
[CLSCompliant(false)]
public abstract class Avx512BitAlg : Avx512BW
{
    public static new bool IsSupported { get; }

    [Intrinsic]
    public new abstract class X64 : Avx512BW.X64
    {
        public static new bool IsSupported { get; }
    }

    public static Vector512<short> PopCount(Vector512<short> value);
    public static Vector512<ushort> PopCount(Vector512<ushort> value);
    public static Vector512<byte> PopCount(Vector512<byte> value);
    public static Vector512<sbyte> PopCount(Vector512<sbyte> value);
    public static Vector512<byte> ShuffleBits(Vector512<ulong> value, Vector512<byte> control);
    public static Vector512<sbyte> ShuffleBits(Vector512<long> value, Vector512<sbyte> control);
    public static Vector512<byte> MaskShuffleBits(Vector512<byte> mask, Vector512<ulong> value, Vector512<byte> control);
    public static Vector512<sbyte> MaskShuffleBits(Vector512<sbyte> mask, Vector512<long> value, Vector512<sbyte> control);

    public abstract class VL : Avx512BW.VL
    {
        public static new bool IsSupported { get; }

        public static Vector256<short> PopCount(Vector256<short> value);
        public static Vector256<ushort> PopCount(Vector256<ushort> value);
        public static Vector256<byte> PopCount(Vector256<byte> value);
        public static Vector256<sbyte> PopCount(Vector256<sbyte> value);
        public static Vector128<short> PopCount(Vector128<short> value);
        public static Vector128<ushort> PopCount(Vector128<ushort> value);
        public static Vector128<byte> PopCount(Vector128<byte> value);
        public static Vector128<sbyte> PopCount(Vector128<sbyte> value);       
        public static Vector256<byte> ShuffleBits(Vector256<ulong> value, Vector256<byte> control);
        public static Vector256<sbyte> ShuffleBits(Vector256<long> value, Vector256<sbyte> control);
        public static Vector256<byte> MaskShuffleBits(Vector256<byte> mask, Vector256<ulong> value, Vector256<byte> control);
        public static Vector256<sbyte> MaskShuffleBits(Vector256<sbyte> mask, Vector256<long> value, Vector256<sbyte> control);
        public static Vector128<byte> ShuffleBits(Vector128<ulong> value, Vector128<byte> control);
        public static Vector128<sbyte> ShuffleBits(Vector128<long> value, Vector128<sbyte> control);
        public static Vector128<byte> MaskShuffleBits(Vector128<byte> mask, Vector128<ulong> value, Vector128<byte> control);
        public static Vector128<sbyte> MaskShuffleBits(Vector128<sbyte> mask, Vector128<long> value, Vector128<sbyte> control);
    }
}
Mrnikbobjeff commented 3 months ago

This would be wonderful for our internal algorithm library as well. We need to compute hamming distances on larger number sets, and using the popcount x64 intrinsic is a limiting factor in our implementation.