dotnet / runtime

.NET is a cross-platform runtime for cloud, mobile, desktop, and IoT apps.
https://docs.microsoft.com/dotnet/core/
MIT License
14.95k stars 4.65k forks source link

[API Proposal]: VPCLMULQDQ Intrinsics #95772

Open saucecontrol opened 9 months ago

saucecontrol commented 9 months ago

Background and motivation

VPCLMULQDQ is supported by Intel in the Ice Lake and newer architectures, and by AMD in Zen 4. It allows for parallel pclmulqdq in Vector256 and Vector512 and is important for implementing vectorized CRC32 among other things.

API Proposal

namespace System.Runtime.Intrinsics.X86;

/// <summary>
/// This class provides access to Intel VPCLMULQDQ hardware instructions via intrinsics
/// </summary>
[Intrinsic]
[CLSCompliant(false)]
public abstract class Pclmulqdq256 : Pclmulqdq
{
    internal Pclmulqdq256() { }

    // This would depend on the VPCLMULQDQ CPUID bit for VEX encoding and VPCLMULQDQ + AVX512VL for EVEX
    public static new bool IsSupported { get => IsSupported; }

    [Intrinsic]
    public new abstract class X64 : Pclmulqdq.X64
    {
        internal X64() { }

        public static new bool IsSupported { get => IsSupported; }
    }

    /// <summary>
    /// __m256i _mm256_clmulepi64_si128 (__m256i a, __m256i b, const int imm8)
    ///   VPCLMULQDQ ymm1, ymm2, ymm3/m256, imm8
    /// </summary>
    public static Vector256<long> CarrylessMultiply(Vector256<long> left, Vector256<long> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
    /// __m256i _mm256_clmulepi64_si128 (__m256i a, __m256i b, const int imm8)
    ///   VPCLMULQDQ ymm1, ymm2, ymm3/m256, imm8
    /// </summary>
    public static Vector256<ulong> CarrylessMultiply(Vector256<ulong> left, Vector256<ulong> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
}

/// <summary>
/// This class provides access to Intel AVX-512 VPCLMULQDQ hardware instructions via intrinsics
/// </summary>
[Intrinsic]
[CLSCompliant(false)]
public abstract class Pclmulqdq512 : Pclmulqdq
{
    internal Pclmulqdq512() { }

    // This would depend on the VPCLMULQDQ + AVX512F CPUID bits
    public static new bool IsSupported { get => IsSupported; }

    [Intrinsic]
    public new abstract class X64 : Pclmulqdq.X64
    {
        internal X64() { }

        public static new bool IsSupported { get => IsSupported; }
    }

    /// <summary>
    /// __m512i _mm512_clmulepi64_si128 (__m512i a, __m512i b, const int imm8)
    ///   VPCLMULQDQ zmm1, zmm2, zmm3/m512, imm8
    /// </summary>
    public static Vector512<long> CarrylessMultiply(Vector512<long> left, Vector512<long> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
    /// __m512i _mm512_clmulepi64_si128 (__m512i a, __m512i b, const int imm8)
    ///   VPCLMULQDQ zmm1, zmm2, zmm3/m512, imm8
    /// </summary>
    public static Vector512<ulong> CarrylessMultiply(Vector512<ulong> left, Vector512<ulong> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
}

API Usage

Examples of vectorized CRC32 implementations using the equivalent C intrinsics abound. One such example: https://github.com/corsix/fast-crc32/blob/main/sample_avx512_vpclmulqdq_crc32c_v4s5x3.c

Alternative Designs

The Pclmulqdq256 and Pclmulqdq512 classes could be nested under Pclmulqdq rather than being top-level classes inheriting from it. Since this ISA includes only a single instruction, that may be preferable.

A case could be made for making Avx the base of Pclulqdq256, as VEX encoding is required for vpclmulqdq. Likewise, Pclmulqdq512 could have Avx512F as a base given its requirement of EVEX encoding. However, the relationship will change with AVX10, where EVEX support will not imply 512-bit vector support.

Risks

N/A

ghost commented 9 months ago

Tagging subscribers to this area: @dotnet/area-system-runtime-intrinsics See info in area-owners.md if you want to be subscribed.

Issue Details
### Background and motivation `VPCLMULQDQ` is supported by Intel in the Ice Lake and newer architectures, and by AMD in Zen 4. It allows for parallel `pclmulqdq` in `Vector256` and `Vector512` and is important for implementing vectorized CRC32 [among other things](https://www.intel.com/content/dam/develop/external/us/en/documents/clmul-wp-rev-2-02-2014-04-20.pdf). ### API Proposal ```csharp namespace System.Runtime.Intrinsics.X86; /// /// This class provides access to Intel VPCLMULQDQ hardware instructions via intrinsics /// [Intrinsic] [CLSCompliant(false)] public abstract class Pclmulqdq256 : Pclmulqdq { internal Pclmulqdq256() { } // This would depend on the VPCLMULQDQ CPUID bit for VEX encoding and VPCLMULQDQ + AVX512VL for EVEX public static new bool IsSupported { get => IsSupported; } [Intrinsic] public new abstract class X64 : Pclmulqdq.X64 { internal X64() { } public static new bool IsSupported { get => IsSupported; } } /// /// __m256i _mm256_clmulepi64_si128 (__m256i a, __m256i b, const int imm8) /// VPCLMULQDQ ymm1, ymm2, ymm3/m256, imm8 /// public static Vector256 CarrylessMultiply(Vector256 left, Vector256 right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control); /// __m256i _mm256_clmulepi64_si128 (__m256i a, __m256i b, const int imm8) /// VPCLMULQDQ ymm1, ymm2, ymm3/m256, imm8 /// public static Vector256 CarrylessMultiply(Vector256 left, Vector256 right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control); } /// /// This class provides access to Intel AVX-512 VPCLMULQDQ hardware instructions via intrinsics /// [Intrinsic] [CLSCompliant(false)] public abstract class Pclmulqdq512 : Pclmulqdq { internal Pclmulqdq256() { } // This would depend on the VPCLMULQDQ + AVX512F CPUID bits public static new bool IsSupported { get => IsSupported; } [Intrinsic] public new abstract class X64 : Pclmulqdq.X64 { internal X64() { } public static new bool IsSupported { get => IsSupported; } } /// /// __m512i _mm512_clmulepi64_si128 (__m512i a, __m512i b, const int imm8) /// VPCLMULQDQ ymm1, ymm2, ymm3/m512, imm8 /// public static Vector512 CarrylessMultiply(Vector512 left, Vector512 right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control); /// __m512i _mm512_clmulepi64_si128 (__m512i a, __m512i b, const int imm8) /// VPCLMULQDQ ymm1, ymm2, ymm3/m512, imm8 /// public static Vector512 CarrylessMultiply(Vector512 left, Vector512 right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control); } ``` ### API Usage Examples of vectorized CRC32 implementations using the equivalent C intrinsics abound. One such example: https://github.com/corsix/fast-crc32/blob/main/sample_avx512_vpclmulqdq_crc32c_v4s5x3.c ### Alternative Designs The `Pclmulqdq256` and `Pclmulqdq256` could be nested under `Pclmulqdq` rather than being top-level classes inheriting from it. Since this ISA includes only a single instruction, that may be preferable. A case could be made for making `Avx` the base of `Pclulqdq256`, as VEX encoding is required for `vpclmulqdq`. Likewise, `Pclulqdq512` could have `Avx512F` as a base given its requirement of `EVEX` encoding. However, the relationship will change with AVX10, where EVEX support will not imply 512-bit vector support. ### Risks N/A
Author: saucecontrol
Assignees: -
Labels: `api-suggestion`, `area-System.Runtime.Intrinsics`
Milestone: -
MichalPetryka commented 9 months ago

Pclmulqdq512 could maybe be changed to inherit from Pclmulqdq256 if it implies that that's supported too.

bartonjs commented 8 months ago

Video

Looks good as proposed.

namespace System.Runtime.Intrinsics.X86;

/// <summary>
/// This class provides access to Intel VPCLMULQDQ hardware instructions via intrinsics
/// </summary>
[Intrinsic]
[CLSCompliant(false)]
public abstract class Pclmulqdq256 : Pclmulqdq
{
    internal Pclmulqdq256() { }

    // This would depend on the VPCLMULQDQ CPUID bit for VEX encoding and VPCLMULQDQ + AVX512VL for EVEX
    public static new bool IsSupported { get => IsSupported; }

    [Intrinsic]
    public new abstract class X64 : Pclmulqdq.X64
    {
        internal X64() { }

        public static new bool IsSupported { get => IsSupported; }
    }

    /// <summary>
    /// __m256i _mm256_clmulepi64_si128 (__m256i a, __m256i b, const int imm8)
    ///   VPCLMULQDQ ymm1, ymm2, ymm3/m256, imm8
    /// </summary>
    public static Vector256<long> CarrylessMultiply(Vector256<long> left, Vector256<long> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
    /// __m256i _mm256_clmulepi64_si128 (__m256i a, __m256i b, const int imm8)
    ///   VPCLMULQDQ ymm1, ymm2, ymm3/m256, imm8
    /// </summary>
    public static Vector256<ulong> CarrylessMultiply(Vector256<ulong> left, Vector256<ulong> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
}

/// <summary>
/// This class provides access to Intel AVX-512 VPCLMULQDQ hardware instructions via intrinsics
/// </summary>
[Intrinsic]
[CLSCompliant(false)]
public abstract class Pclmulqdq512 : Pclmulqdq
{
    internal Pclmulqdq512() { }

    // This would depend on the VPCLMULQDQ + AVX512F CPUID bits
    public static new bool IsSupported { get => IsSupported; }

    [Intrinsic]
    public new abstract class X64 : Pclmulqdq.X64
    {
        internal X64() { }

        public static new bool IsSupported { get => IsSupported; }
    }

    /// <summary>
    /// __m512i _mm512_clmulepi64_si128 (__m512i a, __m512i b, const int imm8)
    ///   VPCLMULQDQ zmm1, zmm2, zmm3/m512, imm8
    /// </summary>
    public static Vector512<long> CarrylessMultiply(Vector512<long> left, Vector512<long> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
    /// __m512i _mm512_clmulepi64_si128 (__m512i a, __m512i b, const int imm8)
    ///   VPCLMULQDQ zmm1, zmm2, zmm3/m512, imm8
    /// </summary>
    public static Vector512<ulong> CarrylessMultiply(Vector512<ulong> left, Vector512<ulong> right, [ConstantExpected] byte control) => CarrylessMultiply(left, right, control);
}