EgorBot / runtime-utils

MIT License
0 stars 1 forks source link

tests #149

Open EgorBo opened 6 days ago

EgorBo commented 6 days ago

@EgorBot -linux_aws_genoa -windows_aws_sapphirelake


using BenchmarkDotNet.Attributes;
using System.Buffers.Binary;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;
using System.Security.Cryptography;

[MemoryDiagnoser]
public class ReverseTests
{
    public byte[] Data;
    private readonly Vector128<byte> _pos;

    public ReverseTests()
    {
        Data = RandomNumberGenerator.GetBytes(16);
        _pos = Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
    }

    [Benchmark]
    public ReadOnlySpan<byte> VectorTest()
    {
        var output = new byte[16];

        var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
        tempFirst = Vector128.Shuffle(tempFirst, _pos);

        tempFirst.StoreUnsafe(ref output[0]);

        return output;
    }

    [Benchmark]
    public ReadOnlySpan<byte> Ssse3Test()
    {
        var output = new byte[16];

        var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
        tempFirst = Ssse3.Shuffle(tempFirst, _pos);

        tempFirst.StoreUnsafe(ref output[0]);

        return output;
    }

    [Benchmark]
    public ReadOnlySpan<byte> BinaryPrimitivesTest()
    {
        var output = new byte[16];

        var tempFirst = Unsafe.ReadUnaligned<long>(ref Data[0]);
        var tempLast = Unsafe.ReadUnaligned<long>(ref Data[8]);

        Unsafe.WriteUnaligned(ref output[0], BinaryPrimitives.ReverseEndianness(tempLast));
        Unsafe.WriteUnaligned(ref output[8], BinaryPrimitives.ReverseEndianness(tempFirst));

        return output;
    }

    [Benchmark]
    public ReadOnlySpan<byte> SpanTest()
    {
        var span = new Span<byte>(Data.ToArray());
        span.Reverse();

        return span;
    }
}
``
EgorBot commented 6 days ago

❌ Failed on AwsGenoa: Job failed, see logs.

cc @EgorBo (logs)

EgorBo commented 6 days ago

@EgorBot -linux_aws_genoa -windows_aws_sapphirelake

using BenchmarkDotNet.Attributes;
using System.Buffers.Binary;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;
using System.Security.Cryptography;

[MemoryDiagnoser]
public class ReverseTests
{
    public byte[] Data;
    private readonly Vector128<byte> _pos;

    public ReverseTests()
    {
        Data = RandomNumberGenerator.GetBytes(16);
        _pos = Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
    }

    [Benchmark]
    public ReadOnlySpan<byte> VectorTest()
    {
        var output = new byte[16];

        var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
        tempFirst = Vector128.Shuffle(tempFirst, _pos);

        tempFirst.StoreUnsafe(ref output[0]);

        return output;
    }

    [Benchmark]
    public ReadOnlySpan<byte> Ssse3Test()
    {
        var output = new byte[16];

        var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
        tempFirst = Ssse3.Shuffle(tempFirst, _pos);

        tempFirst.StoreUnsafe(ref output[0]);

        return output;
    }

    [Benchmark]
    public ReadOnlySpan<byte> BinaryPrimitivesTest()
    {
        var output = new byte[16];

        var tempFirst = Unsafe.ReadUnaligned<long>(ref Data[0]);
        var tempLast = Unsafe.ReadUnaligned<long>(ref Data[8]);

        Unsafe.WriteUnaligned(ref output[0], BinaryPrimitives.ReverseEndianness(tempLast));
        Unsafe.WriteUnaligned(ref output[8], BinaryPrimitives.ReverseEndianness(tempFirst));

        return output;
    }

    [Benchmark]
    public ReadOnlySpan<byte> SpanTest()
    {
        var span = new Span<byte>(Data.ToArray());
        span.Reverse();

        return span;
    }
}
EgorBot commented 6 days ago

Benchmark results on linux-genoa

BenchmarkDotNet v0.14.0, Ubuntu 24.04 LTS (Noble Numbat)
AMD EPYC 9R14, 1 CPU, 4 logical and 4 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Mean Error Gen0 Allocated
VectorTest 16.894 ns 0.0082 ns 0.0048 40 B
Ssse3Test 6.158 ns 0.1316 ns 0.0048 40 B
BinaryPrimitivesTest 5.354 ns 0.0451 ns 0.0048 40 B
SpanTest 13.596 ns 0.0547 ns 0.0048 40 B

BDN_Artifacts.zip

EgorBot commented 6 days ago

cc @EgorBo (logs)

kzorin52 commented 6 days ago

@EgorBot -linux_aws_genoa -windows_aws_sapphirelake

using System;
using System.Buffers.Binary;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Security.Cryptography;
using System.Text;
using BenchmarkDotNet.Attributes;

namespace GuidWars;

[MemoryDiagnoser]
public class ReverseTests
{
    public byte[] Data = RandomNumberGenerator.GetBytes(16);
    private readonly Vector128<byte> _pos = Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);

    [Benchmark]
    public byte[] SpanTest()
    {
        var output = new byte[16];

        Data.AsSpan().CopyTo(output);
        output.AsSpan().Reverse();

        return output;
    }

    [Benchmark]
    public byte[] VectorTest()
    {
        var output = new byte[16];

        var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
        tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));

        tempFirst.StoreUnsafe(ref output[0]);

        return output;
    }

    [Benchmark]
    public byte[] Ssse3Test()
    {
        var output = new byte[16];

        var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
        tempFirst = Ssse3.Shuffle(tempFirst, _pos);

        tempFirst.StoreUnsafe(ref output[0]);

        return output;
    }

    [Benchmark]
    public byte[] BinaryPrimitivesTest()
    {
        var output = new byte[16];

        var tempFirst = Unsafe.ReadUnaligned<long>(ref Data[0]);
        var tempLast = Unsafe.ReadUnaligned<long>(ref Data[8]);

        Unsafe.WriteUnaligned(ref output[0], BinaryPrimitives.ReverseEndianness(tempLast));
        Unsafe.WriteUnaligned(ref output[8], BinaryPrimitives.ReverseEndianness(tempFirst));

        return output;
    }
}
EgorBot commented 6 days ago

Benchmark results on windows-sapphirelake

BenchmarkDotNet v0.14.0, Windows 10 (10.0.20348.2762)
Intel Xeon Platinum 8488C, 1 CPU, 16 logical and 8 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Mean Error Gen0 Allocated
VectorTest 17.654 ns 0.0465 ns 0.0007 40 B
Ssse3Test 4.919 ns 0.1256 ns 0.0007 40 B
BinaryPrimitivesTest 5.017 ns 0.1172 ns 0.0007 40 B
SpanTest 15.365 ns 0.0229 ns 0.0007 40 B

BDN_Artifacts.zip

EgorBot commented 6 days ago

cc @EgorBo (logs)

EgorBot commented 6 days ago

Benchmark results on linux-genoa

BenchmarkDotNet v0.14.0, Ubuntu 24.04 LTS (Noble Numbat)
AMD EPYC 9R14, 1 CPU, 4 logical and 4 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Mean Error Gen0 Allocated
SpanTest 9.049 ns 0.1481 ns 0.0048 40 B
VectorTest 5.948 ns 0.0357 ns 0.0048 40 B
Ssse3Test 6.385 ns 0.0658 ns 0.0048 40 B
BinaryPrimitivesTest 6.069 ns 0.1397 ns 0.0048 40 B

BDN_Artifacts.zip

EgorBot commented 6 days ago

cc @kzorin52 (logs)

EgorBot commented 6 days ago

Benchmark results on windows-sapphirelake

BenchmarkDotNet v0.14.0, Windows 10 (10.0.20348.2762)
Intel Xeon Platinum 8488C, 1 CPU, 16 logical and 8 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Mean Error Gen0 Allocated
SpanTest 8.378 ns 0.2397 ns 0.0007 40 B
VectorTest 5.531 ns 0.1856 ns 0.0007 40 B
Ssse3Test 5.452 ns 0.1884 ns 0.0007 40 B
BinaryPrimitivesTest 5.602 ns 0.1867 ns 0.0007 40 B

BDN_Artifacts.zip

EgorBot commented 6 days ago

cc @kzorin52 (logs)

kzorin52 commented 6 days ago

@EgorBot -linux_aws_genoa -windows_aws_sapphirelake

using System;
using System.Buffers.Binary;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Security.Cryptography;
using System.Text;
using BenchmarkDotNet.Attributes;

namespace GuidWars;

public class Reverse256Tests
{
    public byte[] Data = RandomNumberGenerator.GetBytes(32);

    [Benchmark]
    public byte[] Avx2Reverse()
    {
        var output = new byte[32];

        var reverseMask = Vector256.Create(
            (byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane

        var tempFirst = Vector256.LoadUnsafe(ref Data[0]);

        tempFirst = Avx2.Shuffle(tempFirst, reverseMask);
        tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);

        tempFirst.StoreUnsafe(ref output[0]);

        return output;
    }

    private static readonly Vector256<byte> cachedMask = Vector256.Create(
        (byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane

    [Benchmark]
    public byte[] Avx2ReverseCachedMask()
    {
        var output = new byte[32];

        var tempFirst = Vector256.LoadUnsafe(ref Data[0]);

        tempFirst = Avx2.Shuffle(tempFirst, cachedMask);
        tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);

        tempFirst.StoreUnsafe(ref output[0]);

        return output;
    }

    [Benchmark]
    public byte[] Ssse3x2Reverse()
    {
        var output = new byte[32];

        var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
        var tempLast = Vector128.LoadUnsafe(ref Data[0], 16);

        tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
        tempLast = Vector128.Shuffle(tempLast, Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));

        tempFirst.StoreUnsafe(ref output[0], 16);
        tempLast.StoreUnsafe(ref output[0]);

        return output;
    }
}
EgorBot commented 6 days ago

Benchmark results on linux-genoa

BenchmarkDotNet v0.14.0, Ubuntu 24.04 LTS (Noble Numbat)
AMD EPYC 9R14, 1 CPU, 4 logical and 4 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Mean Error
Avx2Reverse 7.259 ns 0.0375 ns
Avx2ReverseCachedMask 6.482 ns 0.0338 ns
Ssse3x2Reverse 6.738 ns 0.0280 ns

BDN_Artifacts.zip

EgorBot commented 6 days ago

cc @kzorin52 (logs)

EgorBot commented 6 days ago

Benchmark results on windows-sapphirelake

BenchmarkDotNet v0.14.0, Windows 10 (10.0.20348.2762)
Intel Xeon Platinum 8488C, 1 CPU, 16 logical and 8 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Mean Error
Avx2Reverse 7.619 ns 0.0484 ns
Avx2ReverseCachedMask 7.614 ns 0.0630 ns
Ssse3x2Reverse 7.663 ns 0.0740 ns

BDN_Artifacts.zip

EgorBot commented 6 days ago

cc @kzorin52 (logs)

kzorin52 commented 6 days ago

@EgorBot -linux_aws_genoa

using System;
using System.Buffers.Binary;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Security.Cryptography;
using System.Text;
using BenchmarkDotNet.Attributes;

namespace GuidWars;

public class Reverse256Tests
{
    public byte[] Data = RandomNumberGenerator.GetBytes(32);

    [Benchmark]
    public byte[] Avx2ReversePermute()
    {
        var output = new byte[32];

        var reverseMask = Vector256.Create(
            (byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane

        var tempFirst = Vector256.LoadUnsafe(ref Data[0]);

        tempFirst = Avx2.Shuffle(tempFirst, reverseMask);
        tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);

        tempFirst.StoreUnsafe(ref output[0]);

        return output;
    }

    [Benchmark]
    public byte[] Avx2ReverseInlinedMask()
    {
        var output = new byte[32];
        var tempFirst = Vector256.LoadUnsafe(ref Data[0]);

        tempFirst = Vector256.Shuffle(tempFirst, Vector256.Create(
            (byte)31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
        //tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);

        tempFirst.StoreUnsafe(ref output[0]);

        return output;
    }

    private static readonly Vector256<byte> cachedMask2 = Vector256.Create(
        (byte)31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);

    [Benchmark]
    public byte[] Avx2ReverseCachedMask()
    {
        var output = new byte[32];
        var tempFirst = Vector256.LoadUnsafe(ref Data[0]);

        tempFirst = Vector256.Shuffle(tempFirst, cachedMask2);
        tempFirst.StoreUnsafe(ref output[0]);

        return output;
    }

    private static readonly Vector256<byte> cachedMask = Vector256.Create(
        (byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane

    [Benchmark]
    public byte[] Avx2ReversePermuteCachedMask()
    {
        var output = new byte[32];

        var tempFirst = Vector256.LoadUnsafe(ref Data[0]);

        tempFirst = Avx2.Shuffle(tempFirst, cachedMask);
        tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);

        tempFirst.StoreUnsafe(ref output[0]);

        return output;
    }

    [Benchmark]
    public byte[] Ssse3x2Reverse()
    {
        var output = new byte[32];

        var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
        var tempLast = Vector128.LoadUnsafe(ref Data[0], 16);

        tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
        tempLast = Vector128.Shuffle(tempLast, Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));

        tempFirst.StoreUnsafe(ref output[0], 16);
        tempLast.StoreUnsafe(ref output[0]);

        return output;
    }
}
EgorBot commented 6 days ago

Benchmark results on linux-genoa

BenchmarkDotNet v0.14.0, Ubuntu 24.04 LTS (Noble Numbat)
AMD EPYC 9R14, 1 CPU, 4 logical and 4 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Mean Error
Avx2ReversePermute 6.263 ns 0.0505 ns
Avx2ReverseInlinedMask 6.425 ns 0.0061 ns
Avx2ReverseCachedMask 6.682 ns 0.0648 ns
Avx2ReversePermuteCachedMask 6.628 ns 0.0917 ns
Ssse3x2Reverse 6.410 ns 0.0590 ns

BDN_Artifacts.zip

EgorBot commented 6 days ago

cc @kzorin52 (logs)

kzorin52 commented 6 days ago

@EgorBot -linux_aws_genoa

using System;
using System.Buffers.Binary;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Security.Cryptography;
using System.Text;
using BenchmarkDotNet.Attributes;

namespace GuidWars;

public class Reverse512Tests
{
    public byte[] Data = RandomNumberGenerator.GetBytes(64);

    [Benchmark]
    public byte[] SingleAvx512()
    {
        var output = new byte[64];

        var tempFirst = Vector512.LoadUnsafe(ref Data[0]);
        tempFirst = Vector512.Shuffle(tempFirst, Vector512.Create(
            (byte)63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,
            47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
            31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));

        tempFirst.StoreUnsafe(ref output[0]);
        return output;
    }

    [Benchmark]
    public byte[] TwoAvx2() // dotnet solution
    {
        var output = new byte[64];

        var reverseMask = Vector256.Create(
            (byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane

        var tempFirst = Vector256.LoadUnsafe(ref Data[0]);
        var tempLast = Vector256.LoadUnsafe(ref Data[32]);

        tempFirst = Avx2.Shuffle(tempFirst, reverseMask);
        tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);
        tempLast = Avx2.Shuffle(tempLast, reverseMask);
        tempLast = Avx2.Permute2x128(tempLast, tempLast, 0b00_01);

        tempLast.StoreUnsafe(ref output[0]);
        tempFirst.StoreUnsafe(ref output[32]);

        return output;
    }
}
EgorBot commented 6 days ago

Benchmark results on linux-genoa

BenchmarkDotNet v0.14.0, Ubuntu 24.04 LTS (Noble Numbat)
AMD EPYC 9R14, 1 CPU, 4 logical and 4 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Mean Error
SingleAvx512 7.468 ns 0.0563 ns
TwoAvx2 7.165 ns 0.0832 ns

BDN_Artifacts.zip

EgorBot commented 6 days ago

cc @kzorin52 (logs)