Open EgorBo opened 1 week ago
@EgorBot -linux_aws_genoa -windows_aws_sapphirelake
using BenchmarkDotNet.Attributes;
using System.Buffers.Binary;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;
using System.Security.Cryptography;
[MemoryDiagnoser]
public class ReverseTests
{
public byte[] Data;
private readonly Vector128<byte> _pos;
public ReverseTests()
{
Data = RandomNumberGenerator.GetBytes(16);
_pos = Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
}
[Benchmark]
public ReadOnlySpan<byte> VectorTest()
{
var output = new byte[16];
var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
tempFirst = Vector128.Shuffle(tempFirst, _pos);
tempFirst.StoreUnsafe(ref output[0]);
return output;
}
[Benchmark]
public ReadOnlySpan<byte> Ssse3Test()
{
var output = new byte[16];
var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
tempFirst = Ssse3.Shuffle(tempFirst, _pos);
tempFirst.StoreUnsafe(ref output[0]);
return output;
}
[Benchmark]
public ReadOnlySpan<byte> BinaryPrimitivesTest()
{
var output = new byte[16];
var tempFirst = Unsafe.ReadUnaligned<long>(ref Data[0]);
var tempLast = Unsafe.ReadUnaligned<long>(ref Data[8]);
Unsafe.WriteUnaligned(ref output[0], BinaryPrimitives.ReverseEndianness(tempLast));
Unsafe.WriteUnaligned(ref output[8], BinaryPrimitives.ReverseEndianness(tempFirst));
return output;
}
[Benchmark]
public ReadOnlySpan<byte> SpanTest()
{
var span = new Span<byte>(Data.ToArray());
span.Reverse();
return span;
}
}
linux-genoa
BenchmarkDotNet v0.14.0, Ubuntu 24.04 LTS (Noble Numbat)
AMD EPYC 9R14, 1 CPU, 4 logical and 4 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method | Mean | Error | Gen0 | Allocated |
---|---|---|---|---|
VectorTest | 16.894 ns | 0.0082 ns | 0.0048 | 40 B |
Ssse3Test | 6.158 ns | 0.1316 ns | 0.0048 | 40 B |
BinaryPrimitivesTest | 5.354 ns | 0.0451 ns | 0.0048 | 40 B |
SpanTest | 13.596 ns | 0.0547 ns | 0.0048 | 40 B |
@EgorBot -linux_aws_genoa -windows_aws_sapphirelake
using System;
using System.Buffers.Binary;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Security.Cryptography;
using System.Text;
using BenchmarkDotNet.Attributes;
namespace GuidWars;
[MemoryDiagnoser]
public class ReverseTests
{
public byte[] Data = RandomNumberGenerator.GetBytes(16);
private readonly Vector128<byte> _pos = Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
[Benchmark]
public byte[] SpanTest()
{
var output = new byte[16];
Data.AsSpan().CopyTo(output);
output.AsSpan().Reverse();
return output;
}
[Benchmark]
public byte[] VectorTest()
{
var output = new byte[16];
var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
tempFirst.StoreUnsafe(ref output[0]);
return output;
}
[Benchmark]
public byte[] Ssse3Test()
{
var output = new byte[16];
var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
tempFirst = Ssse3.Shuffle(tempFirst, _pos);
tempFirst.StoreUnsafe(ref output[0]);
return output;
}
[Benchmark]
public byte[] BinaryPrimitivesTest()
{
var output = new byte[16];
var tempFirst = Unsafe.ReadUnaligned<long>(ref Data[0]);
var tempLast = Unsafe.ReadUnaligned<long>(ref Data[8]);
Unsafe.WriteUnaligned(ref output[0], BinaryPrimitives.ReverseEndianness(tempLast));
Unsafe.WriteUnaligned(ref output[8], BinaryPrimitives.ReverseEndianness(tempFirst));
return output;
}
}
windows-sapphirelake
BenchmarkDotNet v0.14.0, Windows 10 (10.0.20348.2762)
Intel Xeon Platinum 8488C, 1 CPU, 16 logical and 8 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method | Mean | Error | Gen0 | Allocated |
---|---|---|---|---|
VectorTest | 17.654 ns | 0.0465 ns | 0.0007 | 40 B |
Ssse3Test | 4.919 ns | 0.1256 ns | 0.0007 | 40 B |
BinaryPrimitivesTest | 5.017 ns | 0.1172 ns | 0.0007 | 40 B |
SpanTest | 15.365 ns | 0.0229 ns | 0.0007 | 40 B |
linux-genoa
BenchmarkDotNet v0.14.0, Ubuntu 24.04 LTS (Noble Numbat)
AMD EPYC 9R14, 1 CPU, 4 logical and 4 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method | Mean | Error | Gen0 | Allocated |
---|---|---|---|---|
SpanTest | 9.049 ns | 0.1481 ns | 0.0048 | 40 B |
VectorTest | 5.948 ns | 0.0357 ns | 0.0048 | 40 B |
Ssse3Test | 6.385 ns | 0.0658 ns | 0.0048 | 40 B |
BinaryPrimitivesTest | 6.069 ns | 0.1397 ns | 0.0048 | 40 B |
windows-sapphirelake
BenchmarkDotNet v0.14.0, Windows 10 (10.0.20348.2762)
Intel Xeon Platinum 8488C, 1 CPU, 16 logical and 8 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method | Mean | Error | Gen0 | Allocated |
---|---|---|---|---|
SpanTest | 8.378 ns | 0.2397 ns | 0.0007 | 40 B |
VectorTest | 5.531 ns | 0.1856 ns | 0.0007 | 40 B |
Ssse3Test | 5.452 ns | 0.1884 ns | 0.0007 | 40 B |
BinaryPrimitivesTest | 5.602 ns | 0.1867 ns | 0.0007 | 40 B |
@EgorBot -linux_aws_genoa -windows_aws_sapphirelake
using System;
using System.Buffers.Binary;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Security.Cryptography;
using System.Text;
using BenchmarkDotNet.Attributes;
namespace GuidWars;
public class Reverse256Tests
{
public byte[] Data = RandomNumberGenerator.GetBytes(32);
[Benchmark]
public byte[] Avx2Reverse()
{
var output = new byte[32];
var reverseMask = Vector256.Create(
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane
var tempFirst = Vector256.LoadUnsafe(ref Data[0]);
tempFirst = Avx2.Shuffle(tempFirst, reverseMask);
tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);
tempFirst.StoreUnsafe(ref output[0]);
return output;
}
private static readonly Vector256<byte> cachedMask = Vector256.Create(
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane
[Benchmark]
public byte[] Avx2ReverseCachedMask()
{
var output = new byte[32];
var tempFirst = Vector256.LoadUnsafe(ref Data[0]);
tempFirst = Avx2.Shuffle(tempFirst, cachedMask);
tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);
tempFirst.StoreUnsafe(ref output[0]);
return output;
}
[Benchmark]
public byte[] Ssse3x2Reverse()
{
var output = new byte[32];
var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
var tempLast = Vector128.LoadUnsafe(ref Data[0], 16);
tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
tempLast = Vector128.Shuffle(tempLast, Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
tempFirst.StoreUnsafe(ref output[0], 16);
tempLast.StoreUnsafe(ref output[0]);
return output;
}
}
linux-genoa
BenchmarkDotNet v0.14.0, Ubuntu 24.04 LTS (Noble Numbat)
AMD EPYC 9R14, 1 CPU, 4 logical and 4 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method | Mean | Error |
---|---|---|
Avx2Reverse | 7.259 ns | 0.0375 ns |
Avx2ReverseCachedMask | 6.482 ns | 0.0338 ns |
Ssse3x2Reverse | 6.738 ns | 0.0280 ns |
windows-sapphirelake
BenchmarkDotNet v0.14.0, Windows 10 (10.0.20348.2762)
Intel Xeon Platinum 8488C, 1 CPU, 16 logical and 8 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method | Mean | Error |
---|---|---|
Avx2Reverse | 7.619 ns | 0.0484 ns |
Avx2ReverseCachedMask | 7.614 ns | 0.0630 ns |
Ssse3x2Reverse | 7.663 ns | 0.0740 ns |
@EgorBot -linux_aws_genoa
using System;
using System.Buffers.Binary;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Security.Cryptography;
using System.Text;
using BenchmarkDotNet.Attributes;
namespace GuidWars;
public class Reverse256Tests
{
public byte[] Data = RandomNumberGenerator.GetBytes(32);
[Benchmark]
public byte[] Avx2ReversePermute()
{
var output = new byte[32];
var reverseMask = Vector256.Create(
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane
var tempFirst = Vector256.LoadUnsafe(ref Data[0]);
tempFirst = Avx2.Shuffle(tempFirst, reverseMask);
tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);
tempFirst.StoreUnsafe(ref output[0]);
return output;
}
[Benchmark]
public byte[] Avx2ReverseInlinedMask()
{
var output = new byte[32];
var tempFirst = Vector256.LoadUnsafe(ref Data[0]);
tempFirst = Vector256.Shuffle(tempFirst, Vector256.Create(
(byte)31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
//tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);
tempFirst.StoreUnsafe(ref output[0]);
return output;
}
private static readonly Vector256<byte> cachedMask2 = Vector256.Create(
(byte)31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
[Benchmark]
public byte[] Avx2ReverseCachedMask()
{
var output = new byte[32];
var tempFirst = Vector256.LoadUnsafe(ref Data[0]);
tempFirst = Vector256.Shuffle(tempFirst, cachedMask2);
tempFirst.StoreUnsafe(ref output[0]);
return output;
}
private static readonly Vector256<byte> cachedMask = Vector256.Create(
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane
[Benchmark]
public byte[] Avx2ReversePermuteCachedMask()
{
var output = new byte[32];
var tempFirst = Vector256.LoadUnsafe(ref Data[0]);
tempFirst = Avx2.Shuffle(tempFirst, cachedMask);
tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);
tempFirst.StoreUnsafe(ref output[0]);
return output;
}
[Benchmark]
public byte[] Ssse3x2Reverse()
{
var output = new byte[32];
var tempFirst = Vector128.LoadUnsafe(ref Data[0]);
var tempLast = Vector128.LoadUnsafe(ref Data[0], 16);
tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
tempLast = Vector128.Shuffle(tempLast, Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
tempFirst.StoreUnsafe(ref output[0], 16);
tempLast.StoreUnsafe(ref output[0]);
return output;
}
}
linux-genoa
BenchmarkDotNet v0.14.0, Ubuntu 24.04 LTS (Noble Numbat)
AMD EPYC 9R14, 1 CPU, 4 logical and 4 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method | Mean | Error |
---|---|---|
Avx2ReversePermute | 6.263 ns | 0.0505 ns |
Avx2ReverseInlinedMask | 6.425 ns | 0.0061 ns |
Avx2ReverseCachedMask | 6.682 ns | 0.0648 ns |
Avx2ReversePermuteCachedMask | 6.628 ns | 0.0917 ns |
Ssse3x2Reverse | 6.410 ns | 0.0590 ns |
@EgorBot -linux_aws_genoa
using System;
using System.Buffers.Binary;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Security.Cryptography;
using System.Text;
using BenchmarkDotNet.Attributes;
namespace GuidWars;
public class Reverse512Tests
{
public byte[] Data = RandomNumberGenerator.GetBytes(64);
[Benchmark]
public byte[] SingleAvx512()
{
var output = new byte[64];
var tempFirst = Vector512.LoadUnsafe(ref Data[0]);
tempFirst = Vector512.Shuffle(tempFirst, Vector512.Create(
(byte)63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,
47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
tempFirst.StoreUnsafe(ref output[0]);
return output;
}
[Benchmark]
public byte[] TwoAvx2() // dotnet solution
{
var output = new byte[64];
var reverseMask = Vector256.Create(
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane
var tempFirst = Vector256.LoadUnsafe(ref Data[0]);
var tempLast = Vector256.LoadUnsafe(ref Data[32]);
tempFirst = Avx2.Shuffle(tempFirst, reverseMask);
tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);
tempLast = Avx2.Shuffle(tempLast, reverseMask);
tempLast = Avx2.Permute2x128(tempLast, tempLast, 0b00_01);
tempLast.StoreUnsafe(ref output[0]);
tempFirst.StoreUnsafe(ref output[32]);
return output;
}
}
linux-genoa
BenchmarkDotNet v0.14.0, Ubuntu 24.04 LTS (Noble Numbat)
AMD EPYC 9R14, 1 CPU, 4 logical and 4 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.47305), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method | Mean | Error |
---|---|---|
SingleAvx512 | 7.468 ns | 0.0563 ns |
TwoAvx2 | 7.165 ns | 0.0832 ns |
@EgorBot -linux_aws_genoa
using System;
using BenchmarkDotNet.Attributes;
public class Bench
{
private int total;
private long alsoTotal;
private int Addition32;
private long Addition64;
public Bench()
{
Addition32 = Random.Shared.Next();
Addition64 = (long)Addition32;
}
[Benchmark]
public int AddProgress32()
{
total += Addition32;
return Addition32 / 1024 / 1024;
}
[Benchmark]
public long AddProgress64()
{
alsoTotal += Addition64;
return Addition64 / 1024 / 1024;
}
[Benchmark]
public int AddProgressShift32()
{
total += Addition32;
return Addition32 >> 20;
}
[Benchmark]
public long AddProgressShift64()
{
alsoTotal += Addition64;
return Addition64 >> 20;
}
}
linux_genoa
BenchmarkDotNet v0.14.0, Ubuntu 24.04 LTS (Noble Numbat)
AMD EPYC 9R14, 1 CPU, 4 logical and 4 physical cores
DefaultJob : .NET 9.0.0 (9.0.24.52809), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method | Mean | Error |
---|---|---|
AddProgress32 | 0.0000 ns | 0.0000 ns |
AddProgress64 | 0.2727 ns | 0.0001 ns |
AddProgressShift32 | 0.0000 ns | 0.0000 ns |
AddProgressShift64 | 0.0102 ns | 0.0092 ns |
cc @kzorin52 (agent_logs.txt)
@EgorBot -linux_aws_genoa -windows_aws_sapphirelake