I accidentaly found performance reggresion using"mandelbrot algorithm"

milen-denev commented 1 year ago

The place from where I copied the code: https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-csharpcore-1.html

Actual Code:

using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;

namespace Test;

public static class Program
{
    public static void Main() 
    {
        BenchmarkRunner.Run<Benchy>();
    }

    [MemoryDiagnoser(true)]
    [SimpleJob(RuntimeMoniker.Net70)]
    [SimpleJob(RuntimeMoniker.Net60, baseline: true)]
    public class Benchy
    {
        [Benchmark]
        public void Benchmark1()
        {
            MandelBrot.Default();
        }
    }

    public class MandelBrot
    {
        // x86 version, AVX2
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static byte Process8(double x, double y, double dx)
        {
            // initial x coords
            var x01 = Vector256.Create(x + 0 * dx, x + 1 * dx, x + 2 * dx, x + 3 * dx);
            var x02 = Vector256.Create(x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx);

            // initial y coords
            var y0 = Vector256.Create(y);

            Vector256<double> x1 = x01, y1 = y0; // current iteration 1
            Vector256<double> x2 = x02, y2 = y0; // current iteration 2

            Vector256<double> four = Vector256.Create(4.0); // 4 in each slot        

            var pass = 0;

            // temp space, C# requires init.
            Vector256<double>
                x12 = Vector256<double>.Zero,
                y12 = Vector256<double>.Zero,
                x22 = Vector256<double>.Zero,
                y22 = Vector256<double>.Zero;

            // bit masks for results
            uint res1 = 1, res2 = 1;

            while (pass < 49 && (res1 != 0 || res2 != 0))
            {

                // do several between checks a time like other code
                for (var p = 0; p < 7; ++p)
                {
                    // unroll loop 2x to decrease register stalls

                    // squares x*x and y*y
                    x12 = Avx2.Multiply(x1, x1);
                    y12 = Avx2.Multiply(y1, y1);
                    x22 = Avx2.Multiply(x2, x2);
                    y22 = Avx2.Multiply(y2, y2);

                    // mixed products x*y
                    var xy1 = Avx2.Multiply(x1, y1);
                    var xy2 = Avx2.Multiply(x2, y2);

                    // diff of squares x*x - y*y
                    var ds1 = Avx2.Subtract(x12, y12);
                    var ds2 = Avx2.Subtract(x22, y22);

                    // 2*x*y
                    xy1 = Avx2.Add(xy1, xy1);
                    xy2 = Avx2.Add(xy2, xy2);

                    // next iters
                    y1 = Avx2.Add(xy1, y0);
                    y2 = Avx2.Add(xy2, y0);
                    x1 = Avx2.Add(ds1, x01);
                    x2 = Avx2.Add(ds2, x02);
                }
                pass += 7;

                // numbers overflow, which gives an Infinity or NaN, which, 
                // when compared N < 4, results in false, which is what we want

                // sum of squares x*x + y*y, compare to 4 (escape mandelbrot)
                var ss1 = Avx2.Add(x12, y12);
                var ss2 = Avx2.Add(x22, y22);

                // compare - puts all 0 in reg if false, else all 1 (=NaN bitwise)
                // when each register is 0, then all points escaped, so exit
                var cmp1 = Avx.Compare(ss1, four,
                        FloatComparisonMode.OrderedLessThanOrEqualNonSignaling);
                var cmp2 = Avx.Compare(ss2, four,
                        FloatComparisonMode.OrderedLessThanOrEqualNonSignaling);

                // take top bit from each byte
                res1 = (uint)Avx2.MoveMask(Vector256.AsByte(cmp1));
                res2 = (uint)Avx2.MoveMask(Vector256.AsByte(cmp2));
            }

            // can make a mask of bits in any order, which is the +7, +6, .., +1, +0
            res1 &=
                (1 << (0 + 7)) |
                (1 << (8 + 6)) |
                (1 << (16 + 5)) |
                (1 << (24 + 4));
            res2 &=
                (1 << (0 + 3)) |
                (1 << (8 + 2)) |
                (1 << (16 + 1)) |
                (1 << (24 + 0));

            var res = res1 | res2;
            res |= res >> 16;
            res |= res >> 8;
            return (byte)(res);
        }

        public static void MainNew()
        {

            var size = 200;
            var lineLength = size >> 3;
            var data = new byte[size * lineLength];

            // step size
            var delta = 2.0 / size; // (0.5 - (-1.5))/size;

            Parallel.For(0, size, y =>
            {
                var yd = y * delta - 1;
                for (var x = 0; x < lineLength; x++)
                {
                    var xd = (x * 8) * delta - 1.5;
                    data[y * lineLength + x] = Process8(xd, yd, delta);
                }
            });
        }

        public static void Default()
        {
            MainNew();
        }
    }
}

Configuration

BenchmarkDotNet=v0.13.4, OS=Windows 11 (10.0.22621.1105) Intel Core i5-10400 CPU 2.90GHz, 1 CPU, 12 logical and 6 physical cores .NET SDK=7.0.102 [Host] : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2 .NET 6.0 : .NET 6.0.13 (6.0.1322.58009), X64 RyuJIT AVX2 .NET 7.0 : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2

   <TargetFrameworks>net6.0;net7.0;</TargetFrameworks>
   <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   <PublishAot>false</PublishAot>
   <ServerGarbageCollection>true</ServerGarbageCollection>
   <TieredPGO>true</TieredPGO>
   <TieredCompilationQuickJitForLoops>true</TieredCompilationQuickJitForLoops>
   <Platforms>AnyCPU;x64</Platforms>

Regression?

The Regression is tested from .NET 6 -> .NET 7

Data

Method	Job	Runtime	Mean	Error	StdDev	Ratio	RatioSD	Gen0	Allocated	Alloc Ratio
Benchmark1	.NET 6.0	.NET 6.0	81.72 us	1.496 us	1.400 us	1.00	0.00	0.1221	8.69 KB	1.00
Benchmark1	.NET 7.0	.NET 7.0	101.81 us	0.699 us	0.654 us	1.25	0.02	-	8.53 KB	0.98

dotnet-issue-labeler[bot] commented 1 year ago

I couldn't figure out the best area label to add to this issue. If you have write-permissions please help me learn by adding exactly one area label.

ghost commented 1 year ago

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak See info in area-owners.md if you want to be subscribed.

Issue Details

The place from where I copied the code: `https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-csharpcore-1.html` Actual Code: ``` using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Jobs; using BenchmarkDotNet.Running; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; namespace Test; public static class Program { public static void Main() { BenchmarkRunner.Run(); } [MemoryDiagnoser(true)] [SimpleJob(RuntimeMoniker.Net70)] [SimpleJob(RuntimeMoniker.Net60, baseline: true)] public class Benchy { [Benchmark] public void Benchmark1() { MandelBrot.Default(); } } public class MandelBrot { // x86 version, AVX2 [MethodImpl(MethodImplOptions.AggressiveInlining)] static byte Process8(double x, double y, double dx) { // initial x coords var x01 = Vector256.Create(x + 0 * dx, x + 1 * dx, x + 2 * dx, x + 3 * dx); var x02 = Vector256.Create(x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx); // initial y coords var y0 = Vector256.Create(y); Vector256 x1 = x01, y1 = y0; // current iteration 1 Vector256 x2 = x02, y2 = y0; // current iteration 2 Vector256 four = Vector256.Create(4.0); // 4 in each slot var pass = 0; // temp space, C# requires init. Vector256 x12 = Vector256.Zero, y12 = Vector256.Zero, x22 = Vector256.Zero, y22 = Vector256.Zero; // bit masks for results uint res1 = 1, res2 = 1; while (pass < 49 && (res1 != 0 || res2 != 0)) { // do several between checks a time like other code for (var p = 0; p < 7; ++p) { // unroll loop 2x to decrease register stalls // squares x*x and y*y x12 = Avx2.Multiply(x1, x1); y12 = Avx2.Multiply(y1, y1); x22 = Avx2.Multiply(x2, x2); y22 = Avx2.Multiply(y2, y2); // mixed products x*y var xy1 = Avx2.Multiply(x1, y1); var xy2 = Avx2.Multiply(x2, y2); // diff of squares x*x - y*y var ds1 = Avx2.Subtract(x12, y12); var ds2 = Avx2.Subtract(x22, y22); // 2*x*y xy1 = Avx2.Add(xy1, xy1); xy2 = Avx2.Add(xy2, xy2); // next iters y1 = Avx2.Add(xy1, y0); y2 = Avx2.Add(xy2, y0); x1 = Avx2.Add(ds1, x01); x2 = Avx2.Add(ds2, x02); } pass += 7; // numbers overflow, which gives an Infinity or NaN, which, // when compared N < 4, results in false, which is what we want // sum of squares x*x + y*y, compare to 4 (escape mandelbrot) var ss1 = Avx2.Add(x12, y12); var ss2 = Avx2.Add(x22, y22); // compare - puts all 0 in reg if false, else all 1 (=NaN bitwise) // when each register is 0, then all points escaped, so exit var cmp1 = Avx.Compare(ss1, four, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); var cmp2 = Avx.Compare(ss2, four, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); // take top bit from each byte res1 = (uint)Avx2.MoveMask(Vector256.AsByte(cmp1)); res2 = (uint)Avx2.MoveMask(Vector256.AsByte(cmp2)); } // can make a mask of bits in any order, which is the +7, +6, .., +1, +0 res1 &= (1 << (0 + 7)) | (1 << (8 + 6)) | (1 << (16 + 5)) | (1 << (24 + 4)); res2 &= (1 << (0 + 3)) | (1 << (8 + 2)) | (1 << (16 + 1)) | (1 << (24 + 0)); var res = res1 | res2; res |= res >> 16; res |= res >> 8; return (byte)(res); } public static void MainNew() { var size = 200; var lineLength = size >> 3; var data = new byte[size * lineLength]; // step size var delta = 2.0 / size; // (0.5 - (-1.5))/size; Parallel.For(0, size, y => { var yd = y * delta - 1; for (var x = 0; x < lineLength; x++) { var xd = (x * 8) * delta - 1.5; data[y * lineLength + x] = Process8(xd, yd, delta); } }); } public static void Default() { MainNew(); } } } ``` ### Configuration BenchmarkDotNet=v0.13.4, OS=Windows 11 (10.0.22621.1105) Intel Core i5-10400 CPU 2.90GHz, 1 CPU, 12 logical and 6 physical cores .NET SDK=7.0.102 [Host] : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2 .NET 6.0 : .NET 6.0.13 (6.0.1322.58009), X64 RyuJIT AVX2 .NET 7.0 : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2 ``` net6.0;net7.0; true false true true true AnyCPU;x64 ``` ### Regression? The Regression is tested from .NET 6 -> .NET 7 ### Data | Method | Job | Runtime | Mean | Error | StdDev | Ratio | RatioSD | Gen0 | Allocated | Alloc Ratio | |----------- |--------- |--------- |----------:|---------:|---------:|------:|--------:|-------:|----------:|------------:| | Benchmark1 | .NET 6.0 | .NET 6.0 | 81.72 us | 1.496 us | 1.400 us | 1.00 | 0.00 | 0.1221 | 8.69 KB | 1.00 | | Benchmark1 | .NET 7.0 | .NET 7.0 | 101.81 us | 0.699 us | 0.654 us | 1.25 | 0.02 | - | 8.53 KB | 0.98 |

Author:	milen-denev
Assignees:	-
Labels:	`tenet-performance`, `area-CodeGen-coreclr`, `untriaged`
Milestone:	-

AndyAyersMS commented 1 year ago

Probably measuring OSR codegen (note this is with PGO enabled, so perhaps instrumented OSR code?).

MichalPetryka commented 1 year ago

Could you rerun the benchmark with [DisassemblyDiagnoser(maxDepth = 6)] and post the generated files here?

JulieLeeMSFT commented 1 year ago

Assigning back to @milen-denev for follow up test request.

Could you rerun the benchmark with [DisassemblyDiagnoser(maxDepth = 6)] and post the generated files here?

milen-denev commented 1 year ago

So, I am sorry for my late reply. I tested again with [DisassemblyDiagnoser(maxDepth: 6)] and actually only the second time I runned the benchmark and got a result where .net7.0 was slower:

Results (Same configuration, nothing changed):

Method	Job	Runtime	Mean	Error	StdDev	Ratio	Code Size	Allocated	Alloc Ratio
Benchmark1	.NET 6.0	.NET 6.0	125.2 us	0.73 us	0.69 us	1.00	9,680 B	9.21 KB	1.00
Benchmark1	.NET 7.0	.NET 7.0	130.5 us	1.12 us	1.05 us	1.04	10,393 B	8.97 KB	0.97

Files: https://cdn.denevcloud.net/milen-denev/bin.zip

milen-denev commented 1 year ago

I also tried:

    <TieredPGO>false</TieredPGO>
    <TieredCompilationQuickJitForLoops>false</TieredCompilationQuickJitForLoops>

Method	Job	Runtime	Mean	Error	StdDev	Ratio	Code Size	Allocated	Alloc Ratio
Benchmark1	.NET 6.0	.NET 6.0	125.1 us	0.73 us	0.68 us	1.00	9,680 B	9.22 KB	1.00
Benchmark1	.NET 7.0	.NET 7.0	127.4 us	0.28 us	0.27 us	1.02	10,393 B	8.99 KB	0.97

Files: https://cdn.denevcloud.net/milen-denev/bin2.zip

AndyAyersMS commented 1 year ago

Tried this on my box with .NET 6, .NET 7, .NET 8 preview2, and a current .NET 8 main build. There seems to be as fair bit of run to run variation here but in the handful of runs I did, .NET 8 main always was fastest.

Possibly a result of https://github.com/dotnet/runtime/pull/83910, though I have not done proper diligence to prove that's the case.

I'm going to close this. @milen-denev if you get a chance to try a .NET 8 preview and still see a regression then please re-open. Please try preview 4 later so you pick up the change above.

Default

BenchmarkDotNet=v0.13.5, OS=Windows 11 (10.0.22621.1413/22H2/2022Update/SunValley2) Intel Core i7-8700 CPU 3.20GHz (Coffee Lake), 1 CPU, 12 logical and 6 physical cores .NET SDK=8.0.100-preview.2.23157.25 [Host] : .NET 8.0.0 (8.0.23.12803), X64 RyuJIT AVX2 Job-XFQKPO : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2 Job-OPADBB : .NET 6.0.15 (6.0.1523.11507), X64 RyuJIT AVX2 Job-RVDABM : .NET 7.0.4 (7.0.423.11508), X64 RyuJIT AVX2 Job-YLZPTG : .NET 8.0.0 (8.0.23.12803), X64 RyuJIT AVX2

Method	Runtime	Mean	Error	StdDev	Ratio	RatioSD
Benchmark1	.NET 6.0	100.94 us	1.918 us	1.884 us	1.00	0.00
Benchmark1	.NET 7.0	91.47 us	0.445 us	0.417 us	0.91	0.02
Benchmark1	.NET 8.0 p2	93.01 us	0.403 us	0.377 us	0.92	0.02
Benchmark1	.NET 8.0 main	82.95 us	0.646 us	0.604 us	0.82	0.02

TieredPGO

Via <TieredPGO>true</TieredPGO> in the csproj. (Note this has no effect on .NET 6, and only partial effect on .NET 7)

BenchmarkDotNet=v0.13.5, OS=Windows 11 (10.0.22621.1413/22H2/2022Update/SunValley2) Intel Core i7-8700 CPU 3.20GHz (Coffee Lake), 1 CPU, 12 logical and 6 physical cores .NET SDK=8.0.100-preview.2.23157.25 [Host] : .NET 8.0.0 (8.0.23.12803), X64 RyuJIT AVX2 Job-XKOTDN : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2 Job-HITSWH : .NET 6.0.15 (6.0.1523.11507), X64 RyuJIT AVX2 Job-TYYPKW : .NET 7.0.4 (7.0.423.11508), X64 RyuJIT AVX2 Job-ENCIEP : .NET 8.0.0 (8.0.23.12803), X64 RyuJIT AVX2

Method	Runtime	Mean	Error	StdDev	Ratio	RatioSD
Benchmark1	.NET 6.0	91.21 us	1.824 us	4.740 us	1.00	0.00
Benchmark1	.NET 7.0	96.41 us	0.280 us	0.248 us	1.16	0.08
Benchmark1	.NET 8.0 p2	92.35 us	1.478 us	1.383 us	1.11	0.07
Benchmark1	.NET 8.0 main	82.13 us	0.292 us	0.259 us	0.99	0.07

dotnet / runtime