dotnet / runtime

.NET is a cross-platform runtime for cloud, mobile, desktop, and IoT apps.
https://docs.microsoft.com/dotnet/core/
MIT License
15.46k stars 4.76k forks source link

Using null conditional in if statement comparisons has bad performance #92620

Open kamronbatman opened 1 year ago

kamronbatman commented 1 year ago

Description

I was cleaning up some code, and made a simple change in my repo, something like this:

Before:

            if (!value && _state != null && curTime - _state._nextTime > 0)
            {
                _state._nextTime = curTime;
            }

After:

            if (!value && curTime - _state?._nextTime > 0)
            {
                _state._nextTime = curTime;
            }

I happen to have the lowered C# viewer open, and noticed the new code was not that great. So I benchmarked it to see if it mattered for performance, and it looks like the optional chaining has a significant different in performance. I guess I am wondering if there is a way to add in logic to identify a null check and pull it out, as the developer would in older C# code.

Configuration

Here is the Benchmark.NET example code I came up with. I would love to refine this for the sake of tracking down if this is worth fixing:

using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;

namespace Benchmarks;

[DisassemblyDiagnoser]
[SimpleJob(RuntimeMoniker.Net70)]
[SimpleJob(RuntimeMoniker.Net80)]
public class BenchmarkNullable
{
    private static SomeObject _someObject = new();
    private static long _someConstant = 500;
    private static long _anotherVariable;
    private static bool _someOtherValue = false;

    [Benchmark]
    public long BenchmarkNullableCode()
    {
        return !_someOtherValue && _someConstant - _someObject?._someValue > 0 ? 10 : 100;
    }

    [Benchmark]
    public long BenchmarkOldCode()
    {
        return !_someOtherValue && _someObject != null && _someConstant - _someObject._someValue > 0 ? 10 : 100;
    }

    internal class SomeObject
    {
        internal long _someValue = 100;
    }
}

Regression?

I am not sure if this is a regression per-se, or a naive unfurling of the syntax sugar and warrants investigation? At the least, the community should be made aware to not "clean" their code in this way I suppose.

Data

Benchmarks for .NET 7

| Method                | Mean      | Error     | StdDev    | Code Size |
|---------------------- |----------:|----------:|----------:|----------:|
| BenchmarkNullableCode | 0.7128 ns | 0.0264 ns | 0.0247 ns |     101 B |
| BenchmarkOldCode      | 0.1106 ns | 0.0100 ns | 0.0094 ns |      58 B |

Disassembly Diagnoser for .NET 7

.NET 7.0.12 (7.0.1223.47720), X64 RyuJIT AVX2

; Benchmarks.BenchmarkNullable.BenchmarkNullableCode()
       cmp       byte ptr [7FFCAED0CEA8],0
       jne       short M00_L04
       mov       rax,[7FFCAED0CE98]
       mov       rdx,28FB9401F38
       mov       rdx,[rdx]
       test      rdx,rdx
       jne       short M00_L00
       xor       ecx,ecx
       xor       r8d,r8d
       jmp       short M00_L01
M00_L00:
       mov       r8,[rdx+8]
       mov       ecx,1
M00_L01:
       test      cl,cl
       jne       short M00_L02
       xor       edx,edx
       xor       ecx,ecx
       jmp       short M00_L03
M00_L02:
       sub       rax,r8
       mov       rcx,rax
       mov       edx,1
M00_L03:
       xor       eax,eax
       test      rcx,rcx
       setg      al
       movzx     edx,dl
       test      edx,eax
       jne       short M00_L05
M00_L04:
       mov       eax,64
       jmp       short M00_L06
M00_L05:
       mov       eax,0A
M00_L06:
       cdqe
       ret
; Total bytes of code 101

.NET 7.0.12 (7.0.1223.47720), X64 RyuJIT AVX2

; Benchmarks.BenchmarkNullable.BenchmarkOldCode()
       cmp       byte ptr [7FFCAECECEA8],0
       jne       short M00_L00
       mov       rax,1E8B3801F38
       mov       rax,[rax]
       test      rax,rax
       je        short M00_L00
       mov       rdx,[7FFCAECECE98]
       sub       rdx,[rax+8]
       test      rdx,rdx
       jg        short M00_L01
M00_L00:
       mov       eax,64
       jmp       short M00_L02
M00_L01:
       mov       eax,0A
M00_L02:
       cdqe
       ret
; Total bytes of code 58

Benchmarks for .NET 8

I ran the same benchmark with .NET 8, and the results look good. So maybe this is a moot point?

| Method                | Mean      | Error     | StdDev    |
|---------------------- |----------:|----------:|----------:|
| BenchmarkNullableCode | 0.1011 ns | 0.0122 ns | 0.0102 ns |
| BenchmarkOldCode      | 0.1090 ns | 0.0143 ns | 0.0127 ns |

Disassembly Diagnoser for .NET 8

.NET 8.0.0 (8.0.23.47906), X64 RyuJIT AVX2

; Benchmarks.BenchmarkNullable.BenchmarkNullableCode()
       cmp       byte ptr [7FFCBA01DCF0],0
       jne       short M00_L04
       mov       rcx,[7FFCBA01DCE0]
       mov       rax,29688001DC8
       mov       rax,[rax]
       test      rax,rax
       je        short M00_L03
       mov       r8,[rax+8]
       mov       edx,1
M00_L00:
       test      dl,dl
       je        short M00_L05
       sub       rcx,r8
       mov       edx,1
M00_L01:
       xor       eax,eax
       test      rcx,rcx
       setg      al
       movzx     edx,dl
       test      edx,eax
       je        short M00_L04
       mov       edx,0A
M00_L02:
       movsxd    rax,edx
       ret
M00_L03:
       xor       edx,edx
       xor       r8d,r8d
       jmp       short M00_L00
M00_L04:
       mov       edx,64
       jmp       short M00_L02
M00_L05:
       xor       edx,edx
       xor       ecx,ecx
       jmp       short M00_L01
; Total bytes of code 99

.NET 8.0.0 (8.0.23.47906), X64 RyuJIT AVX2

; Benchmarks.BenchmarkNullable.BenchmarkOldCode()
       cmp       byte ptr [7FFCBA00DCF0],0
       jne       short M00_L01
       mov       rax,1FE87001DC8
       mov       rax,[rax]
       test      rax,rax
       je        short M00_L01
       mov       rcx,[7FFCBA00DCE0]
       sub       rcx,[rax+8]
       test      rcx,rcx
       jle       short M00_L01
       mov       eax,0A
M00_L00:
       cdqe
       ret
M00_L01:
       mov       eax,64
       jmp       short M00_L00
; Total bytes of code 58

Analysis

The lowered code for the null conditional looks like this:

    public long BenchmarkNullableCode()
    {
      if (!BenchmarkNullable._someOtherValue)
      {
        BenchmarkNullable.SomeObject someObject = BenchmarkNullable._someObject;
        long? nullable1;
        if (someObject == null)
          nullable1 = new long?();
        else
          nullable1 = new long?(someObject._someValue);
        long? nullable2 = nullable1;
        long someConstant = BenchmarkNullable._someConstant;
        long? nullable3;
        if (!nullable2.HasValue)
          nullable3 = new long?();
        else
          nullable3 = new long?(nullable2.GetValueOrDefault() - someConstant);
        long? nullable4 = nullable3;
        long num = 0;
        if (nullable4.GetValueOrDefault() > num & nullable4.HasValue)
          BenchmarkNullable._anotherVariable = BenchmarkNullable._someObject._someValue;
      }
      return BenchmarkNullable._anotherVariable;
    }

This is actually better than the lowered code I saw from my repo:

        int num1;
        if (!value)
        {
          long num2 = curTime;
          NetState netState = this._state;
          long? nullable1;
          if (netState == null)
            nullable1 = new long?();
          else
            nullable1 = new long?(netState._nextTime);
          long? nullable2 = nullable1;
          long? nullable3;
          if (!nullable2.HasValue)
            nullable3 = new long?();
          else
            nullable3 = new long?(num2 - nullable2.GetValueOrDefault());
          long? nullable4 = nullable3;
          long num3 = 0;
          num1 = nullable4.GetValueOrDefault() > num3 & nullable4.HasValue ? 1 : 0;
        }
        else
          num1 = 0;
        if (num1 == 0)
          return;
        this._state._nextTime = curTime;

Note the ternary: num1 = nullable4.GetValueOrDefault() > num3 & nullable4.HasValue ? 1 : 0; - Not sure why that was necessary.

From what I can tell, it is simply really verbose and not efficient.

ghost commented 1 year ago

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch See info in area-owners.md if you want to be subscribed.

Issue Details
### Description I was cleaning up some code, and made a simple change in my repo, something like this: Before: ```cs if (!value && _state != null && curTime - _state._nextTime > 0) { _state._nextTime = curTime; } ``` After: ```cs if (!value && curTime - _state?._nextTime > 0) { _state._nextTime = curTime; } ``` I happen to have the lowered C# viewer open, and noticed the new code was not that great. So I benchmarked it to see if it mattered for performance, and it looks like the optional chaining has a significant different in performance. I guess I am wondering if there is a way to add in logic to identify a null check and pull it out, as the developer would in older C# code. ### Configuration * .NET 7 * Windows 11 (x64) - I can also run this on arm64 macos ventura if necessary Here is the Benchmark.NET example code I came up with. I would love to refine this for the sake of tracking down if this is worth fixing: ```cs using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Jobs; namespace Benchmarks; [SimpleJob(RuntimeMoniker.Net70)] public class BenchmarkNullable { private static SomeObject _someObject = new(); private static long _someConstant = 500; private static long _anotherVariable; private static bool _someOtherValue = false; [Benchmark] public long BenchmarkNullableCode() { if (!_someOtherValue && _someConstant - _someObject?._someValue > 0) { _anotherVariable = _someObject._someValue; } return _anotherVariable; } [Benchmark] public long BenchmarkOldCode() { if (!_someOtherValue && _someObject != null && _someConstant - _someObject._someValue > 0) { _anotherVariable = _someObject._someValue; } return _anotherVariable; } internal class SomeObject { internal long _someValue = 100; } } ``` ### Regression? I am not sure if this is a regression per-se, or a naive unfurling of the syntax sugar and warrants investigation? At the least, the community should be made aware to not "clean" their code in this way I suppose. ### Data #### .NET 7 ```cs | Method | Mean | Error | StdDev | Median | |---------------------- |----------:|----------:|----------:|----------:| | BenchmarkNullableCode | 0.4463 ns | 0.0307 ns | 0.0287 ns | 0.4429 ns | | BenchmarkOldCode | 0.0025 ns | 0.0041 ns | 0.0038 ns | 0.0000 ns | ``` ### .NET 8 I ran the same benchmark with .NET 8, and the results look good. So maybe this is a moot point? Not using a nullable is 10x slower, but using a nullable has the same performance. Is this a potential regression in the optimization of the old code? ```cs | Method | Mean | Error | StdDev | |---------------------- |----------:|----------:|----------:| | BenchmarkNullableCode | 0.1011 ns | 0.0122 ns | 0.0102 ns | | BenchmarkOldCode | 0.1090 ns | 0.0143 ns | 0.0127 ns | ``` ### Analysis The lowered code for the null conditional looks like this: ```cs public long BenchmarkNullableCode() { if (!BenchmarkNullable._someOtherValue) { BenchmarkNullable.SomeObject someObject = BenchmarkNullable._someObject; long? nullable1; if (someObject == null) nullable1 = new long?(); else nullable1 = new long?(someObject._someValue); long? nullable2 = nullable1; long someConstant = BenchmarkNullable._someConstant; long? nullable3; if (!nullable2.HasValue) nullable3 = new long?(); else nullable3 = new long?(nullable2.GetValueOrDefault() - someConstant); long? nullable4 = nullable3; long num = 0; if (nullable4.GetValueOrDefault() > num & nullable4.HasValue) BenchmarkNullable._anotherVariable = BenchmarkNullable._someObject._someValue; } return BenchmarkNullable._anotherVariable; } ``` This is actually better than the lowered code I saw from my repo: ```cs int num1; if (!value) { long num2 = curTime; NetState netState = this._state; long? nullable1; if (netState == null) nullable1 = new long?(); else nullable1 = new long?(netState._nextTime); long? nullable2 = nullable1; long? nullable3; if (!nullable2.HasValue) nullable3 = new long?(); else nullable3 = new long?(num2 - nullable2.GetValueOrDefault()); long? nullable4 = nullable3; long num3 = 0; num1 = nullable4.GetValueOrDefault() > num3 & nullable4.HasValue ? 1 : 0; } else num1 = 0; if (num1 == 0) return; this._state._nextTime = curTime; ``` Note the ternary: `num1 = nullable4.GetValueOrDefault() > num3 & nullable4.HasValue ? 1 : 0;` - Not sure why that is was necessary. From what I can tell, it is simply really verbose and not efficient.
Author: kamronbatman
Assignees: -
Labels: `tenet-performance`, `area-CodeGen-coreclr`, `untriaged`, `needs-area-label`
Milestone: -
huoyaoyuan commented 1 year ago

The result on .NET 7 looks like constant elimination. You can use [DisassemblyDiagnoser] to check the codegen difference to see if 8.0 solves this issue.

This is actually better than the lowered code I saw from my repo:

It's also worth checking the C# compiler versions. The compiler can also optimize the lowered result in new version.

kamronbatman commented 1 year ago

I tried to avoid constant time elimination. So I would love to get some ideas on a better way to benchmark this. The .NET 8 lowered code didn't look any different from what I saw. Meanwhile, I'll run the disassembly diagnoser and get back to you.

EDIT: I added the disassembly. Note though that I modified the test to avoid the constant time elimination (I hope). I'll update the post with the benchmark code as soon as I get back to the computer.

jakobbotsch commented 1 year ago

cc @AndyAyersMS, looks like a jump threading candidate

jnyrup commented 1 year ago

For the less efficient codegen, see https://github.com/dotnet/roslyn/issues/69411

AndyAyersMS commented 1 year ago

I'll update the post with the benchmark code as soon as I get back to the computer.

Is the version up top the updated one?

kamronbatman commented 1 year ago

@AndyAyersMS - I updated the benchmark code to match. The post is pretty long. so here are the results after running it again at a glance.

| Method                | Job      | Runtime  | Mean      | Error     | StdDev    | Code Size |
|---------------------- |--------- |--------- |----------:|----------:|----------:|----------:|
| BenchmarkNullableCode | .NET 7.0 | .NET 7.0 | 0.7544 ns | 0.0124 ns | 0.0116 ns |     101 B |
| BenchmarkOldCode      | .NET 7.0 | .NET 7.0 | 0.1169 ns | 0.0056 ns | 0.0050 ns |      58 B |
| BenchmarkNullableCode | .NET 8.0 | .NET 8.0 | 0.1297 ns | 0.0059 ns | 0.0055 ns |      99 B |
| BenchmarkOldCode      | .NET 8.0 | .NET 8.0 | 0.1181 ns | 0.0054 ns | 0.0045 ns |      58 B |
kamronbatman commented 11 months ago

@AndyAyersMS - any planned movement on this?

AndyAyersMS commented 11 months ago

@AndyAyersMS - any planned movement on this?

The .NET 8 numbers look reasonably good, so are you asking if (a) we'd improve that further, or (b) retroactively fix .NET 7?

kamronbatman commented 11 months ago

The disassembly in .net 8 doesn't look much better, so are we relying on the JIT optimizations to further improve the assembly? I am simply curious if it's possible to lower this specifically widely used code pattern for nullables so it isn't so gnarly and therefore rely on JIT optimizations techniques. Barring the extra optimization techniques in .net 8 (probably PGO?), the code runs considerably slower than a developer would expect? 🤔

AndyAyersMS commented 5 months ago

Hmm... 9.0 (p5 at least) seems a bit slower than 8.0.

BenchmarkDotNet v0.13.12, Windows 11 (10.0.22631.3737/23H2/2023Update/SunValley3) Intel Core i7-8700 CPU 3.20GHz (Coffee Lake), 1 CPU, 12 logical and 6 physical cores .NET SDK 9.0.100-preview.5.24307.3 [Host] : .NET 8.0.6 (8.0.624.26715), X64 RyuJIT AVX2 .NET 7.0 : .NET 7.0.20 (7.0.2024.26716), X64 RyuJIT AVX2 .NET 8.0 : .NET 8.0.6 (8.0.624.26715), X64 RyuJIT AVX2 .NET 9.0 : .NET 9.0.0 (9.0.24.30607), X64 RyuJIT AVX2

Method Job Runtime Mean Error StdDev Code Size
BenchmarkNullableCode .NET 7.0 .NET 7.0 1.2402 ns 0.0511 ns 0.0453 ns 101 B
BenchmarkOldCode .NET 7.0 .NET 7.0 0.3828 ns 0.0284 ns 0.0266 ns 58 B
BenchmarkNullableCode .NET 8.0 .NET 8.0 0.3623 ns 0.0082 ns 0.0069 ns 102 B
BenchmarkOldCode .NET 8.0 .NET 8.0 0.0984 ns 0.0042 ns 0.0037 ns 58 B
BenchmarkNullableCode .NET 9.0 .NET 9.0 0.8771 ns 0.0172 ns 0.0144 ns 99 B
BenchmarkOldCode .NET 9.0 .NET 9.0 0.1029 ns 0.0104 ns 0.0092 ns 59 B

Latest main is no better

Method Mean Error StdDev
BenchmarkNullableCode 0.9329 ns 0.0401 ns 0.0376 ns
BenchmarkOldCode 0.2190 ns 0.0178 ns 0.0166 ns
AndyAyersMS commented 4 months ago

Running locally I get wildly different results from run to run, eg

Method Job Toolchain Mean Error StdDev Ratio RatioSD
BenchmarkNullableCode Job-TJQMUP net8 0.8824 ns 0.0446 ns 0.0458 ns 1.00 0.00
BenchmarkNullableCode Job-YHTPVL net9 0.8284 ns 0.0434 ns 0.0406 ns 0.94 0.07
Method Job Toolchain Mean Error StdDev Ratio RatioSD
BenchmarkNullableCode Job-NZWAEZ net8 0.9280 ns 0.0471 ns 0.0644 ns 1.00 0.00
BenchmarkNullableCode Job-OWHYHW net9 1.1097 ns 0.0452 ns 0.0423 ns 1.21 0.09
Method Job Toolchain Mean Error StdDev Ratio RatioSD
BenchmarkNullableCode Job-WNNBGS net8 0.9035 ns 0.0197 ns 0.0184 ns 1.00 0.00
BenchmarkNullableCode Job-QGZPGM net9 0.8542 ns 0.0315 ns 0.0294 ns 0.95 0.04
Method Job Toolchain Mean Error StdDev Ratio RatioSD
BenchmarkNullableCode Job-XSHHGG net8 0.8623 ns 0.0309 ns 0.0317 ns 1.00 0.00
BenchmarkNullableCode Job-YHVYFK net9 0.9135 ns 0.0209 ns 0.0232 ns 1.06 0.05
Method Job Toolchain Mean Error StdDev Ratio RatioSD
BenchmarkNullableCode Job-RFNRHQ net8 0.3588 ns 0.0147 ns 0.0169 ns 1.00 0.00
BenchmarkNullableCode Job-ARRJWF net9 0.8860 ns 0.0138 ns 0.0153 ns 2.47 0.12

It looks like when this tiers up it is right around the threshold for switching to probabilistic counting, and the inconsistent counts somehow can trigger better block layouts. So there is some subtle microarchitectural issue here.

EG this is from a a "slow" net8 run:

-----------------------------------------------------------------------------------------------------------------------------------------
BBnum BBid ref try hnd preds           weight    IBC  lp [IL range]     [jump]      [EH region]         [flags]
-----------------------------------------------------------------------------------------------------------------------------------------
BB01 [0000]  1                            7391. 7391    [000..007)-> BB09 ( cond )                     IBC 
BB02 [0001]  1       BB01                 7391. 7391    [007..015)-> BB04 ( cond )                     IBC 
BB03 [0002]  1       BB02                  0       0    [015..022)-> BB05 (always)                     rare IBC 
BB04 [0003]  1       BB02                 7391. 7391    [022..02C)                                     IBC 
BB05 [0004]  2       BB03,BB04            7391. 7391    [02C..036)-> BB07 ( cond )                     IBC 
BB06 [0005]  1       BB05                  0       0    [036..042)-> BB08 (always)                     rare IBC 
BB07 [0006]  1       BB05                 7391. 7391    [042..050)                                     IBC 
BB08 [0007]  2       BB06,BB07            7391. 7391    [050..068)-> BB10 ( cond )                     IBC 
BB09 [0008]  2       BB01,BB08             0       0    [068..06C)-> BB11 (always)                     rare IBC 
BB10 [0009]  1       BB08                 7391. 7391    [06C..06E)                                     IBC 
BB11 [0010]  2       BB09,BB10            7391. 7391    [06E..070)        (return)                     IBC 
-----------------------------------------------------------------------------------------------------------------------------------------

and this from a fast net9 run (sample based counting threshold is 8192, so we've gone past it)

---------------------------------------------------------------------------------------------------------------------------------------------------------------------
BBnum BBid ref try hnd preds           weight    IBC [IL range]   [jump]                            [EH region]        [flags]
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
BB01 [0000]  1                            8356. 8356 [000..007)-> BB09(0.000992),BB02(0.999)    ( cond )                     IBC
BB02 [0001]  1       BB01                 8348. 8348 [007..015)-> BB04(0.997),BB03(0.00261)   ( cond )                     IBC
BB03 [0002]  1       BB02                 21.83   22 [015..022)-> BB05(1)                 (always)                     IBC
BB04 [0003]  1       BB02                 8326. 8326 [022..02C)-> BB05(1)                 (always)                     IBC
BB05 [0004]  2       BB03,BB04            8348. 8348 [02C..036)-> BB07(0.999),BB06(0.000713)    ( cond )                     IBC
BB06 [0005]  1       BB05                  5.95    6 [036..042)-> BB08(1)                 (always)                     IBC
BB07 [0006]  1       BB05                 8342. 8342 [042..050)-> BB08(1)                 (always)                     IBC
BB08 [0007]  2       BB06,BB07            8348. 8348 [050..068)-> BB10(1),BB09(0)         ( cond )                     IBC
BB09 [0008]  2       BB01,BB08             8.29    8 [068..06C)-> BB11(1)                 (always)                     IBC
BB10 [0009]  1       BB08                 8348. 8348 [06C..06E)-> BB11(1)                 (always)                     IBC
BB11 [0010]  2       BB09,BB10            8356. 8356 [06E..070)                           (return)                     IBC
---------------------------------------------------------------------------------------------------------------------------------------------------------------------

As a hypothesis, if I boost the threshold to say 2^16 (via DOTNET_TieredPGO_ScalableCountThreshold=10), I get numbers that seem more stable. Thought perhaps I'm fooling myself and/or it's not legitimate to run BDN multiple times and infer something from the distribution of results.

Method Job Toolchain Mean Error StdDev Ratio
BenchmarkNullableCode Job-XWZQSJ net8 1.0229 ns 0.0067 ns 0.0059 ns 1.00
BenchmarkNullableCode Job-JOITVI net9 0.8731 ns 0.0111 ns 0.0099 ns 0.85
Method Job Toolchain Mean Error StdDev Ratio RatioSD
BenchmarkNullableCode Job-LMKDNX net8 1.0392 ns 0.0248 ns 0.0232 ns 1.00 0.00
BenchmarkNullableCode Job-RHJLUK net9 0.8662 ns 0.0067 ns 0.0060 ns 0.83 0.02
Method Job Toolchain Mean Error StdDev Ratio
BenchmarkNullableCode Job-GXNZLX net8 0.9298 ns 0.0073 ns 0.0064 ns 1.00
BenchmarkNullableCode Job-TRRRNZ net9 0.8706 ns 0.0117 ns 0.0110 ns 0.94
Method Job Toolchain Mean Error StdDev Ratio
BenchmarkNullableCode Job-DSCIBP net8 1.0343 ns 0.0212 ns 0.0188 ns 1.00
BenchmarkNullableCode Job-ZSYRRW net9 0.8689 ns 0.0104 ns 0.0097 ns 0.84
Method Job Toolchain Mean Error StdDev Ratio RatioSD
BenchmarkNullableCode Job-DALTXU net8 0.9321 ns 0.0217 ns 0.0193 ns 1.00 0.00
BenchmarkNullableCode Job-MNQFHJ net9 0.8569 ns 0.0050 ns 0.0041 ns 0.92 0.02

At any rate I don't see us resolving this during .Net 9, and overall .Net 9 seems to do pretty well, so moving this to future.