Closed performanceautofiler[bot] closed 1 year ago
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak See info in area-owners.md if you want to be subscribed.
Author: | performanceautofiler[bot] |
---|---|
Assignees: | - |
Labels: | `os-linux`, `tenet-performance`, `tenet-performance-benchmarks`, `arch-x64`, `area-CodeGen-coreclr`, `untriaged`, `runtime-coreclr`, `needs-area-label` |
Milestone: | - |
Does not repro on my AMD CPU:
BenchmarkDotNet=v0.13.2.2052-nightly, OS=ubuntu 20.04
AMD Ryzen 9 5950X, 1 CPU, 32 logical and 16 physical cores
.NET SDK=8.0.100-preview.4.23211.45
[Host] : .NET 8.0.0 (8.0.23.21001), X64 RyuJIT AVX2
Job-GWXXCM : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
Job-USMAZH : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
PowerPlanMode=00000000-0000-0000-0000-000000000000 IterationTime=250.0000 ms MaxIterationCount=20
MinIterationCount=15 WarmupCount=1
Method | Job | Toolchain | Mean | Error | StdDev | Median | Min | Max | Ratio | Allocated | Alloc Ratio |
---|---|---|---|---|---|---|---|---|---|---|---|
GetFullPathForTypicalLongPath | Job-GWXXCM | /baseline/Core_Root/corerun | 619.8 ns | 2.38 ns | 1.99 ns | 619.3 ns | 616.9 ns | 624.5 ns | 1.00 | - | NA |
GetFullPathForTypicalLongPath | Job-USMAZH | /diff/Core_Root/corerun | 621.9 ns | 3.13 ns | 2.61 ns | 622.7 ns | 617.4 ns | 627.0 ns | 1.00 | - | NA |
I couldn't figure out how to profile ubuntu-x64, so I used PGO to determine that the hot function is
System.IO.PathInternal.RemoveRelativeSegments: 4000610896
The diff is:
@@ -1,275 +1,270 @@
; Assembly listing for method System.IO.PathInternal:RemoveRelativeSegments(System.ReadOnlySpan`1[ushort],int,byref):bool
; Emitting BLENDED_CODE for X64 CPU with AVX - Unix
; Tier-1 compilation
; optimized code
; optimized using Static PGO
; rbp based frame
; fully interruptible
; with Static PGO: edge weights are invalid, and fgCalledCount is 32459
; 1 inlinees with PGO data; 13 single block inlinees; 2 inlinees without PGO data
G_M000_IG01:
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 24
lea rbp, [rsp+40H]
mov r15, rdi
mov r14d, esi
mov r12d, edx
mov rbx, rcx
G_M000_IG02:
xor r13d, r13d
mov edi, r12d
lea eax, [rdi-01H]
mov dword ptr [rbp-34H], eax
cmp eax, r14d
jae G_M000_IG40
mov esi, eax
- movzx rsi, word ptr [r15+2*rsi]
- cmp esi, 47
+ cmp word ptr [r15+2*rsi], 47
cmove edi, eax
test edi, edi
jg G_M000_IG36
G_M000_IG03:
mov edx, edi
cmp edx, r14d
jge SHORT G_M000_IG09
G_M000_IG04:
cmp edx, r14d
jae G_M000_IG40
mov esi, edx
movzx rsi, word ptr [r15+2*rsi]
cmp esi, 47
je SHORT G_M000_IG12
G_M000_IG05:
mov ecx, dword ptr [rbx+08H]
lea r8, bword ptr [rbx+10H]
mov r9, bword ptr [r8]
mov r8d, dword ptr [r8+08H]
cmp ecx, r8d
jae SHORT G_M000_IG07
G_M000_IG06:
mov r8d, ecx
mov word ptr [r9+2*r8], si
inc ecx
mov dword ptr [rbx+08H], ecx
jmp SHORT G_M000_IG08
G_M000_IG07:
mov dword ptr [rbp-30H], edx
mov dword ptr [rbp-2CH], edi
mov rdi, rbx
call [System.Text.ValueStringBuilder:GrowAndAppend(ushort):this]
mov edx, dword ptr [rbp-30H]
mov edi, dword ptr [rbp-2CH]
G_M000_IG08:
inc edx
cmp edx, r14d
jl SHORT G_M000_IG04
G_M000_IG09:
test r13d, r13d
jne G_M000_IG25
cmp dword ptr [rbx+08H], r14d
jne G_M000_IG25
G_M000_IG10:
xor eax, eax
G_M000_IG11:
add rsp, 24
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
G_M000_IG12:
lea ecx, [rdx+01H]
cmp ecx, r14d
jge SHORT G_M000_IG05
G_M000_IG13:
cmp ecx, r14d
jae G_M000_IG40
mov r8d, ecx
movzx r8, word ptr [r15+2*r8]
- mov r9d, r8d
- cmp r9d, 47
+ cmp r8d, 47
je SHORT G_M000_IG08
G_M000_IG14:
lea r9d, [rdx+02H]
cmp r9d, r14d
je G_M000_IG23
G_M000_IG15:
cmp r9d, r14d
jae G_M000_IG40
mov r10d, r9d
- movzx r10, word ptr [r15+2*r10]
- cmp r10d, 47
- je G_M000_IG23
+ cmp word ptr [r15+2*r10], 47
+ je SHORT G_M000_IG23
G_M000_IG16:
cmp r9d, r14d
jge G_M000_IG05
G_M000_IG17:
lea ecx, [rdx+03H]
cmp ecx, r14d
je SHORT G_M000_IG19
G_M000_IG18:
cmp ecx, r14d
jae G_M000_IG40
mov r10d, ecx
- movzx r10, word ptr [r15+2*r10]
- cmp r10d, 47
+ cmp word ptr [r15+2*r10], 47
jne G_M000_IG05
G_M000_IG19:
cmp r8d, 46
jne G_M000_IG05
G_M000_IG20:
cmp r9d, r14d
jae G_M000_IG40
mov r8d, r9d
cmp word ptr [r15+2*r8], 46
jne G_M000_IG05
mov edx, dword ptr [rbx+08H]
dec edx
cmp edx, edi
jl SHORT G_M000_IG32
G_M000_IG21:
lea rsi, bword ptr [rbx+10H]
cmp edx, dword ptr [rsi+08H]
jae G_M000_IG40
mov rsi, bword ptr [rsi]
mov r8d, edx
- movzx rsi, word ptr [rsi+2*r8]
- cmp esi, 47
+ cmp word ptr [rsi+2*r8], 47
je SHORT G_M000_IG28
dec edx
cmp edx, edi
jge SHORT G_M000_IG21
G_M000_IG22:
jmp SHORT G_M000_IG32
G_M000_IG23:
cmp r8d, 46
- jne G_M000_IG16
+ jne SHORT G_M000_IG16
G_M000_IG24:
mov edx, ecx
jmp G_M000_IG08
G_M000_IG25:
cmp edi, r12d
je SHORT G_M000_IG26
cmp dword ptr [rbx+08H], r12d
jl SHORT G_M000_IG34
G_M000_IG26:
mov eax, 1
G_M000_IG27:
add rsp, 24
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
G_M000_IG28:
cmp ecx, r14d
jl SHORT G_M000_IG30
G_M000_IG29:
cmp edx, edi
je SHORT G_M000_IG38
G_M000_IG30:
mov ecx, edx
G_M000_IG31:
mov dword ptr [rbx+08H], ecx
G_M000_IG32:
mov dword ptr [rbp-2CH], edi
cmp edx, edi
jl SHORT G_M000_IG39
G_M000_IG33:
mov dword ptr [rbp-30H], r9d
mov edx, dword ptr [rbp-30H]
mov edi, dword ptr [rbp-2CH]
jmp G_M000_IG08
G_M000_IG34:
mov edi, dword ptr [rbp-34H]
movzx rsi, word ptr [r15+2*rdi]
mov edi, dword ptr [rbx+08H]
lea rax, bword ptr [rbx+10H]
mov rdx, bword ptr [rax]
mov eax, dword ptr [rax+08H]
cmp edi, eax
jae SHORT G_M000_IG35
mov eax, edi
mov word ptr [rdx+2*rax], si
inc edi
mov dword ptr [rbx+08H], edi
jmp SHORT G_M000_IG26
G_M000_IG35:
mov rdi, rbx
call [System.Text.ValueStringBuilder:GrowAndAppend(ushort):this]
jmp SHORT G_M000_IG26
G_M000_IG36:
cmp edi, r14d
jbe SHORT G_M000_IG37
call [System.ThrowHelper:ThrowArgumentOutOfRangeException()]
int3
G_M000_IG37:
mov dword ptr [rbp-2CH], edi
mov edx, edi
mov rsi, r15
mov rdi, rbx
call [System.Text.ValueStringBuilder:Append(System.ReadOnlySpan`1[ushort]):this]
mov edi, dword ptr [rbp-2CH]
jmp G_M000_IG03
G_M000_IG38:
lea ecx, [rdx+01H]
jmp SHORT G_M000_IG31
G_M000_IG39:
mov edi, dword ptr [rbp-2CH]
mov dword ptr [rbx+08H], edi
mov dword ptr [rbp-2CH], edi
jmp SHORT G_M000_IG33
G_M000_IG40:
call CORINFO_HELP_RNGCHKFAIL
int3
-; Total bytes of code 589
+; Total bytes of code 568
Seems like this is JCC erratum given that it didn't repro on my CPU. Unfortunately DOTNET_JitDisasmWithAlignmentBoundaries
is not working right now, so checking that is a bit annoying.
Just reiterating what I said on Discord... My expectation is this is due to how cmp
on a small type is implemented in microcode.
With the previous code we had the following and were actually doing a 32-bit comparison.
movzx esi, word ptr [r15+2*rsi]
cmp esi, 47
While the new code is instead:
cmp word ptr [r15+2*rsi], 47
and a CPU might implement it as:
mov si, word ptr [r15+2*rsi]
cmp si, 47
The code is ultimately doing the "same thing", but a given CPU might not do the implicit zero-extension when operating on a small type for cmp
and this can make it have the general issue around partial dependencies that can be introduced for small types.
My best/educated guess here is that AMD is doing an implicit zero-extension behind the scenes and the Intel CPUs are currently not.
Might also be that Intel CPUs cannot do macro-op fusion in cases like
- movzx rsi, word ptr [rsi+2*r8]
- cmp esi, 47
+ cmp word ptr [rsi+2*r8], 47
je SHORT G_M000_IG28
or, as @tannergooding suggested, a partial stall/false dependency due to worse handling for these kinds of instruction patterns. If that's the case then the only fix is likely to stop containing 16-bit indirections in some of these cases.
Some benchmarks from my Intel CPU:
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using System.Diagnostics;
namespace MyBenchmarks
{
public class Benchmark
{
private static byte[] s_bytes;
private static ushort[] s_ushorts;
private static string s_longPath;
[GlobalSetup]
public static void Setup()
{
s_bytes = Enumerable.Range(0, 100000).Select(i => (byte)(Random.Shared.Next(2) * 46)).ToArray();
s_ushorts = Enumerable.Range(0, 100000).Select(i => (ushort)(Random.Shared.Next(2) * 4600)).ToArray();
s_longPath = CreatePath(500);
}
[Benchmark]
public int Compare8()
{
int result = 0;
byte[] bytes = s_bytes;
for (int i = 0; i < bytes.Length; i++)
{
byte b = bytes[i];
if (b == 46)
{
result++;
}
}
return result;
}
[Benchmark]
public int Compare16()
{
int result = 0;
ushort[] ushorts = s_ushorts;
for (int i = 0; i < ushorts.Length; i++)
{
ushort u = ushorts[i];
if (u == 4600)
{
result++;
}
}
return result;
}
private static string CreatePath(int len)
{
char[] str = new char[len];
for (int i = 0; i < str.Length; i++)
{
// Add path separator so folders aren't too long.
if (i % 20 == 0)
{
str[i] = Path.DirectorySeparatorChar;
}
else
{
str[i] = 'a';
}
}
return new string(str);
}
private static readonly Func<string, int, string> _removeRelativeSegments = typeof(Path).Assembly.GetType("System.IO.PathInternal", true)
.GetMethod("RemoveRelativeSegments", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static, new[] { typeof(string), typeof(int) })
.CreateDelegate<Func<string, int, string>>();
[Benchmark]
public int RemoveRelativeSegments()
{
int result = 0;
for (int i = 0; i < 100; i++)
result += _removeRelativeSegments(s_longPath, 1).Length;
return result;
}
}
public class Program
{
public static void Main(string[] args)
{
BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args);
}
}
}
BenchmarkDotNet=v0.13.5, OS=Windows 11 (10.0.22624.1546)
Intel Core i9-10885H CPU 2.40GHz, 1 CPU, 16 logical and 8 physical cores
.NET SDK=8.0.100-preview.2.23117.18
[Host] : .NET 7.0.5 (7.0.523.17405), X64 RyuJIT AVX2
Job-SYFUUK : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
Job-INHGGA : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
MemoryRandomization=True
Method | Job | Toolchain | Mean | Error | StdDev | Median | Ratio | RatioSD |
---|---|---|---|---|---|---|---|---|
Compare8 | Job-SYFUUK | \93e91df5a7c54dd4bf133f9535acd705f91b789a\corerun.exe | 303.9 us | 2.87 us | 2.68 us | 304.2 us | 1.05 | 0.01 |
Compare8 | Job-INHGGA | \eae7caad29b07c0158b97bff752a78dc4207ca44\corerun.exe | 290.5 us | 3.63 us | 3.39 us | 289.6 us | 1.00 | 0.00 |
Compare16 | Job-SYFUUK | \93e91df5a7c54dd4bf133f9535acd705f91b789a\corerun.exe | 290.9 us | 3.91 us | 3.66 us | 289.6 us | 1.00 | 0.02 |
Compare16 | Job-INHGGA | \eae7caad29b07c0158b97bff752a78dc4207ca44\corerun.exe | 291.7 us | 5.15 us | 4.82 us | 290.6 us | 1.00 | 0.00 |
RemoveRelativeSegments | Job-SYFUUK | \93e91df5a7c54dd4bf133f9535acd705f91b789a\corerun.exe | 180.5 us | 3.55 us | 5.42 us | 176.4 us | 1.06 | 0.02 |
RemoveRelativeSegments | Job-INHGGA | \eae7caad29b07c0158b97bff752a78dc4207ca44\corerun.exe | 170.8 us | 3.40 us | 5.59 us | 166.6 us | 1.00 | 0.00 |
So maybe the impact on newer Intel CPUs is lower, now sure what CPU is in the lab.
Diffs for disabling containment of 16-byte loads in compares do not look too encouraging, so I'm not sure what we can really do here. May just be something we'll have to live with (and be content with the fact that it seems to be less of a problem on more modern CPUs). We could also try disabling the containment for hot blocks by utilizing PGO, though using PGO in lowering like this and in lowering is probably breaking new grounds (and it is unclear how reliable the PGO data is).
Diffs are based on 1,662,932 contexts (510,185 MinOpts, 1,152,747 FullOpts).
MISSED contexts: 1 (0.00%)
The Alpine version of the benchmark does not show the same regression: https://pvscmdupload.blob.core.windows.net/reports/allTestHistory%2frefs%2fheads%2fmain_x64_alpine%203.15%2fSystem.IO.Tests.Perf_Path.GetFullPathForReallyLongPath.html
That's odd since both of these measurements are on i7-8700 CPUs and should be measuring the same JIT codegen. So this may be some weird microarchitectural effect instead.
These benchmarks look to be better now, not sure if it's due to other changes or other factors:
One thing to note is that the Alpine version of the benchmark (linked above) actually shows a perf improvement in the commit range that @EgorBo mentioned above, and that is supposedly the same CPU, just a different OS. So that makes me suspect that this is actually some kind of code alignment or data alignment related effect. I don't think there is anything actionable here.
Run Information
Regressions in System.Text.RegularExpressions.Tests.Perf_Regex_Industry_Leipzig
Test Report
Repro
General Docs link: https://github.com/dotnet/performance/blob/main/docs/benchmarking-workflow-dotnet-runtime.md
Payloads
Baseline Compare
Run Information
Regressions in System.IO.Tests.Perf_Path
Test Report
Repro
General Docs link: https://github.com/dotnet/performance/blob/main/docs/benchmarking-workflow-dotnet-runtime.md
Payloads
Baseline Compare
Run Information
Regressions in System.IO.Tests.Perf_FileInfo
Test Report
Repro
General Docs link: https://github.com/dotnet/performance/blob/main/docs/benchmarking-workflow-dotnet-runtime.md
Payloads
Baseline Compare