Closed aevitas closed 9 months ago
Hi. I can do this after closing issue #10 👌
Thank you! I will look into your pull request shortly. :)
On Wed, 21 Feb 2024, 16:24 kirill, @.***> wrote:
Hi. I can do this after closing issue #10 https://github.com/aevitas/flakeid/issues/10 👌
— Reply to this email directly, view it on GitHub https://github.com/aevitas/flakeid/issues/11#issuecomment-1956118583, or unsubscribe https://github.com/notifications/unsubscribe-auth/AABE7AQIKMRI2BWKUT5A343YUWVLPAVCNFSM6AAAAABDQSVOXWVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTSNJWGEYTQNJYGM . You are receiving this because you authored the thread.Message ID: @.***>
@aevitas Hello. Here is the result I got:
BenchmarkDotNet v0.13.12, Windows 11 (10.0.22631.3155/23H2/2023Update/SunValley3)
AMD Ryzen 5 5600X, 1 CPU, 12 logical and 6 physical cores
.NET SDK 8.0.201
[Host] : .NET 8.0.2 (8.0.224.6711), X64 RyuJIT AVX2
.NET 5.0 : .NET 5.0.17 (5.0.1722.21314), X64 RyuJIT AVX2
.NET 6.0 : .NET 6.0.27 (6.0.2724.6912), X64 RyuJIT AVX2
.NET 7.0 : .NET 7.0.16 (7.0.1624.6629), X64 RyuJIT AVX2
.NET 8.0 : .NET 8.0.2 (8.0.224.6711), X64 RyuJIT AVX2
| Method | Job | Runtime | Mean | Error | StdDev | Code Size |
|--------------- |--------- |--------- |------------:|----------:|-----------:|----------:|
| Single_FlakeId | .NET 5.0 | .NET 5.0 | 27.78 ns | 0.054 ns | 0.048 ns | 254 B |
| Single_Guid | .NET 5.0 | .NET 5.0 | 40.89 ns | 0.029 ns | 0.023 ns | 111 B |
| Single_NewId | .NET 5.0 | .NET 5.0 | 67.26 ns | 0.116 ns | 0.103 ns | 438 B |
| Single_IdGen | .NET 5.0 | .NET 5.0 | 3,489.40 ns | 67.093 ns | 132.436 ns | 712 B |
| Single_FlakeId | .NET 6.0 | .NET 6.0 | 26.65 ns | 0.027 ns | 0.025 ns | 215 B |
| Single_Guid | .NET 6.0 | .NET 6.0 | 40.73 ns | 0.051 ns | 0.047 ns | 111 B |
| Single_NewId | .NET 6.0 | .NET 6.0 | 32.35 ns | 0.123 ns | 0.109 ns | 436 B |
| Single_IdGen | .NET 6.0 | .NET 6.0 | 3,491.03 ns | 69.723 ns | 171.031 ns | 721 B |
| Single_FlakeId | .NET 7.0 | .NET 7.0 | 26.60 ns | 0.010 ns | 0.009 ns | 209 B |
| Single_Guid | .NET 7.0 | .NET 7.0 | 40.86 ns | 0.063 ns | 0.056 ns | 239 B |
| Single_NewId | .NET 7.0 | .NET 7.0 | 32.95 ns | 0.115 ns | 0.090 ns | 426 B |
| Single_IdGen | .NET 7.0 | .NET 7.0 | 3,448.09 ns | 68.673 ns | 174.795 ns | 666 B |
| Single_FlakeId | .NET 8.0 | .NET 8.0 | 26.47 ns | 0.006 ns | 0.005 ns | 358 B |
| Single_Guid | .NET 8.0 | .NET 8.0 | 42.19 ns | 0.817 ns | 0.764 ns | 245 B |
| Single_NewId | .NET 8.0 | .NET 8.0 | 32.21 ns | 0.115 ns | 0.096 ns | 303 B |
| Single_IdGen | .NET 8.0 | .NET 8.0 | 3,500.12 ns | 69.738 ns | 173.671 ns | 671 B |
In fact, there is not much difference in performance between different .NET versions. Should I keep this summary table with benchmarks for multiple runtimes? Or I can make 2 separate tables: 1 with comparing FlakeId with NewId and IdGen (running on .NET 8.0) and 2 with running FlakeId on multiple .NET versions.
In fact, there is not much difference in performance between different .NET versions. Should I keep this summary table with benchmarks for multiple runtimes? Or I can make 2 separate tables: 1 with comparing FlakeId with NewId and IdGen (running on .NET 8.0) and 2 with running FlakeId on multiple .NET versions.
Surprised to see the difference in performance is so small! I think your second suggestion would be preferable, one table comparing it against alternatives (feel free to add more alternatives if you know of any, by the way) and a table with benchmarks across runtimes.
Curious why the code generated by the JIT has almost doubled in size in .NET 8, @jakobbotsch is this just a fluke?
Hard to say. We inline more aggressively in .NET 8 which could explain it.
You can try looking at the codegen by using Benchmark.NET's DisassemblyDiagnoser
, or by passing --envvars DOTNET_JitDisasm:Single_FlakeId
to the BDN command line (requires propagating the CLI args into BDN with BenchmarkSwitcher.FromX().Run(args)
).
Running the DisassemblyDiagnoser
against both frameworks indeed confirms that .NET 8 inlines more aggressively, especially the call to FlakeId.MonotonicTimer.get_ElapsedMilliseconds()
.
There is a discrepancy between the assembly generated in .NET 8 versus prior versions in the call to CORINFO_HELP_GETCURRENTMANAGEDTHREADID
. Prior to .NET 8, the prolog looks like:
mov rdi,3FFFFFFFFFF
and rdi,rax
call CORINFO_HELP_GETCURRENTMANAGEDTHREADID
Where the .NET 8 version includes a bunch of vector ops and other code:
vxorps xmm0,xmm0,xmm0
vcvtsi2sd xmm0,xmm0,rdi
vcvttsd2si rdx,xmm0
mov rcx,346DC5D63886594B
mov rax,rcx
imul rdx
mov rdi,rdx
shr rdi,3F
sar rdx,0B
add rdi,rdx
mov rsi,434CCA528F
add rsi,rdi
mov rdi,3FFFFFFFFFF
and rsi,rdi
call CORINFO_HELP_GETCURRENTMANAGEDTHREADID
Full results below:
; FlakeId.Id.CreateInternal()
push rdi
push rsi
push rbp
push rbx
sub rsp,38
vzeroupper
xor eax,eax
mov [rsp+30],rax
mov [rsp+60],rcx
mov rcx,28FB5804A00
mov rsi,[rcx]
mov [rsp+28],rsi
mov rdi,[rsi+8]
cmp byte ptr [rsi+18],0
je short M01_L00
lea rcx,[rsp+30]
mov rax,7FF9DDD25F20
call rax
mov rax,[rsp+30]
mov rsi,[rsp+28]
sub rax,[rsi+10]
add rdi,rax
cmp dword ptr [7FF895DE505C],0
jne near ptr M01_L03
M01_L00:
vxorps xmm0,xmm0,xmm0
vcvtsi2sd xmm0,xmm0,rdi
vcvttsd2si rdx,xmm0
mov rcx,346DC5D63886594B
mov rax,rcx
imul rdx
mov rdi,rdx
shr rdi,3F
sar rdx,0B
add rdi,rdx
mov rsi,434CCA528F
add rsi,rdi
mov rdi,3FFFFFFFFFF
and rsi,rdi
call CORINFO_HELP_GETCURRENTMANAGEDTHREADID
mov edi,eax
and edi,1F
mov rax,2D04A20ADC0
mov ebp,[rax+4]
cmp byte ptr [rax],0
je short M01_L02
M01_L01:
mov rax,7FF836393068
lock inc dword ptr [rax]
shl rsi,16
shl edi,11
movsxd rax,edi
add rax,rsi
shl ebp,0C
movsxd rcx,ebp
add rax,rcx
mov ecx,[7FF836393068]
and ecx,0FFF
movsxd rcx,ecx
add rax,rcx
mov rbx,[rsp+60]
mov [rbx],rax
add rsp,38
pop rbx
pop rbp
pop rsi
pop rdi
ret
M01_L02:
call qword ptr [7FF8363A5A70]; System.Diagnostics.Process.GetCurrentProcess()
mov rbp,rax
cmp [rbp],bpl
mov rcx,rbp
mov edx,1
call qword ptr [7FF8363A59B0]; System.Diagnostics.Process.EnsureState(State)
mov ebp,[rbp+0D0]
and ebp,1F
mov rax,2D04A20ADC0
mov byte ptr [rax],1
mov [rax+4],ebp
jmp short M01_L01
M01_L03:
call CORINFO_HELP_POLL_GC
jmp near ptr M01_L00
; Total bytes of code 330
; FlakeId.Id.CreateInternal()
push r14
push rdi
push rsi
push rbp
push rbx
sub rsp,20
mov rsi,rcx
call qword ptr [7FF8B561D1B0]; FlakeId.MonotonicTimer.get_ElapsedMilliseconds()
mov rdi,3FFFFFFFFFF
and rdi,rax
call CORINFO_HELP_GETCURRENTMANAGEDTHREADID
mov ebx,eax
and ebx,1F
mov rax,22D6E406D40
mov rbp,[rax]
mov r14d,[rbp+0C]
cmp byte ptr [rbp+8],0
jne short M01_L00
call qword ptr [7FF8B561DBA0]; System.Diagnostics.Process.GetCurrentProcess()
mov r14,rax
cmp [r14],r14b
mov rcx,r14
mov edx,1
call qword ptr [7FF8B561DAE0]; System.Diagnostics.Process.EnsureState(State)
mov r14d,[r14+0D0]
and r14d,1F
add rbp,8
mov byte ptr [rbp],1
mov [rbp+4],r14d
M01_L00:
mov rax,7FF8B55FD3E0
lock add dword ptr [rax],1
mov eax,[7FF8B55FD3E0]
and eax,0FFF
shl rdi,16
shl ebx,11
movsxd rdx,ebx
add rdx,rdi
shl r14d,0C
movsxd rcx,r14d
add rdx,rcx
cdqe
add rax,rdx
mov [rsi],rax
add rsp,20
pop rbx
pop rbp
pop rsi
pop rdi
pop r14
ret
; Total bytes of code 181
Could the JIT be inlining something that was previously passed via a register, as the .NET 8 version does use one fewer reg or is it just performing some magic on the inlined Stopwatch call?
In the FlakeId readme, there are performance benchmarks on .NET 5.0:
Because FlakeId hugely benefits from improvements in the .NET runtime, it's likely that running benchmarks on .NET 8.0 will yield superior performance.
It would be interesting to run these benchmarks on multiple .NET versions on the same machine to update the README, and compare results.