Unity-Technologies / Unity.Mathematics

The C# math library used in Unity providing vector types and math functions with a shader like syntax
Other
1.38k stars 156 forks source link

Avoid using union structs for asfloat(int) and friends #206

Closed unpacklo closed 2 years ago

unpacklo commented 2 years ago

DOTS-5396

While working on #201 I noticed that chgsign was pretty slow without burst so I investigated more carefully. With the help of https://github.com/sschoener/unity-asm-explorer-package from @sschoener, I was finally able to see what was going on. chgsign makes use of asfloat and asuint to manipulate the sign bit directly but mono (without burst) struggles with these methods because of the use of IntFloatUnion which forces it to initialize all fields to zero prior to doing any work. It also forces excessive stack traffic and conversion from single -> double -> single. Here is an example of asfloat(uint4) before this change:

public static Unity.Mathematics.float4 asfloat(Unity.Mathematics.uint4 x)
Address: 00000001782306F0
Code Size in Bytes: 1046
Debug mode: enabled

00000001782306f0 48 81 ec c8 00 00 00           sub rsp, 0xc8
00000001782306f7 48 89 3c 24                    mov [rsp], rdi
00000001782306fb 48 89 b4 24 b8 00 00 00        mov [rsp+0xb8], rsi
0000000178230703 48 89 94 24 c0 00 00 00        mov [rsp+0xc0], rdx
000000017823070b 48 63 84 24 b8 00 00 00        movsxd rax, dword [rsp+0xb8]
0000000178230713 89 84 24 98 00 00 00           mov [rsp+0x98], eax
000000017823071a 48 63 84 24 bc 00 00 00        movsxd rax, dword [rsp+0xbc]
0000000178230722 89 84 24 9c 00 00 00           mov [rsp+0x9c], eax
0000000178230729 48 63 84 24 c0 00 00 00        movsxd rax, dword [rsp+0xc0]
0000000178230731 89 84 24 a0 00 00 00           mov [rsp+0xa0], eax
0000000178230738 48 63 84 24 c4 00 00 00        movsxd rax, dword [rsp+0xc4]
0000000178230740 89 84 24 a4 00 00 00           mov [rsp+0xa4], eax
0000000178230747 8b 84 24 98 00 00 00           mov eax, [rsp+0x98]
000000017823074e c7 84 24 90 00 00 00 00 00 00 00  mov dword [rsp+0x90], 0x0
0000000178230759 66 0f 57 c0                    xorpd xmm0, xmm0
000000017823075d f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
0000000178230762 f3 44 0f 11 bc 24 90 00 00 00  movss [rsp+0x90], xmm15
000000017823076c 89 84 24 90 00 00 00           mov [rsp+0x90], eax
0000000178230773 89 84 24 88 00 00 00           mov [rsp+0x88], eax
000000017823077a f3 0f 10 84 24 88 00 00 00     movss xmm0, dword [rsp+0x88]
0000000178230783 f3 0f 5a c0                    cvtss2sd xmm0, xmm0
0000000178230787 f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
000000017823078c f3 44 0f 11 bc 24 a8 00 00 00  movss [rsp+0xa8], xmm15
0000000178230796 f3 0f 10 84 24 a8 00 00 00     movss xmm0, dword [rsp+0xa8]
000000017823079f f3 0f 5a c0                    cvtss2sd xmm0, xmm0
00000001782307a3 f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
00000001782307a8 f3 44 0f 11 bc 24 a8 00 00 00  movss [rsp+0xa8], xmm15
00000001782307b2 f3 0f 10 9c 24 a8 00 00 00     movss xmm3, dword [rsp+0xa8]
00000001782307bb f3 0f 5a db                    cvtss2sd xmm3, xmm3
00000001782307bf 48 63 84 24 b8 00 00 00        movsxd rax, dword [rsp+0xb8]
00000001782307c7 89 44 24 78                    mov [rsp+0x78], eax
00000001782307cb 48 63 84 24 bc 00 00 00        movsxd rax, dword [rsp+0xbc]
00000001782307d3 89 44 24 7c                    mov [rsp+0x7c], eax
00000001782307d7 48 63 84 24 c0 00 00 00        movsxd rax, dword [rsp+0xc0]
00000001782307df 89 84 24 80 00 00 00           mov [rsp+0x80], eax
00000001782307e6 48 63 84 24 c4 00 00 00        movsxd rax, dword [rsp+0xc4]
00000001782307ee 89 84 24 84 00 00 00           mov [rsp+0x84], eax
00000001782307f5 8b 44 24 7c                    mov eax, [rsp+0x7c]
00000001782307f9 c7 44 24 70 00 00 00 00        mov dword [rsp+0x70], 0x0
0000000178230801 66 0f 57 c0                    xorpd xmm0, xmm0
0000000178230805 f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
000000017823080a f3 44 0f 11 7c 24 70           movss [rsp+0x70], xmm15
0000000178230811 89 44 24 70                    mov [rsp+0x70], eax
0000000178230815 89 44 24 68                    mov [rsp+0x68], eax
0000000178230819 f3 0f 10 44 24 68              movss xmm0, dword [rsp+0x68]
000000017823081f f3 0f 5a c0                    cvtss2sd xmm0, xmm0
0000000178230823 f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
0000000178230828 f3 44 0f 11 bc 24 a8 00 00 00  movss [rsp+0xa8], xmm15
0000000178230832 f3 0f 10 84 24 a8 00 00 00     movss xmm0, dword [rsp+0xa8]
000000017823083b f3 0f 5a c0                    cvtss2sd xmm0, xmm0
000000017823083f f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
0000000178230844 f3 44 0f 11 bc 24 a8 00 00 00  movss [rsp+0xa8], xmm15
000000017823084e f3 0f 10 94 24 a8 00 00 00     movss xmm2, dword [rsp+0xa8]
0000000178230857 f3 0f 5a d2                    cvtss2sd xmm2, xmm2
000000017823085b 48 63 84 24 b8 00 00 00        movsxd rax, dword [rsp+0xb8]
0000000178230863 89 44 24 58                    mov [rsp+0x58], eax
0000000178230867 48 63 84 24 bc 00 00 00        movsxd rax, dword [rsp+0xbc]
000000017823086f 89 44 24 5c                    mov [rsp+0x5c], eax
0000000178230873 48 63 84 24 c0 00 00 00        movsxd rax, dword [rsp+0xc0]
000000017823087b 89 44 24 60                    mov [rsp+0x60], eax
000000017823087f 48 63 84 24 c4 00 00 00        movsxd rax, dword [rsp+0xc4]
0000000178230887 89 44 24 64                    mov [rsp+0x64], eax
000000017823088b 8b 44 24 60                    mov eax, [rsp+0x60]
000000017823088f c7 44 24 50 00 00 00 00        mov dword [rsp+0x50], 0x0
0000000178230897 66 0f 57 c0                    xorpd xmm0, xmm0
000000017823089b f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
00000001782308a0 f3 44 0f 11 7c 24 50           movss [rsp+0x50], xmm15
00000001782308a7 89 44 24 50                    mov [rsp+0x50], eax
00000001782308ab 89 44 24 48                    mov [rsp+0x48], eax
00000001782308af f3 0f 10 44 24 48              movss xmm0, dword [rsp+0x48]
00000001782308b5 f3 0f 5a c0                    cvtss2sd xmm0, xmm0
00000001782308b9 f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
00000001782308be f3 44 0f 11 bc 24 a8 00 00 00  movss [rsp+0xa8], xmm15
00000001782308c8 f3 0f 10 84 24 a8 00 00 00     movss xmm0, dword [rsp+0xa8]
00000001782308d1 f3 0f 5a c0                    cvtss2sd xmm0, xmm0
00000001782308d5 f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
00000001782308da f3 44 0f 11 bc 24 a8 00 00 00  movss [rsp+0xa8], xmm15
00000001782308e4 f3 0f 10 8c 24 a8 00 00 00     movss xmm1, dword [rsp+0xa8]
00000001782308ed f3 0f 5a c9                    cvtss2sd xmm1, xmm1
00000001782308f1 48 63 84 24 b8 00 00 00        movsxd rax, dword [rsp+0xb8]
00000001782308f9 89 44 24 38                    mov [rsp+0x38], eax
00000001782308fd 48 63 84 24 bc 00 00 00        movsxd rax, dword [rsp+0xbc]
0000000178230905 89 44 24 3c                    mov [rsp+0x3c], eax
0000000178230909 48 63 84 24 c0 00 00 00        movsxd rax, dword [rsp+0xc0]
0000000178230911 89 44 24 40                    mov [rsp+0x40], eax
0000000178230915 48 63 84 24 c4 00 00 00        movsxd rax, dword [rsp+0xc4]
000000017823091d 89 44 24 44                    mov [rsp+0x44], eax
0000000178230921 8b 44 24 44                    mov eax, [rsp+0x44]
0000000178230925 c7 44 24 30 00 00 00 00        mov dword [rsp+0x30], 0x0
000000017823092d 66 0f 57 c0                    xorpd xmm0, xmm0
0000000178230931 f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
0000000178230936 f3 44 0f 11 7c 24 30           movss [rsp+0x30], xmm15
000000017823093d 89 44 24 30                    mov [rsp+0x30], eax
0000000178230941 89 44 24 28                    mov [rsp+0x28], eax
0000000178230945 f3 0f 10 44 24 28              movss xmm0, dword [rsp+0x28]
000000017823094b f3 0f 5a c0                    cvtss2sd xmm0, xmm0
000000017823094f f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
0000000178230954 f3 44 0f 11 bc 24 a8 00 00 00  movss [rsp+0xa8], xmm15
000000017823095e f3 0f 10 84 24 a8 00 00 00     movss xmm0, dword [rsp+0xa8]
0000000178230967 f3 0f 5a c0                    cvtss2sd xmm0, xmm0
000000017823096b f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
0000000178230970 f3 44 0f 11 bc 24 a8 00 00 00  movss [rsp+0xa8], xmm15
000000017823097a f3 0f 10 84 24 a8 00 00 00     movss xmm0, dword [rsp+0xa8]
0000000178230983 f3 0f 5a c0                    cvtss2sd xmm0, xmm0
0000000178230987 f2 44 0f 5a fb                 cvtsd2ss xmm15, xmm3
000000017823098c f3 44 0f 11 bc 24 a8 00 00 00  movss [rsp+0xa8], xmm15
0000000178230996 f2 44 0f 5a fa                 cvtsd2ss xmm15, xmm2
000000017823099b f3 44 0f 11 bc 24 ac 00 00 00  movss [rsp+0xac], xmm15
00000001782309a5 f2 44 0f 5a f9                 cvtsd2ss xmm15, xmm1
00000001782309aa f3 44 0f 11 bc 24 b0 00 00 00  movss [rsp+0xb0], xmm15
00000001782309b4 f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
00000001782309b9 f3 44 0f 11 bc 24 b4 00 00 00  movss [rsp+0xb4], xmm15
00000001782309c3 f3 0f 10 9c 24 a8 00 00 00     movss xmm3, dword [rsp+0xa8]
00000001782309cc f3 0f 5a db                    cvtss2sd xmm3, xmm3
00000001782309d0 f3 0f 10 94 24 ac 00 00 00     movss xmm2, dword [rsp+0xac]
00000001782309d9 f3 0f 5a d2                    cvtss2sd xmm2, xmm2
00000001782309dd f3 0f 10 8c 24 b0 00 00 00     movss xmm1, dword [rsp+0xb0]
00000001782309e6 f3 0f 5a c9                    cvtss2sd xmm1, xmm1
00000001782309ea f3 0f 10 84 24 b4 00 00 00     movss xmm0, dword [rsp+0xb4]
00000001782309f3 f3 0f 5a c0                    cvtss2sd xmm0, xmm0
00000001782309f7 c7 44 24 18 00 00 00 00        mov dword [rsp+0x18], 0x0
00000001782309ff c7 44 24 1c 00 00 00 00        mov dword [rsp+0x1c], 0x0
0000000178230a07 c7 44 24 20 00 00 00 00        mov dword [rsp+0x20], 0x0
0000000178230a0f c7 44 24 24 00 00 00 00        mov dword [rsp+0x24], 0x0
0000000178230a17 f2 44 0f 5a fb                 cvtsd2ss xmm15, xmm3
0000000178230a1c f3 44 0f 11 bc 24 b4 00 00 00  movss [rsp+0xb4], xmm15
0000000178230a26 f2 44 0f 5a fa                 cvtsd2ss xmm15, xmm2
0000000178230a2b f3 44 0f 11 bc 24 b0 00 00 00  movss [rsp+0xb0], xmm15
0000000178230a35 f2 44 0f 5a f9                 cvtsd2ss xmm15, xmm1
0000000178230a3a f3 44 0f 11 bc 24 ac 00 00 00  movss [rsp+0xac], xmm15
0000000178230a44 f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
0000000178230a49 f3 44 0f 11 bc 24 a8 00 00 00  movss [rsp+0xa8], xmm15
0000000178230a53 f3 0f 10 84 24 b4 00 00 00     movss xmm0, dword [rsp+0xb4]
0000000178230a5c f3 0f 5a c0                    cvtss2sd xmm0, xmm0
0000000178230a60 f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
0000000178230a65 f3 44 0f 11 7c 24 18           movss [rsp+0x18], xmm15
0000000178230a6c f3 0f 10 84 24 b0 00 00 00     movss xmm0, dword [rsp+0xb0]
0000000178230a75 f3 0f 5a c0                    cvtss2sd xmm0, xmm0
0000000178230a79 f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
0000000178230a7e f3 44 0f 11 7c 24 1c           movss [rsp+0x1c], xmm15
0000000178230a85 f3 0f 10 84 24 ac 00 00 00     movss xmm0, dword [rsp+0xac]
0000000178230a8e f3 0f 5a c0                    cvtss2sd xmm0, xmm0
0000000178230a92 f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
0000000178230a97 f3 44 0f 11 7c 24 20           movss [rsp+0x20], xmm15
0000000178230a9e f3 0f 10 84 24 a8 00 00 00     movss xmm0, dword [rsp+0xa8]
0000000178230aa7 f3 0f 5a c0                    cvtss2sd xmm0, xmm0
0000000178230aab f2 44 0f 5a f8                 cvtsd2ss xmm15, xmm0
0000000178230ab0 f3 44 0f 11 7c 24 24           movss [rsp+0x24], xmm15
0000000178230ab7 48 63 44 24 18                 movsxd rax, dword [rsp+0x18]
0000000178230abc 89 44 24 08                    mov [rsp+0x8], eax
0000000178230ac0 48 63 44 24 1c                 movsxd rax, dword [rsp+0x1c]
0000000178230ac5 89 44 24 0c                    mov [rsp+0xc], eax
0000000178230ac9 48 63 44 24 20                 movsxd rax, dword [rsp+0x20]
0000000178230ace 89 44 24 10                    mov [rsp+0x10], eax
0000000178230ad2 48 63 44 24 24                 movsxd rax, dword [rsp+0x24]
0000000178230ad7 89 44 24 14                    mov [rsp+0x14], eax
0000000178230adb 48 8b 04 24                    mov rax, [rsp]               ; read singlestep trampoline
0000000178230adf 48 63 4c 24 08                 movsxd rcx, dword [rsp+0x8]
0000000178230ae4 89 08                          mov [rax], ecx
0000000178230ae6 48 63 4c 24 0c                 movsxd rcx, dword [rsp+0xc]
0000000178230aeb 89 48 04                       mov [rax+0x4], ecx
0000000178230aee 48 63 4c 24 10                 movsxd rcx, dword [rsp+0x10]
0000000178230af3 89 48 08                       mov [rax+0x8], ecx
0000000178230af6 48 63 4c 24 14                 movsxd rcx, dword [rsp+0x14]
0000000178230afb 89 48 0c                       mov [rax+0xc], ecx
0000000178230afe 48 81 c4 c8 00 00 00           add rsp, 0xc8                ; 200
0000000178230b05 c3                             ret

With this change, mono generates much better code:

public static Unity.Mathematics.float4 asfloat(Unity.Mathematics.uint4 x)
Address: 0000000177F76640
Code Size in Bytes: 94
Debug mode: enabled

0000000177f76640 48 83 ec 28                    sub rsp, 0x28
0000000177f76644 48 89 3c 24                    mov [rsp], rdi
0000000177f76648 48 89 74 24 18                 mov [rsp+0x18], rsi
0000000177f7664d 48 89 54 24 20                 mov [rsp+0x20], rdx
0000000177f76652 48 8d 44 24 18                 lea rax, [rsp+0x18]
0000000177f76657 48 63 08                       movsxd rcx, dword [rax]
0000000177f7665a 89 4c 24 08                    mov [rsp+0x8], ecx
0000000177f7665e 48 63 48 04                    movsxd rcx, dword [rax+0x4]
0000000177f76662 89 4c 24 0c                    mov [rsp+0xc], ecx
0000000177f76666 48 63 48 08                    movsxd rcx, dword [rax+0x8]
0000000177f7666a 89 4c 24 10                    mov [rsp+0x10], ecx
0000000177f7666e 48 63 40 0c                    movsxd rax, dword [rax+0xc]
0000000177f76672 89 44 24 14                    mov [rsp+0x14], eax
0000000177f76676 48 8b 04 24                    mov rax, [rsp]               ; read singlestep trampoline
0000000177f7667a 48 63 4c 24 08                 movsxd rcx, dword [rsp+0x8]
0000000177f7667f 89 08                          mov [rax], ecx
0000000177f76681 48 63 4c 24 0c                 movsxd rcx, dword [rsp+0xc]
0000000177f76686 89 48 04                       mov [rax+0x4], ecx
0000000177f76689 48 63 4c 24 10                 movsxd rcx, dword [rsp+0x10]
0000000177f7668e 89 48 08                       mov [rax+0x8], ecx
0000000177f76691 48 63 4c 24 14                 movsxd rcx, dword [rsp+0x14]
0000000177f76696 89 48 0c                       mov [rax+0xc], ecx
0000000177f76699 48 83 c4 28                    add rsp, 0x28                ; 40
0000000177f7669d c3                             ret

Some select perf numbers with my MacBook Pro that has an Intel(R) Core(TM) i9-9880H CPU (times are in microseconds and the burst times tend to be noisier due to its fast perf):

Method name Baseline time New time Speedup Bursted method Baseline time New time Speedup Baseline mono to burst speedup New mono to burst speedup
quaternion(float3x3) 61798 32110.8 1.924523836 quaternion(float3x3) 769.3 719.6 1.069066148 80.33017028 44.62312396
float3x3(quaternion) 58635.2 28391.3 2.065252384 float3x3(quaternion) 431.3 423.3 1.018899126 135.9499188 67.0713442
asdouble(long) 7618.4 4769.9 1.597182331 asdouble(long) 2382 2330.4 1.022142122 3.198320739 2.046815997
asdouble(ulong) 8511.9 5011.7 1.698405731 asdouble(ulong) 2698.1 2786.3 0.9683451172 3.154775583 1.798693608
asfloat(int) 19374.9 18299.4 1.058772419 asfloat(int) 2401.9 2392 1.004138796 8.06648903 7.650250836
asfloat(int2) 51035.4 9435.6 5.40881343 asfloat(int2) 3237.4 2718.8 1.190745917 15.76431704 3.470501692
asfloat(int3) 64999.7 13369.1 4.861935358 asfloat(int3) 4275.2 4036.9 1.059030444 15.20389689 3.311724343
asfloat(int4) 89697.4 16437.9 5.456743258 asfloat(int4) 4424.5 4429.2 0.9989388603 20.27288959 3.711257112
asfloat(uint) 34714.1 18243.4 1.902830613 asfloat(uint) 2379 2447 0.9722108705 14.59188735 7.455414794
asfloat(uint2) 77753.7 9116.3 8.529085265 asfloat(uint2) 3338.8 3365.5 0.9920665577 23.28791781 2.708750557
asfloat(uint3) 94821 11836.8 8.010695458 asfloat(uint3) 4442.3 5008.8 0.8868990577 21.34502397 2.363200767
asfloat(uint4) 126617.6 14791.6 8.560101679 asfloat(uint4) 3438.9 3796.2 0.9058795638 36.81921545 3.896422739
asint(float) 18490.1 5898.4 3.13476536 asint(float) 2419.3 2374.7 1.01878132 7.642747902 2.483850592
asint(float2) 34440.1 8332.7 4.133126118 asint(float2) 2790.6 2839.5 0.9827786582 12.34146778 2.934565945
asint(float3) 50039 12297.4 4.069071511 asint(float3) 4216 4045.8 1.042068318 11.86883302 3.039547185
asint(float4) 70182.5 15752.9 4.455211421 asint(float4) 4576.6 3754.5 1.21896391 15.33507407 4.195738447
aslong(double) 7773.6 4980.1 1.560932511 aslong(double) 2645.7 2800 0.9448928571 2.93820161 1.778607143
asuint(float) 32331.1 8161.3 3.961513484 asuint(float) 2393.5 2373.7 1.008341408 13.5078755 3.438218815
asuint(float2) 61567.8 7720.1 7.975000324 asuint(float2) 2781.9 2836 0.9809238364 22.13156476 2.722179126
asuint(float3) 91175.7 11956.7 7.625490311 asuint(float3) 4323.1 4622.7 0.9351893915 21.09035183 2.586518701
asuint(float4) 123315 15099.6 8.16677263 asuint(float4) 4155.6 3510.1 1.183897895 29.67441525 4.301757785
asulong(double) 8134.1 4848.5 1.677652882 asulong(double) 2644.4 2614.2 1.011552291 3.075971865 1.854678295
sschoener commented 2 years ago

Nice! Can you show the ASM difference between the old and new asuint(int)? You'd expect a cast to be free, so I'm curious what it did before and what it does now. I'm curious as to whether you checked the codegen for Burst as well? It's not clear to me what N you are using for your perf tests and whether some of the perf degradations for Burst are noise or actual real things (for which there would be a codegen explanation). My first thought was "we should have a define for whether this is compiled with Burst!" but then I realized that this is not how Burst works at all, so we'll have to settle for a compromise between Mono and Burst (and that compromise better doesn't hit Mono as bad as our previous compromise).

unpacklo commented 2 years ago

Nice! Can you show the ASM difference between the old and new asuint(int)?

Old:

public static uint asuint(int x)
Address: 000000017CE19200
Code Size in Bytes: 16
Debug mode: enabled

000000017ce19200 48 83 ec 08                    sub rsp, 0x8
000000017ce19204 48 89 3c 24                    mov [rsp], rdi
000000017ce19208 48 8b c7                       mov rax, rdi
000000017ce1920b 48 83 c4 08                    add rsp, 0x8                 ; 8
000000017ce1920f c3                             ret

New:

public static uint asuint(int x)
Address: 000000017CA2CA80
Code Size in Bytes: 16
Debug mode: enabled

000000017ca2ca80 48 83 ec 08                    sub rsp, 0x8
000000017ca2ca84 48 89 3c 24                    mov [rsp], rdi
000000017ca2ca88 8b 04 24                       mov eax, [rsp]               ; read singlestep trampoline
000000017ca2ca8b 48 83 c4 08                    add rsp, 0x8                 ; 8
000000017ca2ca8f c3                             ret

I don't quite understand why casting vs type punning makes any difference here, but I also just realized I don't have perf tests for these so I'll add those and see how those perform for this case.

I'm curious as to whether you checked the codegen for Burst as well?

Not for all, but some. For example, here's asfloat(uint3).

Old:

        .text
        .intel_syntax noprefix
        .file        "main"
        .globl        "Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.BurstTestFunction(ref Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.Arguments args)_5B581B7AAD6CC3EF" # -- Begin function Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.BurstTestFunction(ref Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.Arguments args)_5B581B7AAD6CC3EF
        .p2align        4, 0x90
        .type        "Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.BurstTestFunction(ref Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.Arguments args)_5B581B7AAD6CC3EF",@function
"Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.BurstTestFunction(ref Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.Arguments args)_5B581B7AAD6CC3EF": # @"Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.BurstTestFunction(ref Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.Arguments args)_5B581B7AAD6CC3EF"
.Lfunc_begin0:
        .file        1 "/Users/dale/code/Unity.Mathematics/src/Unity.Mathematics.PerformanceTests/TestMath.gen.cs"
        .loc        1 1192 0                # TestMath.gen.cs:1192:0
        .cfi_sections .debug_frame
        .cfi_startproc
# %bb.0:                                # %entry
        mov        rax, -4800000
        .p2align        4, 0x90
.LBB0_1:                                # %BL.0004.i
                                        # =>This Inner Loop Header: Depth=1
.Ltmp0:
        .loc        1 1178 49 prologue_end  # TestMath.gen.cs:1178:49
        mov        rcx, qword ptr [rdi + 8]
        mov        rdx, qword ptr [rdi + 16]
        movss        xmm0, dword ptr [rdx + rax + 4800008] # xmm0 = mem[0],zero,zero,zero
        movsd        xmm1, qword ptr [rdx + rax + 4800000] # xmm1 = mem[0],zero
        movss        dword ptr [rcx + rax + 4800000], xmm1
        extractps        dword ptr [rcx + rax + 4800004], xmm1, 1
        movss        dword ptr [rcx + rax + 4800008], xmm0
        .loc        1 1182 13               # TestMath.gen.cs:1182:13
        add        rax, 12
        jne        .LBB0_1
.Ltmp1:
# %bb.2:                                # %"Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.CommonTestFunction(ref Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.Arguments args)_A3E8969154AF482D.exit"
        .loc        1 1193 13               # TestMath.gen.cs:1193:13
        ret

New:

        .text
        .intel_syntax noprefix
        .file        "main"
        .globl        "Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.BurstTestFunction(ref Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.Arguments args)_5B581B7AAD6CC3EF" # -- Begin function Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.BurstTestFunction(ref Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.Arguments args)_5B581B7AAD6CC3EF
        .p2align        4, 0x90
        .type        "Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.BurstTestFunction(ref Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.Arguments args)_5B581B7AAD6CC3EF",@function
"Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.BurstTestFunction(ref Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.Arguments args)_5B581B7AAD6CC3EF": # @"Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.BurstTestFunction(ref Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.Arguments args)_5B581B7AAD6CC3EF"
.Lfunc_begin0:
        .file        1 "/Users/dale/code/Unity.Mathematics/src/Unity.Mathematics.PerformanceTests/TestMath.gen.cs"
        .loc        1 1192 0                # TestMath.gen.cs:1192:0
        .cfi_sections .debug_frame
        .cfi_startproc
# %bb.0:                                # %entry
        mov        rax, -4800000
        .p2align        4, 0x90
.LBB0_1:                                # %BL.0004.i
                                        # =>This Inner Loop Header: Depth=1
.Ltmp0:
        .loc        1 1178 49 prologue_end  # TestMath.gen.cs:1178:49
        mov        rcx, qword ptr [rdi + 8]
        mov        rdx, qword ptr [rdi + 16]
        movss        xmm0, dword ptr [rdx + rax + 4800008] # xmm0 = mem[0],zero,zero,zero
        movsd        xmm1, qword ptr [rdx + rax + 4800000] # xmm1 = mem[0],zero
        movss        dword ptr [rcx + rax + 4800000], xmm1
        extractps        dword ptr [rcx + rax + 4800004], xmm1, 1
        movss        dword ptr [rcx + rax + 4800008], xmm0
        .loc        1 1182 13               # TestMath.gen.cs:1182:13
        add        rax, 12
        jne        .LBB0_1
.Ltmp1:
# %bb.2:                                # %"Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.CommonTestFunction(ref Unity.Mathematics.PerformanceTests.TestMath.asfloat_uint3.Arguments args)_A3E8969154AF482D.exit"
        .loc        1 1193 13               # TestMath.gen.cs:1193:13
        ret

The code is identical in this case which is good but I don't know if this is true for all the code that got changed. I'm going to grab all the bursted code and then see if a diff surfaces anything of interest.

unpacklo commented 2 years ago

@sschoener I've compared all the burst code gen and there are no diffs with this code change. Regarding the mono generated code for type punning vs casting in the case of asuint(int), I haven't been able to determine whether one version is in fact faster than the other on my MacBook Pro, but it seems pretty safe to assume that accessing the stack can only be in the best case as fast as the register move.