[Perf Optimization] Optimized Interlocked.CompareExchange JIT/CodeGeneration with ZF flag respect

dmitriyse commented 4 years ago

Lock-free algorithms are very popular and all of them are basing on the CAS-like CPU instructions:

    public class RefCounterTests
    {
        private long _state = 10;

        [MethodImpl(MethodImplOptions.NoInlining)]
        public void LockFreeMethod()
        {
            Debugger.Break();

            // Classic lock-free state update pattern
            long observedValue = _state;
            long updatedValue, compareValue;
            do
            {
                // Some lock-free update operation

                updatedValue = observedValue;
                if (observedValue % 2 == 0)
                {
                    updatedValue = observedValue * 3 + 11;
                }

                compareValue = observedValue;
                observedValue = Interlocked.CompareExchange(ref _state, updatedValue, compareValue);
            }
            while (observedValue != compareValue);
        }
    }

Release, x86-64 CPU, .Net Core 3.1 JIT Tier1 compiles it to the next CPU instructions:

            Debugger.Break();
00007FF9CCC33C62  sub         esp,30h  
00007FF9CCC33C65  mov         rsi,rcx  
00007FF9CCC33C68  call        00007FF9CB3760A8  

            long observedValue = _state;
00007FF9CCC33C6D  mov         rax,qword ptr [rsi+8]  
00007FF9CCC33C71  add         rsi,8  
            long updatedValue, compareValue;
            do
            {
                // Some lock-free update operation

                updatedValue = observedValue;
00007FF9CCC33C75  mov         rdx,rax  
                if (observedValue % 2 == 0)
00007FF9CCC33C78  mov         rcx,rdx  
00007FF9CCC33C7B  shr         rcx,3Fh  
00007FF9CCC33C7F  add         rcx,rdx  
00007FF9CCC33C82  and         rcx,0FFFFFFFFFFFFFFFEh  
00007FF9CCC33C86  mov         r8,rdx  
00007FF9CCC33C89  sub         r8,rcx  
00007FF9CCC33C8C  jne         00007FF9CCC33C96  
                {
                    updatedValue = observedValue * 3 + 11;
00007FF9CCC33C8E  lea         rdx,[rax+rax*2]  
                {
                    updatedValue = observedValue * 3 + 11;
00007FF9CCC33C92  add         rdx,0Bh  
                }

                compareValue = observedValue;
00007FF9CCC33C96  mov         qword ptr [rsp+28h],rax  
00007FF9CCC33C9B  lock cmpxchg qword ptr [rsi],rdx  
            }
            while (observedValue != compareValue);
00007FF9CCC33CA0  cmp         rax,qword ptr [rsp+28h]  
00007FF9CCC33CA5  jne         00007FF9CCC33C75  
00007FF9CCC33CA7  add         rsp,30h  
00007FF9CCC33CAB  pop         rsi  
00007FF9CCC33CAC  ret

Currently JIT does not re-use ZF (ZFlag) that is set by LOCK CMPXCHG instruction and put redundant CMP instruction. So the current Interlocked.CompareExchange have some room for optimizations at CPU instruction level.

With the additional Interlocked.CompareExchange overload:

    public static class Interlocked
    {
        [MethodImplAttribute(MethodImplOptions.InternalCall)]        
        public static bool CompareExchange(ref long location1, long @value, ref long comparand)
        {
            // Just logic, it's a Hardware intrinsics method.
            var comparandLocal = comparand;
            comparand = Interlocked.CompareExchange(ref location1, value, comparandLocal);
            return comparandLocal == comparand;
        }
    }

The classic lock-free state update loop can be implemented this way:

            var observedValue = Volatile.Read(ref _state);
            long updatedValue;
            do
            {
                // Some lock-free update operation
                updatedValue = observedValue;
                if (observedValue % 2 == 0)
                {
                    updatedValue = observedValue * 3 + 11;
                }
            }
            while (!Interlocked.CompareExchange(ref _state, updatedValue, ref observedValue));

And the JIT could easily map the new bool Interlocked.CompareExchange(ref long, long, ref long) version of CAS to the optimal CPU instructions:

            Debugger.Break();
00007FF9CCC33C62  sub         esp,30h  
00007FF9CCC33C65  mov         rsi,rcx  
00007FF9CCC33C68  call        00007FF9CB3760A8  

            long observedValue = _state;
00007FF9CCC33C6D  mov         rax,qword ptr [rsi+8]  
00007FF9CCC33C71  add         rsi,8  
            long updatedValue, compareValue;
            do
            {
                // Some lock-free update operation

                updatedValue = observedValue;
00007FF9CCC33C75  mov         rdx,rax  
                if (observedValue % 2 == 0)
00007FF9CCC33C78  mov         rcx,rdx  
00007FF9CCC33C7B  shr         rcx,3Fh  
00007FF9CCC33C7F  add         rcx,rdx  
00007FF9CCC33C82  and         rcx,0FFFFFFFFFFFFFFFEh  
00007FF9CCC33C86  mov         r8,rdx  
00007FF9CCC33C89  sub         r8,rcx  
00007FF9CCC33C8C  jne         00007FF9CCC33C96  
                {
                    updatedValue = observedValue * 3 + 11;
00007FF9CCC33C8E  lea         rdx,[rax+rax*2]  
                {
                    updatedValue = observedValue * 3 + 11;
00007FF9CCC33C92  add         rdx,0Bh  
                }

            }
            while (!InterlockedEx.CompareExchange(ref _state, updatedValue, ref observedValue));
// Optimized-Out 00007FF9CCC33C96  mov         qword ptr [rsp+28h],rax  
00007FF9CCC33C9B  lock cmpxchg qword ptr [rsi],rdx  
// Optimized-Out 00007FF9CCC33CA0  cmp         rax,qword ptr [rsp+28h]  
00007FF9CCC33CA5  jne         00007FF9CCC33C75  
00007FF9CCC33CA7  add         rsp,30h  
00007FF9CCC33CAB  pop         rsi  
00007FF9CCC33CAC  ret

The proposed Interlockd.CompareExchange overloads in any case could give benefits to simplify lock-free code. But also could help JIT to generate more optimal CPU instructions. (It's very hard to predict how this JIT improvement will improve the performance, but probably non more than 1%).

dmitriyse commented 4 years ago

This issue is also related to the https://github.com/dotnet/corefx/issues/10481

scalablecory commented 4 years ago

Your "classic pattern" isn't optimal and is inflating the gains. I would instead implement it as:

            long observedValue = _state;
            long updatedValue, compareValue;
            do
            {
                // Some lock-free update operation

                updatedValue = observedValue;
                if (observedValue % 2 == 0)
                {
                    updatedValue = observedValue * 3 + 11;
                }

                compareValue = observedValue;
                observedValue = Interlocked.CompareExchange(ref _state, updatedValue, compareValue);
            }
            while (observedValue != compareValue);

I think ideally JIT would optimize this properly, but even VC++ had a hard time doing it for a long time.

dmitriyse commented 4 years ago

@scalablecory, thank you for the tip about lock-free update pattern. (I have updated the issue)

            long observedValue = _state;
00007FF9CCC33C6D  mov         rax,qword ptr [rsi+8]  
00007FF9CCC33C71  add         rsi,8  
            long updatedValue, compareValue;
            do
            {
                // Some lock-free update operation

                updatedValue = observedValue;
00007FF9CCC33C75  mov         rdx,rax  
                if (observedValue % 2 == 0)
00007FF9CCC33C78  mov         rcx,rdx  
00007FF9CCC33C7B  shr         rcx,3Fh  
00007FF9CCC33C7F  add         rcx,rdx  
00007FF9CCC33C82  and         rcx,0FFFFFFFFFFFFFFFEh  
00007FF9CCC33C86  mov         r8,rdx  
00007FF9CCC33C89  sub         r8,rcx  
00007FF9CCC33C8C  jne         00007FF9CCC33C96  
                {
                    updatedValue = observedValue * 3 + 11;
00007FF9CCC33C8E  lea         rdx,[rax+rax*2]  
                {
                    updatedValue = observedValue * 3 + 11;
00007FF9CCC33C92  add         rdx,0Bh  
                }

                compareValue = observedValue;
00007FF9CCC33C96  mov         qword ptr [rsp+28h],rax  
00007FF9CCC33C9B  lock cmpxchg qword ptr [rsi],rdx  
            }
            while (observedValue != compareValue);
00007FF9CCC33CA0  cmp         rax,qword ptr [rsp+28h]  
00007FF9CCC33CA5  jne         00007FF9CCC33C75  
00007FF9CCC33CA7  add         rsp,30h  
00007FF9CCC33CAB  pop         rsi  
00007FF9CCC33CAC  ret

Your version is little bit more optimal in the beginning of the loop, but the ending part compiles totally the same way.

dotnet-policy-service[bot] commented 1 month ago

Due to lack of recent activity, this issue has been marked as a candidate for backlog cleanup. It will be closed if no further activity occurs within 14 more days. Any new comment (by anyone, not necessarily the author) will undo this process.

This process is part of our issue cleanup automation.

dmitriyse commented 1 month ago

@stephentoub, can you please look at this proposal? .Net runtime uses LOCK CMPXCHG extensively. This proposal can save up to two instructions per Interlocked.CompareExchange call. Even if it's not a major runtime optimization, it may be worth implementing it. Together with other optimizations of a similar kind, each .Net version has a significant performance jump each version.

dotnet / runtime

[Perf Optimization] Optimized Interlocked.CompareExchange JIT/CodeGeneration with ZF flag respect #1435