dotnet / runtime

.NET is a cross-platform runtime for cloud, mobile, desktop, and IoT apps.
https://docs.microsoft.com/dotnet/core/
MIT License
14.82k stars 4.61k forks source link

RyuJIT is not eliminating known constant typed branches (dead code) #6209

Closed redknightlois closed 4 years ago

redknightlois commented 8 years ago

For some reason this code is not being optimized:

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static uint LZ4_hashPosition<TTableType>(byte* sequence)
            where TTableType : ITableTypeDirective
        {
            ulong element = *((ulong*)sequence);
            if (typeof(TTableType) == typeof(ByU16))
            {
                int value = (int)(element * prime5bytes >> (40 - ByU16HashLog));
                return (uint)(value & ByU16HashMask);
            }
            else if (typeof(TTableType) == typeof(ByU32))
            {
                int value = (int)(element * prime5bytes >> (40 - ByU32HashLog));
                return (uint)(value & ByU32HashMask);
            }

            throw new NotSupportedException("TTableType directive is not supported.");
        }

As you can see in the assembler, it should have evicted the whole thing when generating the code for this method. What it is strange is that TTableType is known at the caller site to be a constant already.

            ulong element = *((ulong*)sequence);
00007FFD5F175E22  sub         esp,30h  
00007FFD5F175E25  mov         qword ptr [rsp+28h],rcx  
00007FFD5F175E2A  mov         rdx,qword ptr [rdx]  
            if (typeof(TTableType) == typeof(ByU16))
00007FFD5F175E2D  mov         rax,qword ptr [rcx+10h]  
00007FFD5F175E31  mov         rax,qword ptr [rax]  
00007FFD5F175E34  test        eax,1  
00007FFD5F175E39  jne         00007FFD5F175E3D  
00007FFD5F175E3B  jmp         00007FFD5F175E44  
00007FFD5F175E3D  mov         rax,qword ptr [rax-1]  
00007FFD5F175E44  mov         r8,7FFD5F25AC28h  
00007FFD5F175E4E  cmp         rax,r8  
00007FFD5F175E51  jne         00007FFD5F175E70  
            {
                int value = (int)(element * prime5bytes >> (40 - ByU16HashLog));
00007FFD5F175E53  mov         rax,0CF1BBCDCBBh  
00007FFD5F175E5D  imul        rax,rdx  
00007FFD5F175E61  shr         rax,1Bh  
00007FFD5F175E65  and         eax,1FFFh  
00007FFD5F175E6A  add         rsp,30h  
00007FFD5F175E6E  pop         rsi  
00007FFD5F175E6F  ret  
            }
            else if (typeof(TTableType) == typeof(ByU32))
00007FFD5F175E70  mov         rax,qword ptr [rcx+10h]  
00007FFD5F175E74  mov         rax,qword ptr [rax]  
00007FFD5F175E77  test        eax,1  
00007FFD5F175E7C  jne         00007FFD5F175E80  
00007FFD5F175E7E  jmp         00007FFD5F175E87  
00007FFD5F175E80  mov         rax,qword ptr [rax-1]  
00007FFD5F175E87  mov         rcx,7FFD5F25A900h  
00007FFD5F175E91  cmp         rax,rcx  
00007FFD5F175E94  jne         00007FFD5F175EB5  
            {
                int value = (int)(element * prime5bytes >> (40 - ByU32HashLog));
00007FFD5F175E96  mov         rax,0CF1BBCDCBBh  
00007FFD5F175EA0  imul        rdx,rax  
00007FFD5F175EA4  shr         rdx,1Ch  
00007FFD5F175EA8  mov         eax,edx  
00007FFD5F175EAA  and         eax,0FFFh  
00007FFD5F175EAF  add         rsp,30h  
00007FFD5F175EB3  pop         rsi  
00007FFD5F175EB4  ret  
            }

            throw new NotSupportedException("TTableType directive is not supported.");
00007FFD5F175EB5  mov         rcx,7FFDB7516780h  
00007FFD5F175EBF  call        00007FFDBEE63050  
00007FFD5F175EC4  mov         rsi,rax  
00007FFD5F175EC7  mov         ecx,171Eh  
00007FFD5F175ECC  mov         rdx,7FFD5F1BD678h  
00007FFD5F175ED6  call        00007FFDBEB7C6E8  
00007FFD5F175EDB  mov         rdx,rax  
00007FFD5F175EDE  mov         rcx,rsi  
00007FFD5F175EE1  call        00007FFDB716CDE0  
00007FFD5F175EE6  mov         rcx,rsi  
00007FFD5F175EE9  call        00007FFDBEB7EEA8  
00007FFD5F175EEE  int         3  

And do the call with something like this instead:

               ulong element = *((ulong*)sequence);
00007FFD5F175E22  sub         esp,30h  
00007FFD5F175E2A  mov         rdx,qword ptr [rdx]  
                int value = (int)(element * prime5bytes >> (40 - ByU16HashLog));
00007FFD5F175E53  mov         rax,0CF1BBCDCBBh  
00007FFD5F175E5D  imul        rax,rdx  
00007FFD5F175E61  shr         rax,1Bh  
00007FFD5F175E65  and         eax,1FFFh  
00007FFD5F175E6A  add         rsp,30h  
00007FFD5F175E6E  pop         rsi  
00007FFD5F175E6F  ret  

So the question remains, could it be the way I am defining the 'directive' classes the problem?

        private interface ITableTypeDirective { };
        private sealed class ByU32 : ITableTypeDirective { };
        private sealed class ByU16 : ITableTypeDirective { };

I know this is a somewhat 'unorthodox' use of C# generics, but it is just too tempting to be able to optimize an algorithm that we need it to be portable without having to write C++ for all the architectures supported by CoreCLR.

@CarolEidt If you want to look for potential general optimization opportunities, the actual code can be found at: https://github.com/Corvalius/ravendb/blob/lz4-131/src/Sparrow/Compression/LZ4.cs

mikedn commented 8 years ago

Given that the method is not inlined and those "directives" are classes rather than structs there's no (reasonable) way to optimize this. This trick requires structs to work the way you want it to work.

redknightlois commented 8 years ago

I'll try that with structs in a few hours and see if it works.

redknightlois commented 8 years ago

The struct trick did it. Probably this should be documented properly :) BTW the assembler emitted now is "beautiful" :dart:

cmckinsey commented 8 years ago

When the type is instantiated based on non value-types, shared code is generated which is not possible to specialize for the particular type. Hence you saw the code that was required to do runtime type checking to determine which branch to execute. However when the type was a value-type, the runtime creates specialized generic methods for the type. The JIT in morph has an optimization to fold away such checks at code-generation time and eliminate the branch. Closing this as this is the expected behavior.