Open StephanTLavavej opened 4 years ago
@statementreply wrote:
On x86 double hex to_chars performance: MSVC generates 1.7x faster code with /arch:SSE Oh I guess I found out the culprit. MSVC’s SSE2 codegen writes two adjacent 32-bit memory, and then performs a 64-bit load. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80833 Looks similar to this GCC bug report
@statementreply wrote:
On x86 double hex to_chars performance: MSVC generates 1.7x faster code with /arch:SSE Oh I guess I found out the culprit. MSVC’s SSE2 codegen writes two adjacent 32-bit memory, and then performs a 64-bit load. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80833 Looks similar to this GCC bug report
Filed DevCom-1167526.
C:\Users\He\source\test\perf>type shift64_repo.cpp
using uint64 = unsigned long long;
extern const char* const digits;
char* test_func(char* str, uint64 value, int precision) noexcept {
int shift = 60;
for (;;) {
*str++ = digits[value >> shift];
--precision;
if (precision == 0) {
break;
}
const uint64 mask = (uint64{1} << shift) - 1;
value &= mask;
shift -= 4;
}
return str;
}
C:\Users\He\source\test\perf>cl /EHsc /W4 /WX /O2 /c /Fa shift64_repo.cpp
用于 x86 的 Microsoft (R) C/C++ 优化编译器 19.28.29213 版
版权所有(C) Microsoft Corporation。保留所有权利。
shift64_repo.cpp
C:\Users\He\source\test\perf>type shift64_repo.asm
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.28.29213.0
TITLE C:\Users\He\source\test\perf\shift64_repo.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES
PUBLIC ?test_func@@YAPADPAD_KH@Z ; test_func
EXTRN ?digits@@3QBDB:DWORD ; digits
; Function compile flags: /Ogtpy
; COMDAT ?test_func@@YAPADPAD_KH@Z
_TEXT SEGMENT
_str$ = 8 ; size = 4
_value$ = 12 ; size = 8
_precision$ = 20 ; size = 4
?test_func@@YAPADPAD_KH@Z PROC ; test_func, COMDAT
; File C:\Users\He\source\test\perf\shift64_repo.cpp
; Line 7
mov eax, DWORD PTR ?digits@@3QBDB ; digits
push ebx
mov ebx, DWORD PTR _value$[esp+4]
mov ecx, ebx
push ebp
mov ebp, DWORD PTR _precision$[esp+4]
push esi
mov esi, DWORD PTR _str$[esp+8]
shr ecx, 28 ; 0000001cH
push edi
mov edi, 60 ; 0000003cH
mov al, BYTE PTR [ecx+eax]
mov BYTE PTR [esi], al
inc esi
sub ebp, 1
; Line 10
je SHORT $LN13@test_func
npad 6
$LL2@test_func:
; Line 14
xor ecx, ecx
xor edx, edx
mov eax, edi
bts ecx, eax
cmp eax, 32 ; 00000020H
cmovae edx, ecx
xor ecx, edx
cmp eax, 64 ; 00000040H
cmovae edx, ecx
add ecx, -1
adc edx, -1
; Line 15
and DWORD PTR _value$[esp+12], ecx
mov ecx, DWORD PTR ?digits@@3QBDB ; digits
and ebx, edx
; Line 16
sub edi, 4
mov DWORD PTR _value$[esp+16], ebx
movq xmm1, QWORD PTR _value$[esp+12]
movd xmm0, edi
psrlq xmm1, xmm0
movd edx, xmm1
mov cl, BYTE PTR [edx+ecx]
mov BYTE PTR [esi], cl
inc esi
sub ebp, 1
jne SHORT $LL2@test_func
$LN13@test_func:
pop edi
; Line 20
mov eax, esi
pop esi
pop ebp
pop ebx
ret 0
?test_func@@YAPADPAD_KH@Z ENDP ; test_func
_TEXT ENDS
END
The following instructions in the loop performs two adjacent 32-bit stores, and then performs a 64-bit load of the stored values. The load doesn't take advantage of store forwarding, and is blocked until the stored values are available. The discussion in the GCC bug report says that the stall may cost 10-12 cycles.
and DWORD PTR _value$[esp+12], ecx
mov ecx, DWORD PTR ?digits@@3QBDB
and ebx, edx
sub edi, 4
mov DWORD PTR _value$[esp+16], ebx
movq xmm1, QWORD PTR _value$[esp+12]
- x86
double
hex 💥. This codepath performs bitwise operations withuint64_t
. Something terrible is happening here, causing code that should be ultra-fast to instead be extremely slow (relatively speaking). This seems specific to the integer type being wider than the machine's preferred width.
Yes, on MSVC in this loop https://github.com/microsoft/STL/blob/b90b3e05d2eb77e3178fb83c2e1eb7d539debf21/stl/inc/charconv#L2260-L2291 This line: https://github.com/microsoft/STL/blob/b90b3e05d2eb77e3178fb83c2e1eb7d539debf21/stl/inc/charconv#L2267
Produced the following call to a helper function:
00254506 call _aullshr (02561D0h)
clang-cl:
0061256C mov eax,ebx
0061256E mov edi,dword ptr [esp+10h]
00612572 shld eax,edi,1Ch
00612576 mov dword ptr [esp+18h],eax
0061257A mov eax,ebx
0061257C shr eax,4
0061257F mov dword ptr [esp+34h],eax
00612583 mov eax,ebx
00612585 shld eax,edi,18h
00612589 mov dword ptr [esp+1Ch],eax
0061258D mov eax,ebx
0061258F shr eax,8
00612592 mov dword ptr [esp+8],eax
00612596 mov eax,ebx
00612598 shld eax,edi,14h
0061259C mov dword ptr [esp+20h],eax
006125A0 mov eax,ebx
006125A2 shld eax,edi,10h
006125A6 mov dword ptr [esp+24h],eax
006125AA mov eax,ebx
006125AC shr eax,0Ch
006125AF mov dword ptr [esp+0Ch],eax
006125B3 mov ecx,ebx
006125B5 shr ecx,10h
006125B8 mov eax,ebx
006125BA shld eax,edi,0Ch
006125BE mov dword ptr [esp+28h],eax
006125C2 mov eax,ebx
006125C4 shld eax,edi,8
006125C8 mov dword ptr [esp+2Ch],eax
006125CC mov eax,edi
006125CE shrd eax,ebx,1Ch
006125D2 mov dword ptr [esp+30h],eax
@StephanTLavavej, do you think it worth further refining and reporting?
@AlexGuteniev Thanks! Yes, I think this is definitely worth refining/reporting, as the perf impact was so significant and (presumably) the fix would be fairly targeted. Generating a call to a library function _aullshr
which performs an Unsigned Long Long Shift Right is definitely unnecessary/harmful, as the clang-cl codegen proves.
See DevCom-1533263
<charconv>
is very fast, but it could be faster. One issue is that MSVC's optimizer often generates slower code for<charconv>
than Clang/LLVM. We should create reduced test cases and file bugs on Developer Community for the optimizer team to investigate and fix.At the bottom of this issue, I've prepared a self-contained test case using
<charconv>
. Here's a table of my timing results (Intel Core i7-8700, VS 2019 16.8 Preview 2, Clang/LLVM 10.0.0). The measured times are average nanoseconds per floating-point value.float
plain shortestdouble
plain shortestfloat
scientific shortestdouble
scientific shortestfloat
fixed shortestdouble
fixed shortestfloat
general shortestdouble
general shortestfloat
hex shortestdouble
hex shortestfloat
scientific 8double
scientific 16float
fixed 6 (lossy)double
fixed 6 (lossy)float
general 9double
general 17float
hex 6double
hex 13The "LLVM Speedup Ratio" is the MSVC time divided by the LLVM time, so higher values mean that MSVC is slower than what LLVM has proven is possible on this hardware. I've marked values >= 2.0 with :boom:, values >= 1.2 with :x:, values >= 1.1 with :warning:, and values < 1.0 with :heavy_check_mark: (this is when MSVC is faster).
There are at least two separate issues here:
x86
double
hex :boom:. This codepath performs bitwise operations withuint64_t
. Something terrible is happening here, causing code that should be ultra-fast to instead be extremely slow (relatively speaking). This seems specific to the integer type being wider than the machine's preferred width. The good news is that because this codepath is so simple, it should be easy to extract into a minimal test case, and because the issue is so severe, the cause will hopefully be easy for the optimizer devs to find and fix. The functions are: https://github.com/microsoft/STL/blob/b90b3e05d2eb77e3178fb83c2e1eb7d539debf21/stl/inc/charconv#L2079-L2432Decimal slowness, from moderate :warning: to significant :x:. These optimizer issues affect @ulfjack's amazing Ryu and Ryu Printf algorithms (used for the "shortest" and "precision" scenarios, respectively, although shortest does need to invoke Ryu Printf in a specific situation). When I worked on importing them to
<charconv>
, I reported as many optimizer issues as I could, MSVC (and LLVM) were able to fix some of them, and I was able to improve the source workarounds for other issues. However, there are still persistent issues with MSVC's optimizer.Bonus issue: Conversely, MSVC appears to be generating noticeably faster code for the
double
hex 13 scenario on x64 :heavy_check_mark:. This indicates that LLVM has room for improvement; an issue could be reduced and reported to them.<charconv>
test case ("C1XX/C2" refers to the MSVC compiler front-end/back-end, like "Clang/LLVM"):