carsongoodwin32 / rosetta2_avx_dive

Rosetta2 AVX Implementation Deep Dive
4 stars 0 forks source link

X86 Codegen differences between Clang and GCC #1

Open vyuuui opened 5 months ago

vyuuui commented 5 months ago

This post mistakenly assumes that you can expect registers not to be clobbered between two inline assembler blocks. If you inspect the output from clang++ versus g++, you'll notice that clang injects vzeroupper instructions between the inline assembler blocks in sum_int_avx2. Here's the two for comparison:

Output from g++ (GCC) 14.1.1 20240522: g++ -msse2 -mavx -mavx2 avxbench.cpp -o benchmark; objdump -D benchmark -Mintel | grep -A 48 '<_Z12sum_int_avx2RKSt6vectorIiSaIiEE>:'

00000000000013fa <_Z12sum_int_avx2RKSt6vectorIiSaIiEE>:
    13fa:   55                      push   rbp
    13fb:   48 89 e5                mov    rbp,rsp
    13fe:   48 83 ec 50             sub    rsp,0x50
    1402:   48 89 7d b8             mov    QWORD PTR [rbp-0x48],rdi
    1406:   64 48 8b 04 25 28 00    mov    rax,QWORD PTR fs:0x28
    140d:   00 00 
    140f:   48 89 45 f8             mov    QWORD PTR [rbp-0x8],rax
    1413:   31 c0                   xor    eax,eax
    1415:   c5 f9 ef c0             vpxor  xmm0,xmm0,xmm0
    1419:   c5 f9 7f 45 d0          vmovdqa XMMWORD PTR [rbp-0x30],xmm0
    141e:   c5 f9 7f 45 e0          vmovdqa XMMWORD PTR [rbp-0x20],xmm0
    1423:   c5 f5 ef c9             vpxor  ymm1,ymm1,ymm1
    1427:   c7 45 cc 00 00 00 00    mov    DWORD PTR [rbp-0x34],0x0
    142e:   eb 21                   jmp    1451 <_Z12sum_int_avx2RKSt6vectorIiSaIiEE+0x57>
    1430:   8b 45 cc                mov    eax,DWORD PTR [rbp-0x34]
    1433:   48 63 d0                movsxd rdx,eax
    1436:   48 8b 45 b8             mov    rax,QWORD PTR [rbp-0x48]
    143a:   48 89 d6                mov    rsi,rdx
    143d:   48 89 c7                mov    rdi,rax
    1440:   e8 49 09 00 00          call   1d8e <_ZNKSt6vectorIiSaIiEEixEm>
    1445:   c5 fe 6f 00             vmovdqu ymm0,YMMWORD PTR [rax]
    1449:   c5 f5 fe c8             vpaddd ymm1,ymm1,ymm0
    144d:   83 45 cc 08             add    DWORD PTR [rbp-0x34],0x8
    1451:   81 7d cc ff 07 af 2f    cmp    DWORD PTR [rbp-0x34],0x2faf07ff
    1458:   7e d6                   jle    1430 <_Z12sum_int_avx2RKSt6vectorIiSaIiEE+0x36>
    145a:   c5 fe 7f 4d d0          vmovdqu YMMWORD PTR [rbp-0x30],ymm1
    145f:   8b 55 d0                mov    edx,DWORD PTR [rbp-0x30]
    1462:   8b 45 d4                mov    eax,DWORD PTR [rbp-0x2c]
    1465:   01 c2                   add    edx,eax
    1467:   8b 45 d8                mov    eax,DWORD PTR [rbp-0x28]
    146a:   01 c2                   add    edx,eax
    146c:   8b 45 dc                mov    eax,DWORD PTR [rbp-0x24]
    146f:   01 c2                   add    edx,eax
    1471:   8b 45 e0                mov    eax,DWORD PTR [rbp-0x20]
    1474:   01 c2                   add    edx,eax
    1476:   8b 45 e4                mov    eax,DWORD PTR [rbp-0x1c]
    1479:   01 c2                   add    edx,eax
    147b:   8b 45 e8                mov    eax,DWORD PTR [rbp-0x18]
    147e:   01 c2                   add    edx,eax
    1480:   8b 45 ec                mov    eax,DWORD PTR [rbp-0x14]
    1483:   01 d0                   add    eax,edx
    1485:   48 8b 55 f8             mov    rdx,QWORD PTR [rbp-0x8]
    1489:   64 48 2b 14 25 28 00    sub    rdx,QWORD PTR fs:0x28
    1490:   00 00 
    1492:   74 05                   je     1499 <_Z12sum_int_avx2RKSt6vectorIiSaIiEE+0x9f>
    1494:   e8 47 fc ff ff          call   10e0 <__stack_chk_fail@plt>
    1499:   c9                      leave
    149a:   c3                      ret

Output from clang: g++ -msse2 -mavx -mavx2 avxbench.cpp -o benchmark; objdump -D benchmark -Mintel | grep -A 48 '<__Z12sum_int_avx2RKNSt3__16vectorIiNS_9allocatorIiEEEE>:'

0000000100001330 <__Z12sum_int_avx2RKNSt3__16vectorIiNS_9allocatorIiEEEE>:
100001330: 55                           push    rbp
100001331: 48 89 e5                     mov rbp, rsp
100001334: 48 83 ec 40                  sub rsp, 0x40
100001338: 48 8b 05 11 2e 00 00         mov rax, qword ptr [rip + 0x2e11] ## 0x100004150 <_time+0x100004150>
10000133f: 48 8b 00                     mov rax, qword ptr [rax]
100001342: 48 89 45 f8                  mov qword ptr [rbp - 0x8], rax
100001346: 48 89 7d c8                  mov qword ptr [rbp - 0x38], rdi
10000134a: 48 8d 7d d0                  lea rdi, [rbp - 0x30]
10000134e: 31 f6                        xor esi, esi
100001350: ba 20 00 00 00               mov edx, 0x20
100001355: e8 04 29 00 00               call    0x100003c5e <_time+0x100003c5e>
10000135a: c5 f5 ef c9                  vpxor   ymm1, ymm1, ymm1
10000135e: c7 45 c4 00 00 00 00         mov dword ptr [rbp - 0x3c], 0x0
100001365: 81 7d c4 00 08 af 2f         cmp dword ptr [rbp - 0x3c], 0x2faf0800
10000136c: 0f 8d 26 00 00 00            jge 0x100001398 <__Z12sum_int_avx2RKNSt3__16vectorIiNS_9allocatorIiEEEE+0x68>
100001372: 48 8b 7d c8                  mov rdi, qword ptr [rbp - 0x38]
100001376: 48 63 75 c4                  movsxd  rsi, dword ptr [rbp - 0x3c]
10000137a: c5 f8 77                     vzeroupper
10000137d: e8 ee fe ff ff               call    0x100001270 <__ZNKSt3__16vectorIiNS_9allocatorIiEEEixB7v160006Em>
100001382: c5 fe 6f 00                  vmovdqu ymm0, ymmword ptr [rax]
100001386: c5 f5 fe c8                  vpaddd  ymm1, ymm1, ymm0
10000138a: 8b 45 c4                     mov eax, dword ptr [rbp - 0x3c]
10000138d: 83 c0 08                     add eax, 0x8
100001390: 89 45 c4                     mov dword ptr [rbp - 0x3c], eax
100001393: e9 cd ff ff ff               jmp 0x100001365 <__Z12sum_int_avx2RKNSt3__16vectorIiNS_9allocatorIiEEEE+0x35>
100001398: c5 fe 7f 4d d0               vmovdqu ymmword ptr [rbp - 0x30], ymm1
10000139d: 8b 45 d0                     mov eax, dword ptr [rbp - 0x30]
1000013a0: 03 45 d4                     add eax, dword ptr [rbp - 0x2c]
1000013a3: 03 45 d8                     add eax, dword ptr [rbp - 0x28]
1000013a6: 03 45 dc                     add eax, dword ptr [rbp - 0x24]
1000013a9: 03 45 e0                     add eax, dword ptr [rbp - 0x20]
1000013ac: 03 45 e4                     add eax, dword ptr [rbp - 0x1c]
1000013af: 03 45 e8                     add eax, dword ptr [rbp - 0x18]
1000013b2: 03 45 ec                     add eax, dword ptr [rbp - 0x14]
1000013b5: 89 45 c0                     mov dword ptr [rbp - 0x40], eax
1000013b8: 48 8b 05 91 2d 00 00         mov rax, qword ptr [rip + 0x2d91] ## 0x100004150 <_time+0x100004150>
1000013bf: 48 8b 00                     mov rax, qword ptr [rax]
1000013c2: 48 8b 4d f8                  mov rcx, qword ptr [rbp - 0x8]
1000013c6: 48 39 c8                     cmp rax, rcx
1000013c9: 0f 85 0c 00 00 00            jne 0x1000013db <__Z12sum_int_avx2RKNSt3__16vectorIiNS_9allocatorIiEEEE+0xab>
1000013cf: 8b 45 c0                     mov eax, dword ptr [rbp - 0x40]
1000013d2: 48 83 c4 40                  add rsp, 0x40
1000013d6: 5d                           pop rbp
1000013d7: c5 f8 77                     vzeroupper
1000013da: c3                           ret
1000013db: c5 f8 77                     vzeroupper
1000013de: e8 75 28 00 00               call    0x100003c58 <_time+0x100003c58>
1000013e3: 66 66 66 66 2e 0f 1f 84 00 00 00 00 00       nop word ptr cs:[rax + rax]

Take note of the following:

; Within the summation loop
10000137a: c5 f8 77                     vzeroupper ; This is zeroing out the accumulator
10000137d: e8 ee fe ff ff               call    0x100001270 <__ZNKSt3__16vectorIiNS_9allocatorIiEEEixB7v160006Em>
100001382: c5 fe 6f 00                  vmovdqu ymm0, ymmword ptr [rax]
100001386: c5 f5 fe c8                  vpaddd  ymm1, ymm1, ymm0

Instead of doing inlined assembly blocks, it's probably best to put the entire benchmark in a separate assembly routine.

vyuuui commented 5 months ago

Should also note that the SysV ABI marks all SIMD registers as "not preserved across function calls", and considering vector<int>::operator[] is a function (albeit very simple), it's (yet another) instance that'll end up clobbering it. In the very least, the liveness of ymm0/ymm1 does not persist across the loop, let alone the whole function. All this to say, this benchmark is relying on undefined behavior to produce the correct sum values.