X86 Codegen differences between Clang and GCC

This post mistakenly assumes that you can expect registers not to be clobbered between two inline assembler blocks. If you inspect the output from clang++ versus g++, you'll notice that clang injects vzeroupper instructions between the inline assembler blocks in sum_int_avx2. Here's the two for comparison:

Output from g++ (GCC) 14.1.1 20240522: g++ -msse2 -mavx -mavx2 avxbench.cpp -o benchmark; objdump -D benchmark -Mintel | grep -A 48 '<_Z12sum_int_avx2RKSt6vectorIiSaIiEE>:'

00000000000013fa <_Z12sum_int_avx2RKSt6vectorIiSaIiEE>:
    13fa:   55                      push   rbp
    13fb:   48 89 e5                mov    rbp,rsp
    13fe:   48 83 ec 50             sub    rsp,0x50
    1402:   48 89 7d b8             mov    QWORD PTR [rbp-0x48],rdi
    1406:   64 48 8b 04 25 28 00    mov    rax,QWORD PTR fs:0x28
    140d:   00 00 
    140f:   48 89 45 f8             mov    QWORD PTR [rbp-0x8],rax
    1413:   31 c0                   xor    eax,eax
    1415:   c5 f9 ef c0             vpxor  xmm0,xmm0,xmm0
    1419:   c5 f9 7f 45 d0          vmovdqa XMMWORD PTR [rbp-0x30],xmm0
    141e:   c5 f9 7f 45 e0          vmovdqa XMMWORD PTR [rbp-0x20],xmm0
    1423:   c5 f5 ef c9             vpxor  ymm1,ymm1,ymm1
    1427:   c7 45 cc 00 00 00 00    mov    DWORD PTR [rbp-0x34],0x0
    142e:   eb 21                   jmp    1451 <_Z12sum_int_avx2RKSt6vectorIiSaIiEE+0x57>
    1430:   8b 45 cc                mov    eax,DWORD PTR [rbp-0x34]
    1433:   48 63 d0                movsxd rdx,eax
    1436:   48 8b 45 b8             mov    rax,QWORD PTR [rbp-0x48]
    143a:   48 89 d6                mov    rsi,rdx
    143d:   48 89 c7                mov    rdi,rax
    1440:   e8 49 09 00 00          call   1d8e <_ZNKSt6vectorIiSaIiEEixEm>
    1445:   c5 fe 6f 00             vmovdqu ymm0,YMMWORD PTR [rax]
    1449:   c5 f5 fe c8             vpaddd ymm1,ymm1,ymm0
    144d:   83 45 cc 08             add    DWORD PTR [rbp-0x34],0x8
    1451:   81 7d cc ff 07 af 2f    cmp    DWORD PTR [rbp-0x34],0x2faf07ff
    1458:   7e d6                   jle    1430 <_Z12sum_int_avx2RKSt6vectorIiSaIiEE+0x36>
    145a:   c5 fe 7f 4d d0          vmovdqu YMMWORD PTR [rbp-0x30],ymm1
    145f:   8b 55 d0                mov    edx,DWORD PTR [rbp-0x30]
    1462:   8b 45 d4                mov    eax,DWORD PTR [rbp-0x2c]
    1465:   01 c2                   add    edx,eax
    1467:   8b 45 d8                mov    eax,DWORD PTR [rbp-0x28]
    146a:   01 c2                   add    edx,eax
    146c:   8b 45 dc                mov    eax,DWORD PTR [rbp-0x24]
    146f:   01 c2                   add    edx,eax
    1471:   8b 45 e0                mov    eax,DWORD PTR [rbp-0x20]
    1474:   01 c2                   add    edx,eax
    1476:   8b 45 e4                mov    eax,DWORD PTR [rbp-0x1c]
    1479:   01 c2                   add    edx,eax
    147b:   8b 45 e8                mov    eax,DWORD PTR [rbp-0x18]
    147e:   01 c2                   add    edx,eax
    1480:   8b 45 ec                mov    eax,DWORD PTR [rbp-0x14]
    1483:   01 d0                   add    eax,edx
    1485:   48 8b 55 f8             mov    rdx,QWORD PTR [rbp-0x8]
    1489:   64 48 2b 14 25 28 00    sub    rdx,QWORD PTR fs:0x28
    1490:   00 00 
    1492:   74 05                   je     1499 <_Z12sum_int_avx2RKSt6vectorIiSaIiEE+0x9f>
    1494:   e8 47 fc ff ff          call   10e0 <__stack_chk_fail@plt>
    1499:   c9                      leave
    149a:   c3                      ret

Output from clang: g++ -msse2 -mavx -mavx2 avxbench.cpp -o benchmark; objdump -D benchmark -Mintel | grep -A 48 '<__Z12sum_int_avx2RKNSt3__16vectorIiNS_9allocatorIiEEEE>:'

0000000100001330 <__Z12sum_int_avx2RKNSt3__16vectorIiNS_9allocatorIiEEEE>:
100001330: 55                           push    rbp
100001331: 48 89 e5                     mov rbp, rsp
100001334: 48 83 ec 40                  sub rsp, 0x40
100001338: 48 8b 05 11 2e 00 00         mov rax, qword ptr [rip + 0x2e11] ## 0x100004150 <_time+0x100004150>
10000133f: 48 8b 00                     mov rax, qword ptr [rax]
100001342: 48 89 45 f8                  mov qword ptr [rbp - 0x8], rax
100001346: 48 89 7d c8                  mov qword ptr [rbp - 0x38], rdi
10000134a: 48 8d 7d d0                  lea rdi, [rbp - 0x30]
10000134e: 31 f6                        xor esi, esi
100001350: ba 20 00 00 00               mov edx, 0x20
100001355: e8 04 29 00 00               call    0x100003c5e <_time+0x100003c5e>
10000135a: c5 f5 ef c9                  vpxor   ymm1, ymm1, ymm1
10000135e: c7 45 c4 00 00 00 00         mov dword ptr [rbp - 0x3c], 0x0
100001365: 81 7d c4 00 08 af 2f         cmp dword ptr [rbp - 0x3c], 0x2faf0800
10000136c: 0f 8d 26 00 00 00            jge 0x100001398 <__Z12sum_int_avx2RKNSt3__16vectorIiNS_9allocatorIiEEEE+0x68>
100001372: 48 8b 7d c8                  mov rdi, qword ptr [rbp - 0x38]
100001376: 48 63 75 c4                  movsxd  rsi, dword ptr [rbp - 0x3c]
10000137a: c5 f8 77                     vzeroupper
10000137d: e8 ee fe ff ff               call    0x100001270 <__ZNKSt3__16vectorIiNS_9allocatorIiEEEixB7v160006Em>
100001382: c5 fe 6f 00                  vmovdqu ymm0, ymmword ptr [rax]
100001386: c5 f5 fe c8                  vpaddd  ymm1, ymm1, ymm0
10000138a: 8b 45 c4                     mov eax, dword ptr [rbp - 0x3c]
10000138d: 83 c0 08                     add eax, 0x8
100001390: 89 45 c4                     mov dword ptr [rbp - 0x3c], eax
100001393: e9 cd ff ff ff               jmp 0x100001365 <__Z12sum_int_avx2RKNSt3__16vectorIiNS_9allocatorIiEEEE+0x35>
100001398: c5 fe 7f 4d d0               vmovdqu ymmword ptr [rbp - 0x30], ymm1
10000139d: 8b 45 d0                     mov eax, dword ptr [rbp - 0x30]
1000013a0: 03 45 d4                     add eax, dword ptr [rbp - 0x2c]
1000013a3: 03 45 d8                     add eax, dword ptr [rbp - 0x28]
1000013a6: 03 45 dc                     add eax, dword ptr [rbp - 0x24]
1000013a9: 03 45 e0                     add eax, dword ptr [rbp - 0x20]
1000013ac: 03 45 e4                     add eax, dword ptr [rbp - 0x1c]
1000013af: 03 45 e8                     add eax, dword ptr [rbp - 0x18]
1000013b2: 03 45 ec                     add eax, dword ptr [rbp - 0x14]
1000013b5: 89 45 c0                     mov dword ptr [rbp - 0x40], eax
1000013b8: 48 8b 05 91 2d 00 00         mov rax, qword ptr [rip + 0x2d91] ## 0x100004150 <_time+0x100004150>
1000013bf: 48 8b 00                     mov rax, qword ptr [rax]
1000013c2: 48 8b 4d f8                  mov rcx, qword ptr [rbp - 0x8]
1000013c6: 48 39 c8                     cmp rax, rcx
1000013c9: 0f 85 0c 00 00 00            jne 0x1000013db <__Z12sum_int_avx2RKNSt3__16vectorIiNS_9allocatorIiEEEE+0xab>
1000013cf: 8b 45 c0                     mov eax, dword ptr [rbp - 0x40]
1000013d2: 48 83 c4 40                  add rsp, 0x40
1000013d6: 5d                           pop rbp
1000013d7: c5 f8 77                     vzeroupper
1000013da: c3                           ret
1000013db: c5 f8 77                     vzeroupper
1000013de: e8 75 28 00 00               call    0x100003c58 <_time+0x100003c58>
1000013e3: 66 66 66 66 2e 0f 1f 84 00 00 00 00 00       nop word ptr cs:[rax + rax]

Take note of the following:

; Within the summation loop
10000137a: c5 f8 77                     vzeroupper ; This is zeroing out the accumulator
10000137d: e8 ee fe ff ff               call    0x100001270 <__ZNKSt3__16vectorIiNS_9allocatorIiEEEixB7v160006Em>
100001382: c5 fe 6f 00                  vmovdqu ymm0, ymmword ptr [rax]
100001386: c5 f5 fe c8                  vpaddd  ymm1, ymm1, ymm0

Instead of doing inlined assembly blocks, it's probably best to put the entire benchmark in a separate assembly routine.

carsongoodwin32 / rosetta2_avx_dive

X86 Codegen differences between Clang and GCC #1