llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.47k stars 11.77k forks source link

Unoptimized constexpr code for std::array #62021

Open StefanoD opened 1 year ago

StefanoD commented 1 year ago

Hello, forgive me when I can't be that precise. I just pasted some test code to Godbolt and compared the assembly output between GCC 12.2 and Clang 16.0.0 and the assembly code of Clang was about 3 times larger and I guess, that this code is less optimized.

Here's the code:

#include <array>
#include <algorithm>
#include <stdexcept>
#include <iostream> 

template <typename Key, typename Value, std::size_t Size>
struct FlatMap {
    constexpr explicit FlatMap(std::array<std::pair<Key, Value>, Size> data) :
            m_data(std::move(data)) {}

    constexpr Value at(const Key &key) const {
        const auto itr =
                std::find_if(begin(m_data), end(m_data),
                                [&key](const auto &v) { return v.first == key; });
        if (itr != end(m_data)) {
            return itr->second;
        }

        throw std::range_error("Not Found");
    }

private:
    std::array<std::pair<Key, Value>, Size> m_data;
};

enum Foo
{
    P, N, B, R, Q, K, p, n, b, r, q, k
};

constexpr FlatMap<char, Foo, 12> map {
        {{
                    {'P', P},
                    {'N', N},
                    {'B', B},
                    {'R', R},
                    {'Q', Q},
                    {'K', K},
                    {'p', p},
                    {'n', n},
                    {'b', b},
                    {'r', r},
                    {'q', q},
                    {'k', k}
            }}
};

int main()
{
    std::cout << map.at('P') << std::endl;
    return 0;
}

The assembly output of x86_64 GCC 12.2 with -O3 -std=c++20:

main:
  pushq %rbp
  xorl %esi, %esi
  movl $_ZSt4cout, %edi
  pushq %rbx
  subq $8, %rsp
  call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
  movq %rax, %rbx
  movq (%rax), %rax
  movq -24(%rax), %rax
  movq 240(%rbx,%rax), %rbp
  testq %rbp, %rbp
  je .L7
  cmpb $0, 56(%rbp)
  je .L3
  movsbl 67(%rbp), %esi
.L4:
  movq %rbx, %rdi
  call std::basic_ostream<char, std::char_traits<char> >::put(char)
  movq %rax, %rdi
  call std::basic_ostream<char, std::char_traits<char> >::flush()
  addq $8, %rsp
  xorl %eax, %eax
  popq %rbx
  popq %rbp
  ret
.L3:
  movq %rbp, %rdi
  call std::ctype<char>::_M_widen_init() const
  movq 0(%rbp), %rax
  movl $10, %esi
  movq %rbp, %rdi
  call *48(%rax)
  movsbl %al, %esi
  jmp .L4
.L7:
  call std::__throw_bad_cast()
_GLOBAL__sub_I_main:
  subq $8, %rsp
  movl $_ZStL8__ioinit, %edi
  call std::ios_base::Init::Init() [complete object constructor]
  movl $__dso_handle, %edx
  movl $_ZStL8__ioinit, %esi
  movl $_ZNSt8ios_base4InitD1Ev, %edi
  addq $8, %rsp
  jmp __cxa_atexit

The assembly output of x86_64 Clang 16.0.0 with -O3 -std=c++20:

main: # @main
  pushq %r14
  pushq %rbx
  pushq %rax
  movb $80, 7(%rsp)
  leaq _ZL3map(%rip), %rdi
  leaq 7(%rsp), %rsi
  callq _ZNK7FlatMapIc3FooLm12EE2atERKc
  movq _ZSt4cout@GOTPCREL(%rip), %rdi
  movl %eax, %esi
  callq _ZNSolsEi@PLT
  movq %rax, %rbx
  movq (%rax), %rax
  movq -24(%rax), %rax
  movq 240(%rbx,%rax), %r14
  testq %r14, %r14
  je .LBB0_5
  cmpb $0, 56(%r14)
  je .LBB0_3
  movzbl 67(%r14), %eax
  jmp .LBB0_4
.LBB0_3:
  movq %r14, %rdi
  callq _ZNKSt5ctypeIcE13_M_widen_initEv@PLT
  movq (%r14), %rax
  movq %r14, %rdi
  movl $10, %esi
  callq *48(%rax)
.LBB0_4:
  movsbl %al, %esi
  movq %rbx, %rdi
  callq _ZNSo3putEc@PLT
  movq %rax, %rdi
  callq _ZNSo5flushEv@PLT
  xorl %eax, %eax
  addq $8, %rsp
  popq %rbx
  popq %r14
  retq
.LBB0_5:
  callq _ZSt16__throw_bad_castv@PLT
_ZNK7FlatMapIc3FooLm12EE2atERKc: # @_ZNK7FlatMapIc3FooLm12EE2atERKc
  pushq %r14
  pushq %rbx
  pushq %rax
  movzbl (%rsi), %eax
  cmpb %al, (%rdi)
  jne .LBB1_3
  xorl %ecx, %ecx
  jmp .LBB1_2
.LBB1_3:
  movl $1, %ecx
  cmpb %al, 8(%rdi)
  je .LBB1_2
  movl $2, %ecx
  cmpb %al, 16(%rdi)
  je .LBB1_2
  movl $3, %ecx
  cmpb %al, 24(%rdi)
  je .LBB1_2
  movl $4, %ecx
  cmpb %al, 32(%rdi)
  je .LBB1_2
  movl $5, %ecx
  cmpb %al, 40(%rdi)
  je .LBB1_2
  movl $6, %ecx
  cmpb %al, 48(%rdi)
  je .LBB1_2
  movl $7, %ecx
  cmpb %al, 56(%rdi)
  je .LBB1_2
  movl $8, %ecx
  cmpb %al, 64(%rdi)
  je .LBB1_2
  movl $9, %ecx
  cmpb %al, 72(%rdi)
  je .LBB1_2
  movl $10, %ecx
  cmpb %al, 80(%rdi)
  je .LBB1_2
  movl $11, %ecx
  cmpb %al, 88(%rdi)
  jne .LBB1_14
.LBB1_2:
  movl 4(%rdi,%rcx,8), %eax
  addq $8, %rsp
  popq %rbx
  popq %r14
  retq
.LBB1_14:
  movl $16, %edi
  callq __cxa_allocate_exception@PLT
  movq %rax, %rbx
  leaq .L.str(%rip), %rsi
  movq %rax, %rdi
  callq _ZNSt11range_errorC1EPKc@PLT
  movq _ZTISt11range_error@GOTPCREL(%rip), %rsi
  movq _ZNSt11range_errorD1Ev@GOTPCREL(%rip), %rdx
  movq %rbx, %rdi
  callq __cxa_throw@PLT
  movq %rax, %r14
  movq %rbx, %rdi
  callq __cxa_free_exception@PLT
  movq %r14, %rdi
  callq _Unwind_Resume@PLT
_GLOBAL__sub_I_example.cpp: # @_GLOBAL__sub_I_example.cpp
  pushq %rbx
  leaq _ZStL8__ioinit(%rip), %rbx
  movq %rbx, %rdi
  callq _ZNSt8ios_base4InitC1Ev@PLT
  movq _ZNSt8ios_base4InitD1Ev@GOTPCREL(%rip), %rdi
  leaq __dso_handle(%rip), %rdx
  movq %rbx, %rsi
  popq %rbx
  jmp __cxa_atexit@PLT # TAILCALL
_ZL3map:
  .byte 80 # 0x50
  .zero 3
  .long 0 # 0x0
  .byte 78 # 0x4e
  .zero 3
  .long 1 # 0x1
  .byte 66 # 0x42
  .zero 3
  .long 2 # 0x2
  .byte 82 # 0x52
  .zero 3
  .long 3 # 0x3
  .byte 81 # 0x51
  .zero 3
  .long 4 # 0x4
  .byte 75 # 0x4b
  .zero 3
  .long 5 # 0x5
  .byte 112 # 0x70
  .zero 3
  .long 6 # 0x6
  .byte 110 # 0x6e
  .zero 3
  .long 7 # 0x7
  .byte 98 # 0x62
  .zero 3
  .long 8 # 0x8
  .byte 114 # 0x72
  .zero 3
  .long 9 # 0x9
  .byte 113 # 0x71
  .zero 3
  .long 10 # 0xa
  .byte 107 # 0x6b
  .zero 3
  .long 11 # 0xb

.L.str:
  .asciz "Not Found"

DW.ref.__gxx_personality_v0:
  .quad __gxx_personality_v0
sfjhkjdsh commented 1 year ago

That's just clang unrolling the loop which increases generated asm.

The thing that worries me is that optimizer couldn't collapse all the instructions here like GCC: https://godbolt.org/z/5Pfhc8YhT