Benchmark for measuring the performance of sparse and irregular memory access.
🐛 [BUG] - Not Generating AVX Instructions #209

Open radelja opened 2 months ago

radelja commented 2 months ago

💿 OS

Ubuntu 22

🏗️ Architecture


📝 Description

The use of the #pragma omp simd OpenMP directive in Configuration.cc does not result in AVX instructions with the GNU C++ compiler on ADL and CSL, even when -O3 is specified (-DCMAKE_CXX_FLAGS="-O3").

To generate AVX instructions, I needed to pass the -mavx flag with -DCMAKE_CXX_FLAGS="-O3 -mavx" when building with CMake. The GNU C++ compiler will now generate VMOVSD instructions instead of MOVSD instructions in the Spatter kernels.

🔄 Reproduction steps

  1. Go to project root directory
  2. Run CMake to build OpenMP backend with cmake -DCMAKE_CXX_FLAGS="-O3" -DUSE_OPENMP=1 -B build_openmp -S . && make -j$(nproc) -C build_openmp
  3. Disassemble a kernel with objdump --disassemble=_ZN7Spatter13ConfigurationINS_6OpenMPEE7scatterEbm._omp_fn.0 ./build_openmp/spatter

💬 Logs

./build_openmp/spatter:     file format elf64-x86-64

Disassembly of section .init:

Disassembly of section .plt:

Disassembly of section .plt.got:

Disassembly of section .plt.sec:

Disassembly of section .text:

00000000000122f0 <_ZN7Spatter13ConfigurationINS_6OpenMPEE7scatterEbm._omp_fn.0>:
   122f0:   f3 0f 1e fa             endbr64 
   122f4:   41 55                   push   %r13
   122f6:   41 54                   push   %r12
   122f8:   55                      push   %rbp
   122f9:   48 89 fd                mov    %rdi,%rbp
   122fc:   53                      push   %rbx
   122fd:   48 83 ec 08             sub    $0x8,%rsp
   12301:   4c 8b 2f                mov    (%rdi),%r13
   12304:   e8 07 4a ff ff          call   6d10 <omp_get_thread_num@plt>
   12309:   49 8b 9d 28 01 00 00    mov    0x128(%r13),%rbx
   12310:   48 85 db                test   %rbx,%rbx
   12313:   0f 84 be 00 00 00       je     123d7 <_ZN7Spatter13ConfigurationINS_6OpenMPEE7scatterEbm._omp_fn.0+0xe7>
   12319:   41 89 c4                mov    %eax,%r12d
   1231c:   e8 ff 4a ff ff          call   6e20 <omp_get_num_threads@plt>
   12321:   31 d2                   xor    %edx,%edx
   12323:   49 63 cc                movslq %r12d,%rcx
   12326:   48 63 f0                movslq %eax,%rsi
   12329:   48 89 d8                mov    %rbx,%rax
   1232c:   48 f7 f6                div    %rsi
   1232f:   48 39 d1                cmp    %rdx,%rcx
   12332:   0f 82 aa 00 00 00       jb     123e2 <_ZN7Spatter13ConfigurationINS_6OpenMPEE7scatterEbm._omp_fn.0+0xf2>
   12338:   48 89 c6                mov    %rax,%rsi
   1233b:   48 0f af f1             imul   %rcx,%rsi
   1233f:   4c 8d 14 16             lea    (%rsi,%rdx,1),%r10
   12343:   4e 8d 1c 10             lea    (%rax,%r10,1),%r11
   12347:   4d 39 da                cmp    %r11,%r10
   1234a:   0f 83 87 00 00 00       jae    123d7 <_ZN7Spatter13ConfigurationINS_6OpenMPEE7scatterEbm._omp_fn.0+0xe7>
   12350:   49 8b 85 98 00 00 00    mov    0x98(%r13),%rax
   12357:   49 8b 95 e8 00 00 00    mov    0xe8(%r13),%rdx
   1235e:   48 8b 75 08             mov    0x8(%rbp),%rsi
   12362:   49 8b 9d 20 01 00 00    mov    0x120(%r13),%rbx
   12369:   4c 8b 00                mov    (%rax),%r8
   1236c:   48 8d 04 49             lea    (%rcx,%rcx,2),%rax
   12370:   48 8b 0a                mov    (%rdx),%rcx
   12373:   49 8b ad 00 01 00 00    mov    0x100(%r13),%rbp
   1237a:   48 8d 04 c1             lea    (%rcx,%rax,8),%rax
   1237e:   4c 8b 20                mov    (%rax),%r12
   12381:   48 85 f6                test   %rsi,%rsi
   12384:   74 51                   je     123d7 <_ZN7Spatter13ConfigurationINS_6OpenMPEE7scatterEbm._omp_fn.0+0xe7>
   12386:   4c 89 d1                mov    %r10,%rcx
   12389:   4d 8b 4d 50             mov    0x50(%r13),%r9
   1238d:   48 0f af cd             imul   %rbp,%rcx
   12391:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)
   12398:   4c 89 d0                mov    %r10,%rax
   1239b:   31 d2                   xor    %edx,%edx
   1239d:   48 f7 f3                div    %rbx
   123a0:   31 c0                   xor    %eax,%eax
   123a2:   48 0f af d6             imul   %rsi,%rdx
   123a6:   49 8d 3c d4             lea    (%r12,%rdx,8),%rdi
   123aa:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
   123b0:   49 8b 14 c1             mov    (%r9,%rax,8),%rdx
   123b4:   f2 0f 10 04 c7          movsd  (%rdi,%rax,8),%xmm0
   123b9:   48 83 c0 01             add    $0x1,%rax
   123bd:   48 01 ca                add    %rcx,%rdx
   123c0:   f2 41 0f 11 04 d0       movsd  %xmm0,(%r8,%rdx,8)
   123c6:   48 39 c6                cmp    %rax,%rsi
   123c9:   75 e5                   jne    123b0 <_ZN7Spatter13ConfigurationINS_6OpenMPEE7scatterEbm._omp_fn.0+0xc0>
   123cb:   49 83 c2 01             add    $0x1,%r10
   123cf:   48 01 e9                add    %rbp,%rcx
   123d2:   4d 39 d3                cmp    %r10,%r11
   123d5:   75 c1                   jne    12398 <_ZN7Spatter13ConfigurationINS_6OpenMPEE7scatterEbm._omp_fn.0+0xa8>
   123d7:   48 83 c4 08             add    $0x8,%rsp
   123db:   5b                      pop    %rbx
   123dc:   5d                      pop    %rbp
   123dd:   41 5c                   pop    %r12
   123df:   41 5d                   pop    %r13
   123e1:   c3                      ret    
   123e2:   48 83 c0 01             add    $0x1,%rax
   123e6:   31 d2                   xor    %edx,%edx
   123e8:   e9 4b ff ff ff          jmp    12338 <_ZN7Spatter13ConfigurationINS_6OpenMPEE7scatterEbm._omp_fn.0+0x48>

