llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
26.68k stars 10.93k forks source link

[X86] -march=native causes performance regression on znver3 #96701

Open jabraham17 opened 5 days ago

jabraham17 commented 5 days ago

I am noticing a performance regression when comparing -march=none to -march=native with the following kernel.

// compiled with -ffast-math
int64_t nums[N];
float kernel() {
  float sum = 0;
  for (long i = 0; i < N; i ++) {
      const float x = nums[i] % 3 == 0 ?
          (nums[i] % 5 == 0 ? A : B) :
          (nums[i] % 4 == 0 ? C : D);
      const float rx = 1.0f / x;
      sum += rx;
  }
  return sum;
}

Compiling with-march=native results in about a 1.3x slowdown on my system (AMD znver3).

See this link for an assembly comparison. I compiled all with -mavx2 so that it was more of an apples-to-apples comparison, this way both the versions run with VEX instructions (vdivss/vaddss). I think the issue is that with -march=native/-march=znver3, more aggressive unrolling is done that hurts performance.

Full kernel that runs timing ```c #include #include #include #include #include #define A 1.0f #define B 20.0f #define C 25.6f #define D 24.0f #ifndef N #define N 100000000 #endif #ifndef iters #define iters 100 #endif #ifndef ARRAY_TYPE #define ARRAY_TYPE int64_t #endif ARRAY_TYPE nums[N]; float kernel() { float sum = 0; for (long i = 0; i < N; i ++) { const float x = nums[i] % 3 == 0 ? (nums[i] % 5 == 0 ? A : B) : (nums[i] % 4 == 0 ? C : D); const float rx = 1.0f / x; sum += rx; } return sum; } void c_version(int initArray, int printTime, int printCorrectness) { if (initArray) { #ifdef seed srand(seed); #endif for (long i = N-1; i >= N; i--) { nums[i] = rand(); } } float dest[iters]; struct timespec start_time, end_time; clock_gettime(CLOCK_MONOTONIC_RAW, &start_time); for (int i = 0; i < iters; i++) { dest[i] = kernel(); } clock_gettime(CLOCK_MONOTONIC_RAW, &end_time); float elapsed = (end_time.tv_sec - start_time.tv_sec)+ (end_time.tv_nsec - start_time.tv_nsec) / 1000000000.0; if (printTime) printf("c Time: %f\n", elapsed); float sum = 0.0; for (int i = 0; i < iters; i++) { sum += dest[i]; } if (printCorrectness) printf("%f\n", sum); } int main() { c_version(1, 1, 1); return 0; } ```
llvmbot commented 5 days ago

@llvm/issue-subscribers-backend-x86

Author: Jade Abraham (jabraham17)

I am noticing a performance regression when comparing `-march=none` to `-march=native` with the following kernel. ```c // compiled with -ffast-math int64_t nums[N]; float kernel() { float sum = 0; for (long i = 0; i < N; i ++) { const float x = nums[i] % 3 == 0 ? (nums[i] % 5 == 0 ? A : B) : (nums[i] % 4 == 0 ? C : D); const float rx = 1.0f / x; sum += rx; } return sum; } ``` Compiling with`-march=native` results in about a 1.3x slowdown on my system (AMD `znver3`). See [this link](https://godbolt.org/z/3h8bo36Wq) for an assembly comparison. I compiled all with `-mavx2` so that it was more of an apples-to-apples comparison, this way both the versions run with VEX instructions (`vdivss`/`vaddss`). I think the issue is that with `-march=native`/`-march=znver3`, more aggressive unrolling is done that hurts performance. <details> <summary>Full kernel that runs timing</summary> ```c #include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #include <stdint.h> #define A 1.0f #define B 20.0f #define C 25.6f #define D 24.0f #ifndef N #define N 100000000 #endif #ifndef iters #define iters 100 #endif #ifndef ARRAY_TYPE #define ARRAY_TYPE int64_t #endif ARRAY_TYPE nums[N]; float kernel() { float sum = 0; for (long i = 0; i < N; i ++) { const float x = nums[i] % 3 == 0 ? (nums[i] % 5 == 0 ? A : B) : (nums[i] % 4 == 0 ? C : D); const float rx = 1.0f / x; sum += rx; } return sum; } void c_version(int initArray, int printTime, int printCorrectness) { if (initArray) { #ifdef seed srand(seed); #endif for (long i = N-1; i >= N; i--) { nums[i] = rand(); } } float dest[iters]; struct timespec start_time, end_time; clock_gettime(CLOCK_MONOTONIC_RAW, &start_time); for (int i = 0; i < iters; i++) { dest[i] = kernel(); } clock_gettime(CLOCK_MONOTONIC_RAW, &end_time); float elapsed = (end_time.tv_sec - start_time.tv_sec)+ (end_time.tv_nsec - start_time.tv_nsec) / 1000000000.0; if (printTime) printf("c Time: %f\n", elapsed); float sum = 0.0; for (int i = 0; i < iters; i++) { sum += dest[i]; } if (printCorrectness) printf("%f\n", sum); } int main() { c_version(1, 1, 1); return 0; } ``` </details>
dtcxzyw commented 5 days ago

cc @topperc @RKSimon

ganeshgit commented 5 days ago

I will take a look.