I am noticing a performance regression when comparing -march=none to -march=native with the following kernel.
// compiled with -ffast-math
int64_t nums[N];
float kernel() {
float sum = 0;
for (long i = 0; i < N; i ++) {
const float x = nums[i] % 3 == 0 ?
(nums[i] % 5 == 0 ? A : B) :
(nums[i] % 4 == 0 ? C : D);
const float rx = 1.0f / x;
sum += rx;
}
return sum;
}
Compiling with-march=native results in about a 1.3x slowdown on my system (AMD znver3).
See this link for an assembly comparison. I compiled all with -mavx2 so that it was more of an apples-to-apples comparison, this way both the versions run with VEX instructions (vdivss/vaddss). I think the issue is that with -march=native/-march=znver3, more aggressive unrolling is done that hurts performance.
Full kernel that runs timing
```c
#include
#include
#include
#include
#include
#define A 1.0f
#define B 20.0f
#define C 25.6f
#define D 24.0f
#ifndef N
#define N 100000000
#endif
#ifndef iters
#define iters 100
#endif
#ifndef ARRAY_TYPE
#define ARRAY_TYPE int64_t
#endif
ARRAY_TYPE nums[N];
float kernel() {
float sum = 0;
for (long i = 0; i < N; i ++) {
const float x = nums[i] % 3 == 0 ?
(nums[i] % 5 == 0 ? A : B) :
(nums[i] % 4 == 0 ? C : D);
const float rx = 1.0f / x;
sum += rx;
}
return sum;
}
void c_version(int initArray, int printTime, int printCorrectness) {
if (initArray) {
#ifdef seed
srand(seed);
#endif
for (long i = N-1; i >= N; i--) {
nums[i] = rand();
}
}
float dest[iters];
struct timespec start_time, end_time;
clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
for (int i = 0; i < iters; i++) {
dest[i] = kernel();
}
clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
float elapsed = (end_time.tv_sec - start_time.tv_sec)+ (end_time.tv_nsec - start_time.tv_nsec) / 1000000000.0;
if (printTime) printf("c Time: %f\n", elapsed);
float sum = 0.0;
for (int i = 0; i < iters; i++) {
sum += dest[i];
}
if (printCorrectness) printf("%f\n", sum);
}
int main() {
c_version(1, 1, 1);
return 0;
}
```
I am noticing a performance regression when comparing `-march=none` to `-march=native` with the following kernel.
```c
// compiled with -ffast-math
int64_t nums[N];
float kernel() {
float sum = 0;
for (long i = 0; i < N; i ++) {
const float x = nums[i] % 3 == 0 ?
(nums[i] % 5 == 0 ? A : B) :
(nums[i] % 4 == 0 ? C : D);
const float rx = 1.0f / x;
sum += rx;
}
return sum;
}
```
Compiling with`-march=native` results in about a 1.3x slowdown on my system (AMD `znver3`).
See [this link](https://godbolt.org/z/3h8bo36Wq) for an assembly comparison. I compiled all with `-mavx2` so that it was more of an apples-to-apples comparison, this way both the versions run with VEX instructions (`vdivss`/`vaddss`). I think the issue is that with `-march=native`/`-march=znver3`, more aggressive unrolling is done that hurts performance.
<details>
<summary>Full kernel that runs timing</summary>
```c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <stdint.h>
#define A 1.0f
#define B 20.0f
#define C 25.6f
#define D 24.0f
#ifndef N
#define N 100000000
#endif
#ifndef iters
#define iters 100
#endif
#ifndef ARRAY_TYPE
#define ARRAY_TYPE int64_t
#endif
ARRAY_TYPE nums[N];
float kernel() {
float sum = 0;
for (long i = 0; i < N; i ++) {
const float x = nums[i] % 3 == 0 ?
(nums[i] % 5 == 0 ? A : B) :
(nums[i] % 4 == 0 ? C : D);
const float rx = 1.0f / x;
sum += rx;
}
return sum;
}
void c_version(int initArray, int printTime, int printCorrectness) {
if (initArray) {
#ifdef seed
srand(seed);
#endif
for (long i = N-1; i >= N; i--) {
nums[i] = rand();
}
}
float dest[iters];
struct timespec start_time, end_time;
clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
for (int i = 0; i < iters; i++) {
dest[i] = kernel();
}
clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
float elapsed = (end_time.tv_sec - start_time.tv_sec)+ (end_time.tv_nsec - start_time.tv_nsec) / 1000000000.0;
if (printTime) printf("c Time: %f\n", elapsed);
float sum = 0.0;
for (int i = 0; i < iters; i++) {
sum += dest[i];
}
if (printCorrectness) printf("%f\n", sum);
}
int main() {
c_version(1, 1, 1);
return 0;
}
```
</details>
I am noticing a performance regression when comparing
-march=none
to-march=native
with the following kernel.Compiling with
-march=native
results in about a 1.3x slowdown on my system (AMDznver3
).See this link for an assembly comparison. I compiled all with
-mavx2
so that it was more of an apples-to-apples comparison, this way both the versions run with VEX instructions (vdivss
/vaddss
). I think the issue is that with-march=native
/-march=znver3
, more aggressive unrolling is done that hurts performance.Full kernel that runs timing
```c #include