Open vfdff opened 4 months ago
#include <complex.h> #include <math.h> using namespace std;
void cmul2 (complex __restrict a, complex restrict b, complex * restrict c) { for (int i=0; i<1000; i++) { c[i] = a[i] * b[i]; } }
* gcc:
cmul2(std::complex, std::complex, std::complex*): mov x3, 16000 add x3, x2, x3 .L2: ld2 {v30.2d - v31.2d}, [x0], 32 ld2 {v28.2d - v29.2d}, [x1], 32 fmul v27.2d, v31.2d, v28.2d fmul v26.2d, v30.2d, v28.2d fmla v27.2d, v30.2d, v29.2d fmls v26.2d, v29.2d, v31.2d st2 {v26.2d - v27.2d}, [x2], 32 cmp x3, x2 bne .L2 ret
*llvm:
cmul2(std::complex, std::complex, std::complex): // @cmul2(std::complex, std::complex, std::complex) mov x8, #-16000 // =0xffffffffffffc180 mov w9, #16000 // =0x3e80 .LBB0_1: // =>This Inner Loop Header: Depth=1 add x10, x0, x8 add x11, x1, x8 add x10, x10, x9 add x11, x11, x9 ld2 { v0.2d, v1.2d }, [x10] ld2 { v2.2d, v3.2d }, [x11] add x10, x2, x8 add x10, x10, x9 adds x8, x8, #32 fmul v4.2d, v2.2d, v0.2d fmul v5.2d, v2.2d, v1.2d fmls v4.2d, v3.2d, v1.2d fmla v5.2d, v3.2d, v0.2d st2 { v4.2d, v5.2d }, [x10] b.ne .LBB0_1 ret
* it seems the issue related to type **std::complex<double>**, when we change it to **int**, the issue doesn't apprear ,https://gcc.godbolt.org/z/zzxeKPsvo
@llvm/issue-subscribers-backend-aarch64
Author: Allen (vfdff)
Have you measured the perf difference?
void cmul2 (complex __restrict a,
complex restrict b,
complex * restrict c) {
for (int i=0; i<1000; i++) {
c[i] = a[i] * b[i];
}
}
cmul2(std::complex, std::complex , std::complex*):
mov x3, 16000
add x3, x2, x3
.L2:
ld2 {v30.2d - v31.2d}, [x0], 32
ld2 {v28.2d - v29.2d}, [x1], 32
fmul v27.2d, v31.2d, v28.2d
fmul v26.2d, v30.2d, v28.2d
fmla v27.2d, v30.2d, v29.2d
fmls v26.2d, v29.2d, v31.2d
st2 {v26.2d - v27.2d}, [x2], 32
cmp x3, x2
bne .L2
ret
cmul2(std::complex, std::complex , std::complex): // @cmul2(std::complex , std::complex, std::complex )
mov x8, #-16000 // =0xffffffffffffc180
mov w9, #16000 // =0x3e80
.LBB0_1: // =>This Inner Loop Header: Depth=1
add x10, x0, x8
add x11, x1, x8
add x10, x10, x9
add x11, x11, x9
ld2 { v0.2d, v1.2d }, [x10]
ld2 { v2.2d, v3.2d }, [x11]
add x10, x2, x8
add x10, x10, x9
adds x8, x8, #32
fmul v4.2d, v2.2d, v0.2d
fmul v5.2d, v2.2d, v1.2d
fmls v4.2d, v3.2d, v1.2d
fmla v5.2d, v3.2d, v0.2d
st2 { v4.2d, v5.2d }, [x10]
b.ne .LBB0_1
ret