Open fbarchard opened 2 years ago
idk 🤷🏻
benchmark of an end to end application, where kiss fft is about 60% of the profile:
32 bit Cortex A53 Original 53.3 us 4 loads 45.8 us neon 41.9 us
4 loads is this change: https://github.com/mborgerding/kissfft/issues/79 neon replaces complex operators in _kiss_fft_guts to use a single neon instruction on the complex structure.
68a69,99
if defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16)
define C_MUL( res, a,b) asm volatile( \
"vmull.s16 q2, %1, %2 \n" / a.r b.r, a.i b.i / \ "vrev32.16 d6, %2 \n" / b.i, b.r / \ "vmull.s16 q1, %1, d6 \n" / a.r b.i, a.i b.r / \ "vneg.s32 d6, d4 \n" / -a.r b.r, -a.i b.i / \ "vmov s9, s13 \n" / a.r b.r, -a.i b.i / \ "vpadd.s32 d4, d4, d2 \n" / a.r b.r - a.i b.i, a.r b.i + a.i b.r / \ "vqrshrn.s32 %0, q2, #15 \n" / 32 -> 16 bit / \ : "=w"(res): "w"(a), "w"(b): "q1", "q2", "q3")
define DIVSCALAR(x,k) asm volatile( \
"vdup.16 d2, %1 \n" / 1 / k / \ "vmull.s16 q1, %0, d2 \n" / x 1 / k / \ "vqrshrn.s32 %0, q1, #15 \n" / 32 -> 16 bit */ \ : "+w"(x): "r"(SAMP_MAX/k): "q1")
define C_FIXDIV(c,div) \
if (div == 4) asm volatile( "vrshr.s16 %0, %0, #2\n" : "+w"(c)); / div 4 = shr 2 / \ else asm volatile( \ "vdup.16 d2, %1 \n" / 1 / div / \ "vmull.s16 q1, %0, d2 \n" / c 1 / div / \ "vqrshrn.s32 %0, q1, #15 \n" / 32 -> 16 bit */ \ : "+w"(c): "r"(SAMP_MAX/div): "q1")
define C_MULBYSCALAR(x,s) asm volatile( \
"vdup.16 d2, %1 \n" / s / \ "vmull.s16 q1, %0, d2 \n" / x s / \ "vqrshrn.s32 %0, q1, #15 \n" / 32 -> 16 bit */ \ : "+w"(x): "r"(s): "q1")
else
82a114,115
endif
99a133,138
if defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16)
define C_ADD( res, a,b) asm volatile( "vaddq.i16 %0, %1, %2": "=w"(res): "w"(a), "w"(b))
define C_SUB( res, a,b) asm volatile( "vsubq.i16 %0, %1, %2": "=w"(res): "w"(a), "w"(b))
define C_ADDTO( res , a) asm volatile( "vaddq.i16 %0, %0, %1": "+w"(res): "w"(a))
define C_SUBFROM( res , a) asm volatile( "vsubq.i16 %0, %0, %1": "+w"(res): "w"(a))
else //defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16)
124a164
endif // !(defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16))
kissfft on ARMv7 with fixed point is slow. A neon version would improve performance quite a bit.
The float version has neither of those issues. The main loop of bfly4 is 68 instructions for float vs 150 for 16 bit fixed point.