Neon - Githubissues

fbarchard commented 2 years ago

kissfft on ARMv7 with fixed point is slow. A neon version would improve performance quite a bit.

there arent enough registers, so the stack is used
shifts and rounding add a lot of overhead

The float version has neither of those issues. The main loop of bfly4 is 68 instructions for float vs 150 for 16 bit fixed point.

Neon could also process more than 1 value at a time. kiss_fft_cpx has 2 values, and bfly loops process more than 1 in places. C_MUL(scratch[0],Fout1 , tw1 ); C_MUL(scratch[1],Fout2 , tw2 );

j4m3s-101 commented 2 years ago

idk 🤷🏻

fbarchard commented 2 years ago

benchmark of an end to end application, where kiss fft is about 60% of the profile:

32 bit Cortex A53 Original 53.3 us 4 loads 45.8 us neon 41.9 us

4 loads is this change: https://github.com/mborgerding/kissfft/issues/79 neon replaces complex operators in _kiss_fft_guts to use a single neon instruction on the complex structure.

68a69,99

if defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16)

define C_MUL( res, a,b) asm volatile( \

"vmull.s16 q2, %1, %2 \n" / a.r b.r, a.i b.i / \ "vrev32.16 d6, %2 \n" / b.i, b.r / \ "vmull.s16 q1, %1, d6 \n" / a.r b.i, a.i b.r / \ "vneg.s32 d6, d4 \n" / -a.r b.r, -a.i b.i / \ "vmov s9, s13 \n" / a.r b.r, -a.i b.i / \ "vpadd.s32 d4, d4, d2 \n" / a.r b.r - a.i b.i, a.r b.i + a.i b.r / \ "vqrshrn.s32 %0, q2, #15 \n" / 32 -> 16 bit / \ : "=w"(res): "w"(a), "w"(b): "q1", "q2", "q3")

define DIVSCALAR(x,k) asm volatile( \

"vdup.16 d2, %1 \n" / 1 / k / \ "vmull.s16 q1, %0, d2 \n" / x 1 / k / \ "vqrshrn.s32 %0, q1, #15 \n" / 32 -> 16 bit */ \ : "+w"(x): "r"(SAMP_MAX/k): "q1")

define C_FIXDIV(c,div) \

if (div == 4) asm volatile( "vrshr.s16 %0, %0, #2\n" : "+w"(c)); / div 4 = shr 2 / \ else asm volatile( \ "vdup.16 d2, %1 \n" / 1 / div / \ "vmull.s16 q1, %0, d2 \n" / c 1 / div / \ "vqrshrn.s32 %0, q1, #15 \n" / 32 -> 16 bit */ \ : "+w"(c): "r"(SAMP_MAX/div): "q1")

define C_MULBYSCALAR(x,s) asm volatile( \

"vdup.16 d2, %1 \n" / s / \ "vmull.s16 q1, %0, d2 \n" / x s / \ "vqrshrn.s32 %0, q1, #15 \n" / 32 -> 16 bit */ \ : "+w"(x): "r"(s): "q1")

else

82a114,115

endif

99a133,138

if defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16)

define C_ADD( res, a,b) asm volatile( "vaddq.i16 %0, %1, %2": "=w"(res): "w"(a), "w"(b))

define C_SUB( res, a,b) asm volatile( "vsubq.i16 %0, %1, %2": "=w"(res): "w"(a), "w"(b))

define C_ADDTO( res , a) asm volatile( "vaddq.i16 %0, %0, %1": "+w"(res): "w"(a))

define C_SUBFROM( res , a) asm volatile( "vsubq.i16 %0, %0, %1": "+w"(res): "w"(a))

else //defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16)

124a164

endif // !(defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16))

mborgerding / kissfft

Neon #80

if defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16)

define C_MUL( res, a,b) asm volatile( \

define DIVSCALAR(x,k) asm volatile( \

define C_FIXDIV(c,div) \

define C_MULBYSCALAR(x,s) asm volatile( \

else

endif

if defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16)

define C_ADD( res, a,b) asm volatile( "vaddq.i16 %0, %1, %2": "=w"(res): "w"(a), "w"(b))

define C_SUB( res, a,b) asm volatile( "vsubq.i16 %0, %1, %2": "=w"(res): "w"(a), "w"(b))

define C_ADDTO( res , a) asm volatile( "vaddq.i16 %0, %0, %1": "+w"(res): "w"(a))

define C_SUBFROM( res , a) asm volatile( "vsubq.i16 %0, %0, %1": "+w"(res): "w"(a))

else //defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16)

endif // !(defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16))