FFTW / fftw3

DO NOT CHECK OUT THESE FILES FROM GITHUB UNLESS YOU KNOW WHAT YOU ARE DOING. (See below.)
GNU General Public License v2.0
2.66k stars 651 forks source link

armv7 #268

Open Karutsenko opened 2 years ago

Karutsenko commented 2 years ago

I try execute code with stm32mp1 stm32mp1 (two core) armV7: I have exeption Child terminated with signal = 0xb (SIGSEGV)

//=======================================

include

include

include

define FFT_SIZE 1024

define REAL 0

define IMAG 1

define I256 256

define I128 128

define F128 128.0f

static fftwf_plan plan; static fftwf_complex src, dst;

void fft_f_test(float* mgn) {

src =(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)* FFT_SIZE);
dst =(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)* FFT_SIZE);
for(int i=0; i<FFT_SIZE; i++)
{
    float theta  = (float)i/(float)FFT_SIZE * M_PI;
    src[i][REAL] = 1.0 * cos(10.0 * theta) + 0.5 * cos(25.0 * theta);
    src[i][IMAG] = 1.0 * sin(10.0 * theta) + 0.5 * sin(25.0 * theta);
}

int nbthreads  = 1;//omp_get_max_threads();
fftwf_init_threads();
fftwf_plan_with_nthreads(nbthreads);
plan = fftwf_plan_dft_1d(FFT_SIZE, src, dst, FFTW_FORWARD, FFTW_ESTIMATE);
fftwf_execute(plan); //<== this exeption

for(int i=0; i<FFT_SIZE; i++) {
    mgn[i] = sqrt(dst[i][REAL]*dst[i][REAL] + dst[i][IMAG]*dst[i][IMAG]);
}

fftwf_destroy_plan(plan);
fftwf_cleanup_threads();
fftwf_free(src);
fftwf_free(dst);

printf("fft test success!\n");

}

//=======================================================================================

Disassembly:

      fftwf_codelet_n2fv_16_neon:

b6f32c74: ldr r2, [pc, #8] ; (0xb6f32c80 <fftwf_codelet_n2fv_16_neon+12>) b6f32c76: ldr r1, [pc, #12] ; (0xb6f32c84 <fftwf_codelet_n2fv_16_neon+16>) b6f32c78: add r2, pc b6f32c7a: add r1, pc b6f32c7c: b.w 0xb6e72ca4 fftwf_kdft_register@plt b6f32c80: ldmia r2, {r2, r7} b6f32c82: movs r0, r1 b6f32c84: ; instruction: 0xfb43ffff b6f32c88: stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr} b6f32c8c: mov r11, r2 b6f32c8e: vpush {d8-d15} b6f32c92: ldr.w r2, [pc, #1600] ; 0xb6f332d4 b6f32c96: sub.w sp, sp, #524 ; 0x20c b6f32c9a: ldr r1, [sp, #632] ; 0x278 b6f32c9c: add r2, pc b6f32c9e: str r0, [sp, #0] b6f32ca0: cmp r1, #0 b6f32ca2: ldr r4, [sp, #624] ; 0x270 b6f32ca4: ldr r0, [sp, #640] ; 0x280 b6f32ca6: ble.w 0xb6f33c0e b6f32caa: ldr.w r3, [pc, #1580] ; 0xb6f332d8 b6f32cae: mov r10, r4vm9ctv97u6dk8uu b6f32cb0: str r1, [sp, #20] b6f32cb2: ldr r3, [r2, r3] b6f32cb4: ldr r3, [r3, #0] b6f32cb6: str r3, [sp, #516] ; 0x204 b6f32cb8: lsls r3, r0, #2 b6f32cba: str r3, [sp, #512] ; 0x200 b6f32cbc: lsls r3, r0, #3 b6f32cbe: str r3, [sp, #508] ; 0x1fc b6f32cc0: ldr r3, [sp, #636] ; 0x27c b6f32cc2: lsls r3, r3, #3 b6f32cc4: str r3, [sp, #504] ; 0x1f8 b6f32cc6: ldr r2, [sp, #0] b6f32cc8: mov.w r9, #76 ; 0x4c b6f32ccc: mov.w r0, r10, lsl #6 b6f32cd0: add.w r5, r11, #224 ; 0xe0 b6f32cd4: mvn.w r3, #31 b6f32cd8: mul.w r12, r3, r10 b6f32cdc: adds r3, r2, r0 b6f32cde: movs r2, #24 b6f32ce0: mvn.w r6, #79 ; 0x4f b6f32ce4: mov r1, r3 b6f32ce6: add.w r8, r1, r12 b6f32cea: movs r3, #56 ; 0x38 b6f32cec: mul.w r6, r6, r10 b6f32cf0: vld1.32 {d14-d15}, [r8], r0 b6f32cf4: str r1, [sp, #200] ; 0xc8 b6f32cf6: mvn.w r1, #63 ; 0x3f b6f32cfa: add.w r7, r8, r6 b6f32cfe: mul.w r1, r1, r10 b6f32d02: vld1.32 {d12-d13}, [r7], r0 b6f32d06: vld1.32 {d18-d19}, [r8] b6f32d0a: vld1.32 {d16-d17}, [r7], r7 b6f32d0e: mvn.w lr, #119 ; 0x77 b6f32d12: vstr d14, [sp, #472] ; 0x1d8 b6f32d16: vstr d15, [sp, #480] ; 0x1e0 b6f32d1a: add.w r8, r7, r1 b6f32d1e: mla r9, r9, r10, r8 b6f32d22: mla lr, lr, r10, r9 b6f32d26: mla r3, r3, r10, lr b6f32d2a: vld1.32 {d28-d29}, [r9] <- THIS PROBLEM, Break at address "0xb6f32d2a" with no debug information available, or outside of program code. RETURN Child terminated with signal = 0xb (SIGSEGV)

Karutsenko commented 2 years ago

I did some additional work and found one peculiarity: at the fft size (n) 256 fftw works! for claritty, i printed the plan? and recorded the result: void fft_f_test2(float* mgn) { int n = 512;

fftwf_plan p;
fftwf_complex *s, *d;

int nbthreads  = omp_get_max_threads();
fftwf_init_threads();
fftwf_plan_with_nthreads(nbthreads);

s =(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)* n);
d =(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)* n);

//fftwf_set_timelimit(FFTW_NO_TIMELIMIT);
//fftwf_make_planner_thread_safe();
p = fftwf_plan_dft_1d(n, s, d, FFTW_FORWARD, FFTW_ESTIMATE);
printf(" "); fftwf_print_plan(p); printf("\n");

/*   this work,  n=64
 *  (dft-thr-ct-dit-x2/8
  (dftw-direct-8/12 "t3fv_8_neon")
  (dftw-direct-8/12 "t3fv_8_neon")
  (dft-thr-vrank>=1-x2/1
    (dft-direct-8-x4 "n2fv_8_neon")
    (dft-direct-8-x4 "n2fv_8_neon")))

    n=128 =>this work!
      (dft-thr-ct-dit-x2/8
      (dftw-direct-8/12 "t3fv_8_neon")
      (dftw-direct-8/12 "t3fv_8_neon")
      (dft-thr-vrank>=1-x2/1
        (dft-direct-16-x4 "n2fv_16_neon")
        (dft-direct-16-x4 "n2fv_16_neon")))

    n=256 =>this work!
      (dft-thr-ct-dit-x2/16
      (dftw-direct-16/16 "t3fv_16_neon")
      (dftw-direct-16/16 "t3fv_16_neon")
      (dft-thr-vrank>=1-x2/1
        (dft-direct-16-x8 "n2fv_16_neon")
        (dft-direct-16-x8 "n2fv_16_neon")))

    n=512 dont'n work!!!
    (dft-thr-ct-dit-x2/16
      (dftw-direct-16/16 "t3fv_16_neon")
      (dftw-direct-16/16 "t3fv_16_neon")
      (dft-thr-vrank>=1-x2/1
        (dft-direct-32-x8 "n2fv_32_neon")
        (dft-direct-32-x8 "n2fv_32_neon")))
        ===========================================
        Child terminated with signal = 0xb (SIGSEGV)
 */

for(int i=0; i<n; i++)
{
    float theta  = (float)i/(float)n * M_PI;
    s[i][REAL] = 1.0 * cos(10.0 * theta) + 0.5 * cos(25.0 * theta);
    s[i][IMAG] = 1.0 * sin(10.0 * theta) + 0.5 * sin(25.0 * theta);
}

//fftwf_execute(p);
fftwf_execute_dft(p, s, d);

for(int i=0; i<n; i++) {
    mgn[i] = sqrt(SQR(d[i][REAL])+ SQR(d[i][IMAG]));
}

fftwf_cleanup_threads();
fftwf_destroy_plan(p);
fftwf_free(s);
fftwf_free(d);

printf("fft test success!\n");

}