FFTW / fftw3

DO NOT CHECK OUT THESE FILES FROM GITHUB UNLESS YOU KNOW WHAT YOU ARE DOING. (See below.)
GNU General Public License v2.0
2.67k stars 652 forks source link

Bug with plans using avx/avx2 kernels followed by generic simd256 plans #294

Open IceTDrinker opened 1 year ago

IceTDrinker commented 1 year ago

Hello,

A coworker found a bug in fftw linked to genric simd 256 kernels following avx/avx2 kernels and we were able to narrow down the issue.

Repro code (we observed the issue in Ubuntu 21.10, Ubuntu 22.04 docker but not Ubuntu 20.04 which seems to always generated plans without simd256 kernels):

#include <fftw3.h>
#include <stddef.h>
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdbool.h>

#define N 16384

int main() {
    fftw_complex *src, *fw, *bw;
    fftw_plan fw_plan;
    fftw_plan bw_plan;

    src = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
    fw = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
    bw = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);

    fw_plan = fftw_plan_dft_1d(N, src, fw, FFTW_FORWARD, FFTW_MEASURE | FFTW_PRESERVE_INPUT);
    bw_plan = fftw_plan_dft_1d(N, fw, bw, FFTW_BACKWARD, FFTW_MEASURE | FFTW_PRESERVE_INPUT);

    for (size_t i = 0; i < N; ++i) {
        src[i][0] = 0.0;
        src[i][1] = 0.0;
    }

    src[1][0] = 1.0;

    fftw_execute_dft(fw_plan, src, fw);
    fftw_execute_dft(bw_plan, fw, bw);

    for (size_t i = 0; i < N; ++i) {
        bw[i][0] /= N;
        bw[i][1] /= N;
    }

    puts("");
    fftw_print_plan(fw_plan);
    puts("");
    fftw_print_plan(bw_plan);
    puts("");

    for (size_t i = 0; i < N; ++i) {
        bool is_close_re = fabs(src[i][0] - bw[i][0]) < 1e-6;
        bool is_close_im = fabs(src[i][1] - bw[i][1]) < 1e-6;

        if (!is_close_re || !is_close_im) {
            printf(
                "at iteration i = %d\n"
                "expected: %f + %fi\n"
                "found: %f + %fi\n",
                (int)(i), src[i][0], src[i][1], bw[i][0], bw[i][1]
            );
            return 1;
        }
    }

    fftw_destroy_plan(fw_plan);
    fftw_destroy_plan(bw_plan);

    fftw_free(src);
    fftw_free(fw);
    fftw_free(bw);
}

This C code compiled with

gcc fftw_bug.c -O3 path/to/libfftw3.a -lm

yields the sample output:

for i in {1..1000}; do ./a.out; done

(dft-ct-dit/8
  (dftw-direct-8/28 "t1fv_8_avx2")
  (dft-vrank>=1-x8/1
    (dft-ct-dit/64
      (dftw-direct-64/252 "t1fv_64_avx2")
      (dft-direct-32-x64 "n2fv_32_avx2_128"))))
(dft-ct-dit/8
  (dftw-direct-8/28 "t1bv_8_avx2")
  (dft-vrank>=1-x8/1
    (dft-ct-dit/32
      (dftw-direct-32/124 "t1bv_32_avx2")
      (dft-direct-64-x32 "n2bv_64_avx2_128"))))

(dft-ct-dit/32
  (dftw-direct-32/16 "t3fv_32_avx2")
  (dft-vrank>=1-x32/1
    (dft-ct-dit/32
      (dftw-direct-32/124 "t1fv_32_avx2")
      (dft-direct-16-x32 "n2fv_16_generic_simd256"))))
(dft-ct-dit/8
  (dftw-direct-8/56 "t2bv_8_avx2")
  (dft-vrank>=1-x8/1
    (dft-ct-dit/64
      (dftw-direct-64/252 "t1bv_64_avx2")
      (dft-direct-32-x64 "n2bv_32_avx2_128"))))
at iteration i = 32
expected: 0.000000 + 0.000000i
found: 1.000000 + -0.000000i

(dft-ct-dit/32
  (dftw-direct-32/124 "t1fv_32_avx")
  (dft-vrank>=1-x32/1
    (dft-ct-dit/32
      (dftw-direct-32/124 "t1fv_32_avx2")
      (dft-direct-16-x32 "n2fv_16_generic_simd256"))))
(dft-ct-dit/8
  (dftw-direct-8/12 "t3bv_8_avx2")
  (dft-vrank>=1-x8/1
    (dft-ct-dit/32
      (dftw-direct-32/124 "t1bv_32_avx2")
      (dft-direct-64-x32 "n2bv_64_avx2_128"))))
at iteration i = 32
expected: 0.000000 + 0.000000i
found: 1.000000 + -0.000000i

(dft-ct-dit/16
  (dftw-direct-16/16 "t3fv_16_avx2")
  (dft-vrank>=1-x16/1
    (dft-ct-dit/64
      (dftw-direct-64/252 "t1fv_64_avx2")
      (dft-direct-16-x64 "n2fv_16_generic_simd256"))))
(dft-ct-dit/8
  (dftw-direct-8/28 "t1bv_8_avx2")
  (dft-vrank>=1-x8/1
    (dft-ct-dit/32
      (dftw-direct-32/124 "t1bv_32_avx2")
      (dft-direct-64-x32 "n2bv_64_avx2_128"))))
at iteration i = 16
expected: 0.000000 + 0.000000i
found: 1.000000 + -0.000000i

when compiling libfftw3.a from the fftw-3.3.10 source directory:

./configure --prefix=$(pwd)/build --with-pic --enable-static --disable-doc --enable-avx --enable-avx2 --enable-sse2 --enable-generic-simd128 --enable-generic-simd256

make -j $(nproc)

make install

However, when disabling the generic simd256 i.e. :

./configure --prefix=$(pwd)/build --with-pic --enable-static --disable-doc --enable-avx --enable-avx2 --enable-sse2 --enable-generic-simd128

make -j $(nproc)

make install

The 1000 test runs will run without any issue in an Ubuntu 22.04 docker image which would have precision issue with the simd256 plans.

Cheers