mpip / pfft

Parallel fast Fourier transforms
GNU General Public License v3.0
54 stars 22 forks source link

in place transform fails on some platforms. #5

Open rainwoodman opened 9 years ago

rainwoodman commented 9 years ago

In place transform fails on edison.nersc.gov, with intel 14.0.2 20140120.

I'll look deeper into this. Can it be a problem with FFTW ? --since the domain decomposition is 1d in this simple test.

#include <complex.h>
#include <pfft.h>

int main(int argc, char **argv)
{
  int np[2];
  ptrdiff_t n[3];
  ptrdiff_t alloc_local;
  ptrdiff_t local_ni[3], local_i_start[3];
  ptrdiff_t local_no[3], local_o_start[3];
  double err;
  pfft_complex *in, *out;
  pfft_plan plan_forw=NULL, plan_back=NULL;
  MPI_Comm comm_cart_2d;
  double data[] = {
        -0.51503939,  0.59189672,  0.0478734,  -0.48840469, -0.35495284, -0.39181335,
  1.86426106, -1.37148975,  2.22627536, -0.11810965,  0.11984837,  0.18259889,
 -0.65773926, -1.64623164, -1.14158407, -1.43908939,
  } ;
  /* Set size of FFT and process mesh */
//  n[0] = 29; n[1] = 27; n[2] = 31;
  n[0] = 2; n[1] = 3; n[2] = 2;
  np[0] = 2; np[1] = 1;

  /* Initialize MPI and PFFT */
  MPI_Init(&argc, &argv);
  pfft_init();
  int rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  /* Create two-dimensional process grid of size np[0] x np[1], if possible */
  if( pfft_create_procmesh_1d(MPI_COMM_WORLD, np[0], &comm_cart_2d) ){
    pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]);
    MPI_Finalize();
    return 1;
  }

  /* Get parameters of data distribution */
  alloc_local = pfft_local_size_dft_3d(n, comm_cart_2d, PFFT_TRANSPOSED_NONE,
      local_ni, local_i_start, local_no, local_o_start);

  /* Allocate memory */
  in  = pfft_alloc_complex(alloc_local);
  out = pfft_alloc_complex(alloc_local);
  out = in;
  /* Plan parallel forward FFT */
  plan_forw = pfft_plan_dft_3d(
      n, in, out, comm_cart_2d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT);

  /* Plan parallel backward FFT */
  plan_back = pfft_plan_dft_3d(
      n, out, in, comm_cart_2d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT);

  /* Initialize input with random numbers */
  pfft_init_input_complex_3d(n, local_ni, local_i_start,
      in);

  memcpy(in, data, sizeof(double) * alloc_local * 2);
  /* execute parallel forward FFT */
  pfft_execute(plan_forw);

  ptrdiff_t l;

  int r;
  for (r = 0; r < 2; r++) {
      MPI_Barrier(MPI_COMM_WORLD);
      if (r != rank) continue;
      printf ("out on rank %d :", rank);
      for(l=0; l < alloc_local * 2; l ++) {
            printf("%g\n", ((double*)out)[l]);
      }
  }

  /* clear the old input */
//  pfft_clear_input_complex_3d(n, local_ni, local_i_start,
 //     in);

  /* execute parallel backward FFT */
  pfft_execute(plan_back);

  /* Scale data */
  for (r = 0; r < 2; r++) {
      MPI_Barrier(MPI_COMM_WORLD);
      if (r != rank) continue;
      printf ("on rank %d :", rank);
      for(l=0; l < alloc_local * 2; l ++) {
            printf("%g %g\n", ((double*)in)[l], data[l]);
      }
  }
  for(l=0; l < local_ni[0] * local_ni[1] * local_ni[2]; l++)
    in[l] /= (n[0]*n[1]*n[2]);
  /* Print error of back transformed data */
  err = pfft_check_output_complex_3d(n, local_ni, local_i_start, in, comm_cart_2d);
  pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td, %td):\n", n[0], n[1], n[2]); 
  pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err);

  /* free mem and finalize */
  pfft_destroy_plan(plan_forw);
  pfft_destroy_plan(plan_back);
  MPI_Comm_free(&comm_cart_2d);
 // pfft_free(in); pfft_free(out);
  MPI_Finalize();
  return 0;
}
rainwoodman commented 9 years ago

out-place transforms seems to be alright on edison.

mpip commented 9 years ago

Looks like you forgot to divide in[l] by (n[0]n[1]n[2]) before you compare it with data[l]. Then, the test works on my desktop.

mpip commented 9 years ago

Right now, I added two inplace c2c checks to the test suite.

rainwoodman commented 9 years ago

Thanks for putting in the tests. I hacked the code together to compare the output of PFFT on Edison and on a different computer -- hence hacked away the normalization.

With the intel compiler, the problem seems to be some variables are moved to a wrong process. I will attach a log showing a list of passed / failed flags later(they were all passed on my local cluster).

Also with gnu 4.9.2 the code crashes in patched fftw files transpose-pairwise.c, at a memcpy around line 70. Somehow after MPI_SendRecv buf is overwritten as 0. It for sure looks like a compiler bug to me.

But I wonder if we can avoid triggering these bugs by simplifying the if-branching a bit in places.

rainwoodman commented 9 years ago

The first 78 passed, the laters failed. Three mesh sizes were used [2, 3, 2] [32, 32, 32], and [29, 30, 31]

PASS 78 / 216
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [32, 32, 32]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [32, 32, 32]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [32, 32, 32]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [32, 32, 32]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [32, 32, 32]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [32, 32, 32]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [32, 32, 32]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [32, 32, 32]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [32, 32, 32]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [32, 32, 32]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [32, 32, 32]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [32, 32, 32]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [32, 32, 32]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [32, 32, 32]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [32, 32, 32]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [32, 32, 32]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [32, 32, 32]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [32, 32, 32]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [29, 30, 31]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [29, 30, 31]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [29, 30, 31]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [29, 30, 31]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [29, 30, 31]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [32, 32, 32]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [32, 32, 32]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [32, 32, 32]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [32, 32, 32]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [32, 32, 32]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [32, 32, 32]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [32, 32, 32]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [32, 32, 32]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [32, 32, 32]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [32, 32, 32]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [32, 32, 32]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [32, 32, 32]
FAIL 138 / 216
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [29, 30, 31]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [2, 3, 2]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [29, 30, 31]
NP [2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [2, 3, 2]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [29, 30, 31]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [2, 3, 2]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [29, 30, 31]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [2, 3, 2]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [29, 30, 31]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [2, 3, 2]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [29, 30, 31]
NP [2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [2, 3, 2]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [29, 30, 31]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [2, 3, 2]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [29, 30, 31]
NP [2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [2, 3, 2]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [29, 30, 31]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [2, 3, 2]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [29, 30, 31]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [2, 3, 2]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [29, 30, 31]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [2, 3, 2]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [29, 30, 31]
NP [2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [29, 30, 31]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [2, 3, 2]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [29, 30, 31]
NP [1, 2] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [29, 30, 31]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [2, 3, 2]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [29, 30, 31]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [29, 30, 31]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [2, 3, 2]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [29, 30, 31]
NP [1, 2] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [32, 32, 32]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [32, 32, 32]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [32, 32, 32]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [32, 32, 32]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [32, 32, 32]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [32, 32, 32]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [32, 32, 32]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [2, 3, 2]
NP [1, 2] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [32, 32, 32]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [2, 3, 2]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [2, 3, 2]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [2, 3, 2]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [2, 3, 2]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [2, 3, 2]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_C2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [2, 3, 2]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [29, 30, 31]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [2, 3, 2]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_R2C PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [2, 3, 2]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [29, 30, 31]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace True Nmesh [2, 3, 2]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE InPlace False Nmesh [2, 3, 2]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [29, 30, 31]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace True Nmesh [2, 3, 2]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_DESTROY_INPUT InPlace False Nmesh [2, 3, 2]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [29, 30, 31]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace True Nmesh [2, 3, 2]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [29, 30, 31]
NP [2, 1] PFFT_R2C PFFT_TRANSPOSED_OUT|PFFT_ESTIMATE|PFFT_PADDED_R2C|PFFT_PADDED_C2R InPlace False Nmesh [2, 3, 2]
mpip commented 9 years ago

Can it be something going wrong with the bitwise operations (my_pe * n_pes + pe) & 0xffff in line 73 of transpose-pairwise.c? They are used to generate the message tags. Do you see, why & 0xffff is necessary at this place?

rainwoodman commented 9 years ago

I don't see why. But the test case is very small (4 or 2 PEs), 0xffff shall not trigger anything.

I did more tests with the GCC (4.8.1 Cray), which is the compiler Cray used to compile fftw.

I tweaked simple_test_c2c.c to use the exact set up [np = [4], proc_mesh_1d) that was giving the buf = NULL core-dump at memcpy.

It didn't crash with simple_test_c2c.c Identical set up with the python API is crashing, on and only on a Cray.

The only sensitive difference I see here is whether the objects are linked into a static binary or a dynamic library. The dynamic library is quite messy too, with all symbols in FFTW (and a size of 10MB).

The default fftw does have an almost identical transpose-pairwise.c; could you think of a FFTW test case that can surely trigger that code? I would test if dynamical linking against a cray FFTW .so file triggers the same issue too.

mpip commented 9 years ago

If you compile FFTW on your own, the easiest way is to remove the other transpose algorithms from the solvtab list in mpi/conf.c and recompile. Otherwise, we can look at the functions applicable(...) in transpose-alltoall.c transpose-pairwise.c transpose-recurse.c and figure out a setting that permits all algorithms but one. The alltoall variant can not be executed inplace, c.f. line 107 in transpose-alltoall.c. The recursive variant needs that the second dimension can be divided equally on all processes, c.f. line 131 in transpose-recurse.c.

So an inplace transposition of size n0 x n1 on P processes, where P is no divisor of n0 and n1 should end up in the pairwise variant. I also assumed that P is no divisor of n0 to be sure it works for TRANSPOSED_IN and TRANSPOSED_OUT where the roles of n0 and n1 might change.

mpip commented 9 years ago

Did I get it right, that the problems only occur in transpose-pairwise-transposed.c?

rainwoodman commented 9 years ago

The crash occurs in transpose-pairwise and transpose-pairwise-transposed -- I've found the fix.

It was because of the size of MPI_Status is 20 bytes. Compiler puts `buf' right after it. On this particular computer and particular setup MPI_Sendrecv would purge the 4 bytes after it -- must be an alignment issue. The solution is to replace &status with MPI_STATUS_IGNORE. It doesn't happen if we link against the static version of the MPI library. I've attached a new install_fftw.sh that contains the updated patches to FFTW 3.3.3

The code still crashes if I create procmesh for too many times at a call to MPI_Comm_size. (around 128 ~ 256) But I believe that's a Cray MPI issue that we can't work around; and it is rare to plan for that many times anyways.

Here is the new install_fftw.sh.

#!/bin/sh -e

FFTW_VERSION="3.3.3"
PREFIX="$1"
TMP="tmp-fftw-${FFTW_VERSION}"
LOGFILE="build.log"

# bash check if directory exists
if [ -d $TMP ]; then
        echo "Directory $TMP already exists. Delete it? (y/n)"
    answer=y
    if [ ${answer} = "y" ]; then
        rm -rf $TMP
    else
        echo "Program aborted."
        exit 1
    fi
fi

mkdir $TMP && cd $TMP

wget http://www.fftw.org/fftw-$FFTW_VERSION.tar.gz
gzip -dc fftw-$FFTW_VERSION.tar.gz | tar xvf -
cd fftw-$FFTW_VERSION

# add two more transpose algorithms to the planner and fix double free
patch -b -p1 <<\EOF
diff -rupN fftw-3.3.3/mpi/Makefile.am fftw-3.3.3-patched/mpi/Makefile.am
--- fftw-3.3.3/mpi/Makefile.am  2013-03-28 13:52:19.000000000 +0100
+++ fftw-3.3.3-patched/mpi/Makefile.am  2013-03-28 12:36:08.000000000 +0100
@@ -16,6 +16,7 @@ BUILT_SOURCES = fftw3-mpi.f03.in fftw3-m
 CLEANFILES = fftw3-mpi.f03 fftw3l-mpi.f03

 TRANSPOSE_SRC = transpose-alltoall.c transpose-pairwise.c transpose-recurse.c transpose-problem.c transpose-solve.c mpi-transpose.h
+TRANSPOSE_SRC += transpose-alltoall-transposed.c transpose-pairwise-transposed.c
 DFT_SRC = dft-serial.c dft-rank-geq2.c dft-rank-geq2-transposed.c dft-rank1.c dft-rank1-bigvec.c dft-problem.c dft-solve.c mpi-dft.h
 RDFT_SRC = rdft-serial.c rdft-rank-geq2.c rdft-rank-geq2-transposed.c rdft-rank1-bigvec.c rdft-problem.c rdft-solve.c mpi-rdft.h
 RDFT2_SRC = rdft2-serial.c rdft2-rank-geq2.c rdft2-rank-geq2-transposed.c rdft2-problem.c rdft2-solve.c mpi-rdft2.h
diff -rupN fftw-3.3.3/mpi/conf.c fftw-3.3.3-patched/mpi/conf.c
--- fftw-3.3.3/mpi/conf.c   2013-03-28 13:52:19.000000000 +0100
+++ fftw-3.3.3-patched/mpi/conf.c   2013-03-28 12:36:08.000000000 +0100
@@ -29,6 +29,8 @@ static const solvtab s =
      SOLVTAB(XM(transpose_pairwise_register)),
      SOLVTAB(XM(transpose_alltoall_register)),
      SOLVTAB(XM(transpose_recurse_register)),
+     SOLVTAB(XM(transpose_pairwise_transposed_register)),
+     SOLVTAB(XM(transpose_alltoall_transposed_register)),
      SOLVTAB(XM(dft_rank_geq2_register)),
      SOLVTAB(XM(dft_rank_geq2_transposed_register)),
      SOLVTAB(XM(dft_serial_register)),
diff -rupN fftw-3.3.3/mpi/mpi-transpose.h fftw-3.3.3-patched/mpi/mpi-transpose.h
--- fftw-3.3.3/mpi/mpi-transpose.h  2013-03-28 13:52:19.000000000 +0100
+++ fftw-3.3.3-patched/mpi/mpi-transpose.h  2013-03-28 12:36:09.000000000 +0100
@@ -59,3 +59,5 @@ int XM(mkplans_posttranspose)(const prob
 void XM(transpose_pairwise_register)(planner *p);
 void XM(transpose_alltoall_register)(planner *p);
 void XM(transpose_recurse_register)(planner *p);
+void XM(transpose_pairwise_transposed_register)(planner *p);
+void XM(transpose_alltoall_transposed_register)(planner *p);
diff -rupN fftw-3.3.3/mpi/transpose-alltoall-transposed.c fftw-3.3.3-patched/mpi/transpose-alltoall-transposed.c
--- fftw-3.3.3/mpi/transpose-alltoall-transposed.c  1970-01-01 01:00:00.000000000 +0100
+++ fftw-3.3.3-patched/mpi/transpose-alltoall-transposed.c  2013-03-28 12:36:09.000000000 +0100
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ * Copyright (c) 2012 Michael Pippig
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* plans for distributed out-of-place transpose using MPI_Alltoall,
+   and which destroy the input array (also if TRANSPOSED_IN is used) */
+
+#include "mpi-transpose.h"
+#include <string.h>
+
+typedef struct {
+     solver super;
+     int copy_transposed_out; /* whether to copy the output for TRANSPOSED_OUT,
+               which makes the first transpose out-of-place
+               but costs an extra copy and requires us
+               to destroy the input */
+} S;
+
+typedef struct {
+     plan_mpi_transpose super;
+
+     plan *cld1, *cld2, *cld2rest, *cld3;
+
+     MPI_Comm comm;
+     int *send_block_sizes, *send_block_offsets;
+     int *recv_block_sizes, *recv_block_offsets;
+
+     INT rest_Ioff, rest_Ooff;
+
+     int equal_blocks;
+} P;
+
+/* transpose locally to get contiguous chunks
+   this may take two transposes if the block sizes are unequal
+   (3 subplans, two of which operate on disjoint data) */
+static void apply_pretranspose(
+    const P *ego, R *I, R *O
+    )
+{
+  plan_rdft *cld2, *cld2rest, *cld3;
+
+  cld3 = (plan_rdft *) ego->cld3;
+  if (cld3)
+       cld3->apply(ego->cld3, O, O);
+  /* else TRANSPOSED_IN is true and user wants I transposed */
+
+  cld2 = (plan_rdft *) ego->cld2;
+  cld2->apply(ego->cld2, I, O);
+  cld2rest = (plan_rdft *) ego->cld2rest;
+  if (cld2rest) {
+       cld2rest->apply(ego->cld2rest,
+                  I + ego->rest_Ioff, O + ego->rest_Ooff);
+  }
+}
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1 = (plan_rdft *) ego->cld1;
+
+     if (cld1) {
+          /* transpose locally to get contiguous chunks */
+          apply_pretranspose(ego, I, O);
+
+     /* transpose chunks globally */
+     if (ego->equal_blocks)
+          MPI_Alltoall(O, ego->send_block_sizes[0], FFTW_MPI_TYPE,
+               I, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
+               ego->comm);
+     else
+          MPI_Alltoallv(O, ego->send_block_sizes, ego->send_block_offsets,
+                FFTW_MPI_TYPE,
+                I, ego->recv_block_sizes, ego->recv_block_offsets,
+                FFTW_MPI_TYPE,
+                ego->comm);
+
+          /* transpose locally to get non-transposed output */
+          cld1->apply(ego->cld1, I, O);
+     } /* else TRANSPOSED_OUT is true and user wants O transposed */
+     else {
+          /* transpose locally to get contiguous chunks */
+          apply_pretranspose(ego, I, I);
+
+          /* transpose chunks globally */
+     if (ego->equal_blocks)
+          MPI_Alltoall(I, ego->send_block_sizes[0], FFTW_MPI_TYPE,
+               O, ego->recv_block_sizes[0], FFTW_MPI_TYPE,
+               ego->comm);
+     else
+          MPI_Alltoallv(I, ego->send_block_sizes, ego->send_block_offsets,
+                FFTW_MPI_TYPE,
+                O, ego->recv_block_sizes, ego->recv_block_offsets,
+                FFTW_MPI_TYPE,
+                ego->comm);
+     }
+}
+
+static int applicable(const S *ego, const problem *p_,
+             const planner *plnr)
+{
+     /* in contrast to transpose-alltoall this algorithm can not preserve the input,
+      * since we need at least one transpose before the (out-of-place) Alltoall */
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     return (1
+        && p->I != p->O
+        && (!NO_DESTROY_INPUTP(plnr))  
+        && ((p->flags & TRANSPOSED_OUT) || !ego->copy_transposed_out)
+        && ONLY_TRANSPOSEDP(p->flags)
+     );
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+     X(plan_awake)(ego->cld2rest, wakefulness);
+     X(plan_awake)(ego->cld3, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(ifree0)(ego->send_block_sizes);
+     MPI_Comm_free(&ego->comm);
+     X(plan_destroy_internal)(ego->cld3);
+     X(plan_destroy_internal)(ego->cld2rest);
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-transpose-alltoall-transposed%s%(%p%)%(%p%)%(%p%)%(%p%))",
+         ego->equal_blocks ? "/e" : "",
+         ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_transpose *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
+     INT b, bt, vn, rest_Ioff, rest_Ooff;
+     R *O;
+     int *sbs, *sbo, *rbs, *rbo;
+     int pe, my_pe, n_pes;
+     int equal_blocks = 1;
+     static const plan_adt padt = {
+          XM(transpose_solve), awake, print, destroy
+     };
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_transpose *) p_;
+     vn = p->vn;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     bt = XM(block)(p->ny, p->tblock, my_pe);
+
+     if (p->flags & TRANSPOSED_OUT) { /* O stays transposed */
+     if (ego->copy_transposed_out) {
+          cld1 = X(mkplan_f_d)(plnr,
+                 X(mkproblem_rdft_0_d)(X(mktensor_1d)
+                           (bt * p->nx * vn, 1, 1),
+                           p->I, O = p->O),
+                   0, 0, NO_SLOW);
+          if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+     else /* first transpose is in-place */
+              O = p->I;
+     }
+     else { /* transpose nx x bt x vn -> bt x nx x vn */
+     cld1 = X(mkplan_f_d)(plnr, 
+                  X(mkproblem_rdft_0_d)(X(mktensor_3d)
+                            (bt, vn, p->nx * vn,
+                             p->nx, bt * vn, vn,
+                             vn, 1, 1),
+                            p->I, O = p->O),
+                  0, 0, NO_SLOW);
+     if (XM(any_true)(!cld1, p->comm)) goto nada;
+     }
+
+     if (XM(any_true)(!XM(mkplans_pretranspose)(p, plnr, p->I, O, my_pe,
+                       &cld2, &cld2rest, &cld3,
+                       &rest_Ioff, &rest_Ooff),
+             p->comm)) goto nada;
+
+
+     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
+
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->cld2rest = cld2rest;
+     pln->rest_Ioff = rest_Ioff;
+     pln->rest_Ooff = rest_Ooff;
+     pln->cld3 = cld3;
+
+     MPI_Comm_dup(p->comm, &pln->comm);
+
+     /* Compute sizes/offsets of blocks to send for all-to-all command. */
+     sbs = (int *) MALLOC(4 * n_pes * sizeof(int), PLANS);
+     sbo = sbs + n_pes;
+     rbs = sbo + n_pes;
+     rbo = rbs + n_pes;
+     b = XM(block)(p->nx, p->block, my_pe);
+     bt = XM(block)(p->ny, p->tblock, my_pe);
+     for (pe = 0; pe < n_pes; ++pe) {
+     INT db, dbt; /* destination block sizes */
+     db = XM(block)(p->nx, p->block, pe);
+     dbt = XM(block)(p->ny, p->tblock, pe);
+     if (db != p->block || dbt != p->tblock)
+          equal_blocks = 0;
+
+     /* MPI requires type "int" here; apparently it
+        has no 64-bit API?  Grrr. */
+     sbs[pe] = (int) (b * dbt * vn);
+     sbo[pe] = (int) (pe * (b * p->tblock) * vn);
+     rbs[pe] = (int) (db * bt * vn);
+     rbo[pe] = (int) (pe * (p->block * bt) * vn);
+     }
+     pln->send_block_sizes = sbs;
+     pln->send_block_offsets = sbo;
+     pln->recv_block_sizes = rbs;
+     pln->recv_block_offsets = rbo;
+     pln->equal_blocks = equal_blocks;
+
+     X(ops_zero)(&pln->super.super.ops);
+     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
+     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
+     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
+     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
+     /* FIXME: should MPI operations be counted in "other" somehow? */
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld3);
+     X(plan_destroy_internal)(cld2rest);
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int copy_transposed_out)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->copy_transposed_out = copy_transposed_out;
+     return &(slv->super);
+}
+
+void XM(transpose_alltoall_transposed_register)(planner *p)
+{
+     int cto;
+     for (cto = 0; cto <= 1; ++cto)
+     REGISTER_SOLVER(p, mksolver(cto));
+}
diff -rupN fftw-3.3.3/mpi/transpose-pairwise-transposed.c fftw-3.3.3-patched/mpi/transpose-pairwise-transposed.c
--- fftw-3.3.3/mpi/transpose-pairwise-transposed.c  1970-01-01 01:00:00.000000000 +0100
+++ fftw-3.3.3-patched/mpi/transpose-pairwise-transposed.c  2013-03-28 13:49:35.000000000 +0100
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2003, 2007-11 Matteo Frigo
+ * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
+ * Copyright (c) 2012 Michael Pippig
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+/* Distributed transposes using a sequence of carefully scheduled
+   pairwise exchanges.  This has the advantage that it can be done
+   in-place, or out-of-place while preserving the input, using buffer
+   space proportional to the local size divided by the number of
+   processes (i.e. to the total array size divided by the number of
+   processes squared). */
+
+#include "mpi-transpose.h"
+#include <string.h>
+
+typedef struct {
+     solver super;
+     int preserve_input; /* preserve input even if DESTROY_INPUT was passed */
+} S;
+
+typedef struct {
+     plan_mpi_transpose super;
+
+     plan *cld1, *cld2, *cld2rest, *cld3;
+     INT rest_Ioff, rest_Ooff;
+     
+     int n_pes, my_pe, *sched;
+     INT *send_block_sizes, *send_block_offsets;
+     INT *recv_block_sizes, *recv_block_offsets;
+     MPI_Comm comm;
+     int preserve_input;
+} P;
+
+static void transpose_chunks(int *sched, int n_pes, int my_pe,
+                INT *sbs, INT *sbo, INT *rbs, INT *rbo,
+                MPI_Comm comm,
+                R *I, R *O)
+{
+     if (sched) {
+     int i;
+
+     /* TODO: explore non-synchronous send/recv? */
+
+     if (I == O) {
+          R *buf = (R*) MALLOC(sizeof(R) * sbs[0], BUFFERS);
+          
+          for (i = 0; i < n_pes; ++i) {
+           int pe = sched[i];
+           if (my_pe == pe) {
+            if (rbo[pe] != sbo[pe])
+                 memmove(O + rbo[pe], O + sbo[pe],
+                     sbs[pe] * sizeof(R));
+           }
+           else {
+            memcpy(buf, O + sbo[pe], sbs[pe] * sizeof(R));
+            MPI_Sendrecv(buf, (int) (sbs[pe]), FFTW_MPI_TYPE,
+                     pe, (my_pe * n_pes + pe) & 0xffff,
+                     O + rbo[pe], (int) (rbs[pe]),
+                     FFTW_MPI_TYPE,
+                     pe, (pe * n_pes + my_pe) & 0xffff,
+                     comm, MPI_STATUS_IGNORE);
+           }
+          }
+
+          X(ifree)(buf);
+     }
+     else { /* I != O */
+          for (i = 0; i < n_pes; ++i) {
+           int pe = sched[i];
+           if (my_pe == pe)
+            memcpy(O + rbo[pe], I + sbo[pe], sbs[pe] * sizeof(R));
+           else
+            MPI_Sendrecv(I + sbo[pe], (int) (sbs[pe]),
+                     FFTW_MPI_TYPE,
+                     pe, (my_pe * n_pes + pe) & 0xffff,
+                     O + rbo[pe], (int) (rbs[pe]),
+                     FFTW_MPI_TYPE,
+                     pe, (pe * n_pes + my_pe) & 0xffff,
+                     comm, MPI_STATUS_IGNORE);
+          }
+     }
+     }
+}
+
+/* transpose locally to get contiguous chunks
+   this may take two transposes if the block sizes are unequal
+   (3 subplans, two of which operate on disjoint data) */
+static void apply_pretranspose(
+    const P *ego, R *I, R *O
+    )
+{
+  plan_rdft *cld2, *cld2rest, *cld3;
+
+  cld3 = (plan_rdft *) ego->cld3;
+  if (cld3)
+       cld3->apply(ego->cld3, O, O);
+  /* else TRANSPOSED_IN is true and user wants I transposed */
+
+  cld2 = (plan_rdft *) ego->cld2;
+  cld2->apply(ego->cld2, I, O);
+  cld2rest = (plan_rdft *) ego->cld2rest;
+  if (cld2rest) {
+       cld2rest->apply(ego->cld2rest,
+                  I + ego->rest_Ioff, O + ego->rest_Ooff);
+  }
+}
+
+static void apply(const plan *ego_, R *I, R *O)
+{
+     const P *ego = (const P *) ego_;
+     plan_rdft *cld1 = (plan_rdft *) ego->cld1;
+     
+     if (cld1) {
+          /* transpose locally to get contiguous chunks */
+          apply_pretranspose(ego, I, O);
+
+          if(ego->preserve_input) I = O;
+
+          /* transpose chunks globally */
+          transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
+                           ego->send_block_sizes, ego->send_block_offsets,
+              ego->recv_block_sizes, ego->recv_block_offsets,
+              ego->comm, O, I);
+
+          /* transpose locally to get non-transposed output */
+          cld1->apply(ego->cld1, I, O);
+     } /* else TRANSPOSED_OUT is true and user wants O transposed */
+     else if (ego->preserve_input) {
+          /* transpose locally to get contiguous chunks */
+          apply_pretranspose(ego, I, O);
+
+          /* transpose chunks globally */
+          transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
+                          ego->send_block_sizes, ego->send_block_offsets,
+              ego->recv_block_sizes, ego->recv_block_offsets,
+              ego->comm, O, O);
+     }
+     else {
+          /* transpose locally to get contiguous chunks */
+          apply_pretranspose(ego, I, I);
+
+          /* transpose chunks globally */
+          transpose_chunks(ego->sched, ego->n_pes, ego->my_pe,
+                          ego->send_block_sizes, ego->send_block_offsets,
+              ego->recv_block_sizes, ego->recv_block_offsets,
+              ego->comm, I, O);
+     }
+}
+
+static int applicable(const S *ego, const problem *p_,
+             const planner *plnr)
+{
+     const problem_mpi_transpose *p = (const problem_mpi_transpose *) p_;
+     /* Note: this is *not* UGLY for out-of-place, destroy-input plans;
+   the planner often prefers transpose-pairwise to transpose-alltoall,
+   at least with LAM MPI on my machine. */
+     return (1
+        && (!ego->preserve_input || (!NO_DESTROY_INPUTP(plnr)
+                     && p->I != p->O))
+        && ONLY_TRANSPOSEDP(p->flags));
+}
+
+static void awake(plan *ego_, enum wakefulness wakefulness)
+{
+     P *ego = (P *) ego_;
+     X(plan_awake)(ego->cld1, wakefulness);
+     X(plan_awake)(ego->cld2, wakefulness);
+     X(plan_awake)(ego->cld2rest, wakefulness);
+     X(plan_awake)(ego->cld3, wakefulness);
+}
+
+static void destroy(plan *ego_)
+{
+     P *ego = (P *) ego_;
+     X(ifree0)(ego->sched);
+     X(ifree0)(ego->send_block_sizes);
+     MPI_Comm_free(&ego->comm);
+     X(plan_destroy_internal)(ego->cld3);
+     X(plan_destroy_internal)(ego->cld2rest);
+     X(plan_destroy_internal)(ego->cld2);
+     X(plan_destroy_internal)(ego->cld1);
+}
+
+static void print(const plan *ego_, printer *p)
+{
+     const P *ego = (const P *) ego_;
+     p->print(p, "(mpi-transpose-pairwise-transposed%s%(%p%)%(%p%)%(%p%)%(%p%))", 
+         ego->preserve_input==2 ?"/p":"",
+         ego->cld1, ego->cld2, ego->cld2rest, ego->cld3);
+}
+
+/* Given a process which_pe and a number of processes npes, fills
+   the array sched[npes] with a sequence of processes to communicate
+   with for a deadlock-free, optimum-overlap all-to-all communication.
+   (All processes must call this routine to get their own schedules.)
+   The schedule can be re-ordered arbitrarily as long as all processes
+   apply the same permutation to their schedules.
+
+   The algorithm here is based upon the one described in:
+       J. A. M. Schreuder, "Constructing timetables for sport
+       competitions," Mathematical Programming Study 13, pp. 58-67 (1980). 
+   In a sport competition, you have N teams and want every team to
+   play every other team in as short a time as possible (maximum overlap
+   between games).  This timetabling problem is therefore identical
+   to that of an all-to-all communications problem.  In our case, there
+   is one wrinkle: as part of the schedule, the process must do
+   some data transfer with itself (local data movement), analogous
+   to a requirement that each team "play itself" in addition to other
+   teams.  With this wrinkle, it turns out that an optimal timetable
+   (N parallel games) can be constructed for any N, not just for even
+   N as in the original problem described by Schreuder.
+*/
+static void fill1_comm_sched(int *sched, int which_pe, int npes)
+{
+     int pe, i, n, s = 0;
+     A(which_pe >= 0 && which_pe < npes);
+     if (npes % 2 == 0) {
+     n = npes;
+     sched[s++] = which_pe;
+     }
+     else
+     n = npes + 1;
+     for (pe = 0; pe < n - 1; ++pe) {
+     if (npes % 2 == 0) {
+          if (pe == which_pe) sched[s++] = npes - 1;
+          else if (npes - 1 == which_pe) sched[s++] = pe;
+     }
+     else if (pe == which_pe) sched[s++] = pe;
+
+     if (pe != which_pe && which_pe < n - 1) {
+          i = (pe - which_pe + (n - 1)) % (n - 1);
+          if (i < n/2)
+           sched[s++] = (pe + i) % (n - 1);
+          
+          i = (which_pe - pe + (n - 1)) % (n - 1);
+          if (i < n/2)
+           sched[s++] = (pe - i + (n - 1)) % (n - 1);
+     }
+     }
+     A(s == npes);
+}
+
+/* Sort the communication schedule sched for npes so that the schedule
+   on process sortpe is ascending or descending (!ascending).  This is
+   necessary to allow in-place transposes when the problem does not
+   divide equally among the processes.  In this case there is one
+   process where the incoming blocks are bigger/smaller than the
+   outgoing blocks and thus have to be received in
+   descending/ascending order, respectively, to avoid overwriting data
+   before it is sent. */
+static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
+{
+     int *sortsched, i;
+     sortsched = (int *) MALLOC(npes * sizeof(int) * 2, OTHER);
+     fill1_comm_sched(sortsched, sortpe, npes);
+     if (ascending)
+     for (i = 0; i < npes; ++i)
+          sortsched[npes + sortsched[i]] = sched[i];
+     else
+     for (i = 0; i < npes; ++i)
+          sortsched[2*npes - 1 - sortsched[i]] = sched[i];
+     for (i = 0; i < npes; ++i)
+     sched[i] = sortsched[npes + i];
+     X(ifree)(sortsched);
+}
+
+/* make the plans to do the pre-MPI transpositions (shared with
+   transpose-alltoall-transposed) */
+int XM(mkplans_pretranspose)(const problem_mpi_transpose *p, planner *plnr,
+                 R *I, R *O, int my_pe,
+                 plan **cld2, plan **cld2rest, plan **cld3,
+                 INT *rest_Ioff, INT *rest_Ooff)
+{
+     INT vn = p->vn;
+     INT b = XM(block)(p->nx, p->block, my_pe);
+     INT bt = p->tblock;
+     INT nyb = p->ny / bt; /* number of equal-sized blocks */
+     INT nyr = p->ny - nyb * bt; /* leftover rows after equal blocks */
+
+     *cld2 = *cld2rest = *cld3 = NULL;
+     *rest_Ioff = *rest_Ooff = 0;
+
+     if (!(p->flags & TRANSPOSED_IN) && (nyr == 0 || I != O)) {
+     INT ny = p->ny * vn;
+     bt *= vn;
+     *cld2 = X(mkplan_f_d)(plnr, 
+               X(mkproblem_rdft_0_d)(X(mktensor_3d)
+                             (nyb, bt, b * bt,
+                              b, ny, bt,
+                              bt, 1, 1),
+                             I, O),
+               0, 0, NO_SLOW);
+     if (!*cld2) goto nada;
+
+     if (nyr > 0) {
+          *rest_Ioff = nyb * bt;
+          *rest_Ooff = nyb * b * bt;
+          bt = nyr * vn;
+          *cld2rest = X(mkplan_f_d)(plnr,
+                    X(mkproblem_rdft_0_d)(X(mktensor_2d)
+                                  (b, ny, bt,
+                               bt, 1, 1),
+                                  I + *rest_Ioff,
+                                  O + *rest_Ooff),
+                                        0, 0, NO_SLOW);
+               if (!*cld2rest) goto nada;
+     }
+     }
+     else {
+     *cld2 = X(mkplan_f_d)(plnr,
+               X(mkproblem_rdft_0_d)(
+                    X(mktensor_4d)
+                    (nyb, b * bt * vn, b * bt * vn,
+                     b, vn, bt * vn,
+                     bt, b * vn, vn,
+                     vn, 1, 1),
+                    I, O),
+               0, 0, NO_SLOW);
+     if (!*cld2) goto nada;
+
+     *rest_Ioff = *rest_Ooff = nyb * bt * b * vn;
+     *cld2rest = X(mkplan_f_d)(plnr,
+                   X(mkproblem_rdft_0_d)(
+                    X(mktensor_3d)
+                    (b, vn, nyr * vn,
+                     nyr, b * vn, vn,
+                     vn, 1, 1),
+                    I + *rest_Ioff, O + *rest_Ooff),
+                   0, 0, NO_SLOW);
+     if (!*cld2rest) goto nada;
+
+     if (!(p->flags & TRANSPOSED_IN)) {
+          *cld3 = X(mkplan_f_d)(plnr,
+                    X(mkproblem_rdft_0_d)(
+                     X(mktensor_3d)
+                     (p->ny, vn, b * vn,
+                      b, p->ny * vn, vn,
+                      vn, 1, 1),
+                     I, I),
+                    0, 0, NO_SLOW);
+          if (!*cld3) goto nada;
+     }
+     }
+
+     return 1;
+
+nada:
+     X(plan_destroy_internal)(*cld3);
+     X(plan_destroy_internal)(*cld2rest);
+     X(plan_destroy_internal)(*cld2);
+     *cld2 = *cld2rest = *cld3 = NULL;
+     return 0;
+}
+
+static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
+{
+     const S *ego = (const S *) ego_;
+     const problem_mpi_transpose *p;
+     P *pln;
+     plan *cld1 = 0, *cld2 = 0, *cld2rest = 0, *cld3 = 0;
+     INT b, bt, vn, rest_Ioff, rest_Ooff;
+     INT *sbs, *sbo, *rbs, *rbo;
+     int pe, my_pe, n_pes, sort_pe = -1, ascending = 1;
+     R *I, *O;
+     static const plan_adt padt = {
+          XM(transpose_solve), awake, print, destroy
+     };
+
+     UNUSED(ego);
+
+     if (!applicable(ego, p_, plnr))
+          return (plan *) 0;
+
+     p = (const problem_mpi_transpose *) p_;
+     vn = p->vn;
+     I = p->I; O = p->O;
+
+     MPI_Comm_rank(p->comm, &my_pe);
+     MPI_Comm_size(p->comm, &n_pes);
+
+     bt = XM(block)(p->ny, p->tblock, my_pe);
+
+
+     if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) I = p->O;
+     
+     if (!(p->flags & TRANSPOSED_OUT)) { /* nx x bt x vn -> bt x nx x vn */
+     cld1 = X(mkplan_f_d)(plnr, 
+                  X(mkproblem_rdft_0_d)(X(mktensor_3d)
+                            (bt, vn, p->nx * vn,
+                             p->nx, bt * vn, vn,
+                             vn, 1, 1),
+                            I, O = p->O),
+                  0, 0, NO_SLOW);
+     if (XM(any_true)(!cld1, p->comm)) goto nada;
+
+     }
+     else {
+       if (ego->preserve_input || NO_DESTROY_INPUTP(plnr)) 
+         O = p->O;
+       else
+         O = p->I;
+     }
+
+     if (XM(any_true)(!XM(mkplans_pretranspose)(p, plnr, p->I, O, my_pe,
+                       &cld2, &cld2rest, &cld3,
+                       &rest_Ioff, &rest_Ooff),
+             p->comm)) goto nada;
+
+     pln = MKPLAN_MPI_TRANSPOSE(P, &padt, apply);
+
+     pln->cld1 = cld1;
+     pln->cld2 = cld2;
+     pln->cld2rest = cld2rest;
+     pln->rest_Ioff = rest_Ioff;
+     pln->rest_Ooff = rest_Ooff;
+     pln->cld3 = cld3;
+     pln->preserve_input = ego->preserve_input ? 2 : NO_DESTROY_INPUTP(plnr);
+
+     MPI_Comm_dup(p->comm, &pln->comm);
+
+     n_pes = (int) X(imax)(XM(num_blocks)(p->nx, p->block),
+              XM(num_blocks)(p->ny, p->tblock));
+
+     /* Compute sizes/offsets of blocks to exchange between processors */
+     sbs = (INT *) MALLOC(4 * n_pes * sizeof(INT), PLANS);
+     sbo = sbs + n_pes;
+     rbs = sbo + n_pes;
+     rbo = rbs + n_pes;
+     b = XM(block)(p->nx, p->block, my_pe);
+     bt = XM(block)(p->ny, p->tblock, my_pe);
+     for (pe = 0; pe < n_pes; ++pe) {
+     INT db, dbt; /* destination block sizes */
+     db = XM(block)(p->nx, p->block, pe);
+     dbt = XM(block)(p->ny, p->tblock, pe);
+
+     sbs[pe] = b * dbt * vn;
+     sbo[pe] = pe * (b * p->tblock) * vn;
+     rbs[pe] = db * bt * vn;
+     rbo[pe] = pe * (p->block * bt) * vn;
+
+     if (db * dbt > 0 && db * p->tblock != p->block * dbt) {
+          A(sort_pe == -1); /* only one process should need sorting */
+          sort_pe = pe;
+          ascending = db * p->tblock > p->block * dbt;
+     }
+     }
+     pln->n_pes = n_pes;
+     pln->my_pe = my_pe;
+     pln->send_block_sizes = sbs;
+     pln->send_block_offsets = sbo;
+     pln->recv_block_sizes = rbs;
+     pln->recv_block_offsets = rbo;
+
+     if (my_pe >= n_pes) {
+     pln->sched = 0; /* this process is not doing anything */
+     }
+     else {
+     pln->sched = (int *) MALLOC(n_pes * sizeof(int), PLANS);
+     fill1_comm_sched(pln->sched, my_pe, n_pes);
+     if (sort_pe >= 0)
+          sort1_comm_sched(pln->sched, n_pes, sort_pe, ascending);
+     }
+
+     X(ops_zero)(&pln->super.super.ops);
+     if (cld1) X(ops_add2)(&cld1->ops, &pln->super.super.ops);
+     if (cld2) X(ops_add2)(&cld2->ops, &pln->super.super.ops);
+     if (cld2rest) X(ops_add2)(&cld2rest->ops, &pln->super.super.ops);
+     if (cld3) X(ops_add2)(&cld3->ops, &pln->super.super.ops);
+     /* FIXME: should MPI operations be counted in "other" somehow? */
+
+     return &(pln->super.super);
+
+ nada:
+     X(plan_destroy_internal)(cld3);
+     X(plan_destroy_internal)(cld2rest);
+     X(plan_destroy_internal)(cld2);
+     X(plan_destroy_internal)(cld1);
+     return (plan *) 0;
+}
+
+static solver *mksolver(int preserve_input)
+{
+     static const solver_adt sadt = { PROBLEM_MPI_TRANSPOSE, mkplan, 0 };
+     S *slv = MKSOLVER(S, &sadt);
+     slv->preserve_input = preserve_input;
+     return &(slv->super);
+}
+
+void XM(transpose_pairwise_transposed_register)(planner *p)
+{
+     int preserve_input;
+     for (preserve_input = 0; preserve_input <= 1; ++preserve_input)
+     REGISTER_SOLVER(p, mksolver(preserve_input));
+}
diff -rupN fftw-3.3.3/mpi/transpose-pairwise.c fftw-3.3.3-patched/mpi/transpose-pairwise.c
--- fftw-3.3.3/mpi/transpose-pairwise.c 2013-03-28 13:52:19.000000000 +0100
+++ fftw-3.3.3-patched/mpi/transpose-pairwise.c 2013-03-28 13:49:08.000000000 +0100
@@ -53,7 +53,6 @@ static void transpose_chunks(int *sched,
 {
      if (sched) {
      int i;
-     MPI_Status status;

      /* TODO: explore non-synchronous send/recv? */

@@ -74,7 +73,7 @@ static void transpose_chunks(int *sched,
                      O + rbo[pe], (int) (rbs[pe]),
                      FFTW_MPI_TYPE,
                      pe, (pe * n_pes + my_pe) & 0xffff,
-                     comm, &status);
+                     comm, MPI_STATUS_IGNORE);
            }
           }

@@ -92,7 +91,7 @@ static void transpose_chunks(int *sched,
                      O + rbo[pe], (int) (rbs[pe]),
                      FFTW_MPI_TYPE,
                      pe, (pe * n_pes + my_pe) & 0xffff,
-                     comm, &status);
+                     comm, MPI_STATUS_IGNORE);
           }
      }
      }
@@ -350,6 +349,7 @@ nada:
      X(plan_destroy_internal)(*cld3);
      X(plan_destroy_internal)(*cld2rest);
      X(plan_destroy_internal)(*cld2);
+     *cld2 = *cld2rest = *cld3 = NULL;
      return 0;
 }

EOF

# the patch changed a Makefile.am, so we need to call autoreconf
autoreconf --verbose --install --symlink --force

./configure --prefix=$PREFIX --enable-mpi --disable-shared \
--enable-threads --enable-static --enable-openmp --disable-fortran |tee $LOGFILE

make -j 4 2>&1 | tee $LOGFILE
make install 2>&1 | tee $LOGFILE