easybuilders / easybuild-easyconfigs

A collection of easyconfig files that describe which software to build using which build options with EasyBuild.
https://easybuild.io
GNU General Public License v2.0
380 stars 703 forks source link

FFTW 3.3.10 iimpi 2022a MPI gets stuck in the testing phase when the tests use more than 1 core #17581

Open satishskamath opened 1 year ago

satishskamath commented 1 year ago
[satishk@int6 mpi]$ make check
make  check-am
make[1]: Entering directory '/gpfs/scratch1/shared/satishk/fftw-3.3.10/mpi'
make  check-local
make[2]: Entering directory '/gpfs/scratch1/shared/satishk/fftw-3.3.10/mpi'
perl -w ../tests/check.pl --verbose --random --maxsize=10000 -c=10  --mpi "mpirun -np 1 `pwd`/mpi-bench"
Executing "mpirun -np 1 /scratch-shared/satishk/fftw-3.3.10/mpi/mpi-bench --verbose=1   --verify 'okd10o01x72b' --verify 'ikd10o01x72b' --verify 'okd]12o11x9e10v15' --verify 'ikd]12
o11x9e10v15' --verify 'okd]3e00x5o01x6e11x2e00' --verify 'ikd]3e00x5o01x6e11x2e00' --verify 'obr[6x7x7x6' --verify 'ibr[6x7x7x6' --verify 'obc[6x7x7x6' --verify 'ibc[6x7x7x6' --veri
fy 'ofc[6x7x7x6' --verify 'ifc[6x7x7x6' --verify 'obrd10x4x10' --verify 'ibrd10x4x10' --verify 'ofrd10x4x10' --verify 'ifrd10x4x10' --verify 'obcd10x4x10' --verify 'ibcd10x4x10' --v
erify 'ofcd10x4x10' --verify 'ifcd10x4x10' --verify 'ok12o01x2e01x7e01x10o11v2' --verify 'ik12o01x2e01x7e01x10o11v2' --verify 'ofr]9x11' --verify 'ifr]9x11' --verify 'obc]9x11' --ve
rify 'ibc]9x11' --verify 'ofc]9x11' --verify 'ifc]9x11' --verify 'ofr]10x7' --verify 'ifr]10x7' --verify 'obc]10x7' --verify 'ibc]10x7' --verify 'ofc]10x7' --verify 'ifc]10x7'"
sh: mpirun: command not found
FAILED mpirun -np 1 /scratch-shared/satishk/fftw-3.3.10/mpi/mpi-bench:  --verify 'okd10o01x72b' --verify 'ikd10o01x72b' --verify 'okd]12o11x9e10v15' --verify 'ikd]12o11x9e10v15' --v
erify 'okd]3e00x5o01x6e11x2e00' --verify 'ikd]3e00x5o01x6e11x2e00' --verify 'obr[6x7x7x6' --verify 'ibr[6x7x7x6' --verify 'obc[6x7x7x6' --verify 'ibc[6x7x7x6' --verify 'ofc[6x7x7x6'
 --verify 'ifc[6x7x7x6' --verify 'obrd10x4x10' --verify 'ibrd10x4x10' --verify 'ofrd10x4x10' --verify 'ifrd10x4x10' --verify 'obcd10x4x10' --verify 'ibcd10x4x10' --verify 'ofcd10x4x
10' --verify 'ifcd10x4x10' --verify 'ok12o01x2e01x7e01x10o11v2' --verify 'ik12o01x2e01x7e01x10o11v2' --verify 'ofr]9x11' --verify 'ifr]9x11' --verify 'obc]9x11' --verify 'ibc]9x11' 
--verify 'ofc]9x11' --verify 'ifc]9x11' --verify 'ofr]10x7' --verify 'ifr]10x7' --verify 'obc]10x7' --verify 'ibc]10x7' --verify 'ofc]10x7' --verify 'ifc]10x7'
make[2]: *** [Makefile:993: check-local] Error 1
make[2]: Leaving directory '/gpfs/scratch1/shared/satishk/fftw-3.3.10/mpi'
make[1]: *** [Makefile:786: check-am] Error 2
make[1]: Leaving directory '/gpfs/scratch1/shared/satishk/fftw-3.3.10/mpi'
make: *** [Makefile:788: check] Error 2
[satishk@int6 mpi]$ module load 2022
[satishk@int6 mpi]$ module load iimpi/2022a 
[satishk@int6 mpi]$ make check
make  check-am
make[1]: Entering directory '/gpfs/scratch1/shared/satishk/fftw-3.3.10/mpi'
make  check-local
make[2]: Entering directory '/gpfs/scratch1/shared/satishk/fftw-3.3.10/mpi'
perl -w ../tests/check.pl --verbose --random --maxsize=10000 -c=10  --mpi "mpirun -np 1 `pwd`/mpi-bench"
Executing "mpirun -np 1 /scratch-shared/satishk/fftw-3.3.10/mpi/mpi-bench --verbose=1   --verify 'ofc[5x2x4v5' --verify 'ifc[5x2x4v5' --verify 'ok[12e10x10o11x9e00x3e00' --verify 'i
k[12e10x10o11x9e00x3e00' --verify 'ok]5bx16bx8e01x10o10' --verify 'ik]5bx16bx8e01x10o10' --verify 'obr7x39' --verify 'ibr7x39' --verify 'ofr7x39' --verify 'ifr7x39' --verify 'obc7x3
9' --verify 'ibc7x39' --verify 'ofc7x39' --verify 'ifc7x39' --verify 'ok13o01x12bx12e11' --verify 'ik13o01x12bx12e11' --verify 'obr8x12x20' --verify 'ibr8x12x20' --verify 'ofr8x12x2
0' --verify 'ifr8x12x20' --verify 'obc8x12x20' --verify 'ibc8x12x20' --verify 'ofc8x12x20' --verify 'ifc8x12x20' --verify 'ok9o11x15e01x12bx3e01v2' --verify 'ik9o11x15e01x12bx3e01v2
' --verify 'obr13x4x6x8' --verify 'ibr13x4x6x8' --verify 'ofr13x4x6x8' --verify 'ifr13x4x6x8' --verify 'obc13x4x6x8' --verify 'ibc13x4x6x8' --verify 'ofc13x4x6x8' --verify 'ifc13x4x
6x8' --verify 'ok]4o01x14bx10o01x10o01' --verify 'ik]4o01x14bx10o01x10o01' --verify 'obr10x4' --verify 'ibr10x4' --verify 'ofr10x4' --verify 'ifr10x4' --verify 'obc10x4' --verify 'i
bc10x4' --verify 'ofc10x4' --verify 'ifc10x4'"
MPI startup(): Warning: I_MPI_PMI_LIBRARY will be ignored since the hydra process manager was found
ofc[5x2x4v5 2.86185e-16 3.3704e-16 3.42382e-16
ifc[5x2x4v5 3.18851e-16 3.3704e-16 3.42469e-16
ok[12e10x10o11x9e00x3e00 4.06264e-16 3.14051e-14 7.01813e-16
ik[12e10x10o11x9e00x3e00 5.32723e-16 3.87274e-14 4.61566e-16
ok]5bx16bx8e01x10o10 3.66698e-16 5.68434e-14 4.46936e-16
ik]5bx16bx8e01x10o10 3.48889e-16 8.91232e-14 4.05621e-16
obr7x39 4.21147e-16 8.6008e-16 4.21645e-16
ibr7x39 5.25907e-16 6.4506e-16 4.20287e-16
ofr7x39 4.05213e-16 6.4506e-16 7.96314e-16
ifr7x39 4.19171e-16 6.4506e-16 7.53344e-16
obc7x39 3.53707e-16 6.4506e-16 8.82617e-16
ibc7x39 3.77171e-16 6.4506e-16 8.15679e-16
ofc7x39 3.96623e-16 6.4506e-16 8.96221e-16
ifc7x39 4.03986e-16 6.4506e-16 7.67852e-16
ok13o01x12bx12e11 4.86749e-16 1.71341e-14 4.49351e-16
ik13o01x12bx12e11 4.82642e-16 1.76348e-14 4.64008e-16
obr8x12x20 3.80235e-16 1.29727e-15 5.28623e-16
ibr8x12x20 3.99768e-16 1.29727e-15 5.21374e-16
ofr8x12x20 4.1034e-16 1.29727e-15 8.48407e-16
ifr8x12x20 3.68379e-16 1.13511e-15 8.60514e-16
obc8x12x20 4.29003e-16 1.13511e-15 7.75874e-16
ibc8x12x20 3.96171e-16 1.29727e-15 8.54497e-16
ofc8x12x20 4.50204e-16 1.13511e-15 7.59909e-16
ifc8x12x20 3.8462e-16 9.72951e-16 8.74819e-16
ok9o11x15e01x12bx3e01v2 5.25458e-16 6.33155e-14 5.09957e-16
ik9o11x15e01x12bx3e01v2 4.59139e-16 6.6032e-14 5.13709e-16
obr13x4x6x8 3.65894e-16 8.53334e-16 5.16076e-16
ibr13x4x6x8 3.55054e-16 8.53334e-16 5.04552e-16
ofr13x4x6x8 3.55178e-16 5.68889e-16 8.82728e-16
ifr13x4x6x8 4.40036e-16 5.68889e-16 8.80091e-16
obc13x4x6x8 3.32391e-16 8.53334e-16 8.54903e-16
ibc13x4x6x8 4.66775e-16 7.11112e-16 9.62202e-16
ofc13x4x6x8 4.34012e-16 5.68889e-16 9.29134e-16
ifc13x4x6x8 4.65314e-16 8.53334e-16 9.0392e-16
ok]4o01x14bx10o01x10o01 4.41587e-16 2.34666e-14 5.30994e-16
ik]4o01x14bx10o01x10o01 4.08943e-16 2.40825e-14 5.18843e-16
obr10x4 3.84017e-16 4.213e-16 2.60873e-16
ibr10x4 3.19984e-16 4.213e-16 3.52765e-16
ofr10x4 2.73929e-16 2.80867e-16 3.54971e-16
ifr10x4 2.40833e-16 2.80867e-16 2.9243e-16
obc10x4 3.97692e-16 4.213e-16 3.33344e-16
ibc10x4 3.53257e-16 4.213e-16 4.39754e-16
ofc10x4 2.51045e-16 4.213e-16 2.89078e-16
ifc10x4 2.92424e-16 4.213e-16 4.71599e-16
Executing "mpirun -np 1 /scratch-shared/satishk/fftw-3.3.10/mpi/mpi-bench --verbose=1   --verify 'okd[8hx4o11x5e01x11o01' --verify 'ikd[8hx4o11x5e01x11o01' --verify 'obr[5x2x4v5' --
verify 'ibr[5x2x4v5' --verify 'obc[5x2x4v5' --verify 'ibc[5x2x4v5'"
MPI startup(): Warning: I_MPI_PMI_LIBRARY will be ignored since the hydra process manager was found
okd[8hx4o11x5e01x11o01 4.68273e-16 1.38101e-14 6.04121e-16
ikd[8hx4o11x5e01x11o01 4.28172e-16 1.7772e-14 4.84704e-16
obr[5x2x4v5 3.36356e-16 3.3704e-16 2.641e-16
ibr[5x2x4v5 2.92387e-16 5.0556e-16 2.8255e-16
obc[5x2x4v5 2.73883e-16 3.3704e-16 3.73067e-16
ibc[5x2x4v5 2.80644e-16 3.3704e-16 3.43517e-16
--------------------------------------------------------------
     MPI FFTW transforms passed 10 tests, 1 CPU
--------------------------------------------------------------
perl -w ../tests/check.pl --verbose --random --maxsize=10000 -c=10  --mpi "mpirun -np 2 `pwd`/mpi-bench"
Executing "mpirun -np 2 /scratch-shared/satishk/fftw-3.3.10/mpi/mpi-bench --verbose=1   --verify 'ok]6e00x9hx5e01x10e11' --verify 'ik]6e00x9hx5e01x10e11' --verify 'obr24x11x8' --ver
ify 'ibr24x11x8' --verify 'ofr24x11x8' --verify 'ifr24x11x8' --verify 'obc24x11x8' --verify 'ibc24x11x8' --verify 'ofc24x11x8' --verify 'ifc24x11x8' --verify 'ok]10o00x6o11x8e00' --
verify 'ik]10o00x6o11x8e00' --verify 'okd9o11x88b' --verify 'ikd9o11x88b' --verify 'ok[70o11x36h' --verify 'ik[70o11x36h' --verify 'ok[12e01x11o01' --verify 'ik[12e01x11o01' --verif
y 'ofrd]5x11v13' --verify 'ifrd]5x11v13' --verify 'obcd]5x11v13' --verify 'ibcd]5x11v13' --verify 'ofcd]5x11v13' --verify 'ifcd]5x11v13' --verify 'obrd[10x3x4x4' --verify 'ibrd[10x3
x4x4' --verify 'obcd[10x3x4x4' --verify 'ibcd[10x3x4x4' --verify 'ofcd[10x3x4x4' --verify 'ifcd[10x3x4x4' --verify 'obr[77x30' --verify 'ibr[77x30' --verify 'obc[77x30' --verify 'ib
c[77x30' --verify 'ofc[77x30' --verify 'ifc[77x30' --verify 'okd[18o01x3o10x10o11' --verify 'ikd[18o01x3o10x10o11' --verify 'ofr]12x5x5v4' --verify 'ifr]12x5x5v4' --verify 'obc]12x5
x5v4' --verify 'ibc]12x5x5v4' --verify 'ofc]12x5x5v4' --verify 'ifc]12x5x5v4'"
MPI startup(): Warning: I_MPI_PMI_LIBRARY will be ignored since the hydra process manager was found
MPI startup(): Warning: I_MPI_PMI_LIBRARY will be ignored since the hydra process manager was found

^C[mpiexec@int6] Sending Ctrl-C to processes as requested
[mpiexec@int6] Press Ctrl-C again to force abort

The test gets stuck with more than 1 MPI process. This is on AMD zen2 7H12.

boegel commented 1 year ago

Hmm, I haven't seen this myself...

Do you have some more information about the environment in which you're seeing this? Is it in an interactive Slurm job? If so, which Slurm version, how was the session started?

satishskamath commented 1 year ago

It is related to this thread on slack. https://easybuild.slack.com/archives/C34UA1HT7/p1677104754651969

satishskamath commented 1 year ago

Hmm, I haven't seen this myself...

Do you have some more information about the environment in which you're seeing this? Is it in an interactive Slurm job? If so, which Slurm version, how was the session started?

This is not a SLURM job. I tried installing it without easybuild and I got the same result. Therefore, it is something that is weird with the installation process either of the MPI or the intel compilers. I will report this in FFTW github as well.

boegel commented 1 year ago

Any chance this is related to #15651 ?