Writing a field using large chunks and compression crashes the program #438

Open DusanJovic-NOAA opened 5 months ago

DusanJovic-NOAA commented 5 months ago

The following test program:

$ cat test_netcdf_chunking.F90
!> Return error to ESMF and finalize it.
#define NC_ERR_STOP(status) \
    if (status /= nf90_noerr) write(0,*) "file: ", __FILE__, " line: ", __LINE__, trim(nf90_strerror(status)); \
    if (status /= nf90_noerr) call MPI_Abort(MPI_COMM_WORLD,1,ierr)

program test_netcdf_chunking

  use mpi
  use netcdf

    implicit none

    integer           :: mype, nproc

    ! integer, parameter :: im=3600, jm=1800, lm=256
    integer, parameter :: im=3600, jm=1800, lm=128

    real, dimension(:,:,:), allocatable   :: array

    integer :: ncerr,ierr
    integer :: ncid
    integer :: oldMode
    integer :: im_dimid, jm_dimid, lm_dimid
    integer :: varid

    call MPI_Init ( ierr )
    call MPI_Comm_rank ( MPI_COMM_WORLD, mype, ierr )
    call MPI_Comm_size ( MPI_COMM_WORLD, nproc, ierr )

    if (mod(lm,nproc) /= 0) then
       write(0,*)'MPI_Comm_size=',nproc,' must evenly divide lm=', lm
       call MPI_Abort(MPI_COMM_WORLD,1,ierr)

    ncerr = nf90_create('test.nc',&
            comm=MPI_COMM_WORLD, info = MPI_INFO_NULL, ncid=ncid); NC_ERR_STOP(ncerr)

    ! disable auto filling.
    ncerr = nf90_set_fill(ncid, NF90_NOFILL, oldMode); NC_ERR_STOP(ncerr)

    ncerr = nf90_def_dim(ncid, "im", im, im_dimid); NC_ERR_STOP(ncerr)
    ncerr = nf90_def_dim(ncid, "jm", jm, jm_dimid); NC_ERR_STOP(ncerr)
    ncerr = nf90_def_dim(ncid, "lm", lm, lm_dimid); NC_ERR_STOP(ncerr)

    ncerr = nf90_def_var(ncid, "field", NF90_FLOAT, [im_dimid,jm_dimid,lm_dimid], varid) ; NC_ERR_STOP(ncerr)
    ncerr = nf90_def_var_chunking(ncid, varid, NF90_CHUNKED, [im,jm,lm]) ; NC_ERR_STOP(ncerr)
    ncerr = nf90_def_var_deflate(ncid, varid, NF90_NOSHUFFLE, 1, 1) ; NC_ERR_STOP(ncerr)
    ncerr = nf90_var_par_access(ncid, varid, NF90_COLLECTIVE); NC_ERR_STOP(ncerr)

    ncerr = nf90_enddef(ncid); NC_ERR_STOP(ncerr)

    call random_number(array)
    ncerr = nf90_put_var(ncid, varid, values=array, start=[1,1,(mype*(lm/nproc))+1]); NC_ERR_STOP(ncerr)

    ncerr = nf90_close(ncid=ncid); NC_ERR_STOP(ncerr)

    call MPI_Finalize(ierr)

end program test_netcdf_chunking

crashes with the following error on HPC system using intel/intelmpi:

0: Abort(671744002) on node 0 (rank 0 in comm 0): Fatal error in PMPI_Type_contiguous: Invalid count, error stack:
0: PMPI_Type_contiguous(271): MPI_Type_contiguous(count=-1299292371, MPI_BYTE, new_type_p=0x7fff1d3777d4) failed
0: PMPI_Type_contiguous(238): Negative count, value is -1299292371
0: slurmstepd: error: *** STEP 58146089.1 ON h21c07 CANCELLED AT 2024-04-10T22:45:59 ***
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
srun: error: h21c07: tasks 0-3: Killed
srun: Terminating StepId=58146089.1

and on my laptop using gcc/13 and either OpenMPI or mpich:

$ mpirun -n 4 ./test_netcdf_chunking
[fedora:1007948] *** An error occurred in MPI_Type_contiguous
[fedora:1007948] *** reported by process [2256207873,0]
[fedora:1007948] *** on communicator MPI_COMM_WORLD
[fedora:1007948] *** MPI_ERR_COUNT: invalid count argument
[fedora:1007948] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[fedora:1007948] ***    and potentially your MPI job)
$ mpirun -n 4 ./test_netcdf_chunking
Abort(205157890) on node 0 (rank 0 in comm 0): Fatal error in internal_Type_contiguous: Invalid count, error stack:
internal_Type_contiguous(75): MPI_Type_contiguous(count=-1311205156, MPI_BYTE, newtype=0x7ffdbae07a20) failed
internal_Type_contiguous(43): Negative count, value is -1311205156