E3SM-Project / scorpio

A high-level Parallel I/O Library for structured grid applications
18 stars 16 forks source link

A long ne30+FC5AV1C-L run fails with Scorpio on Compy #319

Open jayeshkrishna opened 4 years ago

jayeshkrishna commented 4 years ago

@JS-WRF-SBM reported this issue when running ne30 + FC5AV1C-L for 1+ years on compy. The fails with the following error,

1080: PIO: FATAL ERROR: Aborting... An error occured, Writing variables (number of variables = 374) to file (compy_FC5AV1C-L_ne30_SCREAM-github-16thMay20-wrk_scream_p3.cam.h0.2011-03.nc, ncid=459) using PIO_IOTYPE_PNETCDF iotype failed. Non blocking write for variable (WD_H2O2, varid=385) failed (Number of s. err=-60. Aborting since the error handler was set to PIO_INTERNAL_ERROR... (/qfs/people/shpu881/E3SM/SCREAM-github-16thMay20-wrk/externals/scorpio/src/clib/pio_darray_int.c: 389)

The script (from @JS-WRF-SBM ) below recreates the issue,

#!/bin/csh
date

set echo verbose

set fetch_code              = 0   # 0 = No, >0 = Yes
set create_newcase          = 1
set case_setup              = 1
set case_build_all          = 1   # 0 = No, >0 = Yes
set case_build_incremental = 0
set case_run                = 1   # 0 = No, >0 = Yes

####################################################################
# Fetch code
####################################################################
#setenv CCSMTAG E3SM_20190418
setenv CCSMTAG SCREAM-github-16thMay20-wrk
setenv CCSMROOT /qfs/people/shpu881/E3SM/${CCSMTAG}
#setenv CCSMROOT /compyfs/${USER}/${CCSMTAG}

####################################################################
# Machine, compset, PE layout etc.
####################################################################

setenv COMPSET FC5AV1C-L #F20TRC5-CMIP6  #F2010C5-CMIP6-LR #FSCREAM-LR
setenv RESOLUTION ne30_ne30
setenv MRES ne30
setenv MACH      compy
setenv PTMP      /compyfs/${USER}/bld

setenv ntasks 1600
setenv nthrds 1

#setenv MYSRC     ${CCSMROOT}/mods_v1_p3_cmdv
#setenv MYCLM     ${CCSMROOT}/mods_clm

setenv CASE     ${MACH}_${COMPSET}_${MRES}_${CCSMTAG}_scream_p3
setenv COMCASE  ${MACH}_${COMPSET}_${MRES}_${CCSMTAG}_scream_p3

setenv CASEROOT  ${CCSMROOT}/cases/${CASE}
setenv RUNDIR    /compyfs/${USER}/csmruns/${CASE}

####################################################################
# Compile model
####################################################################
if ($create_newcase > 0) then

   rm -rf ${CASEROOT}
   cd  ${CCSMROOT}/cime/scripts

   ./create_newcase --case ${CASEROOT} --project e3sm --mach ${MACH} \
                    --res ${RESOLUTION} --compset ${COMPSET}
endif

#====================================================================
# set up case
#====================================================================
if ($case_setup > 0) then

   cd ${CASEROOT}

   ./xmlchange -file env_run.xml   -id RUNDIR  -val ${RUNDIR}

   ./xmlchange -file env_mach_pes.xml -id NTASKS_ATM -val ${ntasks}
   ./xmlchange -file env_mach_pes.xml -id NTHRDS_ATM -val ${nthrds}
   ./xmlchange -file env_mach_pes.xml -id ROOTPE_ATM -val '0'

   ./xmlchange -file env_mach_pes.xml -id NTASKS_LND -val ${ntasks}
   ./xmlchange -file env_mach_pes.xml -id NTHRDS_LND -val ${nthrds}
   ./xmlchange -file env_mach_pes.xml -id ROOTPE_LND -val '0'

   ./xmlchange -file env_mach_pes.xml -id NTASKS_ROF -val ${ntasks}
   ./xmlchange -file env_mach_pes.xml -id NTHRDS_ROF -val ${nthrds}
   ./xmlchange -file env_mach_pes.xml -id ROOTPE_ROF -val '0'

   ./xmlchange -file env_mach_pes.xml -id NTASKS_ICE -val ${ntasks}
   ./xmlchange -file env_mach_pes.xml -id NTHRDS_ICE -val ${nthrds}
   ./xmlchange -file env_mach_pes.xml -id ROOTPE_ICE -val '0'

   ./xmlchange -file env_mach_pes.xml -id NTASKS_OCN -val ${ntasks}
   ./xmlchange -file env_mach_pes.xml -id NTHRDS_OCN -val ${nthrds}
   ./xmlchange -file env_mach_pes.xml -id ROOTPE_OCN -val '0'

   ./xmlchange -file env_mach_pes.xml -id NTASKS_GLC -val ${ntasks}
   ./xmlchange -file env_mach_pes.xml -id NTHRDS_GLC -val ${nthrds}
   ./xmlchange -file env_mach_pes.xml -id ROOTPE_GLC -val '0'

   ./xmlchange -file env_mach_pes.xml -id NTASKS_WAV -val ${ntasks}
   ./xmlchange -file env_mach_pes.xml -id NTHRDS_WAV -val ${nthrds}
   ./xmlchange -file env_mach_pes.xml -id ROOTPE_WAV -val '0'

   ./xmlchange -file env_mach_pes.xml -id NTASKS_CPL -val ${ntasks}
   ./xmlchange -file env_mach_pes.xml -id NTHRDS_CPL -val ${nthrds}
   ./xmlchange -file env_mach_pes.xml -id ROOTPE_CPL -val '0'

   ./xmlchange -file env_workflow.xml -id JOB_WALLCLOCK_TIME -val '10:00:00'
   ./xmlchange -file env_workflow.xml -id JOB_QUEUE –val 'slurm'

   ./case.setup --clean
   ./case.setup
endif

#====================================================================
# my mods of source code
#====================================================================
if ($case_build_all > 0) then

    cd ${CASEROOT}
    # ln -s ${MYSRC}/* SourceMods/src.cam    # put your mods in here
    # ln -s ${MYCLM}/* SourceMods/src.clm    # put your mods in here

    ./xmlchange -file env_build.xml -id DEBUG                   -val 'FALSE'
    ./xmlchange -file env_build.xml -id CAM_CONFIG_OPTS -append -val ' -cosp'

      cd ${CASEROOT}
      #./case.build --clean-all 
      ./case.build

endif

if ($case_build_incremental > 0) then

    cd ${CASEROOT}
    ./case.build

endif

#####################################################################
# Conduct simulation
#####################################################################
if ($case_run > 0) then

#------------------
## set environment
#------------------

cd ${CASEROOT}

./xmlchange  -file env_run.xml  -id  RUN_STARTDATE   -val '2010-01-01'
#./xmlchange  -file env_run.xml  -id  RESUBMIT        -val '0'
./xmlchange  -file env_run.xml  -id  STOP_N          -val '1825'
./xmlchange  -file env_run.xml  -id  STOP_OPTION     -val 'ndays'
./xmlchange  -file env_run.xml  -id  REST_N          -val '30'
./xmlchange  -file env_run.xml  -id  REST_OPTION     -val 'ndays'
./xmlchange  -file env_run.xml  -id  DOUT_S          -val 'FALSE'
#./xmlchange  -file env_run.xml  -id  PIO_TYPENAME    -val 'netcdf'
#./xmlchange  -file env_run.xml  -id  PIO_NETCDF_FORMAT -val '64bit_data'
#./xmlchange  -file env_workflow.xml -id JOB_WALLCLOCK_TIME -val '12:00:00'
#./xmlchange  -file env_workflow.xml -id USER_REQUESTED_QUEUE –val 'slurm'

## goto the case directory, make changes, and submit the job 
./case.submit

endif
fdongyu commented 2 years ago

Hello, has this issue been resolved? I have the similar error running on Cori. Any suggestion would be helpful. Thanks in advance!