[Bug]: TC Analysis failing for complete_run test

forsyth2 commented 4 months ago

What happened?

complete_run can't finish because tc_analysis_1850-1851 is not completing successfully. The status file says "RUNNING" but there is no corresponding job running. The .o file shows:

Parameters:
  --res <integer> [30] 
  --file <string> ["/lcrc/globalscratch/ac.forsyth2//tc-analysis_1850_1851/outCSne30.g"] 
  --out_format <string> ["Netcdf4"] 
  --alt <bool> [true] 
=========================================================
..Generating mesh with resolution [30]
..Writing mesh to file [/lcrc/globalscratch/ac.forsyth2//tc-analysis_1850_1851/outCSne30.g] 
Nodes per element
..Block 1 (4 nodes): 5400
NetCDF: HDF error
/var/spool/slurmd/job552617/slurm_script: line 80: 1328058 Killed                  GenerateCSMesh --res $res --alt --file ${result_dir}outCSne$res.g

Furthermore, this is happening on two different pull requests: #607/#612 and #421/#602 (interestingly this was only happening on one of two testing branches)

What machine were you running on?

Chrysalis

Environment

zppy_dev_n600

What command did you run?

zppy -c tests/integration/generated/test_complete_run_chrysalis.cfg

Copy your cfg file

### 1st cfg with this issue
[default]
case = v2.LR.historical_0201
constraint = ""
dry_run = "True"
environment_commands = ""
input = "/lcrc/group/e3sm/ac.forsyth2//E3SMv2/v2.LR.historical_0201"
input_subdir = archive/atm/hist
mapping_file = "map_ne30pg2_to_cmip6_180x360_aave.20200201.nc"
# To run this test, edit `output` and `www` in this file, along with `actual_images_dir` in test_complete_run.py
output = "/lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/issue-421-post-600-1st-commit-only/v2.LR.historical_0201"
partition = "debug"
qos = "regular"
www = "/lcrc/group/e3sm/public_html/diagnostic_output/ac.forsyth2/zppy_test_complete_run_www/issue-421-post-600-1st-commit-only"

[climo]
active = True
walltime = "00:30:00"
years = "1850:1854:2", "1850:1854:4",

  [[ atm_monthly_180x360_aave ]]
  frequency = "monthly"
  input_files = "eam.h0"
  input_subdir = "archive/atm/hist"
  vars = ""

  [[ atm_monthly_diurnal_8xdaily_180x360_aave ]]
  frequency = "diurnal_8xdaily"
  input_files = "eam.h4"
  input_subdir = "archive/atm/hist"
  vars = "PRECT"

  [[ land_monthly_climo ]]
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  vars = ""

[ts]
active = True
e3sm_to_cmip_environment_commands = ""
walltime = "00:30:00"
years = "1850:1854:2",

  [[ atm_monthly_180x360_aave ]]
  frequency = "monthly"
  input_files = "eam.h0"
  input_subdir = "archive/atm/hist"
  ts_fmt = "cmip"

  [[ atm_daily_180x360_aave ]]
  frequency = "daily"
  input_files = "eam.h1"
  input_subdir = "archive/atm/hist"
  vars = "PRECT"

  [[ atm_monthly_glb ]]
  # Note global average won't work for 3D variables.
  frequency = "monthly"
  input_files = "eam.h0"
  input_subdir = "archive/atm/hist"
  mapping_file = "glb"
  years = "1850:1860:5",

  [[ land_monthly ]]
  e3sm_to_cmip_environment_commands = ""
  extra_vars = "landfrac"
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  vars = "LAISHA,LAISUN"
  ts_fmt = "cmip"

  [[ lnd_monthly_glb ]]
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  mapping_file = "glb"
  vars = "LAISHA,LAISUN"
  years = "1850:1860:5",

  [[ rof_monthly ]]
  extra_vars = 'areatotal2'
  frequency = "monthly"
  input_files = "mosart.h0"
  input_subdir = "archive/rof/hist"
  mapping_file = ""
  vars = "RIVER_DISCHARGE_OVER_LAND_LIQ"

[tc_analysis]
active = True
scratch = "/lcrc/globalscratch/ac.forsyth2/"
walltime = "00:30:00"
years = "1850:1854:2",

[e3sm_diags]
active = True
grid = '180x360_aave'
ref_final_yr = 2014
ref_start_yr = 1985
# TODO: this directory is missing OMI-MLS
sets = "lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","enso_diags","qbo","diurnal_cycle","annual_cycle_zonal_mean","streamflow", "zonal_mean_2d_stratosphere", "tc_analysis",
short_name = 'v2.LR.historical_0201'
ts_num_years = 2
walltime = "00:30:00"
years = "1850:1854:2", "1850:1854:4",

  [[ atm_monthly_180x360_aave ]]
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  partition = "compute"
  qos = "regular"
  sets = "lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","enso_diags","qbo","diurnal_cycle","annual_cycle_zonal_mean","streamflow", "zonal_mean_2d_stratosphere",
  walltime = "5:00:00"

  [[ atm_monthly_180x360_aave_environment_commands ]]
  environment_commands = "source /home/ac.forsyth2/miniconda3/etc/profile.d/conda.sh; conda activate e3sm_diags_20240610"
  sets = "qbo",
  ts_subsection = "atm_monthly_180x360_aave"

  [[ atm_monthly_180x360_aave_tc_analysis ]]
  # Running as its own subtask because tc_analysis requires jobs to run sequentially, which slows down testing
  sets = "tc_analysis",
  years = "1850:1852:2",

  [[ atm_monthly_180x360_aave_mvm ]]
  # Test model-vs-model using the same files as the reference
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_subsection = "atm_monthly_180x360_aave"
  diff_title = "Difference"
  partition = "compute"
  qos = "regular"
  ref_final_yr = 1851
  ref_name = "v2.LR.historical_0201"
  ref_start_yr = 1850
  ref_years = "1850-1851",
  reference_data_path = "/lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/issue-421-post-600-1st-commit-only/v2.LR.historical_0201/post/atm/180x360_aave/clim"
  run_type = "model_vs_model"
  short_ref_name = "v2.LR.historical_0201"
  swap_test_ref = False
  tag = "model_vs_model"
  ts_num_years_ref = 2
  ts_subsection = "atm_monthly_180x360_aave"
  walltime = "5:00:00"
  years = "1852-1853",

  [[ lnd_monthly_mvm_lnd ]]
  # Test model-vs-model using the same files as the reference
  climo_subsection = "land_monthly_climo"
  diff_title = "Difference"
  #grid = 'native'
  partition = "compute"
  qos = "regular"
  ref_final_yr = 1851
  ref_name = "v2.LR.historical_0201"
  ref_start_yr = 1850
  ref_years = "1850-1851",
  reference_data_path = "/lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/issue-421-post-600-1st-commit-only/v2.LR.historical_0201/post/lnd/180x360_aave/clim"
  run_type = "model_vs_model"
  sets = "lat_lon_land",
  short_ref_name = "same simulation"
  swap_test_ref = False
  tag = "model_vs_model"
  ts_num_years_ref = 2

[mpas_analysis]
active = True
anomalyRefYear = 1850
climo_years ="1850-1854", "1855-1860",
enso_years = "1850-1854", "1855-1860",
mesh = "EC30to60E2r2"
parallelTaskCount = 6
partition = "compute"
qos = "regular"
ts_years = "1850-1854", "1850-1860",
walltime = "00:30:00"

[global_time_series]
active = True
climo_years ="1850-1854", "1855-1860",
experiment_name = "v2.LR.historical_0201"
figstr = "v2_historical_0201"
moc_file=mocTimeSeries_1850-1860.nc
plots_lnd = "LAISHA,LAISUN"
ts_num_years = 5
ts_years = "1850-1854", "1850-1860",
walltime = "00:30:00"
years = "1850-1860",

[ilamb]
active = True
grid = '180x360_aave'
nodes = 8
partition = "compute"
short_name = 'v2.LR.historical_0201'
ts_num_years = 2
years = "1850:1854:2",

  [[ land_monthly ]]

### 2nd cfg with this issue
[default]
case = v2.LR.historical_0201
constraint = ""
dry_run = "False"
environment_commands = ""
input = "/lcrc/group/e3sm/ac.forsyth2//E3SMv2/v2.LR.historical_0201"
input_subdir = archive/atm/hist
mapping_file = "map_ne30pg2_to_cmip6_180x360_aave.20200201.nc"
# To run this test, edit `output` and `www` in this file, along with `actual_images_dir` in test_complete_run.py
output = "/lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/test-607v6/v2.LR.historical_0201"
partition = "debug"
qos = "regular"
www = "/lcrc/group/e3sm/public_html/diagnostic_output/ac.forsyth2/zppy_test_complete_run_www/test-607v6"

[climo]
active = True
walltime = "00:30:00"
years = "1850:1854:2", "1850:1854:4",

  [[ atm_monthly_180x360_aave ]]
  frequency = "monthly"
  input_files = "eam.h0"
  input_subdir = "archive/atm/hist"
  vars = ""

  [[ atm_monthly_diurnal_8xdaily_180x360_aave ]]
  frequency = "diurnal_8xdaily"
  input_files = "eam.h4"
  input_subdir = "archive/atm/hist"
  vars = "PRECT"

  [[ land_monthly_climo ]]
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  vars = ""

[ts]
active = True
e3sm_to_cmip_environment_commands = ""
walltime = "00:30:00"
years = "1850:1854:2",

  [[ atm_monthly_180x360_aave ]]
  frequency = "monthly"
  input_files = "eam.h0"
  input_subdir = "archive/atm/hist"
  ts_fmt = "cmip"

  [[ atm_daily_180x360_aave ]]
  frequency = "daily"
  input_files = "eam.h1"
  input_subdir = "archive/atm/hist"
  vars = "PRECT"

  [[ atm_monthly_glb ]]
  # Note global average won't work for 3D variables.
  frequency = "monthly"
  input_files = "eam.h0"
  input_subdir = "archive/atm/hist"
  mapping_file = "glb"
  years = "1850:1860:5",

  [[ land_monthly ]]
  e3sm_to_cmip_environment_commands = ""
  extra_vars = "landfrac"
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  vars = "LAISHA,LAISUN"
  ts_fmt = "cmip"

  [[ lnd_monthly_glb ]]
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  mapping_file = "glb"
  vars = "LAISHA,LAISUN"
  years = "1850:1860:5",

  [[ rof_monthly ]]
  extra_vars = 'areatotal2'
  frequency = "monthly"
  input_files = "mosart.h0"
  input_subdir = "archive/rof/hist"
  mapping_file = ""
  vars = "RIVER_DISCHARGE_OVER_LAND_LIQ"

[tc_analysis]
active = True
scratch = "/lcrc/globalscratch/ac.forsyth2/"
walltime = "00:30:00"
years = "1850:1854:2",

[e3sm_diags]
active = True
grid = '180x360_aave'
ref_final_yr = 2014
ref_start_yr = 1985
# TODO: this directory is missing OMI-MLS
sets = "lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","enso_diags","qbo","diurnal_cycle","annual_cycle_zonal_mean","streamflow", "zonal_mean_2d_stratosphere", "tc_analysis",
short_name = 'v2.LR.historical_0201'
ts_num_years = 2
walltime = "00:30:00"
years = "1850:1854:2", "1850:1854:4",

  [[ atm_monthly_180x360_aave ]]
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  partition = "compute"
  qos = "regular"
  sets = "lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","enso_diags","qbo","diurnal_cycle","annual_cycle_zonal_mean","streamflow", "zonal_mean_2d_stratosphere",
  walltime = "5:00:00"

  [[ atm_monthly_180x360_aave_environment_commands ]]
  environment_commands = "source /home/ac.forsyth2/miniconda3/etc/profile.d/conda.sh; conda activate e3sm_diags_20240610"
  sets = "qbo",
  ts_subsection = "atm_monthly_180x360_aave"

  [[ atm_monthly_180x360_aave_tc_analysis ]]
  # Running as its own subtask because tc_analysis requires jobs to run sequentially, which slows down testing
  sets = "tc_analysis",
  years = "1850:1852:2",

  [[ atm_monthly_180x360_aave_mvm ]]
  # Test model-vs-model using the same files as the reference
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_subsection = "atm_monthly_180x360_aave"
  diff_title = "Difference"
  partition = "compute"
  qos = "regular"
  ref_final_yr = 1851
  ref_name = "v2.LR.historical_0201"
  ref_start_yr = 1850
  ref_years = "1850-1851",
  reference_data_path = "/lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/test-607v6/v2.LR.historical_0201/post/atm/180x360_aave/clim"
  run_type = "model_vs_model"
  short_ref_name = "v2.LR.historical_0201"
  swap_test_ref = False
  tag = "model_vs_model"
  ts_num_years_ref = 2
  ts_subsection = "atm_monthly_180x360_aave"
  walltime = "5:00:00"
  years = "1852-1853",

  [[ lnd_monthly_mvm_lnd ]]
  # Test model-vs-model using the same files as the reference
  climo_subsection = "land_monthly_climo"
  diff_title = "Difference"
  #grid = 'native'
  partition = "compute"
  qos = "regular"
  ref_final_yr = 1851
  ref_name = "v2.LR.historical_0201"
  ref_start_yr = 1850
  ref_years = "1850-1851",
  reference_data_path = "/lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/test-607v6/v2.LR.historical_0201/post/lnd/180x360_aave/clim"
  run_type = "model_vs_model"
  sets = "lat_lon_land",
  short_ref_name = "same simulation"
  swap_test_ref = False
  tag = "model_vs_model"
  ts_num_years_ref = 2

[mpas_analysis]
active = True
anomalyRefYear = 1850
climo_years ="1850-1854", "1855-1860",
enso_years = "1850-1854", "1855-1860",
mesh = "EC30to60E2r2"
parallelTaskCount = 6
partition = "compute"
qos = "regular"
ts_years = "1850-1854", "1850-1860",
walltime = "00:30:00"

[global_time_series]
active = True
climo_years ="1850-1854", "1855-1860",
experiment_name = "v2.LR.historical_0201"
figstr = "v2_historical_0201"
moc_file=mocTimeSeries_1850-1860.nc
plots_lnd = "LAISHA,LAISUN"
ts_num_years = 5
ts_years = "1850-1854", "1850-1860",
walltime = "00:30:00"
years = "1850-1860",

[ilamb]
active = True
grid = '180x360_aave'
nodes = 8
partition = "compute"
short_name = 'v2.LR.historical_0201'
ts_num_years = 2
years = "1850:1854:2",

  [[ land_monthly ]]

What jobs are failing?

$ cd /lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/test-607v6/v2.LR.historical_0201/post/scripts
$ grep -v "OK" *status
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.status:WAITING 552626
e3sm_diags_atm_monthly_180x360_aave_tc_analysis_model_vs_obs_1850-1851.status:WAITING 552625
e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1850-1851_vs_1850-1851.status:WAITING 552627
tc_analysis_1850-1851.status:RUNNING 552617
tc_analysis_1852-1853.status:WAITING 552618

What stack trace are you encountering?

Parameters:
  --res <integer> [30] 
  --file <string> ["/lcrc/globalscratch/ac.forsyth2//tc-analysis_1850_1851/outCSne30.g"] 
  --out_format <string> ["Netcdf4"] 
  --alt <bool> [true] 
=========================================================
..Generating mesh with resolution [30]
..Writing mesh to file [/lcrc/globalscratch/ac.forsyth2//tc-analysis_1850_1851/outCSne30.g] 
Nodes per element
..Block 1 (4 nodes): 5400
NetCDF: HDF error
/var/spool/slurmd/job552617/slurm_script: line 80: 1328058 Killed                  GenerateCSMesh --res $res --alt --file ${result_dir}outCSne$res.g

forsyth2 commented 4 months ago

@chengzhuzhang As mentioned on #602, this issue seems to be independent of the code changes on #602. The stack trace really doesn't give me much to go off:

NetCDF: HDF error
/var/spool/slurmd/job552617/slurm_script: line 80: 1328058 Killed                  GenerateCSMesh --res $res --alt --file ${result_dir}outCSne$res.g

forsyth2 commented 4 months ago

The main branch seems unaffected by this. That means 2 things -- 1) this isn't an issue with storage space on scratch or anything like that affecting me specifically, 2) these two separate pull requests (#612, #602 with 2nd commit only) are somehow independently producing the TC analysis error.

test output dir	branch	base commit	conda env	Ran `pip install . && python tests/integration/utils.py`?	lessons learned
`/lcrc/group/e3sm/ac.forsyth2/zppy_test_debug_output/test-main-613v2/v2.LR.historical_0201/post/scripts`	test-main-613	Add center times (611)	zppy_dev_n600	y	`grep -v "OK" *status` shows no errors
`/lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/test-main-613v2/v2.LR.historical_0201/post/scripts`	test-main-613	Add center times (611)	zppy_dev_n600	y	`grep -v "OK" *status` shows no errors

forsyth2 commented 4 months ago

I ran a debug version of #602 with the 1st commit dropped and the TC analysis tasks worked. This leads me to believe there's some sort of concurrency issue happening when more jobs are run simultaneously.

Or actually, rather than running too many jobs in one zppy run, it's possible this issue has been coming up because I've been testing multiple branches of zppy simultaneously, meaning the TC analysis tasks could be trying to write to the same spot in scratch, resulting in race conditions.

forsyth2 commented 4 months ago

There have been run-in-parallel issues with tc_analysis before. E.g., in zppy/tc_analysis.py we have:

    # There is a `GenerateConnectivityFile: error while loading shared libraries: libnetcdf.so.11: cannot open shared object file: No such file or directory` error
    # when multiple year_sets are run simultaneously. Therefore, we will wait for the completion of one year_set before moving on to the next.

forsyth2 commented 4 months ago

Running #602 without the 1st commit does in fact produce no failures when I run complete_run without any other tests running. I think that pretty much confirms TC Analysis needs something more to avoid concurrency issues / race conditions -- maybe subdirectories based on test unique-id?

forsyth2 commented 3 months ago

I think adding more specificity via subdirectories resolves this issue. See #615. Closing

E3SM-Project / zppy