E3SM-Project / zppy

E3SM post-processing toolchain
BSD 3-Clause "New" or "Revised" License
6 stars 15 forks source link

[Bug]: Issues with `tc_analysis` in zppy #622

Closed chengzhuzhang closed 4 weeks ago

chengzhuzhang commented 2 months ago

What happened?

I ran into a few issues with setting up tc_analysis with a fully configured zppy run.

  1. Status file shows "RUNNING" after slurm error: slurmstepd: error: *** JOB 588238 ON chr-0471 CANCELLED AT 2024-09-20T16:27:45 DUE TO TIME LIMIT ***
  2. All three e3sm_diags runs (model vs obs, model vs model, model vs model [land=only]) are waiting for tc_analysis even though only the model vs obs task depends on tc_analysis.

What machine were you running on?




What command did you run?

zppy -c

Copy your cfg file

input = /lcrc/group/e3sm2/ac.wlin/E3SMv3/v3.LR.historical_0051
output = /lcrc/group/e3sm2/ac.zhang40/E3SMv3/v3.LR.historical_0920
case = v3.LR.historical_0051
www = /lcrc/group/e3sm/public_html/diagnostic_output/ac.zhang40/E3SMv3_0920
partition = compute
environment_commands = "source /lcrc/soft/climate/e3sm-unified/load_latest_e3sm_unified_chrysalis.sh"
#environment_commands = "source /lcrc/soft/climate/e3sm-unified/test_e3sm_unified_1.10.0rc5_chrysalis.sh"
campaign = "water_cycle"

active = True
#years = "0001:0100:50", "0001:0100:100"
#years = "1985:2014:30", "1985:2014:15"
years = "1985:2014:30",
walltime = "1:00:00"

  [[ atm_monthly_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  mapping_file = map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  frequency = "monthly"

  [[ atm_monthly_diurnal_8xdaily_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h3"
  mapping_file = map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  vars = "PRECT"
  frequency = "diurnal_8xdaily"

  [[ land_monthly_climo ]]
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = archive/lnd/hist
  vars = ""

active = True
years = "1985:2014:30"
walltime = "00:50:00"

  [[ atm_monthly_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h0"
  frequency = "monthly"
  mapping_file = /home/ac.zender/data/maps/map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
# Needed for mixed-phase partition
  ts_fmt = "cmip"

  [[ atm_daily_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h1"
  frequency = "daily"
  mapping_file = /home/ac.zender/data/maps/map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  # Needed for Wheeler Kiladis
  vars = "FLUT,PRECT,U850"

  [[ atm_monthly_glb ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h0"
  frequency = "monthly"
  mapping_file = "glb"

  [[ land_monthly ]]
  input_subdir = "archive/lnd/hist"
  input_files = "elm.h0"
  frequency = "monthly"
  #mapping_file = ""
  mapping_file = map_r05_to_cmip6_180x360_aave.20231110.nc
  extra_vars = "landfrac"
  ts_fmt = "cmip"
  [[ rof_monthly ]]
  input_subdir = "archive/rof/hist"
  input_files = "mosart.h0"
  mapping_file = ""
  frequency = "monthly"
  extra_vars = 'areatotal2'

  [[ land_monthly ]]
  input_subdir = "archive/lnd/hist"
  input_files = "elm.h0"
  frequency = "monthly"
  #mapping_file = ""
  mapping_file = map_r05_to_cmip6_180x360_aave.20231110.nc
  extra_vars = "landfrac"
  ts_fmt = "cmip"
  [[ rof_monthly ]]
  input_subdir = "archive/rof/hist"
  input_files = "mosart.h0"
  mapping_file = ""
  frequency = "monthly"
  extra_vars = 'areatotal2'

  [[ lnd_monthly_glb ]]
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  mapping_file = "glb"

active = True
scratch = "/lcrc/globalscratch/$USER"
# Make walltime very short to reproduce this error
walltime = "00:10:00"
years = "1985:2014:30",

active = True
walltime = "4:00:00"
#years = "0001:0100:50", "0001:0100:100"
years = "1985:2014:30",
ts_num_years = 30
ref_start_yr = 1985
ref_final_yr = 2014
multiprocessing = True
num_workers = 8

  [[ atm_monthly_180x360_aave ]]
#  environment_commands = "source /home/ac.zhang40/y/etc/profile.d/conda.sh; conda activate edv2110"
  short_name = 'v3.LR.historical_0051'
  grid = '180x360_aave'
  reference_data_path = '/lcrc/soft/climate/e3sm_diags_data/obs_for_e3sm_diags/climatology'
  obs_ts = '/lcrc/soft/climate/e3sm_diags_data/obs_for_e3sm_diags/time-series'
  dc_obs_climo = '/lcrc/group/e3sm/public_html/e3sm_diags_test_data/unit_test_complete_run/obs/climatology'
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_diurnal_frequency = "diurnal_8xdaily"
  ts_daily_subsection = "atm_daily_180x360_aave"
#  sets="tropical_subseasonal",
  output_format_subplot = "pdf",

  [[ lnd_monthly_mvm_lnd ]]
  # Test model-vs-model using the same files as the reference
  #environment_commands = "source /home/ac.zhang40/y/etc/profile.d/conda.sh; conda activate edv290"
  grid = 'native'
  climo_subsection = "land_monthly_climo"
  diff_title = "Difference"
  partition = "compute"
  qos = "regular"
  short_name = v3.LR.piControl
  ref_name = "20231209.v3.LR.piControl-spinup.chrysalis"
  ref_start_yr = 0051
  ref_final_yr = 0100
  ref_years = "0051-0100",
  reference_data_path = "/lcrc/group/e3sm/ac.zhang40/tests/20231209.v3.LR.piControl-spinup.chrysalis_land_diags/post/lnd/native/clim"
  run_type = "model_vs_model"
  sets = "lat_lon_land",
  short_ref_name = "20231209.v3.LR.piControl-spinup"
  swap_test_ref = False
  tag = "model_vs_model"
  ts_num_years_ref = 50

  #years = "0001-0050", 
  #years = "1985-2014", 
  ref_years = "0001-0050",
  ref_start_yr = 1
  ref_final_yr = 50
  ts_num_years = 30 
  ts_num_years_ref = 10
  ts_subsection = "atm_monthly_180x360_aave"
  short_name = 'v3alpha04-COARE.piControl'
  grid = '180x360_aave'
  ref_name = '20230924.v3alpha04_trigrid.piControl.chrysalis'
  short_ref_name = 'v3alpha04-CTL.piControl'
  tag = 'v3alpha04i-COARE_vs_CTL'
  run_type = "model_vs_model"
  reference_data_path = '/lcrc/group/e3sm2/ac.xzheng/E3SMv3_dev/20230924.v3alpha04_trigrid.piControl.chrysalis/post/atm/180x360_aave/clim'
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_subsection = "atm_monthly_180x360_aave"
  diff_title = 'Difference'
#  output_format_subplot = "pdf",

active = True 
nodes = 8
walltime = "2:00:00"
partition = compute 
short_name = 'v3.LR.historical_0051'
#ts_land_grid = 'native'
ts_num_years = 30
years = "1985:2014:30"

active = True
experiment_name = "v3.LR.historical_0051"
figstr = "v3.LR.historical_0051"
#plots_original = "net_toa_flux_restom,global_surface_air_temperature,toa_radiation,net_atm_energy_imbalance,net_atm_water_imbalance"
plots_atm = "TREFHT,AODDUST"
ts_num_years = 30
walltime = "00:30:00"
years = "1985-2014",
climo_years ="1985-2014",
ts_years ="1985-2014",
moc_file = "mocTimeSeries_1985-2014.nc"

What jobs are failing?

No response

What stack trace are you encountering?

No response

forsyth2 commented 2 months ago

Yes I ran into this issue too on main. Looking into it.

forsyth2 commented 2 months ago

tl;dr dependencies need to be defined per-task in the .py files. I'll make a PR.

I ran zppy -c issue_622.cfg, which uses this cfg:

input = /lcrc/group/e3sm2/ac.wlin/E3SMv3/v3.LR.historical_0051
#output = /lcrc/group/e3sm2/ac.zhang40/E3SMv3/v3.LR.historical_0920
output = /lcrc/group/e3sm/ac.forsyth2/issue_622
case = v3.LR.historical_0051
#www = /lcrc/group/e3sm/public_html/diagnostic_output/ac.zhang40/E3SMv3_0920
www = /lcrc/group/e3sm/public_html/diagnostic_output/ac.forsyth2
partition = compute
environment_commands = "source /lcrc/soft/climate/e3sm-unified/load_latest_e3sm_unified_chrysalis.sh"
campaign = "water_cycle"

active = True
years = "1985:2014:30",
walltime = "1:00:00"

  [[ atm_monthly_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  mapping_file = map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  frequency = "monthly"

  [[ atm_monthly_diurnal_8xdaily_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h3"
  mapping_file = map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  vars = "PRECT"
  frequency = "diurnal_8xdaily"

  [[ land_monthly_climo ]]
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = archive/lnd/hist
  vars = ""

active = True
years = "1985:2014:30"
walltime = "00:50:00"

  [[ atm_monthly_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h0"
  frequency = "monthly"
  mapping_file = /home/ac.zender/data/maps/map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  ts_fmt = "cmip"

  [[ atm_daily_180x360_aave ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h1"
  frequency = "daily"
  mapping_file = /home/ac.zender/data/maps/map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
  # Needed for Wheeler Kiladis
  vars = "FLUT,PRECT,U850"

  [[ atm_monthly_glb ]]
  input_subdir = "archive/atm/hist"
  input_files = "eam.h0"
  frequency = "monthly"
  mapping_file = "glb"

  [[ land_monthly ]]
  input_subdir = "archive/lnd/hist"
  input_files = "elm.h0"
  frequency = "monthly"
  mapping_file = map_r05_to_cmip6_180x360_aave.20231110.nc
  extra_vars = "landfrac"
  ts_fmt = "cmip"

  [[ rof_monthly ]]
  input_subdir = "archive/rof/hist"
  input_files = "mosart.h0"
  mapping_file = ""
  frequency = "monthly"
  extra_vars = 'areatotal2'

  [[ lnd_monthly_glb ]]
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  mapping_file = "glb"

active = True
scratch = "/lcrc/globalscratch/$USER"
# Make walltime very short to reproduce this error
walltime = "00:10:00"
years = "1985:2014:30",

active = True
walltime = "4:00:00"
years = "1985:2014:30",
ts_num_years = 30
ref_start_yr = 1985
ref_final_yr = 2014
multiprocessing = True
num_workers = 8

  [[ atm_monthly_180x360_aave ]]
  short_name = 'v3.LR.historical_0051'
  grid = '180x360_aave'
  reference_data_path = '/lcrc/soft/climate/e3sm_diags_data/obs_for_e3sm_diags/climatology'
  obs_ts = '/lcrc/soft/climate/e3sm_diags_data/obs_for_e3sm_diags/time-series'
  dc_obs_climo = '/lcrc/group/e3sm/public_html/e3sm_diags_test_data/unit_test_complete_run/obs/climatology'
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_diurnal_frequency = "diurnal_8xdaily"
  ts_daily_subsection = "atm_daily_180x360_aave"
  output_format_subplot = "pdf",

  [[ lnd_monthly_mvm_lnd ]]
  # Test model-vs-model using the same files as the reference
  grid = 'native'
  climo_subsection = "land_monthly_climo"
  diff_title = "Difference"
  partition = "compute"
  qos = "regular"
  short_name = v3.LR.piControl
  ref_name = "20231209.v3.LR.piControl-spinup.chrysalis"
  ref_start_yr = 0051
  ref_final_yr = 0100
  ref_years = "0051-0100",
  reference_data_path = "/lcrc/group/e3sm/ac.zhang40/tests/20231209.v3.LR.piControl-spinup.chrysalis_land_diags/post/lnd/native/clim"
  run_type = "model_vs_model"
  sets = "lat_lon_land",
  short_ref_name = "20231209.v3.LR.piControl-spinup"
  swap_test_ref = False
  tag = "model_vs_model"
  ts_num_years_ref = 50

  ref_years = "0001-0050",
  ref_start_yr = 1
  ref_final_yr = 50
  ts_num_years = 30 
  ts_num_years_ref = 10
  ts_subsection = "atm_monthly_180x360_aave"
  short_name = 'v3alpha04-COARE.piControl'
  grid = '180x360_aave'
  ref_name = '20230924.v3alpha04_trigrid.piControl.chrysalis'
  short_ref_name = 'v3alpha04-CTL.piControl'
  tag = 'v3alpha04i-COARE_vs_CTL'
  run_type = "model_vs_model"
  reference_data_path = '/lcrc/group/e3sm2/ac.xzheng/E3SMv3_dev/20230924.v3alpha04_trigrid.piControl.chrysalis/post/atm/180x360_aave/clim'
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_subsection = "atm_monthly_180x360_aave"
  diff_title = 'Difference'

active = True 
nodes = 8
walltime = "2:00:00"
partition = compute 
short_name = 'v3.LR.historical_0051'
ts_num_years = 30
years = "1985:2014:30"

active = True
experiment_name = "v3.LR.historical_0051"
figstr = "v3.LR.historical_0051"
plots_atm = "TREFHT,AODDUST"
ts_num_years = 30
walltime = "00:30:00"
years = "1985-2014",
climo_years ="1985-2014",
ts_years ="1985-2014",
moc_file = "mocTimeSeries_1985-2014.nc"

Note I had to remove duplicated subblocks from your cfg.

$ squeue -o "%8u %.7a %.4D %.9P %7i %.2t %.10r %.10M %.10l %j" --sort=P,-t,-p -u ac.forsyth2
ac.forsy    e3sm    1   compute 591484  PD Dependency       0:00    4:00:00 e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014
ac.forsy    e3sm    1   compute 591485  PD Dependency       0:00    4:00:00 e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100
ac.forsy    e3sm    1   compute 591486  PD Dependency       0:00    4:00:00 e3sm_diags_atm_monthly_180x360_aave_mvm_v3alpha04i-COARE_vs_CTL_1985-2014_vs_0001-0050

We see here that the three E3SM Diags task are all waiting on a dependency.

$ cd /lcrc/group/e3sm/ac.forsyth2/issue_622/post/scripts

$ grep -v "OK" *status
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014.status:WAITING 591484
e3sm_diags_atm_monthly_180x360_aave_mvm_v3alpha04i-COARE_vs_CTL_1985-2014_vs_0001-0050.status:WAITING 591486
e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100.status:WAITING 591485
tc_analysis_1985-2014.status:RUNNING 591483

$ tail -n 1 tc_analysis_1985-2014.o591483 
slurmstepd: error: *** JOB 591483 ON chr-0229 CANCELLED AT 2024-09-23T13:41:46 DUE TO TIME LIMIT ***

Note, your first point Status file shows "RUNNING" after slurm error is unfortuntely something I've never figured out a way around. That is, when SLURM hits a time limit, there is now no longer any time left on the job to update the status file to "ERROR." It said "RUNNING" when the time limit hit and at that point, no more changes can be made.

For your second point, on all three e3sm_diags runs waiting:

e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014 has:


This includes "tc_analysis", naturally it will not run.

e3sm_diags_atm_monthly_180x360_aave_mvm_v3alpha04i-COARE_vs_CTL_1985-2014_vs_0001-0050 has:


e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100 has:

sets = "lat_lon_land",

I added print(f"dependencies={dependencies}") in e3sm_diags.py. That gives me the following dependency lists:




It looks like dependencies get added as we go through the cfg, but we never start from fresh.

In e3sm_diags.py, we have:

    dependencies: List[str] = []

    for c in tasks:

Really, we should be resetting dependencies for each task...

I will make a PR for that.

forsyth2 commented 1 month ago

Resolved by #631.

chengzhuzhang commented 1 month ago

@forsyth2 I'm just wondering have you tested if the zppy configuration works (https://github.com/E3SM-Project/zppy/issues/622#issuecomment-2369252993) with 3 e3sm_diags tasks running in parallel after recent two PRs?

forsyth2 commented 1 month ago

I haven't tested 3 tasks in parallel explicitly. However, I did just run multiple tests in parallel for #632:

zppy -c tests/integration/generated/test_min_case_e3sm_diags_tc_analysis_v2_chrysalis.cfg # Runs 1 `e3sm_diags` task.
zppy -c tests/integration/generated/test_min_case_e3sm_diags_tc_analysis_v2_parallel_chrysalis.cfg # Runs 2 `e3sm_diags` task in parallel

I did run into some problems on the v3 side though, as described in https://github.com/E3SM-Project/zppy/pull/632#issuecomment-2417430619.

I can also increase the number of parallel tasks in min_case_e3sm_diags_tc_analysis_v2_parallel to 3 or more if that would be valuable.

Or are you asking if I've run issue_622.cfg (from the linked comment) post-#623? I can also do that.

chengzhuzhang commented 1 month ago

Or are you asking if I've run issue_622.cfg (from the linked comment) post-#623? I can also do that.

Yes, I mean that we should make sure the second issue in 622 is resolved as intended.

forsyth2 commented 1 month ago

Ok, with the tc_analysis task failing due to time limit reached, only the e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014 gets blocked, as intended. So I would say that #622 is indeed resolved.

I did however notice an unrelated error on the land task:

$ cd /lcrc/group/e3sm/ac.forsyth2/issue_622v3/v3.LR.historical_0051/post/scripts

$ cat e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100.status

$ cat e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100.o607287
cp: cannot stat '/lcrc/group/e3sm/ac.forsyth2/issue_622v3/v3.LR.historical_0051/post/lnd/native/clim/30yr/v3.LR.historical_0051_*_1985??_2014??_climo.nc': No such file or directory

# But:
$ ls /lcrc/group/e3sm/ac.forsyth2/issue_622v3/v3.LR.historical_0051/post/lnd/native/clim/30yr/
v3.LR.historical_0051_01_198501_201401_climo.nc  v3.LR.historical_0051_07_198507_201407_climo.nc  v3.LR.historical_0051_ANN_198501_201412_climo.nc
v3.LR.historical_0051_02_198502_201402_climo.nc  v3.LR.historical_0051_08_198508_201408_climo.nc  v3.LR.historical_0051_DJF_198501_201412_climo.nc
v3.LR.historical_0051_03_198503_201403_climo.nc  v3.LR.historical_0051_09_198509_201409_climo.nc  v3.LR.historical_0051_JJA_198506_201408_climo.nc
v3.LR.historical_0051_04_198504_201404_climo.nc  v3.LR.historical_0051_10_198510_201410_climo.nc  v3.LR.historical_0051_MAM_198503_201405_climo.nc
v3.LR.historical_0051_05_198505_201405_climo.nc  v3.LR.historical_0051_11_198511_201411_climo.nc  v3.LR.historical_0051_SON_198509_201411_climo.nc
v3.LR.historical_0051_06_198506_201406_climo.nc  v3.LR.historical_0051_12_198512_201412_climo.nc
chengzhuzhang commented 1 month ago

could it be a dependency issue? e3sm_diags_lnd_monthly_mvm_lnd task starts before climo files being generated?

forsyth2 commented 1 month ago

So it is. This is bizarre. I'll look into it; it should be depending on climo.

$ grep dependencies e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100.settings 
  'dependencies': [],