Closed chengzhuzhang closed 4 weeks ago
Yes I ran into this issue too on main
. Looking into it.
tl;dr dependencies need to be defined per-task in the .py
files. I'll make a PR.
I ran zppy -c issue_622.cfg
, which uses this cfg:
[default]
input = /lcrc/group/e3sm2/ac.wlin/E3SMv3/v3.LR.historical_0051
#output = /lcrc/group/e3sm2/ac.zhang40/E3SMv3/v3.LR.historical_0920
output = /lcrc/group/e3sm/ac.forsyth2/issue_622
case = v3.LR.historical_0051
#www = /lcrc/group/e3sm/public_html/diagnostic_output/ac.zhang40/E3SMv3_0920
www = /lcrc/group/e3sm/public_html/diagnostic_output/ac.forsyth2
partition = compute
environment_commands = "source /lcrc/soft/climate/e3sm-unified/load_latest_e3sm_unified_chrysalis.sh"
campaign = "water_cycle"
[climo]
active = True
years = "1985:2014:30",
walltime = "1:00:00"
[[ atm_monthly_180x360_aave ]]
input_subdir = "archive/atm/hist"
mapping_file = map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
frequency = "monthly"
[[ atm_monthly_diurnal_8xdaily_180x360_aave ]]
input_subdir = "archive/atm/hist"
input_files = "eam.h3"
mapping_file = map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
vars = "PRECT"
frequency = "diurnal_8xdaily"
[[ land_monthly_climo ]]
frequency = "monthly"
input_files = "elm.h0"
input_subdir = archive/lnd/hist
vars = ""
[ts]
active = True
years = "1985:2014:30"
walltime = "00:50:00"
[[ atm_monthly_180x360_aave ]]
input_subdir = "archive/atm/hist"
input_files = "eam.h0"
frequency = "monthly"
mapping_file = /home/ac.zender/data/maps/map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
vars = "FSNTOA,FLUT,FSNT,FLNT,FSNS,FLNS,SHFLX,QFLX,TAUX,TAUY,PRECC,PRECL,PRECSC,PRECSL,TS,TREFHT,CLDTOT,CLDHGH,CLDMED,CLDLOW,U,ICEFRAC,LANDFRAC,OCNFRAC,PS,CLDICE,CLDLIQ,T,AODDUST"
ts_fmt = "cmip"
[[ atm_daily_180x360_aave ]]
input_subdir = "archive/atm/hist"
input_files = "eam.h1"
frequency = "daily"
mapping_file = /home/ac.zender/data/maps/map_ne30pg2_to_cmip6_180x360_aave.20200201.nc
# Needed for Wheeler Kiladis
vars = "FLUT,PRECT,U850"
[[ atm_monthly_glb ]]
input_subdir = "archive/atm/hist"
input_files = "eam.h0"
frequency = "monthly"
mapping_file = "glb"
[[ land_monthly ]]
input_subdir = "archive/lnd/hist"
input_files = "elm.h0"
frequency = "monthly"
mapping_file = map_r05_to_cmip6_180x360_aave.20231110.nc
vars = "FSH,RH2M,LAISHA,LAISUN,QINTR,QOVER,QRUNOFF,QSOIL,QVEGE,QVEGT,SOILICE,SOILLIQ,SOILWATER_10CM,TSA,TSOI,H2OSNO,TOTLITC,CWDC,SOIL1C,SOIL2C,SOIL3C,SOIL4C,WOOD_HARVESTC,TOTVEGC,NBP,GPP,AR,HR"
extra_vars = "landfrac"
ts_fmt = "cmip"
[[ rof_monthly ]]
input_subdir = "archive/rof/hist"
input_files = "mosart.h0"
mapping_file = ""
frequency = "monthly"
vars = "RIVER_DISCHARGE_OVER_LAND_LIQ"
extra_vars = 'areatotal2'
[[ lnd_monthly_glb ]]
frequency = "monthly"
input_files = "elm.h0"
input_subdir = "archive/lnd/hist"
mapping_file = "glb"
vars = "FSH,RH2M,LAISHA,LAISUN,QINTR,QOVER,QRUNOFF,QSOIL,QVEGE,QVEGT,SOILWATER_10CM,TSA,H2OSNO,TOTLITC,CWDC,SOIL1C,SOIL2C,SOIL3C,SOIL4C,WOOD_HARVESTC,TOTVEGC,NBP,GPP,AR,HR"
[tc_analysis]
active = True
scratch = "/lcrc/globalscratch/$USER"
# Make walltime very short to reproduce this error
walltime = "00:10:00"
years = "1985:2014:30",
[e3sm_diags]
active = True
walltime = "4:00:00"
years = "1985:2014:30",
ts_num_years = 30
ref_start_yr = 1985
ref_final_yr = 2014
multiprocessing = True
num_workers = 8
[[ atm_monthly_180x360_aave ]]
short_name = 'v3.LR.historical_0051'
grid = '180x360_aave'
reference_data_path = '/lcrc/soft/climate/e3sm_diags_data/obs_for_e3sm_diags/climatology'
obs_ts = '/lcrc/soft/climate/e3sm_diags_data/obs_for_e3sm_diags/time-series'
dc_obs_climo = '/lcrc/group/e3sm/public_html/e3sm_diags_test_data/unit_test_complete_run/obs/climatology'
climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
climo_diurnal_frequency = "diurnal_8xdaily"
ts_daily_subsection = "atm_daily_180x360_aave"
sets="lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","annual_cycle_zonal_mean","qbo","diurnal_cycle","zonal_mean_2d_stratosphere","aerosol_aeronet","tropical_subseasonal","tc_analysis",
output_format_subplot = "pdf",
[[ lnd_monthly_mvm_lnd ]]
# Test model-vs-model using the same files as the reference
grid = 'native'
climo_subsection = "land_monthly_climo"
diff_title = "Difference"
partition = "compute"
qos = "regular"
short_name = v3.LR.piControl
ref_name = "20231209.v3.LR.piControl-spinup.chrysalis"
ref_start_yr = 0051
ref_final_yr = 0100
ref_years = "0051-0100",
reference_data_path = "/lcrc/group/e3sm/ac.zhang40/tests/20231209.v3.LR.piControl-spinup.chrysalis_land_diags/post/lnd/native/clim"
run_type = "model_vs_model"
sets = "lat_lon_land",
short_ref_name = "20231209.v3.LR.piControl-spinup"
swap_test_ref = False
tag = "model_vs_model"
ts_num_years_ref = 50
[[atm_monthly_180x360_aave_mvm]]
ref_years = "0001-0050",
ref_start_yr = 1
ref_final_yr = 50
ts_num_years = 30
ts_num_years_ref = 10
ts_subsection = "atm_monthly_180x360_aave"
short_name = 'v3alpha04-COARE.piControl'
grid = '180x360_aave'
ref_name = '20230924.v3alpha04_trigrid.piControl.chrysalis'
short_ref_name = 'v3alpha04-CTL.piControl'
tag = 'v3alpha04i-COARE_vs_CTL'
run_type = "model_vs_model"
reference_data_path = '/lcrc/group/e3sm2/ac.xzheng/E3SMv3_dev/20230924.v3alpha04_trigrid.piControl.chrysalis/post/atm/180x360_aave/clim'
climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
climo_diurnal_frequency = "diurnal_8xdaily"
climo_subsection = "atm_monthly_180x360_aave"
sets="lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","annual_cycle_zonal_mean","qbo","diurnal_cycle","zonal_mean_2d_stratosphere","aerosol_budget"
diff_title = 'Difference'
[ilamb]
active = True
nodes = 8
walltime = "2:00:00"
partition = compute
short_name = 'v3.LR.historical_0051'
ts_num_years = 30
years = "1985:2014:30"
[global_time_series]
active = True
experiment_name = "v3.LR.historical_0051"
figstr = "v3.LR.historical_0051"
plots_atm = "TREFHT,AODDUST"
plots_lnd = "FSH,RH2M,LAISHA,LAISUN,QINTR,QOVER,QRUNOFF,QSOIL,QVEGE,QVEGT,SOILWATER_10CM,TSA,H2OSNO,TOTLITC,CWDC,SOIL1C,SOIL2C,SOIL3C,SOIL4C,WOOD_HARVESTC,TOTVEGC,NBP,GPP,AR,HR"
ts_num_years = 30
walltime = "00:30:00"
years = "1985-2014",
climo_years ="1985-2014",
ts_years ="1985-2014",
moc_file = "mocTimeSeries_1985-2014.nc"
Note I had to remove duplicated subblocks from your cfg.
$ squeue -o "%8u %.7a %.4D %.9P %7i %.2t %.10r %.10M %.10l %j" --sort=P,-t,-p -u ac.forsyth2
USER ACCOUNT NODE PARTITION JOBID ST REASON TIME TIME_LIMIT NAME
ac.forsy e3sm 1 compute 591484 PD Dependency 0:00 4:00:00 e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014
ac.forsy e3sm 1 compute 591485 PD Dependency 0:00 4:00:00 e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100
ac.forsy e3sm 1 compute 591486 PD Dependency 0:00 4:00:00 e3sm_diags_atm_monthly_180x360_aave_mvm_v3alpha04i-COARE_vs_CTL_1985-2014_vs_0001-0050
We see here that the three E3SM Diags task are all waiting on a dependency.
$ cd /lcrc/group/e3sm/ac.forsyth2/issue_622/post/scripts
$ grep -v "OK" *status
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014.status:WAITING 591484
e3sm_diags_atm_monthly_180x360_aave_mvm_v3alpha04i-COARE_vs_CTL_1985-2014_vs_0001-0050.status:WAITING 591486
e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100.status:WAITING 591485
tc_analysis_1985-2014.status:RUNNING 591483
$ tail -n 1 tc_analysis_1985-2014.o591483
slurmstepd: error: *** JOB 591483 ON chr-0229 CANCELLED AT 2024-09-23T13:41:46 DUE TO TIME LIMIT ***
Note, your first point Status file shows "RUNNING" after slurm error
is unfortuntely something I've never figured out a way around. That is, when SLURM hits a time limit, there is now no longer any time left on the job to update the status file to "ERROR." It said "RUNNING" when the time limit hit and at that point, no more changes can be made.
For your second point, on all three e3sm_diags runs waiting:
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014
has:
sets="lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","annual_cycle_zonal_mean","qbo","diurnal_cycle","zonal_mean_2d_stratosphere","aerosol_aeronet","tropical_subseasonal","tc_analysis",
This includes "tc_analysis", naturally it will not run.
e3sm_diags_atm_monthly_180x360_aave_mvm_v3alpha04i-COARE_vs_CTL_1985-2014_vs_0001-0050
has:
sets="lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","annual_cycle_zonal_mean","qbo","diurnal_cycle","zonal_mean_2d_stratosphere","aerosol_budget"
e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100
has:
sets = "lat_lon_land",
I added print(f"dependencies={dependencies}")
in e3sm_diags.py
. That gives me the following dependency lists:
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014
dependencies=[
'climo_atm_monthly_180x360_aave_1985-2014.status',
'climo_atm_monthly_diurnal_8xdaily_180x360_aave_1985-2014.status',
'tc_analysis_1985-2014.status',
'ts_atm_monthly_180x360_aave_1985-2014-0030.status',
'ts_atm_daily_180x360_aave_1985-2014-0030.status']
e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100
dependencies=[
'climo_atm_monthly_180x360_aave_1985-2014.status',
'climo_atm_monthly_diurnal_8xdaily_180x360_aave_1985-2014.status',
'tc_analysis_1985-2014.status',
'ts_atm_monthly_180x360_aave_1985-2014-0030.status',
'ts_atm_daily_180x360_aave_1985-2014-0030.status']
e3sm_diags_atm_monthly_180x360_aave_mvm_v3alpha04i-COARE_vs_CTL_1985-2014_vs_0001-0050
dependencies=[
'climo_atm_monthly_180x360_aave_1985-2014.status',
'climo_atm_monthly_diurnal_8xdaily_180x360_aave_1985-2014.status',
'tc_analysis_1985-2014.status',
'ts_atm_monthly_180x360_aave_1985-2014-0030.status',
'ts_atm_daily_180x360_aave_1985-2014-0030.status',
'climo_atm_monthly_180x360_aave_1985-2014.status',
'climo_atm_monthly_diurnal_8xdaily_180x360_aave_1985-2014.status',
'ts_atm_monthly_180x360_aave_1985-2014-0030.status']
It looks like dependencies get added as we go through the cfg, but we never start from fresh.
In e3sm_diags.py
, we have:
dependencies: List[str] = []
for c in tasks:
Really, we should be resetting dependencies
for each task...
I will make a PR for that.
Resolved by #631.
@forsyth2 I'm just wondering have you tested if the zppy configuration works (https://github.com/E3SM-Project/zppy/issues/622#issuecomment-2369252993) with 3 e3sm_diags tasks running in parallel after recent two PRs?
I haven't tested 3 tasks in parallel explicitly. However, I did just run multiple tests in parallel for #632:
zppy -c tests/integration/generated/test_min_case_e3sm_diags_tc_analysis_v2_chrysalis.cfg # Runs 1 `e3sm_diags` task.
zppy -c tests/integration/generated/test_min_case_e3sm_diags_tc_analysis_v2_parallel_chrysalis.cfg # Runs 2 `e3sm_diags` task in parallel
I did run into some problems on the v3 side though, as described in https://github.com/E3SM-Project/zppy/pull/632#issuecomment-2417430619.
I can also increase the number of parallel tasks in min_case_e3sm_diags_tc_analysis_v2_parallel
to 3 or more if that would be valuable.
Or are you asking if I've run issue_622.cfg
(from the linked comment) post-#623? I can also do that.
Or are you asking if I've run
issue_622.cfg
(from the linked comment) post-#623? I can also do that.
Yes, I mean that we should make sure the second issue in 622 is resolved as intended.
Ok, with the tc_analysis
task failing due to time limit reached, only the e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1985-2014
gets blocked, as intended. So I would say that #622 is indeed resolved.
I did however notice an unrelated error on the land task:
$ cd /lcrc/group/e3sm/ac.forsyth2/issue_622v3/v3.LR.historical_0051/post/scripts
$ cat e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100.status
ERROR (1)
$ cat e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100.o607287
cp: cannot stat '/lcrc/group/e3sm/ac.forsyth2/issue_622v3/v3.LR.historical_0051/post/lnd/native/clim/30yr/v3.LR.historical_0051_*_1985??_2014??_climo.nc': No such file or directory
# But:
$ ls /lcrc/group/e3sm/ac.forsyth2/issue_622v3/v3.LR.historical_0051/post/lnd/native/clim/30yr/
v3.LR.historical_0051_01_198501_201401_climo.nc v3.LR.historical_0051_07_198507_201407_climo.nc v3.LR.historical_0051_ANN_198501_201412_climo.nc
v3.LR.historical_0051_02_198502_201402_climo.nc v3.LR.historical_0051_08_198508_201408_climo.nc v3.LR.historical_0051_DJF_198501_201412_climo.nc
v3.LR.historical_0051_03_198503_201403_climo.nc v3.LR.historical_0051_09_198509_201409_climo.nc v3.LR.historical_0051_JJA_198506_201408_climo.nc
v3.LR.historical_0051_04_198504_201404_climo.nc v3.LR.historical_0051_10_198510_201410_climo.nc v3.LR.historical_0051_MAM_198503_201405_climo.nc
v3.LR.historical_0051_05_198505_201405_climo.nc v3.LR.historical_0051_11_198511_201411_climo.nc v3.LR.historical_0051_SON_198509_201411_climo.nc
v3.LR.historical_0051_06_198506_201406_climo.nc v3.LR.historical_0051_12_198512_201412_climo.nc
could it be a dependency issue? e3sm_diags_lnd_monthly_mvm_lnd
task starts before climo files being generated?
So it is. This is bizarre. I'll look into it; it should be depending on climo.
$ grep dependencies e3sm_diags_lnd_monthly_mvm_lnd_model_vs_model_1985-2014_vs_0051-0100.settings
'dependencies': [],
What happened?
I ran into a few issues with setting up
tc_analysis
with a fully configured zppy run.slurmstepd: error: *** JOB 588238 ON chr-0471 CANCELLED AT 2024-09-20T16:27:45 DUE TO TIME LIMIT ***
tc_analysis
even though only the model vs obs task depends ontc_analysis
.What machine were you running on?
Chrysalis
Environment
e3sm_unified_1.10
What command did you run?
Copy your cfg file
What jobs are failing?
No response
What stack trace are you encountering?
No response