Closed mfdeakin-sandia closed 6 years ago
Bowman performance looks unchanged with ne=8
, nmax=300
, qsize=40
run by mpiexec -n 32 --bind-to core -x OMP_PROC_BIND=spread -x OMP_PLACES=threads -x OMP_NUM_THREADS=2 numactl --membind 1 ./prtcB_c.${b} < ./namelist.nl; mv HommeTime_stats HommeTime_stats.${b}.${i}
:
HommeTime_stats.master.1
prim_main_loop 64 64 6.400000e+01 1.693318e+03 26.460 ( 0 0) 26.458 ( 24 0)
tl-ae U3-5stage_timestep 64 64 1.920000e+04 1.777258e+02 3.554 ( 22 0) 2.171 ( 61 0)
tl-ae advance_hypervis_dp 64 64 1.920000e+04 1.367438e+02 2.157 ( 8 0) 2.119 ( 34 0)
tl-at prim_advec_tracers_remap_RK2 64 64 1.920000e+04 1.147721e+03 18.555 ( 61 0) 17.109 ( 22 0)
tl-sc vertical_remap 64 64 6.400000e+03 2.032544e+02 3.230 ( 23 0) 3.136 ( 17 0)
HommeTime_stats.master.2
prim_main_loop 64 64 6.400000e+01 1.694443e+03 26.477 ( 0 0) 26.475 ( 54 0)
tl-ae U3-5stage_timestep 64 64 1.920000e+04 1.784818e+02 3.581 ( 22 0) 2.200 ( 61 0)
tl-ae advance_hypervis_dp 64 64 1.920000e+04 1.362424e+02 2.158 ( 54 0) 2.113 ( 29 0)
tl-at prim_advec_tracers_remap_RK2 64 64 1.920000e+04 1.149122e+03 18.558 ( 61 0) 17.134 ( 22 0)
tl-sc vertical_remap 64 64 6.400000e+03 2.027860e+02 3.215 ( 52 0) 3.129 ( 7 0)
HommeTime_stats.master.3
prim_main_loop 64 64 6.400000e+01 1.693215e+03 26.458 ( 0 0) 26.456 ( 55 0)
tl-ae U3-5stage_timestep 64 64 1.920000e+04 1.780096e+02 3.582 ( 22 0) 2.202 ( 2 0)
tl-ae advance_hypervis_dp 64 64 1.920000e+04 1.364554e+02 2.147 ( 56 0) 2.114 ( 29 0)
tl-at prim_advec_tracers_remap_RK2 64 64 1.920000e+04 1.148356e+03 18.559 ( 61 0) 17.125 ( 22 0)
tl-sc vertical_remap 64 64 6.400000e+03 2.024812e+02 3.222 ( 26 0) 3.122 ( 12 0)
HommeTime_stats.master.4
prim_main_loop 64 64 6.400000e+01 1.689872e+03 26.405 ( 10 0) 26.403 ( 18 0)
tl-ae U3-5stage_timestep 64 64 1.920000e+04 1.762812e+02 3.531 ( 22 0) 2.145 ( 61 0)
tl-ae advance_hypervis_dp 64 64 1.920000e+04 1.358155e+02 2.143 ( 56 0) 2.103 ( 39 0)
tl-at prim_advec_tracers_remap_RK2 64 64 1.920000e+04 1.147367e+03 18.563 ( 61 0) 17.123 ( 22 0)
tl-sc vertical_remap 64 64 6.400000e+03 2.026072e+02 3.215 ( 48 0) 3.132 ( 58 0)
HommeTime_stats.master.5
prim_main_loop 64 64 6.400000e+01 1.695210e+03 26.489 ( 0 0) 26.487 ( 51 0)
tl-ae U3-5stage_timestep 64 64 1.920000e+04 1.788868e+02 3.582 ( 22 0) 2.183 ( 2 0)
tl-ae advance_hypervis_dp 64 64 1.920000e+04 1.366075e+02 2.153 ( 54 0) 2.116 ( 34 0)
tl-at prim_advec_tracers_remap_RK2 64 64 1.920000e+04 1.149168e+03 18.568 ( 2 0) 17.129 ( 22 0)
tl-sc vertical_remap 64 64 6.400000e+03 2.026960e+02 3.238 ( 43 0) 3.127 ( 35 0)
HommeTime_stats.rm_ivdep.1
prim_main_loop 64 64 6.400000e+01 1.691562e+03 26.432 ( 0 0) 26.430 ( 55 0)
tl-ae U3-5stage_timestep 64 64 1.920000e+04 1.767413e+02 3.567 ( 22 0) 2.139 ( 61 0)
tl-ae advance_hypervis_dp 64 64 1.920000e+04 1.370599e+02 2.161 ( 50 0) 2.123 ( 36 0)
tl-at prim_advec_tracers_remap_RK2 64 64 1.920000e+04 1.148107e+03 18.568 ( 61 0) 17.119 ( 22 0)
tl-sc vertical_remap 64 64 6.400000e+03 2.017358e+02 3.228 ( 43 0) 3.117 ( 15 0)
HommeTime_stats.rm_ivdep.2
prim_main_loop 64 64 6.400000e+01 1.693325e+03 26.459 ( 0 0) 26.457 ( 1 0)
tl-ae U3-5stage_timestep 64 64 1.920000e+04 1.780904e+02 3.572 ( 22 0) 2.202 ( 2 0)
tl-ae advance_hypervis_dp 64 64 1.920000e+04 1.365242e+02 2.159 ( 54 0) 2.117 ( 42 0)
tl-at prim_advec_tracers_remap_RK2 64 64 1.920000e+04 1.148245e+03 18.558 ( 61 0) 17.121 ( 22 0)
tl-sc vertical_remap 64 64 6.400000e+03 2.026869e+02 3.218 ( 30 0) 3.116 ( 15 0)
HommeTime_stats.rm_ivdep.3
prim_main_loop 64 64 6.400000e+01 1.693206e+03 26.457 ( 0 0) 26.456 ( 31 0)
tl-ae U3-5stage_timestep 64 64 1.920000e+04 1.780412e+02 3.535 ( 22 0) 2.202 ( 61 0)
tl-ae advance_hypervis_dp 64 64 1.920000e+04 1.368372e+02 2.164 ( 54 0) 2.124 ( 29 0)
tl-at prim_advec_tracers_remap_RK2 64 64 1.920000e+04 1.148828e+03 18.563 ( 61 0) 17.153 ( 22 0)
tl-sc vertical_remap 64 64 6.400000e+03 2.016693e+02 3.193 ( 22 0) 3.114 ( 8 0)
HommeTime_stats.rm_ivdep.4
prim_main_loop 64 64 6.400000e+01 1.695621e+03 26.495 ( 0 0) 26.494 ( 35 0)
tl-ae U3-5stage_timestep 64 64 1.920000e+04 1.782750e+02 3.527 ( 22 0) 2.208 ( 61 0)
tl-ae advance_hypervis_dp 64 64 1.920000e+04 1.369757e+02 2.170 ( 56 0) 2.126 ( 29 0)
tl-at prim_advec_tracers_remap_RK2 64 64 1.920000e+04 1.148152e+03 18.564 ( 61 0) 17.117 ( 22 0)
tl-sc vertical_remap 64 64 6.400000e+03 2.036603e+02 3.263 ( 22 0) 3.125 ( 2 0)
HommeTime_stats.rm_ivdep.5
prim_main_loop 64 64 6.400000e+01 1.693923e+03 26.469 ( 0 0) 26.467 ( 62 0)
tl-ae U3-5stage_timestep 64 64 1.920000e+04 1.777885e+02 3.567 ( 22 0) 2.177 ( 61 0)
tl-ae advance_hypervis_dp 64 64 1.920000e+04 1.365856e+02 2.159 ( 56 0) 2.119 ( 41 0)
tl-at prim_advec_tracers_remap_RK2 64 64 1.920000e+04 1.148133e+03 18.544 ( 61 0) 17.101 ( 22 0)
tl-sc vertical_remap 64 64 6.400000e+03 2.034738e+02 3.227 ( 49 0) 3.131 ( 58 0)
White performance looks unchanged with the same namelist, run as ./prtcB_c.${b}
:
HommeTime_stats.master.1
prim_main_loop 1 1 1.000000e+00 1.821733e+01 18.217 ( 0 0) 18.217 ( 0 0)
tl-ae U3-5stage_timestep 1 1 3.000000e+02 2.061110e+00 2.061 ( 0 0) 2.061 ( 0 0)
tl-ae advance_hypervis_dp 1 1 3.000000e+02 1.864861e+00 1.865 ( 0 0) 1.865 ( 0 0)
tl-at prim_advec_tracers_remap_RK2 1 1 3.000000e+02 1.221237e+01 12.212 ( 0 0) 12.212 ( 0 0)
tl-sc vertical_remap 1 1 1.000000e+02 1.003912e+00 1.004 ( 0 0) 1.004 ( 0 0)
HommeTime_stats.master.2
prim_main_loop 1 1 1.000000e+00 1.817709e+01 18.177 ( 0 0) 18.177 ( 0 0)
tl-ae U3-5stage_timestep 1 1 3.000000e+02 2.054405e+00 2.054 ( 0 0) 2.054 ( 0 0)
tl-ae advance_hypervis_dp 1 1 3.000000e+02 1.859587e+00 1.860 ( 0 0) 1.860 ( 0 0)
tl-at prim_advec_tracers_remap_RK2 1 1 3.000000e+02 1.219150e+01 12.191 ( 0 0) 12.191 ( 0 0)
tl-sc vertical_remap 1 1 1.000000e+02 1.003266e+00 1.003 ( 0 0) 1.003 ( 0 0)
HommeTime_stats.master.3
prim_main_loop 1 1 1.000000e+00 1.817403e+01 18.174 ( 0 0) 18.174 ( 0 0)
tl-ae U3-5stage_timestep 1 1 3.000000e+02 2.055666e+00 2.056 ( 0 0) 2.056 ( 0 0)
tl-ae advance_hypervis_dp 1 1 3.000000e+02 1.860825e+00 1.861 ( 0 0) 1.861 ( 0 0)
tl-at prim_advec_tracers_remap_RK2 1 1 3.000000e+02 1.217964e+01 12.180 ( 0 0) 12.180 ( 0 0)
tl-sc vertical_remap 1 1 1.000000e+02 1.001659e+00 1.002 ( 0 0) 1.002 ( 0 0)
HommeTime_stats.master.4
prim_main_loop 1 1 1.000000e+00 1.814866e+01 18.149 ( 0 0) 18.149 ( 0 0)
tl-ae U3-5stage_timestep 1 1 3.000000e+02 2.052539e+00 2.053 ( 0 0) 2.053 ( 0 0)
tl-ae advance_hypervis_dp 1 1 3.000000e+02 1.856485e+00 1.856 ( 0 0) 1.856 ( 0 0)
tl-at prim_advec_tracers_remap_RK2 1 1 3.000000e+02 1.216939e+01 12.169 ( 0 0) 12.169 ( 0 0)
tl-sc vertical_remap 1 1 1.000000e+02 1.001021e+00 1.001 ( 0 0) 1.001 ( 0 0)
HommeTime_stats.master.5
prim_main_loop 1 1 1.000000e+00 1.814186e+01 18.142 ( 0 0) 18.142 ( 0 0)
tl-ae U3-5stage_timestep 1 1 3.000000e+02 2.054287e+00 2.054 ( 0 0) 2.054 ( 0 0)
tl-ae advance_hypervis_dp 1 1 3.000000e+02 1.860821e+00 1.861 ( 0 0) 1.861 ( 0 0)
tl-at prim_advec_tracers_remap_RK2 1 1 3.000000e+02 1.218011e+01 12.180 ( 0 0) 12.180 ( 0 0)
tl-sc vertical_remap 1 1 1.000000e+02 1.001351e+00 1.001 ( 0 0) 1.001 ( 0 0)
HommeTime_stats.rm_ivdep.1
prim_main_loop 1 1 1.000000e+00 1.819436e+01 18.194 ( 0 0) 18.194 ( 0 0)
tl-ae U3-5stage_timestep 1 1 3.000000e+02 2.057172e+00 2.057 ( 0 0) 2.057 ( 0 0)
tl-ae advance_hypervis_dp 1 1 3.000000e+02 1.863304e+00 1.863 ( 0 0) 1.863 ( 0 0)
tl-at prim_advec_tracers_remap_RK2 1 1 3.000000e+02 1.220191e+01 12.202 ( 0 0) 12.202 ( 0 0)
tl-sc vertical_remap 1 1 1.000000e+02 1.002508e+00 1.003 ( 0 0) 1.003 ( 0 0)
HommeTime_stats.rm_ivdep.2
prim_main_loop 1 1 1.000000e+00 1.818752e+01 18.188 ( 0 0) 18.188 ( 0 0)
tl-ae U3-5stage_timestep 1 1 3.000000e+02 2.056879e+00 2.057 ( 0 0) 2.057 ( 0 0)
tl-ae advance_hypervis_dp 1 1 3.000000e+02 1.861107e+00 1.861 ( 0 0) 1.861 ( 0 0)
tl-at prim_advec_tracers_remap_RK2 1 1 3.000000e+02 1.219423e+01 12.194 ( 0 0) 12.194 ( 0 0)
tl-sc vertical_remap 1 1 1.000000e+02 1.002711e+00 1.003 ( 0 0) 1.003 ( 0 0)
HommeTime_stats.rm_ivdep.3
prim_main_loop 1 1 1.000000e+00 1.815621e+01 18.156 ( 0 0) 18.156 ( 0 0)
tl-ae U3-5stage_timestep 1 1 3.000000e+02 2.054692e+00 2.055 ( 0 0) 2.055 ( 0 0)
tl-ae advance_hypervis_dp 1 1 3.000000e+02 1.858160e+00 1.858 ( 0 0) 1.858 ( 0 0)
tl-at prim_advec_tracers_remap_RK2 1 1 3.000000e+02 1.219419e+01 12.194 ( 0 0) 12.194 ( 0 0)
tl-sc vertical_remap 1 1 1.000000e+02 1.002419e+00 1.002 ( 0 0) 1.002 ( 0 0)
HommeTime_stats.rm_ivdep.4
prim_main_loop 1 1 1.000000e+00 1.817671e+01 18.177 ( 0 0) 18.177 ( 0 0)
tl-ae U3-5stage_timestep 1 1 3.000000e+02 2.058005e+00 2.058 ( 0 0) 2.058 ( 0 0)
tl-ae advance_hypervis_dp 1 1 3.000000e+02 1.865705e+00 1.866 ( 0 0) 1.866 ( 0 0)
tl-at prim_advec_tracers_remap_RK2 1 1 3.000000e+02 1.219704e+01 12.197 ( 0 0) 12.197 ( 0 0)
tl-sc vertical_remap 1 1 1.000000e+02 1.003191e+00 1.003 ( 0 0) 1.003 ( 0 0)
HommeTime_stats.rm_ivdep.5
prim_main_loop 1 1 1.000000e+00 1.815682e+01 18.157 ( 0 0) 18.157 ( 0 0)
tl-ae U3-5stage_timestep 1 1 3.000000e+02 2.056175e+00 2.056 ( 0 0) 2.056 ( 0 0)
tl-ae advance_hypervis_dp 1 1 3.000000e+02 1.860681e+00 1.861 ( 0 0) 1.861 ( 0 0)
tl-at prim_advec_tracers_remap_RK2 1 1 3.000000e+02 1.219514e+01 12.195 ( 0 0) 12.195 ( 0 0)
tl-sc vertical_remap 1 1 1.000000e+02 1.002413e+00 1.002 ( 0 0) 1.002 ( 0 0)
Blake performance looks unchanged with the same namelist and the run command mpiexec -n 32 --bind-to core -x OMP_PROC_BIND=spread -x OMP_PLACES=threads -x OMP_NUM_THREADS=2 ./prtcB_c.${b}
:
HommeTime_stats.master.1
prim_main_loop 32 32 3.200000e+01 8.692271e+02 27.164 ( 0 0) 27.163 ( 5 0)
tl-ae U3-5stage_timestep 32 32 9.600000e+03 5.505940e+01 2.702 ( 16 0) 1.090 ( 19 0)
tl-ae advance_hypervis_dp 32 32 9.600000e+03 3.432108e+01 1.089 ( 5 0) 1.058 ( 31 0)
tl-at prim_advec_tracers_remap_RK2 32 32 9.600000e+03 7.225926e+02 23.189 ( 19 0) 21.643 ( 16 0)
tl-sc vertical_remap 32 32 3.200000e+03 4.837127e+01 1.542 ( 13 0) 1.496 ( 28 0)
HommeTime_stats.master.2
prim_main_loop 32 32 3.200000e+01 8.469057e+02 26.466 ( 0 0) 26.466 ( 12 0)
tl-ae U3-5stage_timestep 32 32 9.600000e+03 5.534785e+01 2.724 ( 12 0) 1.100 ( 1 0)
tl-ae advance_hypervis_dp 32 32 9.600000e+03 3.382453e+01 1.064 ( 0 0) 1.048 ( 15 0)
tl-at prim_advec_tracers_remap_RK2 32 32 9.600000e+03 7.004775e+02 22.500 ( 1 0) 20.943 ( 12 0)
tl-sc vertical_remap 32 32 3.200000e+03 4.839876e+01 1.543 ( 13 0) 1.494 ( 28 0)
HommeTime_stats.master.3
prim_main_loop 32 32 3.200000e+01 8.627611e+02 26.961 ( 0 0) 26.961 ( 4 0)
tl-ae U3-5stage_timestep 32 32 9.600000e+03 5.530597e+01 2.732 ( 16 0) 1.083 ( 1 0)
tl-ae advance_hypervis_dp 32 32 9.600000e+03 3.365557e+01 1.059 ( 6 0) 1.043 ( 1 0)
tl-at prim_advec_tracers_remap_RK2 32 32 9.600000e+03 7.164708e+02 23.016 ( 1 0) 21.421 ( 16 0)
tl-sc vertical_remap 32 32 3.200000e+03 4.842331e+01 1.541 ( 13 0) 1.496 ( 8 0)
HommeTime_stats.master.4
prim_main_loop 32 32 3.200000e+01 8.614437e+02 26.920 ( 0 0) 26.920 ( 20 0)
tl-ae U3-5stage_timestep 32 32 9.600000e+03 5.726757e+01 2.776 ( 16 0) 1.083 ( 9 0)
tl-ae advance_hypervis_dp 32 32 9.600000e+03 3.400087e+01 1.071 ( 17 0) 1.054 ( 31 0)
tl-at prim_advec_tracers_remap_RK2 32 32 9.600000e+03 7.131761e+02 22.968 ( 9 0) 21.340 ( 16 0)
tl-sc vertical_remap 32 32 3.200000e+03 4.839463e+01 1.551 ( 13 0) 1.494 ( 18 0)
HommeTime_stats.master.5
prim_main_loop 32 32 3.200000e+01 8.611633e+02 26.912 ( 0 0) 26.911 ( 23 0)
tl-ae U3-5stage_timestep 32 32 9.600000e+03 5.690941e+01 2.916 ( 16 0) 1.120 ( 9 0)
tl-ae advance_hypervis_dp 32 32 9.600000e+03 3.397308e+01 1.070 ( 17 0) 1.053 ( 11 0)
tl-at prim_advec_tracers_remap_RK2 32 32 9.600000e+03 7.129033e+02 22.904 ( 9 0) 21.190 ( 16 0)
tl-sc vertical_remap 32 32 3.200000e+03 4.845042e+01 1.564 ( 13 0) 1.495 ( 6 0)
HommeTime_stats.rm_ivdep.1
prim_main_loop 32 32 3.200000e+01 8.538242e+02 26.682 ( 0 0) 26.682 ( 17 0)
tl-ae U3-5stage_timestep 32 32 9.600000e+03 5.732231e+01 2.878 ( 16 0) 1.130 ( 9 0)
tl-ae advance_hypervis_dp 32 32 9.600000e+03 3.417739e+01 1.084 ( 20 0) 1.052 ( 31 0)
tl-at prim_advec_tracers_remap_RK2 32 32 9.600000e+03 7.048339e+02 22.678 ( 9 0) 20.983 ( 16 0)
tl-sc vertical_remap 32 32 3.200000e+03 4.829977e+01 1.533 ( 25 0) 1.491 ( 6 0)
HommeTime_stats.rm_ivdep.2
prim_main_loop 32 32 3.200000e+01 8.596679e+02 26.865 ( 0 0) 26.865 ( 10 0)
tl-ae U3-5stage_timestep 32 32 9.600000e+03 5.723510e+01 2.912 ( 12 0) 1.062 ( 1 0)
tl-ae advance_hypervis_dp 32 32 9.600000e+03 3.420961e+01 1.080 ( 9 0) 1.058 ( 19 0)
tl-at prim_advec_tracers_remap_RK2 32 32 9.600000e+03 7.109998e+02 22.943 ( 1 0) 21.155 ( 12 0)
tl-sc vertical_remap 32 32 3.200000e+03 4.829973e+01 1.558 ( 25 0) 1.492 ( 18 0)
HommeTime_stats.rm_ivdep.3
prim_main_loop 32 32 3.200000e+01 8.602280e+02 26.882 ( 0 0) 26.882 ( 18 0)
tl-ae U3-5stage_timestep 32 32 9.600000e+03 5.538318e+01 2.851 ( 16 0) 1.084 ( 9 0)
tl-ae advance_hypervis_dp 32 32 9.600000e+03 3.406384e+01 1.074 ( 18 0) 1.053 ( 31 0)
tl-at prim_advec_tracers_remap_RK2 32 32 9.600000e+03 7.132896e+02 22.910 ( 9 0) 21.226 ( 16 0)
tl-sc vertical_remap 32 32 3.200000e+03 4.830456e+01 1.555 ( 13 0) 1.492 ( 18 0)
HommeTime_stats.rm_ivdep.4
prim_main_loop 32 32 3.200000e+01 8.506545e+02 26.583 ( 0 0) 26.583 ( 11 0)
tl-ae U3-5stage_timestep 32 32 9.600000e+03 5.540626e+01 2.650 ( 16 0) 1.052 ( 1 0)
tl-ae advance_hypervis_dp 32 32 9.600000e+03 3.353541e+01 1.058 ( 9 0) 1.040 ( 13 0)
tl-at prim_advec_tracers_remap_RK2 32 32 9.600000e+03 7.042963e+02 22.686 ( 1 0) 21.136 ( 16 0)
tl-sc vertical_remap 32 32 3.200000e+03 4.826906e+01 1.531 ( 25 0) 1.492 ( 28 0)
HommeTime_stats.rm_ivdep.5
prim_main_loop 32 32 3.200000e+01 8.506496e+02 26.583 ( 0 0) 26.583 ( 28 0)
tl-ae U3-5stage_timestep 32 32 9.600000e+03 5.560170e+01 2.769 ( 16 0) 1.104 ( 1 0)
tl-ae advance_hypervis_dp 32 32 9.600000e+03 3.376761e+01 1.062 ( 10 0) 1.046 ( 19 0)
tl-at prim_advec_tracers_remap_RK2 32 32 9.600000e+03 7.039295e+02 22.609 ( 1 0) 21.012 ( 16 0)
tl-sc vertical_remap 32 32 3.200000e+03 4.822914e+01 1.535 ( 25 0) 1.489 ( 6 0)
These shouldn't be needed as ivdep is a subset of simd
This should also fix gcc build errors