Add option to use HPMG to solve Poisson equations

This PR enables the use of HPMG as a Poisson solver for Psi, Ez and Bz using the new HPMG system type 3. Additionally, gsrb_shared was expanded to system types 2 and 3 which makes the laser MG solver about 2x faster.

fields.poisson_solver = MGDirichlet # or FFTDirichlet, FFTPeriodic

Note that the fastest Poisson solve is still achieved with the FFT solver and a resolution of 2^n-1.

Performance with amr.n_cell = 1023 8191 2000 on A100:

TinyProfiler total time across processes [min...avg...max]: 93.89 ... 93.89 ... 93.89

--------------------------------------------------------------------------------------------------
Name                                               NCalls  Excl. Min  Excl. Avg  Excl. Max   Max %
--------------------------------------------------------------------------------------------------
hpmg::MultiGrid::solve3()                            6000      31.73      31.73      31.73  33.79%
hpmg::MultiGrid::solve1()                            2000       16.6       16.6       16.6  17.68%
hpmg::MultiGrid::solve2()                            2000      16.22      16.22      16.22  17.27%
ExplicitDeposition()                                 2000      7.393      7.393      7.393   7.87%

Dev:

TinyProfiler total time across processes [min...avg...max]: 93.83 ... 93.83 ... 93.83

--------------------------------------------------------------------------------------------------
Name                                               NCalls  Excl. Min  Excl. Avg  Excl. Max   Max %
--------------------------------------------------------------------------------------------------
hpmg::MultiGrid::solve2()                            2000       34.7       34.7       34.7  36.99%
hpmg::MultiGrid::solve1()                            2000      16.73      16.73      16.73  17.83%
AnyDST::Execute()                                   12000         11         11         11  11.72%
ExplicitDeposition()                                 2000      7.394      7.394      7.394   7.88%

Performance with amr.n_cell = 1024 8192 2000 on A100:

TinyProfiler total time across processes [min...avg...max]: 95.76 ... 95.76 ... 95.76

--------------------------------------------------------------------------------------------------
Name                                               NCalls  Excl. Min  Excl. Avg  Excl. Max   Max %
--------------------------------------------------------------------------------------------------
hpmg::MultiGrid::solve3()                            6000       33.4       33.4       33.4  34.88%
hpmg::MultiGrid::solve2()                            2000      17.08      17.08      17.08  17.83%
hpmg::MultiGrid::solve1()                            2000      16.61      16.61      16.61  17.34%
ExplicitDeposition()                                 2000      6.911      6.911      6.911   7.22%

Dev:

TinyProfiler total time across processes [min...avg...max]: 111.9 ... 111.9 ... 111.9

--------------------------------------------------------------------------------------------------
Name                                               NCalls  Excl. Min  Excl. Avg  Excl. Max   Max %
--------------------------------------------------------------------------------------------------
hpmg::MultiGrid::solve2()                            2000      34.66      34.66      34.66  30.97%
AnyDST::Execute()                                   12000      28.93      28.93      28.93  25.85%
hpmg::MultiGrid::solve1()                            2000       17.2       17.2       17.2  15.37%
ExplicitDeposition()                                 2000      6.897      6.897      6.897   6.16%

--- 64 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void amrex::launch<(int)1024, void hpmg::<unnamed>::gsrb_shared<(int)2, (bool)0, (bool)0, (bool)1>(const amrex::Box &, const amrex::Array4<double> &, const amrex::Array4<const double> &, const amrex::Array4<const double> &, const amrex::Array4<double> &, double, double)::[lambda() (instance 1)]>(int, CUstream_st *, T2 &&)::[lambda() (instance 1)]>(T2)
--- 64 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void amrex::launch<(int)1024, void hpmg::<unnamed>::gsrb_shared<(int)2, (bool)1, (bool)1, (bool)0>(const amrex::Box &, const amrex::Array4<double> &, const amrex::Array4<const double> &, const amrex::Array4<const double> &, const amrex::Array4<double> &, double, double)::[lambda() (instance 1)]>(int, CUstream_st *, T2 &&)::[lambda() (instance 1)]>(T2)
--- 64 registers, 8 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void amrex::launch<(int)1024, void hpmg::<unnamed>::gsrb_shared<(int)2, (bool)1, (bool)1, (bool)1>(const amrex::Box &, const amrex::Array4<double> &, const amrex::Array4<const double> &, const amrex::Array4<const double> &, const amrex::Array4<double> &, double, double)::[lambda() (instance 1)]>(int, CUstream_st *, T2 &&)::[lambda() (instance 1)]>(T2)
--- 64 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void amrex::launch<(int)1024, void hpmg::<unnamed>::gsrb_shared<(int)1, (bool)1, (bool)1, (bool)1>(const amrex::Box &, const amrex::Array4<double> &, const amrex::Array4<const double> &, const amrex::Array4<const double> &, const amrex::Array4<double> &, double, double)::[lambda() (instance 1)]>(int, CUstream_st *, T2 &&)::[lambda() (instance 1)]>(T2)
--- 60 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void amrex::launch<(int)1024, void hpmg::<unnamed>::gsrb_shared<(int)1, (bool)0, (bool)0, (bool)1>(const amrex::Box &, const amrex::Array4<double> &, const amrex::Array4<const double> &, const amrex::Array4<const double> &, const amrex::Array4<double> &, double, double)::[lambda() (instance 1)]>(int, CUstream_st *, T2 &&)::[lambda() (instance 1)]>(T2)
--- 56 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void amrex::launch<(int)1024, void hpmg::<unnamed>::gsrb_shared<(int)2, (bool)0, (bool)0, (bool)0>(const amrex::Box &, const amrex::Array4<double> &, const amrex::Array4<const double> &, const amrex::Array4<const double> &, const amrex::Array4<double> &, double, double)::[lambda() (instance 1)]>(int, CUstream_st *, T2 &&)::[lambda() (instance 1)]>(T2)
--- 55 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void amrex::launch<(int)1024, void hpmg::<unnamed>::gsrb_shared<(int)1, (bool)1, (bool)1, (bool)0>(const amrex::Box &, const amrex::Array4<double> &, const amrex::Array4<const double> &, const amrex::Array4<const double> &, const amrex::Array4<double> &, double, double)::[lambda() (instance 1)]>(int, CUstream_st *, T2 &&)::[lambda() (instance 1)]>(T2)
--- 52 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void amrex::launch<(int)1024, void hpmg::<unnamed>::gsrb_shared<(int)1, (bool)0, (bool)0, (bool)0>(const amrex::Box &, const amrex::Array4<double> &, const amrex::Array4<const double> &, const amrex::Array4<const double> &, const amrex::Array4<double> &, double, double)::[lambda() (instance 1)]>(int, CUstream_st *, T2 &&)::[lambda() (instance 1)]>(T2)
--- 49 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void amrex::launch<(int)1024, void hpmg::<unnamed>::gsrb_shared<(int)3, (bool)1, (bool)1, (bool)1>(const amrex::Box &, const amrex::Array4<double> &, const amrex::Array4<const double> &, const amrex::Array4<const double> &, const amrex::Array4<double> &, double, double)::[lambda() (instance 1)]>(int, CUstream_st *, T2 &&)::[lambda() (instance 1)]>(T2)
--- 40 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void amrex::launch<(int)1024, void hpmg::<unnamed>::gsrb_shared<(int)3, (bool)1, (bool)1, (bool)0>(const amrex::Box &, const amrex::Array4<double> &, const amrex::Array4<const double> &, const amrex::Array4<const double> &, const amrex::Array4<double> &, double, double)::[lambda() (instance 1)]>(int, CUstream_st *, T2 &&)::[lambda() (instance 1)]>(T2)
--- 39 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void amrex::launch<(int)1024, void hpmg::<unnamed>::gsrb_shared<(int)3, (bool)0, (bool)0, (bool)1>(const amrex::Box &, const amrex::Array4<double> &, const amrex::Array4<const double> &, const amrex::Array4<const double> &, const amrex::Array4<double> &, double, double)::[lambda() (instance 1)]>(int, CUstream_st *, T2 &&)::[lambda() (instance 1)]>(T2)
--- 37 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void amrex::launch<(int)1024, void hpmg::<unnamed>::gsrb_shared<(int)3, (bool)0, (bool)0, (bool)0>(const amrex::Box &, const amrex::Array4<double> &, const amrex::Array4<const double> &, const amrex::Array4<const double> &, const amrex::Array4<double> &, double, double)::[lambda() (instance 1)]>(int, CUstream_st *, T2 &&)::[lambda() (instance 1)]>(T2)

--- 64 registers, 8 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void hpmg::<unnamed>::bottomsolve_gpu<(int)16, (int)2, hpmg::MultiGrid::bottomsolve()::[lambda(int, int, int, int, int, int, const amrex::Array4<double> &, const amrex::Array4<double> &, const amrex::Array4<double> &, double, double) (instance 2)], hpmg::MultiGrid::bottomsolve()::[lambda(int, int, const amrex::Array4<double> &, int, int, int, int, const amrex::Array4<double> &, const amrex::Array4<double> &, const amrex::Array4<double> &, double, double) (instance 2)]>(double, double, const amrex::Array4<double> *, const amrex::Array4<double> *, const amrex::Array4<double> *, const amrex::Array4<double> *, int, int, T3 &&, T4 &&)::[lambda() (instance 1)]>(T2)
--- 64 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void hpmg::<unnamed>::bottomsolve_gpu<(int)16, (int)1, hpmg::MultiGrid::bottomsolve()::[lambda(int, int, int, int, int, int, const amrex::Array4<double> &, const amrex::Array4<double> &, const amrex::Array4<double> &, double, double) (instance 1)], hpmg::MultiGrid::bottomsolve()::[lambda(int, int, const amrex::Array4<double> &, int, int, int, int, const amrex::Array4<double> &, const amrex::Array4<double> &, const amrex::Array4<double> &, double, double) (instance 1)]>(double, double, const amrex::Array4<double> *, const amrex::Array4<double> *, const amrex::Array4<double> *, const amrex::Array4<double> *, int, int, T3 &&, T4 &&)::[lambda() (instance 1)]>(T2)
--- 63 registers, 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void hpmg::<unnamed>::bottomsolve_gpu<(int)16, (int)3, hpmg::MultiGrid::bottomsolve()::[lambda(int, int, int, int, int, int, const amrex::Array4<double> &, const amrex::Array4<double> &, const amrex::Array4<double> &, double, double) (instance 3)], hpmg::MultiGrid::bottomsolve()::[lambda(int, int, const amrex::Array4<double> &, int, int, int, int, const amrex::Array4<double> &, const amrex::Array4<double> &, const amrex::Array4<double> &, double, double) (instance 3)]>(double, double, const amrex::Array4<double> *, const amrex::Array4<double> *, const amrex::Array4<double> *, const amrex::Array4<double> *, int, int, T3 &&, T4 &&)::[lambda() (instance 1)]>(T2)

--- 64 registers, 8 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void hpmg::<unnamed>::bottomsolve_gpu<(int)16, hpmg::MultiGrid::bottomsolve()::[lambda(int, int, int, int, int, int, const amrex::Array4<double> &, double, double, const amrex::Array4<double> &, double, double) (instance 2)], hpmg::MultiGrid::bottomsolve()::[lambda(int, int, double &, double &, int, int, int, int, const amrex::Array4<double> &, double, double, const amrex::Array4<double> &, double, double) (instance 2)]>(double, double, const amrex::Array4<double> *, const amrex::Array4<double> *, const amrex::Array4<double> *, const amrex::Array4<double> *, int, int, T2 &&, T3 &&)::[lambda() (instance 1)]>(T2)
--- 64 registers, 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads, function name:
void amrex::launch_global<(int)1024, void hpmg::<unnamed>::bottomsolve_gpu<(int)16, hpmg::MultiGrid::bottomsolve()::[lambda(int, int, int, int, int, int, const amrex::Array4<double> &, double, double, const amrex::Array4<double> &, double, double) (instance 1)], hpmg::MultiGrid::bottomsolve()::[lambda(int, int, double &, double &, int, int, int, int, const amrex::Array4<double> &, double, double, const amrex::Array4<double> &, double, double) (instance 1)]>(double, double, const amrex::Array4<double> *, const amrex::Array4<double> *, const amrex::Array4<double> *, const amrex::Array4<double> *, int, int, T2 &&, T3 &&)::[lambda() (instance 1)]>(T2)

Hi-PACE / hipace

Add option to use HPMG to solve Poisson equations #1063