Benchmarking portable staggered fermion kernel written in Kokkos and MPI

simon-schlepphorst commented 7 months ago

Simulations of Lattice Quantum Chromodynamics (LQCD) are an important application (two digit percentage of cycles) on major High Performance Computing (HPC) installations, including systems high up on and leading the top500 list. In the rapidly changing hardware landscape of HPC, tying up manpower optimizing simulation software for every architecture becomes a sustainability issue. In this work we explore the feasibility of using performance portable parallel code for an important LQCD kernel. Fusing the Kokkos C++ Performance Portability EcoSystem with MPI allows applications to scale on massive parallel machines while still being able to target a plentitude of different architectures with the same simple code. We report on benchmarking results for a range of currently deployed and recently introduced systems, including AMD EPYC 7742, AMD MI250, Fujitsu A64FX, Nvidia A100 and Nvidia H100 components, with mostly encouraging results.

simon-schlepphorst commented 6 months ago

Slides: pdf

simon-schlepphorst commented 6 months ago

Patch against Kokkos 4.0.1 to try simulating different effective cache sizes on Nvidia GPUs:

diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 5afad7a6a..2935b85ef 100644
--- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -216,6 +216,7 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
                                min_shmem_size_per_sm) /
                               max_shmem_per_sm;
   if (carveout > 100) carveout = 100;
+  carveout = CARVEOUT;

   // Set the carveout, but only call it once per kernel or when it changes
   auto set_cache_config = [&] {
@@ -379,15 +380,11 @@ struct CudaParallelLaunchKernelInvoker<

     if (!Impl::is_empty_launch(grid, block)) {
       Impl::check_shmem_request(cuda_instance, shmem);
-      if constexpr (DriverType::Policy::
-                        experimental_contains_desired_occupancy) {
-        int desired_occupancy =
-            driver.get_policy().impl_get_desired_occupancy().value();
+        int desired_occupancy = 100;
         size_t block_size = block.x * block.y * block.z;
         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
             shmem, desired_occupancy);
-      }

       void const* args[] = {&driver};

@@ -478,15 +475,11 @@ struct CudaParallelLaunchKernelInvoker<

     if (!Impl::is_empty_launch(grid, block)) {
       Impl::check_shmem_request(cuda_instance, shmem);
-      if constexpr (DriverType::Policy::
-                        experimental_contains_desired_occupancy) {
-        int desired_occupancy =
-            driver.get_policy().impl_get_desired_occupancy().value();
+        int desired_occupancy = 100;
         size_t block_size = block.x * block.y * block.z;
         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
             shmem, desired_occupancy);
-      }

       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);

@@ -652,17 +645,13 @@ struct CudaParallelLaunchImpl<

       Impl::check_shmem_request(cuda_instance, shmem);

-      if constexpr (DriverType::Policy::
-                        experimental_contains_desired_occupancy) {
-        int desired_occupancy =
-            driver.get_policy().impl_get_desired_occupancy().value();
+        int desired_occupancy = 100;
         size_t block_size = block.x * block.y * block.z;
         Impl::configure_shmem_preference<
             DriverType,
             Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
             shmem, desired_occupancy);
-      }

       ensure_cuda_lock_arrays_on_device();

The CARVEOUT macro is filled with values between 0 and 100 by setting e.g. -DCMAKE_CXX_FLAGS="-DCARVEOUT=100"

Results: pdf

The actual mapping between CARVEOUT-values and cache sizes still needs to be found out.

kokkos / kokkos.github.io

Benchmarking portable staggered fermion kernel written in Kokkos and MPI #36