peterrum / dealii

The development repository for the deal.II finite element library.
https://www.dealii.org/
Other
1 stars 0 forks source link

LinearAlgebra::SharedMPI::Vector #28

Closed peterrum closed 2 years ago

peterrum commented 3 years ago

Goal

Introduce a new vector (LinearAlgebra::SharedMPI::Vector) in deal.II in such a way that it can be used in dealii-based DG-application (DG, ghost cells), in hyper.deal (DG, ghost, faces), and in the CEED benchmarks (CG).

related to https://github.com/dealii/dealii/pull/10872, https://github.com/hyperdeal/hyperdeal/pull/18, and https://github.com/peterrum/ceed_benchmarks_dealii/tree/sm_vector_mf

Layered approach

Initialize partitioner

Get partitioner

Initialize vector

Update vector

Accessing vector during cell/face integrals

via read_dof_values(), distribute_local_to_global(), gather_evaluate(), integrate_scatter()

  if (n_filled_lanes == VectorizedArrayType::size() &&
      n_lanes == VectorizedArrayType::size())
    {
      if (this->dof_info->index_storage_variants[ind][this->cell] ==
          internal::MatrixFreeFunctions::DoFInfo::IndexStorageVariants::
            contiguous)
        {
          if (n_components == 1 || n_fe_components == 1)
            for (unsigned int comp = 0; comp < n_components; ++comp)
              operation.process_dofs_vectorized_transpose(
                this->data->dofs_per_component_on_cell,
                dof_indices,
                *src[comp],
                values_dofs[comp],
                vector_selector);
          else
            operation.process_dofs_vectorized_transpose(
              this->data->dofs_per_component_on_cell * n_components,
              dof_indices,
              *src[0],
              &values_dofs[0][0],
              vector_selector);
        }

https://github.com/dealii/dealii/blob/4cf8f26cbf26f12e630aff124cd112fb3f24180e/include/deal.II/matrix_free/fe_evaluation.h#L4618-L4640

    for (unsigned int comp = 0; comp < n_components; ++comp)
      {
        for (unsigned int i = 0; i < this->data->dofs_per_component_on_cell;
             ++i)
          operation.process_empty(values_dofs[comp][i]);
        if (this->dof_info->index_storage_variants[ind][this->cell] ==
            internal::MatrixFreeFunctions::DoFInfo::IndexStorageVariants::
              contiguous)
          {
            if (n_components == 1 || n_fe_components == 1)
              {
                for (unsigned int v = 0; v < n_filled_lanes; ++v)
                  if (mask[v] == true)
                    for (unsigned int i = 0;
                         i < this->data->dofs_per_component_on_cell;
                         ++i)
                      operation.process_dof(dof_indices[v] + i,
                                            *src[comp],
                                            values_dofs[comp][i][v]);
              }
            else
              {
                for (unsigned int v = 0; v < n_filled_lanes; ++v)
                  if (mask[v] == true)
                    for (unsigned int i = 0;
                         i < this->data->dofs_per_component_on_cell;
                         ++i)
                      operation.process_dof(
                        dof_indices[v] + i +
                          comp * this->data->dofs_per_component_on_cell,
                        *src[0],
                        values_dofs[comp][i][v]);
              }
          }

https://github.com/dealii/dealii/blob/4cf8f26cbf26f12e630aff124cd112fb3f24180e/include/deal.II/matrix_free/fe_evaluation.h#L4712-L4745

            // case 4: contiguous indices without interleaving
            else if (n_face_orientations > 1 ||
                     dof_info.index_storage_variants[dof_access_index][cell] ==
                       MatrixFreeFunctions::DoFInfo::IndexStorageVariants::
                         contiguous)
              {
                const unsigned int *indices =
                  &dof_info.dof_indices_contiguous[dof_access_index]
                                                  [cell *
                                                   VectorizedArrayType::size()];
                Number2_ *vector_ptr =
                  global_vector_ptr + comp * static_dofs_per_component +
                  dof_info
                    .component_dof_indices_offset[active_fe_index]
                                                 [first_selected_component];

                if (do_gradients == true &&
                    data.element_type ==
                      MatrixFreeFunctions::tensor_symmetric_hermite)
                  {
                    if (n_face_orientations == 1 &&
                        dof_info.n_vectorization_lanes_filled[dof_access_index]
                                                             [cell] ==
                          VectorizedArrayType::size())
                      for (unsigned int i = 0; i < dofs_per_face; ++i)
                        {
                          const unsigned int ind1 =
                            index_array_hermite[0][2 * i];
                          const unsigned int ind2 =
                            index_array_hermite[0][2 * i + 1];
                          const unsigned int i_ = reorientate(0, i);

                          proc.function_2a(temp1[i_],
                                           temp1[i_ + dofs_per_face],
                                           vector_ptr + ind1,
                                           vector_ptr + ind2,
                                           grad_weight,
                                           indices,
                                           indices);
                        }
                    else if (n_face_orientations == 1)
                      for (unsigned int i = 0; i < dofs_per_face; ++i)
                        {
                          const unsigned int ind1 =
                            index_array_hermite[0][2 * i];
                          const unsigned int ind2 =
                            index_array_hermite[0][2 * i + 1];
                          const unsigned int i_ = reorientate(0, i);

                          const unsigned int n_filled_lanes =
                            dof_info
                              .n_vectorization_lanes_filled[dof_access_index]
                                                           [cell];

                          for (unsigned int v = 0; v < n_filled_lanes; ++v)
                            proc.function_3a(temp1[i_][v],
                                             temp1[i_ + dofs_per_face][v],
                                             vector_ptr[ind1 + indices[v]],
                                             vector_ptr[ind2 + indices[v]],
                                             grad_weight[v]);

                          if (integrate == false)
                            for (unsigned int v = n_filled_lanes;
                                 v < VectorizedArrayType::size();
                                 ++v)
                              {
                                temp1[i_][v]                 = 0.0;
                                temp1[i_ + dofs_per_face][v] = 0.0;
                              }
                        }
                    else
                      {
                        Assert(false, ExcNotImplemented());

                        const unsigned int n_filled_lanes =
                          dof_info
                            .n_vectorization_lanes_filled[dof_access_index]
                                                         [cell];

                        for (unsigned int v = 0; v < n_filled_lanes; ++v)
                          for (unsigned int i = 0; i < dofs_per_face; ++i)
                            proc.function_3a(
                              temp1[reorientate(v, i)][v],
                              temp1[reorientate(v, i) + dofs_per_face][v],
                              vector_ptr[index_array_hermite[v][2 * i] +
                                         indices[v]],
                              vector_ptr[index_array_hermite[v][2 * i + 1] +
                                         indices[v]],
                              grad_weight[v]);
                      }
                  }
                else
                  {
                    if (n_face_orientations == 1 &&
                        dof_info.n_vectorization_lanes_filled[dof_access_index]
                                                             [cell] ==
                          VectorizedArrayType::size())
                      for (unsigned int i = 0; i < dofs_per_face; ++i)
                        {
                          const unsigned int ind = index_array_nodal[0][i];
                          const unsigned int i_  = reorientate(0, i);

                          proc.function_2b(temp1[i_],
                                           vector_ptr + ind,
                                           indices);
                        }
                    else if (n_face_orientations == 1)
                      for (unsigned int i = 0; i < dofs_per_face; ++i)
                        {
                          const unsigned int ind = index_array_nodal[0][i];
                          const unsigned int i_  = reorientate(0, i);

                          const unsigned int n_filled_lanes =
                            dof_info
                              .n_vectorization_lanes_filled[dof_access_index]
                                                           [cell];

                          for (unsigned int v = 0; v < n_filled_lanes; ++v)
                            proc.function_3b(temp1[i_][v],
                                             vector_ptr[ind + indices[v]]);

                          if (integrate == false)
                            for (unsigned int v = n_filled_lanes;
                                 v < VectorizedArrayType::size();
                                 ++v)
                              temp1[i_][v] = 0.0;
                        }
                    else
                      for (unsigned int i = 0; i < dofs_per_face; ++i)
                        {
                          for (unsigned int v = 0;
                               v < VectorizedArrayType::size();
                               ++v)
                            if (cells[v] != numbers::invalid_unsigned_int)
                              proc.function_3b(
                                temp1[reorientate(v, i)][v],
                                vector_ptr[index_array_nodal[v][i] +
                                           dof_info.dof_indices_contiguous
                                             [dof_access_index][cells[v]]]);
                        }
                  }
              }
            else
              {
                // case 5: default vector access
                AssertDimension(n_face_orientations, 1);

                // for the integrate_scatter path (integrate == true), we
                // need to only prepare the data in this function for all
                // components to later call distribute_local_to_global();
                // for the gather_evaluate path (integrate == false), we
                // instead want to leave early because we need to get the
                // vector data from somewhere else
                proc.function_5(temp1, comp);
                if (integrate)
                  accesses_global_vector = false;
                else
                  return false;
              }
          }

https://github.com/dealii/dealii/blob/4cf8f26cbf26f12e630aff124cd112fb3f24180e/include/deal.II/matrix_free/evaluation_kernels.h#L2980-L3139

      template <typename T0, typename T1, typename T2, typename T3>
      void
      function_2a(T0 &      temp_1,
                  T0 &      temp_2,
                  const T1  src_ptr_1,
                  const T1  src_ptr_2,
                  const T2 &grad_weight,
                  const T3 &indices_1,
                  const T3 &indices_2)
      {
        // case 2a)
        do_vectorized_gather(src_ptr_1, indices_1, temp_1);
        do_vectorized_gather(src_ptr_2, indices_2, temp_2);
        temp_2 = grad_weight * (temp_1 - temp_2);
      }

      template <typename T0, typename T1, typename T2>
      void
      function_2b(T0 &temp, const T1 src_ptr, const T2 &indices)
      {
        // case 2b)
        do_vectorized_gather(src_ptr, indices, temp);
      }

      template <typename T0, typename T1, typename T2>
      void
      function_3a(T0 &      temp_1,
                  T0 &      temp_2,
                  const T1 &src_ptr_1,
                  const T2 &src_ptr_2,
                  const T2 &grad_weight)
      {
        // case 3a)
        temp_1 = src_ptr_1;
        temp_2 = grad_weight * (temp_1 - src_ptr_2);
      }

      template <typename T1, typename T2>
      void
      function_3b(T1 &temp, const T2 &src_ptr)
      {
        // case 3b)
        temp = src_ptr;
      }

https://github.com/dealii/dealii/blob/4cf8f26cbf26f12e630aff124cd112fb3f24180e/include/deal.II/matrix_free/evaluation_kernels.h#L3310-L3353

      template <typename T0, typename T1, typename T2, typename T3>
      void
      function_2a(const T0 &temp_1,
                  const T0 &temp_2,
                  T1        dst_ptr_1,
                  T1        dst_ptr_2,
                  const T2 &grad_weight,
                  const T3 &indices_1,
                  const T3 &indices_2)
      {
        // case 2a)
        const VectorizedArrayType val  = temp_1 - grad_weight * temp_2;
        const VectorizedArrayType grad = grad_weight * temp_2;
        do_vectorized_scatter_add(val, indices_1, dst_ptr_1);
        do_vectorized_scatter_add(grad, indices_2, dst_ptr_2);
      }

      template <typename T0, typename T1, typename T2>
      void
      function_2b(const T0 &temp, T1 dst_ptr, const T2 &indices)
      {
        // case 2b)
        do_vectorized_scatter_add(temp, indices, dst_ptr);
      }

      template <typename T0, typename T1, typename T2>
      void
      function_3a(const T0 &temp_1,
                  const T0 &temp_2,
                  T1 &      dst_ptr_1,
                  T1 &      dst_ptr_2,
                  const T2 &grad_weight)
      {
        // case 3a)
        const Number val  = temp_1 - grad_weight * temp_2;
        const Number grad = grad_weight * temp_2;
        dst_ptr_1 += val;
        dst_ptr_2 += grad;
      }

      template <typename T0, typename T1>
      void
      function_3b(const T0 &temp, T1 &dst_ptr)
      {
        // case 3b)
        dst_ptr += temp;
      }

https://github.com/dealii/dealii/blob/4cf8f26cbf26f12e630aff124cd112fb3f24180e/include/deal.II/matrix_free/evaluation_kernels.h#L3586-L3632

peterrum commented 3 years ago

Reference: hyper.deal

internal::MatrixFreeFunctions::DoFInfo

support of contiguous compressed index storage (in spirit of dealii::internal::MatrixFreeFunctions::DoFInfo::dof_indices_contiguous):

        std::array<std::vector<std::pair<unsigned int, unsigned int>>, 4>
          dof_indices_contiguous_ptr;

https://github.com/hyperdeal/hyperdeal/blob/1fadb385133a67922f793db61f7665d1db87a31f/include/hyper.deal/matrix_free/dof_info.h#L46-L47

ReadWriteOperation::process_cell

cases to consider: all lanes filled, some lanes filled

      template <typename Number>
      template <int dim,
                int degree,
                typename VectorOperation,
                typename VectorizedArrayType>
      void
      ReadWriteOperation<Number>::process_cell(
        const VectorOperation &      operation,
        const std::vector<double *> &global,
        VectorizedArrayType *        local,
        const unsigned int           cell_batch_number) const
      {
        static const unsigned int v_len = VectorizedArrayType::size();
        static const unsigned int n_dofs_per_cell =
          dealii::Utilities::pow<unsigned int>(degree + 1, dim);

        // step 1: get pointer to the first dof of the cell in the sm-domain
        std::array<Number *, v_len> global_ptr;
        std::fill(global_ptr.begin(), global_ptr.end(), nullptr);

        for (unsigned int v = 0;
             v < n_vectorization_lanes_filled[2][cell_batch_number] &&
             v < v_len;
             v++)
          {
            const auto sm_ptr =
              dof_indices_contiguous_ptr[2][v_len * cell_batch_number + v];
            global_ptr[v] = global[sm_ptr.first] + sm_ptr.second;
          }

        // step 2: process dofs
        if (n_vectorization_lanes_filled[2][cell_batch_number] == v_len)
          // case 1: all lanes are filled -> use optimized function
          operation.process_dofs_vectorized_transpose(n_dofs_per_cell,
                                                      global_ptr,
                                                      local);
        else
          // case 2: some lanes are empty
          for (unsigned int i = 0; i < n_dofs_per_cell; ++i)
            for (unsigned int v = 0;
                 v < n_vectorization_lanes_filled[2][cell_batch_number] &&
                 v < v_len;
                 v++)
              operation.process_dof(global_ptr[v][i], local[i][v]);
      }

https://github.com/hyperdeal/hyperdeal/blob/0ae55a2dba9803cda3f8a40792309d1738178dde/include/hyper.deal/matrix_free/read_write_operation.h#L128-L172

ReadWriteOperation::process_face

cases to consider: all lanes filled, some lanes filled, lanes correspond to cells or (ghost) faces

      template <typename Number>
      template <int dim_x,
                int dim_v,
                int degree,
                typename VectorOperation,
                typename VectorizedArrayType>
      void
      ReadWriteOperation<Number>::process_face(
        const VectorOperation &      operation,
        const std::vector<double *> &global,
        VectorizedArrayType *        local,
        const unsigned int *         face_no,
        const unsigned int *         face_orientation,
        const unsigned int           face_orientation_offset,
        const unsigned int           cell_batch_number,
        const unsigned int           cell_side,
        const unsigned int           face_batch_number,
        const unsigned int           face_side) const
      {
        static const unsigned int dim   = dim_x + dim_v;
        static const unsigned int v_len = VectorizedArrayType::size();
        static const unsigned int n_dofs_per_face =
          dealii::Utilities::pow<unsigned int>(degree + 1, dim - 1);

        std::array<Number *, v_len> global_ptr;
        std::fill(global_ptr.begin(), global_ptr.end(), nullptr);

        for (unsigned int v = 0;
             v < n_vectorization_lanes_filled[cell_side][cell_batch_number] &&
             v < v_len;
             v++)
          {
            const auto sm_ptr =
              dof_indices_contiguous_ptr[face_side]
                                        [v_len * face_batch_number + v];
            global_ptr[v] = global[sm_ptr.first] + sm_ptr.second;
          }

        if (n_vectorization_lanes_filled[cell_side][cell_batch_number] ==
              v_len &&
            (face_side == 2 || face_all[face_side][face_batch_number]))
          {
            const auto &face_orientations_ =
              face_orientations[face_orientation[0] + face_orientation_offset];

            if (face_side != 2 &&
                face_type[face_side][v_len * face_batch_number])
              {
                // case 1: read from buffers
                for (unsigned int i = 0; i < n_dofs_per_face; ++i)
                  {
                    const unsigned int i_ = (((dim_x <= 2) && (dim_v <= 2)) ||
                                             face_orientation[0] == 0) ?
                                              i :
                                              face_orientations_[i];

                    for (unsigned int v = 0; v < v_len; ++v)
                      operation.process_dof(global_ptr[v][i], local[i_][v]);
                  }
              }
            else
              {
                // case 2: read from shared memory
                for (unsigned int i = 0; i < n_dofs_per_face; ++i)
                  {
                    const unsigned int i_ = (((dim_x <= 2) && (dim_v <= 2)) ||
                                             face_orientation[0] == 0) ?
                                              i :
                                              face_orientations_[i];

                    for (unsigned int v = 0; v < v_len; ++v)
                      operation.process_dof(
                        global_ptr[v][face_to_cell_index_nodal[face_no[0]][i]],
                        local[i_][v]);
                  }
              }
          }
        else
          for (unsigned int v = 0;
               v < n_vectorization_lanes_filled[cell_side][cell_batch_number] &&
               v < v_len;
               v++)
            {
              const auto &face_orientations_ =
                face_orientations[face_orientation[face_side == 3 ? v : 0] +
                                  face_orientation_offset];

              if (face_side != 2 &&
                  face_type[face_side][v_len * face_batch_number + v])
                {
                  // case 1: read from buffers
                  for (unsigned int i = 0; i < n_dofs_per_face; ++i)
                    {
                      const unsigned int i_ =
                        (((dim_x <= 2) && (dim_v <= 2)) ||
                         face_orientation[face_side == 3 ? v : 0] == 0) ?
                          i :
                          face_orientations_[i];

                      operation.process_dof(global_ptr[v][i], local[i_][v]);
                    }
                }
              else
                {
                  // case 2: read from shared memory
                  for (unsigned int i = 0; i < n_dofs_per_face; ++i)
                    {
                      const unsigned int i_ =
                        (((dim_x <= 2) && (dim_v <= 2)) ||
                         face_orientation[face_side == 3 ? v : 0] == 0) ?
                          i :
                          face_orientations_[i];

                      operation.process_dof(
                        global_ptr[v][face_to_cell_index_nodal
                                        [face_no[face_side == 3 ? v : 0]][i]],
                        local[i_][v]);
                    }
                }
            }
      }

https://github.com/hyperdeal/hyperdeal/blob/0ae55a2dba9803cda3f8a40792309d1738178dde/include/hyper.deal/matrix_free/read_write_operation.h#L176-L296

VectorReader/VectorDistributorLocalToGlobal/VectorSetter

    namespace MatrixFreeFunctions
    {
      template <typename Number, typename VectorizedArrayType>
      struct VectorReader
      {
        void
        process_dof(const Number &global, Number &local) const
        {
          local = global;
        }

        void
        process_dofs_vectorized_transpose(
          const unsigned int dofs_per_cell,
          const std::array<Number *, VectorizedArrayType::size()> &global_ptr,
          VectorizedArrayType *                                    local) const
        {
          vectorized_load_and_transpose(dofs_per_cell, global_ptr, local);
        }
      };

      template <typename Number, typename VectorizedArrayType>
      struct VectorDistributorLocalToGlobal
      {
        void
        process_dof(Number &global, const Number &local) const
        {
          global += local;
        }

        void
        process_dofs_vectorized_transpose(
          const unsigned int                                 dofs_per_cell,
          std::array<Number *, VectorizedArrayType::size()> &global_ptr,
          const VectorizedArrayType *                        local) const
        {
          vectorized_transpose_and_store(true,
                                         dofs_per_cell,
                                         local,
                                         global_ptr);
        }
      };

      template <typename Number, typename VectorizedArrayType>
      struct VectorSetter
      {
        void
        process_dof(Number &global, const Number &local) const
        {
          global = local;
        }

        void
        process_dofs_vectorized_transpose(
          const unsigned int                                 dofs_per_cell,
          std::array<Number *, VectorizedArrayType::size()> &global_ptr,
          const VectorizedArrayType *                        local) const
        {
          vectorized_transpose_and_store(false,
                                         dofs_per_cell,
                                         local,
                                         global_ptr);
        }
      };

    } // namespace MatrixFreeFunctions

https://github.com/hyperdeal/hyperdeal/blob/0ae55a2dba9803cda3f8a40792309d1738178dde/include/hyper.deal/matrix_free/vector_access_internal.h#L25-L94

peterrum commented 3 years ago

A simple matrix-free deal.II code using the new vector:

#include <deal.II/base/conditional_ostream.h>
#include <deal.II/base/convergence_table.h>
#include <deal.II/base/mpi_compute_index_owner_internal.h>
#include <deal.II/base/quadrature_lib.h>

#include <deal.II/distributed/tria.h>

#include <deal.II/dofs/dof_handler.h>

#include <deal.II/fe/fe_dgq.h>
#include <deal.II/fe/mapping_q_generic.h>

#include <deal.II/grid/grid_generator.h>

#include <deal.II/lac/la_parallel_vector.h>
#include <deal.II/lac/la_sm_vector.h>

#include <deal.II/matrix_free/fe_evaluation.h>
#include <deal.II/matrix_free/matrix_free.h>

#include <chrono>
#include <vector>

#include "../sm-util/mpi.h"
#include "../sm-util/timers.h"

using namespace dealii;

using Number = double;

const MPI_Comm comm = MPI_COMM_WORLD;

template <int dim>
void
test(const MPI_Comm     comm,
     const MPI_Comm     comm_sm,
     const unsigned int degree,
     const unsigned int n_refinements)
{
  ConditionalOStream pcout(std::cout, Utilities::MPI::this_mpi_process(comm) == 0);

  // 1) create Triangulation
  parallel::distributed::Triangulation<dim> tria(comm);
  GridGenerator::hyper_cube(tria);
  tria.refine_global(n_refinements);

  // 2) create DoFHandler
  DoFHandler<dim> dof_handler(tria);
  FE_DGQ<dim>     fe(degree);
  dof_handler.distribute_dofs(fe);

  // 3) create MatrixFree
  AffineConstraints<Number> constraint;
  constraint.close();

  QGauss<1> quad(degree + 1);

  MappingQGeneric<dim> mapping(1);

  typename dealii::MatrixFree<dim, Number>::AdditionalData additional_data;
  additional_data.mapping_update_flags =
    update_gradients | update_JxW_values | update_quadrature_points;
  additional_data.mapping_update_flags_inner_faces =
    update_gradients | update_JxW_values | update_quadrature_points;
  additional_data.mapping_update_flags_boundary_faces =
    update_gradients | update_JxW_values | update_quadrature_points;

  dealii::MatrixFree<dim, Number> matrix_free;
  matrix_free.reinit(mapping, dof_handler, constraint, quad, additional_data);

  // 4) create shared-memory Partitioner and Vector
  using VectorType = LinearAlgebra::SharedMPI::Vector<Number>;
  VectorType vec_sm_src, vec_sm_dst;
  matrix_free.initialize_dof_vector(vec_sm_src, comm_sm);
  matrix_free.initialize_dof_vector(vec_sm_dst, comm_sm);

  // 5) create distributed Vector
  LinearAlgebra::distributed::Vector<Number> vec;
  matrix_free.initialize_dof_vector(vec);

  const auto partitioner = matrix_free.get_vector_partitioner(comm_sm);

  std::cout << partitioner->local_size() << " " << partitioner->n_ghost_indices() << std::endl;

  // test read_dof_values() & distribute_local_to_global()
  matrix_free.template loop<VectorType, VectorType>(
    [](const auto &data, auto &dst, const auto &src, const auto range) {
      FEEvaluation<dim, -1, 0, 1, Number> phi(data);

      for (auto cell = range.first; cell < range.second; ++cell)
        {
          phi.reinit(cell);
          phi.read_dof_values(src);

          phi.distribute_local_to_global(dst);
        }
    },
    [](const auto &data, auto &dst, const auto &src, const auto range) {
      FEFaceEvaluation<dim, -1, 0, 1, Number> phi_m(data, true);
      FEFaceEvaluation<dim, -1, 0, 1, Number> phi_p(data, false);

      for (auto cell = range.first; cell < range.second; ++cell)
        {
          phi_m.reinit(cell);
          phi_p.reinit(cell);
          phi_m.read_dof_values(src);
          phi_p.read_dof_values(src);

          phi_m.distribute_local_to_global(dst);
          phi_p.distribute_local_to_global(dst);
        }
    },
    [](const auto &, auto &, const auto &, const auto) {},
    vec_sm_dst,
    vec_sm_src,
    true,
    MatrixFree<dim, Number>::DataAccessOnFaces::unspecified,
    MatrixFree<dim, Number>::DataAccessOnFaces::unspecified);

  // test gather_evaluate() & integrate_scatter()
  matrix_free.template loop<VectorType, VectorType>(
    [](const auto &data, auto &dst, const auto &src, const auto range) {
      FEEvaluation<dim, -1, 0, 1, Number> phi(data);

      for (auto cell = range.first; cell < range.second; ++cell)
        {
          phi.reinit(cell);
          phi.gather_evaluate(src, true, true);

          for (unsigned int q = 0; q < phi.n_q_points; ++q)
            {
              phi.submit_value(phi.get_value(q), q);
              phi.submit_gradient(phi.get_gradient(q), q);
            }

          phi.integrate_scatter(true, true, dst);
        }
    },
    [](const auto &data, auto &dst, const auto &src, const auto range) {
      FEFaceEvaluation<dim, -1, 0, 1, Number> phi_m(data, true);
      FEFaceEvaluation<dim, -1, 0, 1, Number> phi_p(data, false);

      for (auto cell = range.first; cell < range.second; ++cell)
        {
          phi_m.reinit(cell);
          phi_p.reinit(cell);
          phi_m.gather_evaluate(src, true, true);
          phi_p.gather_evaluate(src, true, true);

          for (unsigned int q = 0; q < phi_m.n_q_points; ++q)
            {
              phi_m.submit_value(phi_m.get_value(q), q);
              phi_m.submit_gradient(phi_m.get_gradient(q), q);
              phi_p.submit_value(phi_p.get_value(q), q);
              phi_p.submit_gradient(phi_p.get_gradient(q), q);
            }

          phi_m.integrate_scatter(true, true, dst);
          phi_p.integrate_scatter(true, true, dst);
        }
    },
    [](const auto &, auto &, const auto &, const auto) {},
    vec_sm_dst,
    vec_sm_src,
    true,
    MatrixFree<dim, Number>::DataAccessOnFaces::unspecified,
    MatrixFree<dim, Number>::DataAccessOnFaces::unspecified);
}

int
main(int argc, char **argv)
{
  Utilities::MPI::MPI_InitFinalize mpi(argc, argv, 1);

  const unsigned int rank = Utilities::MPI::this_mpi_process(comm);

  ConditionalOStream pcout(std::cout, rank == 0);

  // print help page if requested
  if (argc > 1 && std::string(argv[1]) == "--help")
    {
      pcout << "mpirun -np 40 ./bench group_size dim degree n_refinements" << std::endl;
      return 0;
    }

  // read parameters form command line
  const unsigned int group_size    = argc < 2 ? 1 : atoi(argv[1]);
  const unsigned int dim           = argc < 3 ? 2 : atoi(argv[2]);
  const unsigned int degree        = argc < 4 ? 3 : atoi(argv[3]);
  const unsigned int n_refinements = argc < 5 ? 2 : atoi(argv[4]);

  hyperdeal::ScopedLikwidInitFinalize likwid;

  // run tests for different group sizes
  {
    // create sm communicator
    MPI_Comm comm_sm;
    MPI_Comm_split(comm, rank / group_size, rank, &comm_sm);

    // perform tests
    if (dim == 2)
      test<2>(comm, comm_sm, degree, n_refinements);
    else if (dim == 3)
      test<3>(comm, comm_sm, degree, n_refinements);

    // free communicator
    MPI_Comm_free(&comm_sm);
  }
}

... basis for discussion.