Closed peterrum closed 2 years ago
support of contiguous compressed index storage (in spirit of dealii::internal::MatrixFreeFunctions::DoFInfo::dof_indices_contiguous
):
std::array<std::vector<std::pair<unsigned int, unsigned int>>, 4>
dof_indices_contiguous_ptr;
cases to consider: all lanes filled, some lanes filled
template <typename Number>
template <int dim,
int degree,
typename VectorOperation,
typename VectorizedArrayType>
void
ReadWriteOperation<Number>::process_cell(
const VectorOperation & operation,
const std::vector<double *> &global,
VectorizedArrayType * local,
const unsigned int cell_batch_number) const
{
static const unsigned int v_len = VectorizedArrayType::size();
static const unsigned int n_dofs_per_cell =
dealii::Utilities::pow<unsigned int>(degree + 1, dim);
// step 1: get pointer to the first dof of the cell in the sm-domain
std::array<Number *, v_len> global_ptr;
std::fill(global_ptr.begin(), global_ptr.end(), nullptr);
for (unsigned int v = 0;
v < n_vectorization_lanes_filled[2][cell_batch_number] &&
v < v_len;
v++)
{
const auto sm_ptr =
dof_indices_contiguous_ptr[2][v_len * cell_batch_number + v];
global_ptr[v] = global[sm_ptr.first] + sm_ptr.second;
}
// step 2: process dofs
if (n_vectorization_lanes_filled[2][cell_batch_number] == v_len)
// case 1: all lanes are filled -> use optimized function
operation.process_dofs_vectorized_transpose(n_dofs_per_cell,
global_ptr,
local);
else
// case 2: some lanes are empty
for (unsigned int i = 0; i < n_dofs_per_cell; ++i)
for (unsigned int v = 0;
v < n_vectorization_lanes_filled[2][cell_batch_number] &&
v < v_len;
v++)
operation.process_dof(global_ptr[v][i], local[i][v]);
}
cases to consider: all lanes filled, some lanes filled, lanes correspond to cells or (ghost) faces
template <typename Number>
template <int dim_x,
int dim_v,
int degree,
typename VectorOperation,
typename VectorizedArrayType>
void
ReadWriteOperation<Number>::process_face(
const VectorOperation & operation,
const std::vector<double *> &global,
VectorizedArrayType * local,
const unsigned int * face_no,
const unsigned int * face_orientation,
const unsigned int face_orientation_offset,
const unsigned int cell_batch_number,
const unsigned int cell_side,
const unsigned int face_batch_number,
const unsigned int face_side) const
{
static const unsigned int dim = dim_x + dim_v;
static const unsigned int v_len = VectorizedArrayType::size();
static const unsigned int n_dofs_per_face =
dealii::Utilities::pow<unsigned int>(degree + 1, dim - 1);
std::array<Number *, v_len> global_ptr;
std::fill(global_ptr.begin(), global_ptr.end(), nullptr);
for (unsigned int v = 0;
v < n_vectorization_lanes_filled[cell_side][cell_batch_number] &&
v < v_len;
v++)
{
const auto sm_ptr =
dof_indices_contiguous_ptr[face_side]
[v_len * face_batch_number + v];
global_ptr[v] = global[sm_ptr.first] + sm_ptr.second;
}
if (n_vectorization_lanes_filled[cell_side][cell_batch_number] ==
v_len &&
(face_side == 2 || face_all[face_side][face_batch_number]))
{
const auto &face_orientations_ =
face_orientations[face_orientation[0] + face_orientation_offset];
if (face_side != 2 &&
face_type[face_side][v_len * face_batch_number])
{
// case 1: read from buffers
for (unsigned int i = 0; i < n_dofs_per_face; ++i)
{
const unsigned int i_ = (((dim_x <= 2) && (dim_v <= 2)) ||
face_orientation[0] == 0) ?
i :
face_orientations_[i];
for (unsigned int v = 0; v < v_len; ++v)
operation.process_dof(global_ptr[v][i], local[i_][v]);
}
}
else
{
// case 2: read from shared memory
for (unsigned int i = 0; i < n_dofs_per_face; ++i)
{
const unsigned int i_ = (((dim_x <= 2) && (dim_v <= 2)) ||
face_orientation[0] == 0) ?
i :
face_orientations_[i];
for (unsigned int v = 0; v < v_len; ++v)
operation.process_dof(
global_ptr[v][face_to_cell_index_nodal[face_no[0]][i]],
local[i_][v]);
}
}
}
else
for (unsigned int v = 0;
v < n_vectorization_lanes_filled[cell_side][cell_batch_number] &&
v < v_len;
v++)
{
const auto &face_orientations_ =
face_orientations[face_orientation[face_side == 3 ? v : 0] +
face_orientation_offset];
if (face_side != 2 &&
face_type[face_side][v_len * face_batch_number + v])
{
// case 1: read from buffers
for (unsigned int i = 0; i < n_dofs_per_face; ++i)
{
const unsigned int i_ =
(((dim_x <= 2) && (dim_v <= 2)) ||
face_orientation[face_side == 3 ? v : 0] == 0) ?
i :
face_orientations_[i];
operation.process_dof(global_ptr[v][i], local[i_][v]);
}
}
else
{
// case 2: read from shared memory
for (unsigned int i = 0; i < n_dofs_per_face; ++i)
{
const unsigned int i_ =
(((dim_x <= 2) && (dim_v <= 2)) ||
face_orientation[face_side == 3 ? v : 0] == 0) ?
i :
face_orientations_[i];
operation.process_dof(
global_ptr[v][face_to_cell_index_nodal
[face_no[face_side == 3 ? v : 0]][i]],
local[i_][v]);
}
}
}
}
namespace MatrixFreeFunctions
{
template <typename Number, typename VectorizedArrayType>
struct VectorReader
{
void
process_dof(const Number &global, Number &local) const
{
local = global;
}
void
process_dofs_vectorized_transpose(
const unsigned int dofs_per_cell,
const std::array<Number *, VectorizedArrayType::size()> &global_ptr,
VectorizedArrayType * local) const
{
vectorized_load_and_transpose(dofs_per_cell, global_ptr, local);
}
};
template <typename Number, typename VectorizedArrayType>
struct VectorDistributorLocalToGlobal
{
void
process_dof(Number &global, const Number &local) const
{
global += local;
}
void
process_dofs_vectorized_transpose(
const unsigned int dofs_per_cell,
std::array<Number *, VectorizedArrayType::size()> &global_ptr,
const VectorizedArrayType * local) const
{
vectorized_transpose_and_store(true,
dofs_per_cell,
local,
global_ptr);
}
};
template <typename Number, typename VectorizedArrayType>
struct VectorSetter
{
void
process_dof(Number &global, const Number &local) const
{
global = local;
}
void
process_dofs_vectorized_transpose(
const unsigned int dofs_per_cell,
std::array<Number *, VectorizedArrayType::size()> &global_ptr,
const VectorizedArrayType * local) const
{
vectorized_transpose_and_store(false,
dofs_per_cell,
local,
global_ptr);
}
};
} // namespace MatrixFreeFunctions
A simple matrix-free deal.II code using the new vector:
#include <deal.II/base/conditional_ostream.h>
#include <deal.II/base/convergence_table.h>
#include <deal.II/base/mpi_compute_index_owner_internal.h>
#include <deal.II/base/quadrature_lib.h>
#include <deal.II/distributed/tria.h>
#include <deal.II/dofs/dof_handler.h>
#include <deal.II/fe/fe_dgq.h>
#include <deal.II/fe/mapping_q_generic.h>
#include <deal.II/grid/grid_generator.h>
#include <deal.II/lac/la_parallel_vector.h>
#include <deal.II/lac/la_sm_vector.h>
#include <deal.II/matrix_free/fe_evaluation.h>
#include <deal.II/matrix_free/matrix_free.h>
#include <chrono>
#include <vector>
#include "../sm-util/mpi.h"
#include "../sm-util/timers.h"
using namespace dealii;
using Number = double;
const MPI_Comm comm = MPI_COMM_WORLD;
template <int dim>
void
test(const MPI_Comm comm,
const MPI_Comm comm_sm,
const unsigned int degree,
const unsigned int n_refinements)
{
ConditionalOStream pcout(std::cout, Utilities::MPI::this_mpi_process(comm) == 0);
// 1) create Triangulation
parallel::distributed::Triangulation<dim> tria(comm);
GridGenerator::hyper_cube(tria);
tria.refine_global(n_refinements);
// 2) create DoFHandler
DoFHandler<dim> dof_handler(tria);
FE_DGQ<dim> fe(degree);
dof_handler.distribute_dofs(fe);
// 3) create MatrixFree
AffineConstraints<Number> constraint;
constraint.close();
QGauss<1> quad(degree + 1);
MappingQGeneric<dim> mapping(1);
typename dealii::MatrixFree<dim, Number>::AdditionalData additional_data;
additional_data.mapping_update_flags =
update_gradients | update_JxW_values | update_quadrature_points;
additional_data.mapping_update_flags_inner_faces =
update_gradients | update_JxW_values | update_quadrature_points;
additional_data.mapping_update_flags_boundary_faces =
update_gradients | update_JxW_values | update_quadrature_points;
dealii::MatrixFree<dim, Number> matrix_free;
matrix_free.reinit(mapping, dof_handler, constraint, quad, additional_data);
// 4) create shared-memory Partitioner and Vector
using VectorType = LinearAlgebra::SharedMPI::Vector<Number>;
VectorType vec_sm_src, vec_sm_dst;
matrix_free.initialize_dof_vector(vec_sm_src, comm_sm);
matrix_free.initialize_dof_vector(vec_sm_dst, comm_sm);
// 5) create distributed Vector
LinearAlgebra::distributed::Vector<Number> vec;
matrix_free.initialize_dof_vector(vec);
const auto partitioner = matrix_free.get_vector_partitioner(comm_sm);
std::cout << partitioner->local_size() << " " << partitioner->n_ghost_indices() << std::endl;
// test read_dof_values() & distribute_local_to_global()
matrix_free.template loop<VectorType, VectorType>(
[](const auto &data, auto &dst, const auto &src, const auto range) {
FEEvaluation<dim, -1, 0, 1, Number> phi(data);
for (auto cell = range.first; cell < range.second; ++cell)
{
phi.reinit(cell);
phi.read_dof_values(src);
phi.distribute_local_to_global(dst);
}
},
[](const auto &data, auto &dst, const auto &src, const auto range) {
FEFaceEvaluation<dim, -1, 0, 1, Number> phi_m(data, true);
FEFaceEvaluation<dim, -1, 0, 1, Number> phi_p(data, false);
for (auto cell = range.first; cell < range.second; ++cell)
{
phi_m.reinit(cell);
phi_p.reinit(cell);
phi_m.read_dof_values(src);
phi_p.read_dof_values(src);
phi_m.distribute_local_to_global(dst);
phi_p.distribute_local_to_global(dst);
}
},
[](const auto &, auto &, const auto &, const auto) {},
vec_sm_dst,
vec_sm_src,
true,
MatrixFree<dim, Number>::DataAccessOnFaces::unspecified,
MatrixFree<dim, Number>::DataAccessOnFaces::unspecified);
// test gather_evaluate() & integrate_scatter()
matrix_free.template loop<VectorType, VectorType>(
[](const auto &data, auto &dst, const auto &src, const auto range) {
FEEvaluation<dim, -1, 0, 1, Number> phi(data);
for (auto cell = range.first; cell < range.second; ++cell)
{
phi.reinit(cell);
phi.gather_evaluate(src, true, true);
for (unsigned int q = 0; q < phi.n_q_points; ++q)
{
phi.submit_value(phi.get_value(q), q);
phi.submit_gradient(phi.get_gradient(q), q);
}
phi.integrate_scatter(true, true, dst);
}
},
[](const auto &data, auto &dst, const auto &src, const auto range) {
FEFaceEvaluation<dim, -1, 0, 1, Number> phi_m(data, true);
FEFaceEvaluation<dim, -1, 0, 1, Number> phi_p(data, false);
for (auto cell = range.first; cell < range.second; ++cell)
{
phi_m.reinit(cell);
phi_p.reinit(cell);
phi_m.gather_evaluate(src, true, true);
phi_p.gather_evaluate(src, true, true);
for (unsigned int q = 0; q < phi_m.n_q_points; ++q)
{
phi_m.submit_value(phi_m.get_value(q), q);
phi_m.submit_gradient(phi_m.get_gradient(q), q);
phi_p.submit_value(phi_p.get_value(q), q);
phi_p.submit_gradient(phi_p.get_gradient(q), q);
}
phi_m.integrate_scatter(true, true, dst);
phi_p.integrate_scatter(true, true, dst);
}
},
[](const auto &, auto &, const auto &, const auto) {},
vec_sm_dst,
vec_sm_src,
true,
MatrixFree<dim, Number>::DataAccessOnFaces::unspecified,
MatrixFree<dim, Number>::DataAccessOnFaces::unspecified);
}
int
main(int argc, char **argv)
{
Utilities::MPI::MPI_InitFinalize mpi(argc, argv, 1);
const unsigned int rank = Utilities::MPI::this_mpi_process(comm);
ConditionalOStream pcout(std::cout, rank == 0);
// print help page if requested
if (argc > 1 && std::string(argv[1]) == "--help")
{
pcout << "mpirun -np 40 ./bench group_size dim degree n_refinements" << std::endl;
return 0;
}
// read parameters form command line
const unsigned int group_size = argc < 2 ? 1 : atoi(argv[1]);
const unsigned int dim = argc < 3 ? 2 : atoi(argv[2]);
const unsigned int degree = argc < 4 ? 3 : atoi(argv[3]);
const unsigned int n_refinements = argc < 5 ? 2 : atoi(argv[4]);
hyperdeal::ScopedLikwidInitFinalize likwid;
// run tests for different group sizes
{
// create sm communicator
MPI_Comm comm_sm;
MPI_Comm_split(comm, rank / group_size, rank, &comm_sm);
// perform tests
if (dim == 2)
test<2>(comm, comm_sm, degree, n_refinements);
else if (dim == 3)
test<3>(comm, comm_sm, degree, n_refinements);
// free communicator
MPI_Comm_free(&comm_sm);
}
}
... basis for discussion.
Goal
Introduce a new vector (
LinearAlgebra::SharedMPI::Vector
) in deal.II in such a way that it can be used in dealii-based DG-application (DG, ghost cells), inhyper.deal
(DG, ghost, faces), and in the CEED benchmarks (CG).related to https://github.com/dealii/dealii/pull/10872, https://github.com/hyperdeal/hyperdeal/pull/18, and https://github.com/peterrum/ceed_benchmarks_dealii/tree/sm_vector_mf
Layered approach
LinearAlgebra::SharedMPI::Vector
-> storing the raw pointers and providing RW access to arrays and its entries + has aLinearAlgebra::SharedMPI::Partitioner
for defaultupdate_ghost_values()
/compression()
(aka.DataAccessOnFaces::unspecified
)LinearAlgebra::SharedMPI::Partitioner
implementation for communicationIndexSet
-> https://github.com/dealii/dealii/blob/067a4365c2848766320e07fc4f6c9ed7b880ac8f/include/deal.II/lac/la_sm_partitioner.h#L1-L483 and https://github.com/dealii/dealii/blob/067a4365c2848766320e07fc4f6c9ed7b880ac8f/source/lac/la_sm_partitioner.cc#L1-L784 -> CGsetup via
global_cell_index
-> https://github.com/hyperdeal/hyperdeal/blob/0ae55a2dba9803cda3f8a40792309d1738178dde/include/hyper.deal/matrix_free/vector_partitioner.h#L1-L1499 with functions translating cells and faces to pairs (rank, offset):DoFInfo
(owned byMatrixFree
) pre-computing and storing pair for each lane of macro cell using the above functions of the partitioner ->dof_indices_contiguous_ptr
FEEvaluation
usesDoFInfo::dof_indices_contiguous_ptr
Initialize partitioner
DoFInfo
?DoFInfo::vector_partitioner_sm
(x1),DoFInfo::vector_partitioner_face_variants_sm
(x5)?MatrixFree::AdditionalData
?IndexSet
/dealii::Utilties::MPI::Partitioner
vs. DG ->global_cell_index
Get partitioner
MatrixFree::get_vector_partitioner()
Initialize vector
MatrixFree::initialize_dof_vector()
-> remove sm comm as argument?Update vector
VectorDataExchange::update_ghost_values_start()
,update_ghost_values_finish()
,compress_start()
,compress_finish()
,reset_ghost_values()
VectorDataExchange::zero_vector_region()
-> use normal partitioner?Accessing vector during cell/face integrals
via
read_dof_values()
,distribute_local_to_global()
,gather_evaluate()
,integrate_scatter()
read_dof_values()
/distribute_local_to_global()
useVectorReader
/read_write_operation
(CG/DG) /read_write_operation_contiguous
(DG -> to be specialized?, rule outinterleaved_contiguous
? - see code snippets below)https://github.com/dealii/dealii/blob/4cf8f26cbf26f12e630aff124cd112fb3f24180e/include/deal.II/matrix_free/fe_evaluation.h#L4618-L4640
https://github.com/dealii/dealii/blob/4cf8f26cbf26f12e630aff124cd112fb3f24180e/include/deal.II/matrix_free/fe_evaluation.h#L4712-L4745
for cell
gather_evaluate()
/integrate_scatter()
callread_dof_values()
/distribute_local_to_global()
for faces
gather_evaluate()
/integrate_scatter()
->fe_face_evaluation_process_and_io()
:MatrixFreeFunctions::tensor_symmetric_hermite
+do_gradients
:n_face_orientations
/n_vectorization_lanes_filled
do_gradients
:n_face_orientations
/n_vectorization_lanes_filled
https://github.com/dealii/dealii/blob/4cf8f26cbf26f12e630aff124cd112fb3f24180e/include/deal.II/matrix_free/evaluation_kernels.h#L2980-L3139
https://github.com/dealii/dealii/blob/4cf8f26cbf26f12e630aff124cd112fb3f24180e/include/deal.II/matrix_free/evaluation_kernels.h#L3310-L3353
https://github.com/dealii/dealii/blob/4cf8f26cbf26f12e630aff124cd112fb3f24180e/include/deal.II/matrix_free/evaluation_kernels.h#L3586-L3632