In this example, the Kokkos::deep_copy call marked below causes the program to hang when running with multiple MPI ranks. Commenting out the deep_copy and uncommenting the for-loop to copy the equivalent entries makes the program work as expected.
Kokkos::deep_copy is not a collective operation when one of the arguments is a view in a remote space, correct?
// clang-format off
#include <fstream>
#include <algorithm>
#include <numeric>
#include <unistd.h>
#include <Kokkos_RemoteSpaces.hpp>
// clang-format on
int main(int argc, char *argv[]) {
using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace;
constexpr size_t M = 8;
int mpi_thread_level_available;
int mpi_thread_level_required = MPI_THREAD_MULTIPLE;
MPI_Init_thread(&argc, &argv, mpi_thread_level_required,
&mpi_thread_level_available);
assert(mpi_thread_level_available >= mpi_thread_level_required);
if (!(mpi_thread_level_available >= mpi_thread_level_required)) {
// if asserts are disabled, don't want to move forward.
std::cout << "mpi_thread_level_available >= mpi_thread_level_required failed\n";
exit(1);
}
Kokkos::initialize(argc, argv);
{
using namespace Kokkos;
using PartitionedView1D =
Kokkos::View<double **, PartitionedLayoutRight, RemoteSpace_t>;
using Local1DView = typename PartitionedView1D::HostMirror;
using TeamPolicy_t = Kokkos::TeamPolicy<>;
int size, rank;
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0) std::cout << "MPI_COMM_WORLD size: " << size << '\n';
auto A = PartitionedView1D("RemoteView", size, M);
RemoteSpace_t().fence();
auto Alocal = Local1DView("LocalView", 1, M);
auto lr = Experimental::get_local_range(M);
parallel_for(
"init", (A.extent(1)),
KOKKOS_LAMBDA(auto i) { A(rank, i) = rank * M + i; });
RemoteSpace_t().fence();
for (int i = 0; i < size; i++) {
if (rank == 0) {
std::cout << "MPI_COMM_WORLD rank: " << i << '\n';
auto range = std::make_pair(size_t(0), M);
auto ar = Kokkos::subview(A, std::make_pair(i, i+1), range);
auto al = Kokkos::subview(A, std::make_pair(rank, rank+1), range);
Kokkos::parallel_for(
"Team", TeamPolicy_t(1, 1),
KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) {
Kokkos::single(Kokkos::PerTeam(team), [&]() {
Kokkos::Experimental::RemoteSpaces::local_deep_copy(al, ar);
});
});
//for(int i = 0; i < al.extent_int(1); i++)
// Alocal(0, i) = al(0, i);
Kokkos::deep_copy(Alocal, al); // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< HERE
for (size_t j = range.first; j < range.second; j++)
std::cout << Alocal(0, j) << ' ';
std::cout << '\n';
}
RemoteSpace_t().fence();
}
}
Kokkos::finalize();
MPI_Finalize();
return 0;
}
Output with 4 ranks and the deep_copy uncommented:
Thanks. We do a global fence in the deep_copy which makes it collective operation and hence the deadlock. This behavior is an artifact from previous versions and I will add the fix to one of the PRs.
In this example, the
Kokkos::deep_copy
call marked below causes the program to hang when running with multiple MPI ranks. Commenting out the deep_copy and uncommenting the for-loop to copy the equivalent entries makes the program work as expected.Kokkos::deep_copy
is not a collective operation when one of the arguments is a view in a remote space, correct?Output with 4 ranks and the deep_copy uncommented:
With the for-loop uncommented (correct behavior). Now the program terminates normally: