Possible bug in get_all_data routine

I seem to have found a bug in the get_all_data(...) routine that exists on certain process grid configurations.

I provide a minimum working example below, as well as a Makefile, that can be used to reproduce the problem.

In the code snippets below, please replace the CTF header file path, as well as the library path to CTF in the Makefile.

Please run on an interactive job on a single node of Stampede2 as follows:

mpiexec -np 64 ./ctf_cholinv 512

This will print two matrices to standard out.

The first is the result of LAPACK dtrtri function (triangular inverse) called from the base case. Every process calls that routine with the same exact data (that data was essentially Allgathered onto each process via get_all_data in the beginning of the base case). Before returning from the base case, each process writes a portion of that data to its global indices as specified via calling get_local_data.
The second print is the result of the get_all_data of the same matrix written directly before (see above). You will see that whereas the first is triangular, the second is not, and this is apparent from the second column, which contains 3 elements, the first two of which are the exact same. Similar issues occur for the rest of the columns.

I've noticed that this bug appears only when the number of processes is >=64, and for matrix dimensions 512+.

#include "../ctf_true/ctf/include/ctf.hpp"
#include "mkl.h"
#include <float.h>
#include <vector>
using namespace CTF;

class cholinv{
public:
  template<typename TensorType>
  static void invoke(TensorType& A, TensorType& R, TensorType& Rinv, int s, int e, int n, int n_bc, World& dw){
    if (dw.rank==0){
      std::cout << "Cholinv: n-" << s << " e-" << e << " n-" << n << std::endl;
    }
    if (n<=n_bc){
      //allgather A into a single tensor
      double* A_data; int64_t A_num_elems;
      std::vector<double> R_data(n*n,0.);
      std::vector<double> Rinv_data(n*n,0.);
      A.get_all_data(&A_num_elems, &A_data, false);
      // Keep zeros in lower-diagonal part, else factorization product in main gets screwed up
      int A_cnt=0;
      for (int i=0; i<n; i++){
        for (int j=0; j<=i; j++){
          R_data[i*n+j] = A_data[A_cnt++];
        }
      }
      assert(A_cnt == A_num_elems);
      auto info = LAPACKE_dpotrf(LAPACK_COL_MAJOR,'U',n,&R_data[0],n);
      assert(info==0);
      Rinv_data = R_data;
      info = LAPACKE_dtrtri(LAPACK_COL_MAJOR,'U','N',n,&Rinv_data[0],n);
      assert(info==0);

      // debug check
      if ((s==0) and (e==32)){
      if (dw.rank==0){
        // every rank here shares the same data, so we will just print from rank 0. (VERIFY THAT EACH PROCESS SHARES SAME DATA?)
        for (int i=0; i<n; i++){
          for (int j=0; j<n; j++){
            std::cout << "R^-1 data after dtrtri - (" << i << "," << j << ") - " << Rinv_data[i*n+j] << std::endl;
          }
        }
      }
      MPI_Barrier(MPI_COMM_WORLD);
      }

      int64_t* indices1; int64_t num_pairs1; double* pairs1;
      int64_t* indices2; int64_t num_pairs2; double* pairs2;
      R.get_local_data(&num_pairs1, &indices1, &pairs1);
      Rinv.get_local_data(&num_pairs2, &indices2, &pairs2);
      std::vector<double> R_local(num_pairs1);
      std::vector<double> Rinv_local(num_pairs2);
      for (int j=0; j<num_pairs1; j++){
        R_local[j] = R_data[indices1[j]];
      }
      for (int j=0; j<num_pairs2; j++){
        Rinv_local[j] = Rinv_data[indices2[j]];
      }
      R.write(num_pairs1, indices1, &R_local[0]);
      Rinv.write(num_pairs2, indices2, &Rinv_local[0]);
      delete[] A_data; delete[] pairs1; free(indices1);
      delete[] pairs2; free(indices2);
      return;
    }
    assert(n%2==0);
    int k=n/2;
    int off_00[2] = {0,0};
    int off_01[2] = {0,k};
    int off_10[2] = {k,0};
    int off_11[2] = {k,k};
    int end_11[2] = {k,k};
    int end_21[2] = {n,k};
    int end_12[2] = {k,n};
    int end_22[2] = {n,n};
    Matrix<> A_11(k,k,SY,dw);
    Matrix<> R_11(k,k,NS,dw);
    Matrix<> Rinv_11(k,k,NS,dw);
    Matrix<> A_12(k,k,NS,dw);
    Matrix<> R_12(k,k,NS,dw);
    Matrix<> Rinv_12(k,k,NS,dw);
    Matrix<> A_22(k,k,SY,dw);
    Matrix<> R_22(k,k,NS,dw);
    Matrix<> Rinv_22(k,k,NS,dw);
    Matrix<> S_12(k,k,NS,dw);
    Matrix<> S_22(k,k,NS,dw);
    Matrix<> Sinv_12(k,k,NS,dw);

    A_11 = A.slice(off_00, end_11);// will be symmetric with special storage
    A_12 = A.slice(off_01, end_12);
    A_22 = A.slice(off_11, end_22);
    S_22 = A_22;// will be symmetric with special storage

    R_11 = R.slice(off_00, end_11);
    Rinv_11 = Rinv.slice(off_00, end_11);
    invoke(A_11,R_11,Rinv_11,s,s+k,k,n_bc,dw);

    // debug check
    // lets debug Rinv_11
    if ((s==0) and (e==64)){
    double* temp_data; int64_t temp_num_elems;
    Rinv_11.get_all_data(&temp_num_elems, &temp_data, false);
    if (dw.rank==0){
      std::cout << "Print Rinv_11 after get_all_data: start-" << s << " end-" << e << " n-" << n << std::endl;
      // every rank here shares the same data, so we will just print from rank 0. (VERIFY THAT EACH PROCESS SHARES SAME DATA?)
      for (int i=0; i<k; i++){
        for (int j=0; j<k; j++){
          std::cout << "Rinv_11 after get_all_dat (" << i << "," << j << ") - " << temp_data[i*k+j] << std::endl;
        }
      }
    }
    delete[] temp_data;
    MPI_Barrier(MPI_COMM_WORLD);
    }
  }
};

int main(int argc, char** argv){
  int rank, np, np_cube_root, n, n_bc;
  int64_t num_pairs;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &np);

  n = atoi(argv[1]);
  np_cube_root = std::nearbyint(std::ceil(pow(np,1./3.)));
  n_bc = n/(np_cube_root*np_cube_root);
{
  World dw(argc, argv);
  if (rank == 0){
    printf("Factoring A via Cholesky inverse with n-%d, np_cube_root-%d, n_bc-%d\n",n,np_cube_root,n_bc);
  }

  Matrix<> A(n,n,SY,dw);
  Matrix<> R(n,n,NS,dw);
  Matrix<> Rinv(n,n,NS,dw);

  double* pairs; int64_t* indices;
  srand48(13*rank);
  A.get_local_data(&num_pairs, &indices, &pairs);
  for (auto i=0; i<num_pairs; i++ ){
    pairs[i] = 0.5;
    // Diagonal elements in the global tensor are given extra weight (same as what I did in camfs)
    if ((indices[i]/n) == (indices[i]%n)){
      pairs[i] = n;
    }
  }
  A.write(num_pairs, indices, pairs);
  delete[] pairs; free(indices);

  cholinv::invoke(A,R,Rinv,0,n,n,n_bc,dw);
}
  MPI_Finalize();
  return 0;
}

HOST=$(shell hostname)
STAMPEDE2=$(shell echo $(HOST) |grep 'stampede2')
BIN=./

all: ctf_cholinv

MACHINE=STAMPEDE2
CCMPI = mpicxx
CFLAGS = -g -Wall -O3 -std=c++14 -mkl=parallel -xMIC-AVX512
LIB_PATH=-L../ctf_true/ctf/lib -L/opt/intel/compilers_and_libraries_2017.4.196/linux/mkl/lib/intel64 -Wl,--no-as-needed
LIBS=-lctf -lmkl_scalapack_lp64 -lmkl_gf_lp64 -lmkl_intel_thread -lmkl_core -lmkl_blacs_intelmpi_lp64 -lmkl_def -liomp5 -lpthread -lm -ldl

ctf_cholinv: ctf_cholinv.o
        $(CCMPI) $(CFLAGS) -o $(BIN)ctf_cholinv ctf_cholinv.o $(LIB_PATH) $(LIBS)
        rm *.o
ctf_cholinv.o:
      $(CCMPI) $(CFLAGS) -D$(MACHINE) -o ctf_cholinv.o -c cholinv.cxx
clean:
      -rm -f *.o *.gch $(BIN)ctf_cholinv

endif

My CTF build was configured as follows:

./configure '--no-dynamic' 'LIB_PATH=-L/opt/intel/compilers_and_libraries_2018.2.199/linux/mkl/lib/intel64/' 'LIBS=-lmkl_scalapack_lp64 -Wl,--start-group -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lmkl_blacs_intelmpi_lp64 -Wl,--end-group -lpthread -lm'

cyclops-community / ctf

Possible bug in get_all_data routine #81