cyclops-community / ctf

Cyclops Tensor Framework: parallel arithmetic on multidimensional arrays
Other
201 stars 53 forks source link

Possible bug in get_all_data routine #81

Closed huttered40 closed 5 years ago

huttered40 commented 5 years ago

I seem to have found a bug in the get_all_data(...) routine that exists on certain process grid configurations.

I provide a minimum working example below, as well as a Makefile, that can be used to reproduce the problem.

In the code snippets below, please replace the CTF header file path, as well as the library path to CTF in the Makefile.

Please run on an interactive job on a single node of Stampede2 as follows:

mpiexec -np 64 ./ctf_cholinv 512

This will print two matrices to standard out.

I've noticed that this bug appears only when the number of processes is >=64, and for matrix dimensions 512+.

#include "../ctf_true/ctf/include/ctf.hpp"
#include "mkl.h"
#include <float.h>
#include <vector>
using namespace CTF;

class cholinv{
public:
  template<typename TensorType>
  static void invoke(TensorType& A, TensorType& R, TensorType& Rinv, int s, int e, int n, int n_bc, World& dw){
    if (dw.rank==0){
      std::cout << "Cholinv: n-" << s << " e-" << e << " n-" << n << std::endl;
    }
    if (n<=n_bc){
      //allgather A into a single tensor
      double* A_data; int64_t A_num_elems;
      std::vector<double> R_data(n*n,0.);
      std::vector<double> Rinv_data(n*n,0.);
      A.get_all_data(&A_num_elems, &A_data, false);
      // Keep zeros in lower-diagonal part, else factorization product in main gets screwed up
      int A_cnt=0;
      for (int i=0; i<n; i++){
        for (int j=0; j<=i; j++){
          R_data[i*n+j] = A_data[A_cnt++];
        }
      }
      assert(A_cnt == A_num_elems);
      auto info = LAPACKE_dpotrf(LAPACK_COL_MAJOR,'U',n,&R_data[0],n);
      assert(info==0);
      Rinv_data = R_data;
      info = LAPACKE_dtrtri(LAPACK_COL_MAJOR,'U','N',n,&Rinv_data[0],n);
      assert(info==0);

      // debug check
      if ((s==0) and (e==32)){
      if (dw.rank==0){
        // every rank here shares the same data, so we will just print from rank 0. (VERIFY THAT EACH PROCESS SHARES SAME DATA?)
        for (int i=0; i<n; i++){
          for (int j=0; j<n; j++){
            std::cout << "R^-1 data after dtrtri - (" << i << "," << j << ") - " << Rinv_data[i*n+j] << std::endl;
          }
        }
      }
      MPI_Barrier(MPI_COMM_WORLD);
      }

      int64_t* indices1; int64_t num_pairs1; double* pairs1;
      int64_t* indices2; int64_t num_pairs2; double* pairs2;
      R.get_local_data(&num_pairs1, &indices1, &pairs1);
      Rinv.get_local_data(&num_pairs2, &indices2, &pairs2);
      std::vector<double> R_local(num_pairs1);
      std::vector<double> Rinv_local(num_pairs2);
      for (int j=0; j<num_pairs1; j++){
        R_local[j] = R_data[indices1[j]];
      }
      for (int j=0; j<num_pairs2; j++){
        Rinv_local[j] = Rinv_data[indices2[j]];
      }
      R.write(num_pairs1, indices1, &R_local[0]);
      Rinv.write(num_pairs2, indices2, &Rinv_local[0]);
      delete[] A_data; delete[] pairs1; free(indices1);
      delete[] pairs2; free(indices2);
      return;
    }
    assert(n%2==0);
    int k=n/2;
    int off_00[2] = {0,0};
    int off_01[2] = {0,k};
    int off_10[2] = {k,0};
    int off_11[2] = {k,k};
    int end_11[2] = {k,k};
    int end_21[2] = {n,k};
    int end_12[2] = {k,n};
    int end_22[2] = {n,n};
    Matrix<> A_11(k,k,SY,dw);
    Matrix<> R_11(k,k,NS,dw);
    Matrix<> Rinv_11(k,k,NS,dw);
    Matrix<> A_12(k,k,NS,dw);
    Matrix<> R_12(k,k,NS,dw);
    Matrix<> Rinv_12(k,k,NS,dw);
    Matrix<> A_22(k,k,SY,dw);
    Matrix<> R_22(k,k,NS,dw);
    Matrix<> Rinv_22(k,k,NS,dw);
    Matrix<> S_12(k,k,NS,dw);
    Matrix<> S_22(k,k,NS,dw);
    Matrix<> Sinv_12(k,k,NS,dw);

    A_11 = A.slice(off_00, end_11);// will be symmetric with special storage
    A_12 = A.slice(off_01, end_12);
    A_22 = A.slice(off_11, end_22);
    S_22 = A_22;// will be symmetric with special storage

    R_11 = R.slice(off_00, end_11);
    Rinv_11 = Rinv.slice(off_00, end_11);
    invoke(A_11,R_11,Rinv_11,s,s+k,k,n_bc,dw);

    // debug check
    // lets debug Rinv_11
    if ((s==0) and (e==64)){
    double* temp_data; int64_t temp_num_elems;
    Rinv_11.get_all_data(&temp_num_elems, &temp_data, false);
    if (dw.rank==0){
      std::cout << "Print Rinv_11 after get_all_data: start-" << s << " end-" << e << " n-" << n << std::endl;
      // every rank here shares the same data, so we will just print from rank 0. (VERIFY THAT EACH PROCESS SHARES SAME DATA?)
      for (int i=0; i<k; i++){
        for (int j=0; j<k; j++){
          std::cout << "Rinv_11 after get_all_dat (" << i << "," << j << ") - " << temp_data[i*k+j] << std::endl;
        }
      }
    }
    delete[] temp_data;
    MPI_Barrier(MPI_COMM_WORLD);
    }
  }
};

int main(int argc, char** argv){
  int rank, np, np_cube_root, n, n_bc;
  int64_t num_pairs;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &np);

  n = atoi(argv[1]);
  np_cube_root = std::nearbyint(std::ceil(pow(np,1./3.)));
  n_bc = n/(np_cube_root*np_cube_root);
{
  World dw(argc, argv);
  if (rank == 0){
    printf("Factoring A via Cholesky inverse with n-%d, np_cube_root-%d, n_bc-%d\n",n,np_cube_root,n_bc);
  }

  Matrix<> A(n,n,SY,dw);
  Matrix<> R(n,n,NS,dw);
  Matrix<> Rinv(n,n,NS,dw);

  double* pairs; int64_t* indices;
  srand48(13*rank);
  A.get_local_data(&num_pairs, &indices, &pairs);
  for (auto i=0; i<num_pairs; i++ ){
    pairs[i] = 0.5;
    // Diagonal elements in the global tensor are given extra weight (same as what I did in camfs)
    if ((indices[i]/n) == (indices[i]%n)){
      pairs[i] = n;
    }
  }
  A.write(num_pairs, indices, pairs);
  delete[] pairs; free(indices);

  cholinv::invoke(A,R,Rinv,0,n,n,n_bc,dw);
}
  MPI_Finalize();
  return 0;
}
HOST=$(shell hostname)
STAMPEDE2=$(shell echo $(HOST) |grep 'stampede2')
BIN=./

all: ctf_cholinv

MACHINE=STAMPEDE2
CCMPI = mpicxx
CFLAGS = -g -Wall -O3 -std=c++14 -mkl=parallel -xMIC-AVX512
LIB_PATH=-L../ctf_true/ctf/lib -L/opt/intel/compilers_and_libraries_2017.4.196/linux/mkl/lib/intel64 -Wl,--no-as-needed
LIBS=-lctf -lmkl_scalapack_lp64 -lmkl_gf_lp64 -lmkl_intel_thread -lmkl_core -lmkl_blacs_intelmpi_lp64 -lmkl_def -liomp5 -lpthread -lm -ldl

ctf_cholinv: ctf_cholinv.o
        $(CCMPI) $(CFLAGS) -o $(BIN)ctf_cholinv ctf_cholinv.o $(LIB_PATH) $(LIBS)
        rm *.o
ctf_cholinv.o:
      $(CCMPI) $(CFLAGS) -D$(MACHINE) -o ctf_cholinv.o -c cholinv.cxx
clean:
      -rm -f *.o *.gch $(BIN)ctf_cholinv

endif

My CTF build was configured as follows:

./configure '--no-dynamic' 'LIB_PATH=-L/opt/intel/compilers_and_libraries_2018.2.199/linux/mkl/lib/intel64/' 'LIBS=-lmkl_scalapack_lp64 -Wl,--start-group -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lmkl_blacs_intelmpi_lp64 -Wl,--end-group -lpthread -lm'
huttered40 commented 5 years ago

False alarm.