Why do I have to retrieve the uniqueID again before running ncclCommInitRank again?

XHANYAO commented 1 year ago

As the title indicates, I discovered that in order for the program to function properly, I had to re-obtain the uniqueID when I ran ncclCommInitRank. Since I was unaware of NCCL until a month ago, I'm new to the site. I was recently playing with nccl in a setup with a single GPU and one NIC. I ran into similar issue when I tried to use two communicators in this setting. Follow is the code:

#include <stdio.h>
#include "cuda_runtime.h"
#include "nccl.h"
#include "mpi.h"
#include <unistd.h>
#include <stdint.h>
#include <stdlib.h>

#define MPICHECK(cmd) do {                          \
  int e = cmd;                                      \
  if( e != MPI_SUCCESS ) {                          \
    printf("Failed: MPI error %s:%d '%d'\n",        \
        __FILE__,__LINE__, e);   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)

#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
    printf("Failed: Cuda error %s:%d '%s'\n",             \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)

#define NCCLCHECK(cmd) do {                         \
  ncclResult_t r = cmd;                             \
  if (r!= ncclSuccess) {                            \
    printf("Failed, NCCL error %s:%d '%s'\n",             \
        __FILE__,__LINE__,ncclGetErrorString(r));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)

static uint64_t getHostHash(const char* string) {
  // Based on DJB2a, result = result * 33 ^ char
  uint64_t result = 5381;
  for (int c = 0; string[c] != '\0'; c++){
    result = ((result << 5) + result) ^ string[c];
  }
  return result;
}

static void getHostName(char* hostname, int maxlen) {
  gethostname(hostname, maxlen);
  for (int i=0; i< maxlen; i++) {
    if (hostname[i] == '.') {
        hostname[i] = '\0';
        return;
    }
  }
}

int main(int argc, char* argv[])
{
  int size = 1024;

  int myRank, nRanks, localRank = 0;

  //initializing MPI
  MPICHECK(MPI_Init(&argc, &argv));
  MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
  MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));

  //calculating localRank based on hostname which is used in selecting a GPU
  uint64_t hostHashs[nRanks];
  char hostname[1024];
  getHostName(hostname, 1024);
  hostHashs[myRank] = getHostHash(hostname);
  MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
  for (int p=0; p<nRanks; p++) {
     if (p == myRank) break;
     if (hostHashs[p] == hostHashs[myRank]) localRank++;
  }

  ncclUniqueId id;
  ncclComm_t comm;
  ncclComm_t comm1;
  float *sendbuff, *recvbuff;
  cudaStream_t s;
  cudaStream_t s1;

  //get NCCL unique ID at rank 0 and broadcast it to all others
  if (myRank == 0) ncclGetUniqueId(&id);
  MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));

  //picking a GPU based on localRank, allocate device buffers
  CUDACHECK(cudaSetDevice(localRank));
  CUDACHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
  CUDACHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
  CUDACHECK(cudaMemset(sendbuff, 1, size * sizeof(float)));
  CUDACHECK(cudaMemset(recvbuff, 0, size * sizeof(float)));
  CUDACHECK(cudaStreamCreate(&s));
  CUDACHECK(cudaStreamCreate(&s1));

  float *sendvalue;
  sendvalue = (float *)malloc(size * sizeof(float));
  for (int i=0; i<size; i++){
   sendvalue[i] = i%2; 
  }
  CUDACHECK(cudaMemcpy(sendbuff, sendvalue, size*sizeof(float), cudaMemcpyHostToDevice));

  printf("%d,%d\n",nRanks,myRank);
  //initializing NCCL
  NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank));
  //NCCLCHECK(ncclCommInitRank(&comm1, nRanks, id, myRank));

  NCCLCHECK(ncclGroupStart());
  //communicating using NCCL
  NCCLCHECK(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, ncclFloat, ncclSum,comm, s));
  NCCLCHECK(ncclGroupEnd());
  CUDACHECK(cudaStreamSynchronize(s));

   NCCLCHECK(ncclCommDestroy(comm));

  sendvalue = (float *)malloc(size * sizeof(float));
  for (int i=0; i<size; i++){
   sendvalue[i] = i%2+2; 
  }
  CUDACHECK(cudaMemcpy(sendbuff, sendvalue, size*sizeof(float), cudaMemcpyHostToDevice));

  //get NCCL unique ID at rank 0 and broadcast it to all others
  //if (myRank == 0) ncclGetUniqueId(&id);
  //MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
  //finalizing NCCL
  //ncclCommAbort(comm);

  //if (myRank == 0) ncclGetUniqueId(&id);
  //MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
  //initializing NCCL

  NCCLCHECK(ncclCommInitRank(&comm1, nRanks, id, myRank));

  NCCLCHECK(ncclGroupStart());
  NCCLCHECK(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, ncclFloat, ncclSum,comm1, s1));
  //CUDACHECK(cudaStreamSynchronize(s));

  NCCLCHECK(ncclGroupEnd());

  //completing NCCL operation by synchronizing on the CUDA stream
  CUDACHECK(cudaStreamSynchronize(s1));

  float* hostRecvBuff = (float*)malloc(size * sizeof(float));
  CUDACHECK(cudaMemcpy(hostRecvBuff, recvbuff, size * sizeof(float), cudaMemcpyDeviceToHost));

  printf("Recv Data: \n");
  for(int i = 0; i < size; i++){
    printf("%f ", hostRecvBuff[i]);
  }
  printf("\n");

  free(hostRecvBuff);

  //free device buffers
  CUDACHECK(cudaFree(sendbuff));
  CUDACHECK(cudaFree(recvbuff));

  //finalizing NCCL
  ncclCommDestroy(comm1);

  //finalizing MPI
  MPICHECK(MPI_Finalize());

  printf("[MPI Rank %d] Success \n", myRank);
  return 0;
}

As you can see, I attempted the second initialization with comm1 and s1 after finishing the first initialization with comm and s and releasing the resource with ncclCommDestroy. Also I try to change the code like follow to test the problem,just use the same comm:

  ncclCommDestroy(comm);

  //if (myRank == 0) ncclGetUniqueId(&id);
  //MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
  //initializing NCCL
  NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank));

the problem are same. DUBUG log is here:

host-2:38610:38610 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp179s0f0
host-2:38610:38610 [0] NCCL INFO Bootstrap : Using enp179s0f0:198.18.2.2<0>
host-2:38610:38610 [0] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory
host-2:38610:38610 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
host-2:38610:38610 [0] NCCL INFO cudaDriverVersion 12020
host-2:38610:38610 [0] NCCL INFO NCCL version 2.18.3+cuda12.2
host-2:38610:38610 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp179s0f0
host-2:38610:38610 [0] NCCL INFO NCCL_IB_HCA set to mlx5_0
host-2:38610:38610 [0] NCCL INFO NET/IB : ncclIbAsyncThreadMain start.
host-2:38610:38610 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB enp179s0f0:198.18.2.2<0>
host-2:38610:38610 [0] NCCL INFO Using network IB
host-2:38610:38610 [0] NCCL INFO socketStartConnect: Connect to 198.18.2.2<54298> 
host-2:38610:38621 [0] NCCL INFO socketStartConnect: Connect to 198.18.2.2<36116> 
host-2:38610:38610 [0] NCCL INFO socketStartConnect: Connect to 198.18.2.2<36730> 
host-2:38610:38610 [0] NCCL INFO comm 0x1277bf60 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 65000 commId 0xbd10cc2bcb56e132 - Init START
host-2:38610:38610 [0] NCCL INFO NCCL_TOPO_DUMP_FILE set by environment to /home/xhy/demo_test/xml/topo_ring.xml
host-2:38610:38610 [0] NCCL INFO Setting affinity for GPU 0 to 010001
host-2:38610:38610 [0] NCCL INFO NCCL_GRAPH_DUMP_FILE set by environment to /home/xhy/demo_test/xml/graph_ring.xml
host-2:38610:38610 [0] NCCL INFO Channel 00/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 01/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 02/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 03/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 04/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 05/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 06/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 07/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 08/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 09/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 10/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 11/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 12/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 13/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 14/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 15/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 16/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 17/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 18/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 19/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 20/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 21/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 22/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 23/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 24/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 25/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 26/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 27/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 28/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 29/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 30/32 :    0
host-2:38610:38610 [0] NCCL INFO Channel 31/32 :    0
host-2:38610:38610 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1
host-2:38610:38610 [0] NCCL INFO P2P Chunksize set to 131072
host-2:38610:38610 [0] NCCL INFO Connected all rings
host-2:38610:38610 [0] NCCL INFO Connected all trees
host-2:38610:38610 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
host-2:38610:38610 [0] NCCL INFO socketStartConnect: Connect to 198.18.2.2<44639> 
host-2:38610:38610 [0] NCCL INFO comm 0x1277bf60 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 65000 commId 0xbd10cc2bcb56e132 - Init COMPLETE
host-2:38610:38610 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp179s0f0
host-2:38610:38610 [0] NCCL INFO NCCL_IB_HCA set to mlx5_0
host-2:38610:38610 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB enp179s0f0:198.18.2.2<0>
host-2:38610:38625 [0] NCCL INFO [Service thread] Connection closed by localRank 0
host-2:38610:38610 [0] NCCL INFO comm 0x1277bf60 rank 0 nranks 1 cudaDev 0 busId 65000 - Abort COMPLETE

host-2:38610:38610 [0] init.cc:1976 NCCL WARN comm 0x1277bf60 has already been destroyed
host-2:38610:38610 [0] NCCL INFO Using network IB
host-2:38610:38610 [0] NCCL INFO socketStartConnect: Connect to 198.18.2.2<54298> 
host-2:38610:38610 [0] NCCL INFO socketStartConnect: Connect to 198.18.2.2<54298> 

host-2:38610:38610 [0] misc/socket.cc:571 NCCL WARN socketStartConnect: Connect to 198.18.2.2<54298> failed : Software caused connection abort
host-2:38610:38610 [0] NCCL INFO misc/socket.cc:652 -> 2
host-2:38610:38610 [0] NCCL INFO misc/socket.cc:706 -> 2
host-2:38610:38610 [0] NCCL INFO bootstrap.cc:273 -> 2
host-2:38610:38610 [0] NCCL INFO init.cc:1377 -> 2
host-2:38610:38610 [0] NCCL INFO init.cc:1630 -> 2
host-2:38610:38610 [0] NCCL INFO init.cc:1671 -> 2

The debug log shows that the program is stuck in bootstrapinit for the second initialization, specifically in socketconnectThis is where the issue appears. It appears to be using the same port, so I'm not sure why it was wrong.

I apologize for posing what could have been a very basic query.After getting the unique ID back, I can confirm that everything is well and that the port number has changed. However, I am confused why obtaining uniqueID is required when ncclcommdestroy will suffice to release these resources.

Is it possible to initialize without having to obtain the uniqueID again?

Finally, if there are any other issues with the code, please ignore it and let's focus on this issue

KaimingOuyang commented 1 year ago

Is it possible to initialize without having to obtain the uniqueID again?

It is not possible for now. Actually, uniqueID is not tied to any communicators, so ncclCommDestroy should not touch anything about uniqueID.

We make it separate for each comm init so that we can reclaim everything from uniqueID when the init is done.

XHANYAO commented 1 year ago

Thank you very much for your reply. I see. I really can't find the use of uniqueID in ncclCommDestroy,that's right.But there's still something I don't understand. Could you be more specific about

We make it separate for each comm init so that we can reclaim everything from uniqueID when the init is done.

So could you tell me more details about reclaim everything from uniqueID?

XHANYAO commented 1 year ago

In fact, I also don't understand why the debug log shows the error in socketconnect and why the error occurs here.Does this mean that even if ncclSocketClose and ncclCommDestroy are executed, this port under this IP is still occupied?

NVIDIA / nccl

Why do I have to retrieve the uniqueID again before running ncclCommInitRank again? #1038