Hi, we have encountered a problem with proxy channel's signal/wait. We have A100 GPUs connected via NVSwitch, and the proxy channels are using CudaIpc connections. We try to do the following to synchronize across all GPUs before a collective communication:
However, the program hangs at wait(). After some debugging, we discovered that the cudaMemcpyAsync in CudaIpcConnection::updateAndSync seems to be never completed. We tried to add MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream_)); immediately after the cudaMemcpyAsync, and the program just hangs at cudaStreamSynchronize. Changing cudaMemcpyAsync to cudaMemcpy also hangs.
It is also found that if we have cudaDeviceSynchronize immediately after this allgather kernel, the program does not hang. It only hangs when there is another kernel launch on the stream after allgather without cudaDeviceSynchronize. We wonder if you have any thought on this issue. Thanks!
Code to setup the proxy channels:
int main(int argc, char* argv[]) {
// Initialize the MPI environment
MPI_Init(&argc, &argv);
// Get the number of processes
int nranks;
MPI_Comm_size(MPI_COMM_WORLD, &nranks);
// Get the rank of the process
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
CUDA_CHECK(cudaSetDevice(rank));
// Print off a hello world message
std::cout << "Hello world from rank " << rank << " out of " << nranks << " ranks" << std::endl;
// Initialize Communicator
auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, nranks);
mscclpp::UniqueId uniqueId;
if (rank == 0) uniqueId = bootstrap->createUniqueId();
MPI_Bcast(&uniqueId, sizeof(uniqueId), MPI_BYTE, 0, MPI_COMM_WORLD);
bootstrap->initialize(uniqueId);
auto comm = std::make_shared<mscclpp::Communicator>(bootstrap);
// Initialize Connections
std::vector<std::shared_ptr<mscclpp::Connection>> connections;
std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connectionFutures;
for (int r = 0; r < nranks; ++r) {
if (r == rank) continue;
mscclpp::Transport transport = mscclpp::Transport::CudaIpc;
connectionFutures.push_back(comm->connectOnSetup(r, 0, transport));
}
comm->setup();
std::transform(
connectionFutures.begin(), connectionFutures.end(), std::back_inserter(connections),
[](const mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>& future) { return future.get(); });
MPI_Barrier(MPI_COMM_WORLD);
...
}
void setupProxyChannels(std::shared_ptr<mscclpp::ProxyService> service,
std::shared_ptr<mscclpp::Communicator> comm,
std::vector<std::shared_ptr<mscclpp::Connection>> connections,
mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel>** proxyChannelHandlesCuda,
Element* input, Element* output, int input_size, int output_size) {
const mscclpp::TransportFlags allTransports = mscclpp::Transport::CudaIpc;
mscclpp::RegisteredMemory inputBuffRegMem = comm->registerMemory(input, input_size * sizeof(Element), allTransports);
mscclpp::RegisteredMemory outputBuffRegMem;
if (input != output) outputBuffRegMem = comm->registerMemory(output, output_size * sizeof(Element), allTransports);
std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteRegMemories;
mscclpp::RegisteredMemory& localRegMemory = (input != output) ? outputBuffRegMem : inputBuffRegMem;
for (int r = 0; r < nranks; ++r) {
if (r == rank) continue;
comm->sendMemoryOnSetup(localRegMemory, r, 0);
auto remoteMemory = comm->recvMemoryOnSetup(r, 0);
remoteRegMemories.push_back(remoteMemory);
}
comm->setup();
for (int i = 0; i < connections.size(); ++i) {
proxyChannelHandles.push_back(mscclpp::deviceHandle(mscclpp::SimpleProxyChannel(
service->proxyChannel(service->buildAndAddSemaphore(*comm, connections[i])),
service->addMemory(remoteRegMemories[i].get()), service->addMemory(inputBuffRegMem)
)));
}
comm->setup();
assert(connections.size() == nranks - 1);
CUDA_CHECK(cudaMalloc(proxyChannelHandlesCuda, (nranks - 1) * sizeof(mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel>)));
CUDA_CHECK(cudaMemcpy(*proxyChannelHandlesCuda, &proxyChannelHandles[proxyChannelHandles.size() - (nranks - 1)],
(nranks - 1) * sizeof(mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel>), cudaMemcpyHostToDevice));
}
Hi @liangyuRain, if cudaMemcpyAsync hangs, it is highly likely that your application prevents other parallel streams run concurrently. Are you using the default stream in your application?
Hi, we have encountered a problem with proxy channel's
signal/wait
. We have A100 GPUs connected via NVSwitch, and the proxy channels are using CudaIpc connections. We try to do the following to synchronize across all GPUs before a collective communication:However, the program hangs at
wait()
. After some debugging, we discovered that thecudaMemcpyAsync
inCudaIpcConnection::updateAndSync
seems to be never completed. We tried to addMSCCLPP_CUDATHROW(cudaStreamSynchronize(stream_));
immediately after thecudaMemcpyAsync
, and the program just hangs atcudaStreamSynchronize
. ChangingcudaMemcpyAsync
tocudaMemcpy
also hangs.It is also found that if we have
cudaDeviceSynchronize
immediately after thisallgather
kernel, the program does not hang. It only hangs when there is another kernel launch on the stream afterallgather
withoutcudaDeviceSynchronize
. We wonder if you have any thought on this issue. Thanks!Code to setup the proxy channels: