sstsimulator / sst-macro

SST Macro Element Library
http://sst-simulator.org/
Other
33 stars 41 forks source link

MVAPICH AllGather test deadlocks #668

Open jpkenny opened 2 years ago

jpkenny commented 2 years ago

The MPI all gather app in skeletons/tests:

#include <mpi.h>
#include <stddef.h>
#include <iostream>

int main(int argc, char** argv)
{
  MPI_Init(&argc, &argv);
  int rank, size;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  if (rank == 0){
    std::cout << "Starting collective" << std::endl;
  }

  int nelems = 100;
//#define VALIDATE_BUFFERS
#ifdef VALIDATE_BUFFERS
  int* send_buf = new int[nelems];
  int* recv_buf = new int[nelems * size];
  for (int i=0; i < nelems; ++i){
    send_buf[i] = rank;
  }
  for (int i=0; i < size; ++i){
    for (int j=0; j < nelems; ++j){
      recv_buf[i*nelems + j] = -1;
    }
  }
#else
  void* send_buf = sstmac_nullptr_send;
  void* recv_buf = sstmac_nullptr_recv;
#endif

  MPI_Allgather(send_buf, 100, MPI_INT,
                recv_buf, 100, MPI_INT, MPI_COMM_WORLD);

  if (rank == 0){
    std::cout << "Cleared collective" << std::endl;
  }
#ifdef VALIDATE_BUFFERS
  for (int i=0; i < size; ++i){
    int* values = recv_buf + i*nelems;
    for (int j=0; j < nelems; ++j){
      if (values[j] != i){
        printf("V[%d][%d] = %d != %d\n", i, j, values[j], i);
      }
    }
  }
#endif

  MPI_Finalize();

  if (rank == 0){
    std::cout << "Cleared finalize" << std::endl;
  }

  return 0;
}

Deadlocks with the following param file when run over MVAPICH:

node {
 os {
  stack_size = 1MB
 }
 app1 { 
  exe = ./run
  argv = 
  launch_cmd = aprun -n 32 -N 1
  apis = [libfabric, pmi:libfabric]
  env {
    SLURM_NPROCS = 32
  }
 }
 nic {
  name = snappr
  credits = 64KB
  mtu = 4096
  bandwidth = 10.0GB/s
  injection {
   bandwidth = 10.0GB/s
   latency = 50ns
   mtu = 1024
   credits = 64KB
   send_state {
     group = state
     type = ftq_calendar
     output = ftq
     epoch_length = 1us
    }
   recv_state {
     group = state
     type = ftq_calendar
     output = ftq
     epoch_length = 1us
    }
  }
  ejection {
   latency = 50ns
  }
 }
 memory {
  name = snappr
  channel_bandwidth = 7GB/s
  num_channels = 10
  latency = 10ns
 }
 proc {
  ncores = 4
  frequency = 2GHz
 }
 name = simple
}

switch {
 name = snappr
 credits = 64KB
 link {
  bandwidth = 10.0GB/s
  latency = 100ns
  credits = 64KB
  xmit_active {
   group = test
   type = accumulator
  }
  xmit_idle {
   group = test
   type = accumulator
  }
  xmit_stall {
   group = test
   type = accumulator
  }
 }
 logp {
  bandwidth = 1GB/s
  out_in_latency = 100ns
  hop_latency = 100ns
 }
}

topology {
  name = dragonfly
  geometry = [32,9]
  h = 16
  inter_group = circulant
  concentration = 16
}

switch {
  router { 
    name = dragonfly_minimal
  }
}

I used the following Makefile:

TARGET := run
SRC := allgather.cc

CXX := sst++
CC := sstcc
CXXFLAGS := --disable-mpi -fPIC -O0 -g
CPPFLAGS := -I. -I/home/jpkenny/install/mv2-ofi-netmod/include
LIBDIR :=
PREFIX :=
LDFLAGS := /home/jpkenny/install/mv2-ofi-netmod/lib/libmpi.so /home/jpkenny/install/mv2-ofi-netmod/lib/libmpi.a /usr/lib64/libhwloc.so -Wl,-rpath,$(PREFIX)/lib -Wl,-rpath,/home/jpkenny/install/mv2-ofi-netmod/lib -Wl,-rpath,$/home/jpkenny/install/sst-transports/lib

OBJ := $(SRC:.cc=.o)
OBJ := $(OBJ:.cpp=.o)
OBJ := $(OBJ:.c=.o)

.PHONY: clean install

all: $(TARGET)

$(TARGET): $(OBJ)
    $(CXX) -o $@ $+ $(LDFLAGS) $(LIBS)  $(CXXFLAGS)

%.o: %.cc
    $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@

%.o: %.c
    $(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@

clean:
    rm -f $(TARGET) $(OBJ)

install: $(TARGET)
    cp $< $(PREFIX)/bin