Open jpkenny opened 2 years ago
The MPI all gather app in skeletons/tests:
#include <mpi.h> #include <stddef.h> #include <iostream> int main(int argc, char** argv) { MPI_Init(&argc, &argv); int rank, size; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if (rank == 0){ std::cout << "Starting collective" << std::endl; } int nelems = 100; //#define VALIDATE_BUFFERS #ifdef VALIDATE_BUFFERS int* send_buf = new int[nelems]; int* recv_buf = new int[nelems * size]; for (int i=0; i < nelems; ++i){ send_buf[i] = rank; } for (int i=0; i < size; ++i){ for (int j=0; j < nelems; ++j){ recv_buf[i*nelems + j] = -1; } } #else void* send_buf = sstmac_nullptr_send; void* recv_buf = sstmac_nullptr_recv; #endif MPI_Allgather(send_buf, 100, MPI_INT, recv_buf, 100, MPI_INT, MPI_COMM_WORLD); if (rank == 0){ std::cout << "Cleared collective" << std::endl; } #ifdef VALIDATE_BUFFERS for (int i=0; i < size; ++i){ int* values = recv_buf + i*nelems; for (int j=0; j < nelems; ++j){ if (values[j] != i){ printf("V[%d][%d] = %d != %d\n", i, j, values[j], i); } } } #endif MPI_Finalize(); if (rank == 0){ std::cout << "Cleared finalize" << std::endl; } return 0; }
Deadlocks with the following param file when run over MVAPICH:
node { os { stack_size = 1MB } app1 { exe = ./run argv = launch_cmd = aprun -n 32 -N 1 apis = [libfabric, pmi:libfabric] env { SLURM_NPROCS = 32 } } nic { name = snappr credits = 64KB mtu = 4096 bandwidth = 10.0GB/s injection { bandwidth = 10.0GB/s latency = 50ns mtu = 1024 credits = 64KB send_state { group = state type = ftq_calendar output = ftq epoch_length = 1us } recv_state { group = state type = ftq_calendar output = ftq epoch_length = 1us } } ejection { latency = 50ns } } memory { name = snappr channel_bandwidth = 7GB/s num_channels = 10 latency = 10ns } proc { ncores = 4 frequency = 2GHz } name = simple } switch { name = snappr credits = 64KB link { bandwidth = 10.0GB/s latency = 100ns credits = 64KB xmit_active { group = test type = accumulator } xmit_idle { group = test type = accumulator } xmit_stall { group = test type = accumulator } } logp { bandwidth = 1GB/s out_in_latency = 100ns hop_latency = 100ns } } topology { name = dragonfly geometry = [32,9] h = 16 inter_group = circulant concentration = 16 } switch { router { name = dragonfly_minimal } }
I used the following Makefile:
TARGET := run SRC := allgather.cc CXX := sst++ CC := sstcc CXXFLAGS := --disable-mpi -fPIC -O0 -g CPPFLAGS := -I. -I/home/jpkenny/install/mv2-ofi-netmod/include LIBDIR := PREFIX := LDFLAGS := /home/jpkenny/install/mv2-ofi-netmod/lib/libmpi.so /home/jpkenny/install/mv2-ofi-netmod/lib/libmpi.a /usr/lib64/libhwloc.so -Wl,-rpath,$(PREFIX)/lib -Wl,-rpath,/home/jpkenny/install/mv2-ofi-netmod/lib -Wl,-rpath,$/home/jpkenny/install/sst-transports/lib OBJ := $(SRC:.cc=.o) OBJ := $(OBJ:.cpp=.o) OBJ := $(OBJ:.c=.o) .PHONY: clean install all: $(TARGET) $(TARGET): $(OBJ) $(CXX) -o $@ $+ $(LDFLAGS) $(LIBS) $(CXXFLAGS) %.o: %.cc $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ %.o: %.c $(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@ clean: rm -f $(TARGET) $(OBJ) install: $(TARGET) cp $< $(PREFIX)/bin
The MPI all gather app in skeletons/tests:
Deadlocks with the following param file when run over MVAPICH:
I used the following Makefile: