sstsimulator / sst-macro

SST Macro Element Library
http://sst-simulator.org/
Other
33 stars 41 forks source link

Possible bug in the MPI_Waitall function #682

Open Anet18 opened 2 years ago

Anet18 commented 2 years ago

New Issue for sst-macro

I tried to run LULESH https://github.com/LLNL/LULESH in SST Macro. And got the following error. In configuration file I used debug = [mpi] to print out the MPI debugging lines. I first collected traces of LULESH using dumpi then simulated those dumpi traces on SST macro 12.0.0. I used the latest SST dumpi https://github.com/sstsimulator/sst-dumpi to collect the traces. And tried both OpenMPI/4.1.1 and MPICH/3.3.2 for collecting traces. In both of these two MPI implementations I got the same error.

Another thing to mention is that for miniVite proxy application (https://github.com/Exa-Graph/miniVite), we got lucky. Switching from OpenMPI/4.1.1 to MPICH/3.3.2 made this error go away. We would like to understand the reason behind this issue, since MPI_Request is an opaque handle, and differences in the underlying implementation should not change behavior.

sstmac
  --debug="" \
  --configfile="dragonfly.ini" \    

MPI Rank 0   : MPI_Init()
MPI Rank 1   : MPI_Init()
MPI Rank 2   : MPI_Init()
MPI Rank 4   : MPI_Init()
MPI Rank 3   : MPI_Init()
MPI Rank 5   : MPI_Init()
MPI Rank 6   : MPI_Init()
MPI Rank 7   : MPI_Init()
MPI Rank 0   : MPI_Init finished
MPI Rank 0   : MPI_Irecv(961,MPI_DOUBLE=7,4:4,1024,MPI_COMM_WORLD;REQ=2)
MPI Rank 0   : MPI_Irecv finished
MPI Rank 0   : MPI_Irecv(961,MPI_DOUBLE=7,2:2,1024,MPI_COMM_WORLD;REQ=3)
MPI Rank 0   : MPI_Irecv finished
MPI Rank 0   : MPI_Irecv(961,MPI_DOUBLE=7,1:1,1024,MPI_COMM_WORLD;REQ=4)
MPI Rank 0   : MPI_Irecv finished
MPI Rank 0   : MPI_Irecv(31,MPI_DOUBLE=7,3:3,1024,MPI_COMM_WORLD;REQ=5)
MPI Rank 0   : MPI_Irecv finished
MPI Rank 0   : MPI_Irecv(31,MPI_DOUBLE=7,6:6,1024,MPI_COMM_WORLD;REQ=6)
MPI Rank 0   : MPI_Irecv finished
DUMPI trace   1 percent complete: dumpi-2022.07.07.17.49.05-0000.bin
MPI Rank 0   : MPI_Irecv(31,MPI_DOUBLE=7,5:5,1024,MPI_COMM_WORLD;REQ=7)
MPI Rank 0   : MPI_Irecv finished
MPI Rank 0   : MPI_Irecv(1,MPI_DOUBLE=7,7:7,1024,MPI_COMM_WORLD;REQ=8)
MPI Rank 0   : MPI_Irecv finished
MPI Rank 0   : MPI_Isend(961,MPI_DOUBLE=7,4,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
MPI Rank 0   : MPI_Isend(961,MPI_DOUBLE=7,2,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
MPI Rank 0   : MPI_Isend(961,MPI_DOUBLE=7,1,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
MPI Rank 0   : MPI_Isend(31,MPI_DOUBLE=7,3,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
MPI Rank 0   : MPI_Isend(31,MPI_DOUBLE=7,6,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
MPI Rank 4   : MPI_Init finished
MPI Rank 4   : MPI_Irecv(961,MPI_DOUBLE=7,0:0,1024,MPI_COMM_WORLD;REQ=2)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Irecv(961,MPI_DOUBLE=7,6:6,1024,MPI_COMM_WORLD;REQ=3)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Irecv(961,MPI_DOUBLE=7,5:5,1024,MPI_COMM_WORLD;REQ=4)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Irecv(31,MPI_DOUBLE=7,7:7,1024,MPI_COMM_WORLD;REQ=5)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Irecv(31,MPI_DOUBLE=7,2:2,1024,MPI_COMM_WORLD;REQ=6)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Irecv(31,MPI_DOUBLE=7,1:1,1024,MPI_COMM_WORLD;REQ=7)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Irecv(1,MPI_DOUBLE=7,3:3,1024,MPI_COMM_WORLD;REQ=8)
MPI Rank 4   : MPI_Irecv finished
MPI Rank 4   : MPI_Isend(961,MPI_DOUBLE=7,0,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 4   : MPI_Isend finished
MPI Rank 4   : MPI_Isend(961,MPI_DOUBLE=7,6,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 4   : MPI_Isend finished
MPI Rank 4   : MPI_Isend(961,MPI_DOUBLE=7,5,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 4   : MPI_Isend finished
MPI Rank 2   : MPI_Init finished
MPI Rank 2   : MPI_Irecv(961,MPI_DOUBLE=7,6:6,1024,MPI_COMM_WORLD;REQ=2)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Irecv(961,MPI_DOUBLE=7,0:0,1024,MPI_COMM_WORLD;REQ=3)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Irecv(961,MPI_DOUBLE=7,3:3,1024,MPI_COMM_WORLD;REQ=4)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Irecv(31,MPI_DOUBLE=7,7:7,1024,MPI_COMM_WORLD;REQ=5)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Irecv(31,MPI_DOUBLE=7,4:4,1024,MPI_COMM_WORLD;REQ=6)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Irecv(31,MPI_DOUBLE=7,1:1,1024,MPI_COMM_WORLD;REQ=7)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Irecv(1,MPI_DOUBLE=7,5:5,1024,MPI_COMM_WORLD;REQ=8)
MPI Rank 2   : MPI_Irecv finished
MPI Rank 2   : MPI_Isend(961,MPI_DOUBLE=7,6,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 2   : MPI_Isend finished
MPI Rank 2   : MPI_Isend(961,MPI_DOUBLE=7,0,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 2   : MPI_Isend finished
MPI Rank 2   : MPI_Isend(961,MPI_DOUBLE=7,3,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 2   : MPI_Isend finished
MPI Rank 1   : MPI_Init finished
MPI Rank 1   : MPI_Irecv(961,MPI_DOUBLE=7,5:5,1024,MPI_COMM_WORLD;REQ=2)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Irecv(961,MPI_DOUBLE=7,3:3,1024,MPI_COMM_WORLD;REQ=3)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Irecv(961,MPI_DOUBLE=7,0:0,1024,MPI_COMM_WORLD;REQ=4)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Irecv(31,MPI_DOUBLE=7,7:7,1024,MPI_COMM_WORLD;REQ=5)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Irecv(31,MPI_DOUBLE=7,2:2,1024,MPI_COMM_WORLD;REQ=6)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Irecv(31,MPI_DOUBLE=7,4:4,1024,MPI_COMM_WORLD;REQ=7)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Irecv(1,MPI_DOUBLE=7,6:6,1024,MPI_COMM_WORLD;REQ=8)
MPI Rank 1   : MPI_Irecv finished
MPI Rank 1   : MPI_Isend(961,MPI_DOUBLE=7,5,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 1   : MPI_Isend finished
MPI Rank 1   : MPI_Isend(961,MPI_DOUBLE=7,3,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 1   : MPI_Isend finished
MPI Rank 1   : MPI_Isend(961,MPI_DOUBLE=7,0,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 1   : MPI_Isend finished
MPI Rank 0   : MPI_Isend(31,MPI_DOUBLE=7,5,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
MPI Rank 0   : MPI_Isend(1,MPI_DOUBLE=7,7,1024,MPI_COMM_WORLD;REQ=9)
MPI Rank 0   : MPI_Isend finished
DUMPI trace   2 percent complete: dumpi-2022.07.07.17.49.05-0000.bin
MPI Rank 0   : MPI_Waitall(26,...)
MPI Rank 0   :    MPI_Wait_nonnull(9)
MPI Rank 0   :    MPI_Wait_nonnull(9)
thread terminated with exception: sprockit::SpktError: could not find mpi request 9 for rank 0
../../sumi-mpi/mpi_api.cc 498
aborting
Aborted (core dumped)

I used the following dragonfly.ini configuration file to run the trace

debug=[mpi]
topology {
 name = dragonfly
 geometry = [8,9]
 #group_connections = 8
 concentration = 8
 #inter_group = alltoall
 h = 8
 #redundant = [1,2]
}

switch {
 router {
  #pb_latency = 0.0
  #name = dragonfly_valiant
  #name = dragonfly_minimal
  name = dragonfly_par
  #name = dragonfly_scatter
  #name = dragonfly_ugal
  #name = dragonfly_rotate
  seed = 14
 }
}
node {
 app1 {
    ftq {
     type = ftq_calendar
     epoch_length = 1ms
     output = ftq
     group = app1
    }
  name = parsedumpi
  #random_allocation_seed = 116
  #indexing = random
  #allocation = random
  allocation = first_available
  size = 8
  launch_cmd = aprun -n 8 -N 1
  dumpi_metaname = dumpi-2022.07.07.17.49.05.meta   
  #coordinate_file = coords.txt
  start = 0ms
 }
 nic {
  name = pisces
  injection {
   mtu = 4096
   arbitrator = cut_through
   bandwidth = 1.0GB/s
   latency = 50ns
   credits = 64KB
  }
  ejection {
   latency = 50ns
  }
 }
 memory {
  name = pisces
  total_bandwidth = 10GB/s
  latency = 10ns
  max_single_bandwidth = 10GB/s
 }
 proc {
  ncores = 1
  frequency = 2GHz
 }
 name = simple
}

switch {
 name = pisces
 arbitrator = cut_through
 mtu = 4096
 link {
  bandwidth = 1.0GB/s
  latency = 100ns
  credits = 64KB
 }
 xbar {
  bandwidth = 10GB/s
 }
 logp {
  bandwidth = 1GB/s
  hop_latency = 100ns
  out_in_latency = 100ns
 }
}
sg0 commented 1 year ago

Is there any workaround for this issue? We are actually encountering this for multiple applications. (We have also tried the pnnl-branch, but facing the same error.)