open-mpi / ompi

Open MPI main development repository
https://www.open-mpi.org
Other
2.13k stars 857 forks source link

problems with xpmem on master and v4.0.x #7030

Closed hppritcha closed 3 years ago

hppritcha commented 4 years ago

I'm observing problems using the XPMEM vader single copy mechanism on master and v4.0.x. Using XPMEM at modest numbers of processes leads to heap corruption and/or set faults in the vader teardown code. I have reason to believe that there are also problems with the grdma rcache which compound the problem. I observed that using the udreg rcache on the Cray's reduced some of the problems, but the XPMEM memory corruption problem remains.

I am able to reproduce the problem fairly reliably with IMB at 32 processes using the allreduce test.

Here's what I see on NERSC cori system using Open MPI 4.0.1 and a --enable-debug built library:

hpp@nid02310:~/mpi-benchmarks/src_c> (master)mpirun -np 32 -N 32 IMB-MPI1 -npmin 32 Allreduce
#------------------------------------------------------------
#    Intel(R) MPI Benchmarks 2018, MPI-1 part    
#------------------------------------------------------------
# Date                  : Tue Oct  1 12:34:01 2019
# Machine               : x86_64
# System                : Linux
# Release               : 4.12.14-25.22_5.0.79-cray_ari_c
# Version               : #1 SMP Fri Aug 9 16:20:09 UTC 2019 (d32c384)
# MPI Version           : 3.1
# MPI Thread Environment: 

# Calling sequence was: 

# IMB-MPI1 -npmin 32 Allreduce

# Minimum message length in bytes:   0
# Maximum message length in bytes:   4194304
#
# MPI_Datatype                   :   MPI_BYTE 
# MPI_Datatype for reductions    :   MPI_FLOAT
# MPI_Op                         :   MPI_SUM  
#
#

# List of Benchmarks to run:

# Allreduce

#----------------------------------------------------------------
# Benchmarking Allreduce 
# #processes = 32 
#----------------------------------------------------------------
       #bytes #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]
            0         1000         0.62         0.68         0.62
            4         1000        39.27        41.41        40.22
            8         1000        39.74        41.59        40.53
           16         1000        39.50        41.24        40.14
           32         1000        39.53        41.51        40.40
           64         1000        39.49        41.25        40.36
          128         1000        40.69        42.24        41.37
          256         1000        45.22        48.07        46.70
          512         1000        90.12        93.04        91.53
         1024         1000       100.34       104.15       102.50
         2048         1000       119.18       122.61       121.19
         4096         1000       154.01       157.69       155.78
         8192         1000       226.40       230.61       228.71
        16384         1000       978.12      1004.68       989.02
        32768         1000      1043.86      1067.14      1053.55
        65536          640      1217.55      1247.16      1229.92
       131072          320      1487.61      1521.95      1500.50
       262144          160      2098.35      2141.39      2116.57
       524288           80      3308.40      3358.66      3328.20
      1048576           40      6082.23      6144.50      6113.68
      2097152           20     10486.34     10561.89     10526.78
      4194304           10     19357.93     19549.13     19457.65

# All processes entering MPI_Finalize

IMB-MPI1: btl_vader_xpmem.c:160: mca_btl_vader_endpoint_xpmem_rcache_cleanup: Assertion `OPAL_OBJ_MAGIC_ID == ((opal_object_t *) (reg))->obj_magic_id' failed.
[nid02310:132319] *** Process received signal ***
[nid02310:132319] Signal: Aborted (6)
[nid02310:132319] Signal code:  (-6)
IMB-MPI1: btl_vader_xpmem.c:160: mca_btl_vader_endpoint_xpmem_rcache_cleanup: Assertion `OPAL_OBJ_MAGIC_ID == ((opal_object_t *) (reg))->obj_magic_id' failed.
[nid02310:132309] *** Process received signal ***
[nid02310:132309] Signal: Aborted (6)
[nid02310:132309] Signal code:  (-6)
[nid02310:132319] [ 0] /lib64/libpthread.so.0(+0x12360)[0x2aaaab60c360]
[nid02310:132319] [ 1] /lib64/libc.so.6(gsignal+0x110)[0x2aaaab84e160]
[nid02310:132319] [ 2] /lib64/libc.so.6(abort+0x151)[0x2aaaab84f741]
[nid02310:132319] [ 3] /lib64/libc.so.6(+0x2e75a)[0x2aaaab84675a]
[nid02310:132319] [ 4] IMB-MPI1: btl_vader_xpmem.c:160: mca_btl_vader_endpoint_xpmem_rcache_cleanup: Assertion `OPAL_OBJ_MAGIC_ID == ((opal_object_t *) (reg))->obj_magic_id' failed.
/lib64/libc.so.6(+0x2e7d2)[0x2aaaab8467d2]
[nid02310:132319] [ 5] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0xc4b1)[0x2aaabdcb84b1]
[nid02310:132319] [ 6] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0xf6eb0)[0x2aaaac1e9eb0]
[nid02310:132319] [ 7] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b332)[0x2aaaac12e332]
[nid02310:132319] [ 8] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b371)[0x2aaaac12e371]
[nid02310:132319] [ 9] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b22f)[0x2aaaac12e22f]
[nid02310:132319] [10] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(opal_interval_tree_traverse+0x72)[0x2aaaac12e4b0]
[nid02310:132319] [11] [nid02310:132321] *** Process received signal ***
[nid02310:132321] Signal: Aborted (6)
[nid02310:132321] Signal code:  (-6)
/global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(mca_rcache_base_vma_tree_iterate+0x6a)[0x2aaaac1e9f1c]
[nid02310:132319] [12] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(mca_rcache_base_vma_iterate+0x47)[0x2aaaac1e9afc]
[nid02310:132319] [13] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(mca_btl_vader_xpmem_cleanup_endpoint+0x41)[0x2aaabdcb8592]
[nid02310:132319] [14] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x4a45)[0x2aaabdcb0a45]
[nid02310:132319] [15] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x2d59)[0x2aaabdcaed59]
[nid02310:132319] [16] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x3d66)[0x2aaabdcafd66]
[nid02310:132319] [17] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x402a)[0x2aaabdcb002a]
[nid02310:132319] [18] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_bml_r2.so(+0x32b4)[0x2aaabdaa82b4]
[nid02310:132319] [19] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_del_procs+0x2b)[0x2aaabf926c0c]
[nid02310:132319] [20] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libmpi.so.40(ompi_mpi_finalize+0xb1c)[0x2aaaaad3923c]
[nid02310:132319] [21] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libmpi.so.40(PMPI_Finalize+0x60)[0x2aaaaad77b4b]
[nid02310:132319] [22] IMB-MPI1[0x4031df]
[nid02310:132319] [23] /lib64/libc.so.6(__libc_start_main+0xea)[0x2aaaab838f8a]
[nid02310:132319] [24] IMB-MPI1[0x402eaa]
[nid02310:132319] *** End of error message ***
[nid02310:132309] [ 0] /lib64/libpthread.so.0(+0x12360)[0x2aaaab60c360]
[nid02310:132309] [ 1] /lib64/libc.so.6(gsignal+0x110)[0x2aaaab84e160]
[nid02310:132309] [ 2] /lib64/libc.so.6(abort+0x151)[0x2aaaab84f741]
[nid02310:132309] [ 3] /lib64/libc.so.6(+0x2e75a)[0x2aaaab84675a]
[nid02310:132309] [ 4] /lib64/libc.so.6(+0x2e7d2)[0x2aaaab8467d2]
[nid02310:132309] [ 5] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0xc4b1)[0x2aaabdcb84b1]
[nid02310:132309] [ 6] IMB-MPI1: btl_vader_xpmem.c:160: mca_btl_vader_endpoint_xpmem_rcache_cleanup: Assertion `OPAL_OBJ_MAGIC_ID == ((opal_object_t *) (reg))->obj_magic_id' failed.
[nid02310:132310] *** Process received signal ***
[nid02310:132310] Signal: Aborted (6)
[nid02310:132310] Signal code:  (-6)
[nid02310:132321] [ 0] /lib64/libpthread.so.0(+0x12360)[0x2aaaab60c360]
[nid02310:132321] [ 1] /lib64/libc.so.6(gsignal+0x110)[0x2aaaab84e160]
[nid02310:132321] [ 2] /lib64/libc.so.6(abort+0x151)[0x2aaaab84f741]
[nid02310:132321] [ 3] /lib64/libc.so.6(+0x2e75a)[0x2aaaab84675a]
[nid02310:132321] [ 4] /lib64/libc.so.6(+0x2e7d2)[0x2aaaab8467d2]
[nid02310:132321] [ 5] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0xc4b1)[0x2aaabdcb84b1]
[nid02310:132321] [ 6] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0xf6eb0)[0x2aaaac1e9eb0]
[nid02310:132321] [ 7] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b332)[0x2aaaac12e332]
[nid02310:132321] [ 8] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b371)[0x2aaaac12e371]
[nid02310:132321] [ 9] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b22f)[0x2aaaac12e22f]
[nid02310:132321] [10] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(opal_interval_tree_traverse+0x72)[0x2aaaac12e4b0]
[nid02310:132321] [11] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(mca_rcache_base_vma_tree_iterate+0x6a)[0x2aaaac1e9f1c]
[nid02310:132321] [12] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(mca_rcache_base_vma_iterate+0x47)[0x2aaaac1e9afc]
[nid02310:132321] [13] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(mca_btl_vader_xpmem_cleanup_endpoint+0x41)[0x2aaabdcb8592]
[nid02310:132321] [14] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x4a45)[0x2aaabdcb0a45]
[nid02310:132321] [15] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x2d59)[0x2aaabdcaed59]
[nid02310:132321] [16] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x3d66)[0x2aaabdcafd66]
[nid02310:132321] [17] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x402a)[0x2aaabdcb002a]
[nid02310:132321] [18] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_bml_r2.so(+0x32b4)[0x2aaabdaa82b4]
[nid02310:132321] [19] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_del_procs+0x2b)[0x2aaabf926c0c]
[nid02310:132321] [20] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0xf6eb0)[0x2aaaac1e9eb0]
[nid02310:132309] [ 7] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b332)[0x2aaaac12e332]
[nid02310:132309] [ 8] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b22f)[0x2aaaac12e22f]
[nid02310:132309] [ 9] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b371)[0x2aaaac12e371]
[nid02310:132309] [10] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(opal_interval_tree_traverse+0x72)[0x2aaaac12e4b0]
[nid02310:132309] [11] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(mca_rcache_base_vma_tree_iterate+0x6a)[0x2aaaac1e9f1c]
[nid02310:132309] [12] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(mca_rcache_base_vma_iterate+0x47)[0x2aaaac1e9afc]
[nid02310:132309] [13] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(mca_btl_vader_xpmem_cleanup_endpoint+0x41)[0x2aaabdcb8592]
[nid02310:132309] [14] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x4a45)[0x2aaabdcb0a45]
[nid02310:132309] [15] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x2d59)[0x2aaabdcaed59]
[nid02310:132309] [nid02310:132310] [ 0] /lib64/libpthread.so.0(+0x12360)[0x2aaaab60c360]
[nid02310:132310] [ 1] /lib64/libc.so.6(gsignal+0x110)[0x2aaaab84e160]
[nid02310:132310] [ 2] /lib64/libc.so.6(abort+0x151)[0x2aaaab84f741]
[nid02310:132310] [ 3] /lib64/libc.so.6(+0x2e75a)[0x2aaaab84675a]
[nid02310:132310] [ 4] /lib64/libc.so.6(+0x2e7d2)[0x2aaaab8467d2]
[nid02310:132310] [ 5] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0xc4b1)[0x2aaabdcb84b1]
[nid02310:132310] [ 6] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0xf6eb0)[0x2aaaac1e9eb0]
[nid02310:132310] [ 7] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b332)[0x2aaaac12e332]
[nid02310:132310] [ 8] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b22f)[0x2aaaac12e22f]
[nid02310:132310] [ 9] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b371)[0x2aaaac12e371]
[nid02310:132310] [10] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(opal_interval_tree_traverse+0x72)[0x2aaaac12e4b0]
[nid02310:132310] [11] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libmpi.so.40(ompi_mpi_finalize+0xb1c)[0x2aaaaad3923c]
[nid02310:132321] [21] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libmpi.so.40(PMPI_Finalize+0x60)[0x2aaaaad77b4b]
[nid02310:132321] [22] IMB-MPI1[0x4031df]
[nid02310:132321] [23] /lib64/libc.so.6(__libc_start_main+0xea)[0x2aaaab838f8a]
[nid02310:132321] [24] IMB-MPI1[0x402eaa]
[nid02310:132321] *** End of error message ***
/global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(mca_rcache_base_vma_tree_iterate+0x6a)[0x2aaaac1e9f1c]
[nid02310:132310] [12] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(mca_rcache_base_vma_iterate+0x47)[0x2aaaac1e9afc]
[nid02310:132310] [13] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(mca_btl_vader_xpmem_cleanup_endpoint+0x41)[0x2aaabdcb8592]
[nid02310:132310] [14] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x4a45)[0x2aaabdcb0a45]
[nid02310:132310] [15] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x2d59)[0x2aaabdcaed59]
[nid02310:132310] [16] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x3d66)[0x2aaabdcafd66]
[nid02310:132310] [17] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x402a)[0x2aaabdcb002a]
[nid02310:132310] [18] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_bml_r2.so(+0x32b4)[0x2aaabdaa82b4]
[nid02310:132310] [19] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_del_procs+0x2b)[0x2aaabf926c0c]
[nid02310:132310] [20] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libmpi.so.40(ompi_mpi_finalize+0xb1c)[0x2aaaaad3923c]
[nid02310:132310] [21] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libmpi.so.40(PMPI_Finalize+0x60)[0x2aaaaad77b4b]
[nid02310:132310] [22] IMB-MPI1[0x4031df]
[nid02310:132310] [23] /lib64/libc.so.6(__libc_start_main+0xea)[0x2aaaab838f8a]
[nid02310:132310] [24] IMB-MPI1[0x402eaa]
[nid02310:132310] *** End of error message ***
[16] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x3d66)[0x2aaabdcafd66]
[nid02310:132309] [17] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x402a)[0x2aaabdcb002a]
[nid02310:132309] [18] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_bml_r2.so(+0x32b4)[0x2aaabdaa82b4]
[nid02310:132309] [19] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_del_procs+0x2b)[0x2aaabf926c0c]
[nid02310:132309] [20] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libmpi.so.40(ompi_mpi_finalize+0xb1c)[0x2aaaaad3923c]
[nid02310:132309] [21] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libmpi.so.40(PMPI_Finalize+0x60)[0x2aaaaad77b4b]
[nid02310:132309] [22] IMB-MPI1[0x4031df]
[nid02310:132309] [23] /lib64/libc.so.6(__libc_start_main+0xea)[0x2aaaab838f8a]
[nid02310:132309] [24] IMB-MPI1[0x402eaa]
[nid02310:132309] *** End of error message ***
IMB-MPI1: btl_vader_xpmem.c:160: mca_btl_vader_endpoint_xpmem_rcache_cleanup: Assertion `OPAL_OBJ_MAGIC_ID == ((opal_object_t *) (reg))->obj_magic_id' failed.
[nid02310:132338] *** Process received signal ***
[nid02310:132338] Signal: Aborted (6)
[nid02310:132338] Signal code:  (-6)
[nid02310:132338] [ 0] /lib64/libpthread.so.0(+0x12360)[0x2aaaab60c360]
[nid02310:132338] [ 1] /lib64/libc.so.6(gsignal+0x110)[0x2aaaab84e160]
[nid02310:132338] [ 2] /lib64/libc.so.6(abort+0x151)[0x2aaaab84f741]
[nid02310:132338] [ 3] /lib64/libc.so.6(+0x2e75a)[0x2aaaab84675a]
[nid02310:132338] [ 4] /lib64/libc.so.6(+0x2e7d2)[0x2aaaab8467d2]
[nid02310:132338] [ 5] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0xc4b1)[0x2aaabdcb84b1]
[nid02310:132338] [ 6] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0xf6eb0)[0x2aaaac1e9eb0]
[nid02310:132338] [ 7] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b332)[0x2aaaac12e332]
[nid02310:132338] [ 8] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b22f)[0x2aaaac12e22f]
[nid02310:132338] [ 9] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b371)[0x2aaaac12e371]
[nid02310:132338] [10] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(opal_interval_tree_traverse+0x72)[0x2aaaac12e4b0]
[nid02310:132338] [11] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(mca_rcache_base_vma_tree_iterate+0x6a)[0x2aaaac1e9f1c]
[nid02310:132338] [12] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(mca_rcache_base_vma_iterate+0x47)[0x2aaaac1e9afc]
[nid02310:132338] [13] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(mca_btl_vader_xpmem_cleanup_endpoint+0x41)[0x2aaabdcb8592]
[nid02310:132338] [14] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x4a45)[0x2aaabdcb0a45]
[nid02310:132338] [15] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x2d59)[0x2aaabdcaed59]
[nid02310:132338] [16] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x3d66)[0x2aaabdcafd66]
[nid02310:132338] [17] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x402a)[0x2aaabdcb002a]
[nid02310:132338] [18] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_bml_r2.so(+0x32b4)[0x2aaabdaa82b4]
[nid02310:132338] [19] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_del_procs+0x2b)[0x2aaabf926c0c]
[nid02310:132338] [20] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libmpi.so.40(ompi_mpi_finalize+0xb1c)[0x2aaaaad3923c]
[nid02310:132338] [21] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libmpi.so.40(PMPI_Finalize+0x60)[0x2aaaaad77b4b]
[nid02310:132338] [22] IMB-MPI1[0x4031df]
[nid02310:132338] [23] /lib64/libc.so.6(__libc_start_main+0xea)[0x2aaaab838f8a]
[nid02310:132338] [24] IMB-MPI1[0x402eaa]
[nid02310:132338] *** End of error message ***
IMB-MPI1: btl_vader_xpmem.c:160: mca_btl_vader_endpoint_xpmem_rcache_cleanup: Assertion `OPAL_OBJ_MAGIC_ID == ((opal_object_t *) (reg))->obj_magic_id' failed.
[nid02310:132345] *** Process received signal ***
[nid02310:132345] Signal: Aborted (6)
[nid02310:132345] Signal code:  (-6)
[nid02310:132345] [ 0] /lib64/libpthread.so.0(+0x12360)[0x2aaaab60c360]
[nid02310:132345] [ 1] /lib64/libc.so.6(gsignal+0x110)[0x2aaaab84e160]
[nid02310:132345] [ 2] /lib64/libc.so.6(abort+0x151)[0x2aaaab84f741]
[nid02310:132345] [ 3] /lib64/libc.so.6(+0x2e75a)[0x2aaaab84675a]
[nid02310:132345] [ 4] /lib64/libc.so.6(+0x2e7d2)[0x2aaaab8467d2]
[nid02310:132345] [ 5] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0xc4b1)[0x2aaabdcb84b1]
[nid02310:132345] [ 6] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0xf6eb0)[0x2aaaac1e9eb0]
[nid02310:132345] [ 7] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b332)[0x2aaaac12e332]
[nid02310:132345] [ 8] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b371)[0x2aaaac12e371]
[nid02310:132345] [ 9] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(+0x3b22f)[0x2aaaac12e22f]
[nid02310:132345] [10] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(opal_interval_tree_traverse+0x72)[0x2aaaac12e4b0]
[nid02310:132345] [11] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(mca_rcache_base_vma_tree_iterate+0x6a)[0x2aaaac1e9f1c]
[nid02310:132345] [12] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libopen-pal.so.40(mca_rcache_base_vma_iterate+0x47)[0x2aaaac1e9afc]
[nid02310:132345] [13] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(mca_btl_vader_xpmem_cleanup_endpoint+0x41)[0x2aaabdcb8592]
[nid02310:132345] [14] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x4a45)[0x2aaabdcb0a45]
[nid02310:132345] [15] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x2d59)[0x2aaabdcaed59]
[nid02310:132345] [16] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x3d66)[0x2aaabdcafd66]
[nid02310:132345] [17] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_btl_vader.so(+0x402a)[0x2aaabdcb002a]
[nid02310:132345] [18] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_bml_r2.so(+0x32b4)[0x2aaabdaa82b4]
[nid02310:132345] [19] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_del_procs+0x2b)[0x2aaabf926c0c]
[nid02310:132345] [20] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libmpi.so.40(ompi_mpi_finalize+0xb1c)[0x2aaaaad3923c]
[nid02310:132345] [21] /global/common/software/m3169/openmpi/4.0.1_debug/gnu/lib/libmpi.so.40(PMPI_Finalize+0x60)[0x2aaaaad77b4b]
[nid02310:132345] [22] IMB-MPI1[0x4031df]
[nid02310:132345] [23] /lib64/libc.so.6(__libc_start_main+0xea)[0x2aaaab838f8a]
[nid02310:132345] [24] IMB-MPI1[0x402eaa]
[nid02310:132345] *** End of error message ***
--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 1 with PID 132310 on node nid02310 exited on signal 6 (Aborted).
--------------------------------------------------------------------------

If I use the cma single copy mechanism, the application runs nominally.

If I don't use a debug library, the problem usually manifests itself as messages from glibc about corrupted heap, or segfaults.

hjelmn commented 4 years ago

Interesting. No problem on v3.1.x?

hjelmn commented 4 years ago

Will take a look this evening.

hppritcha commented 4 years ago

I should add that the udreg was actually only useful when running multi-node. That fixed "bad deregister" return calls from the uGNI library.

hjelmn commented 4 years ago

Can't reproduce the issue in a VM with the current xpmem HEAD. Does it occur if you run single-node with --mca btl self,vader? If not then it will help narrow down where the issues is.

hppritcha commented 4 years ago

this is on cori at NERSC. Yes it does occur on a single node with --mca btl self,vader. Note to reliably reproduce I need to use on the order of 32 ranks.

sekifjikkatsu commented 4 years ago

this probrem reproduced in our aarch64 architecture environmet.

SEKI vader_get_registation reg=57ff20
SEKI vader_get_registation reg=59d710
SEKI vader_get_registation reg=59d7f0
SEKI vader_get_registation reg=59d8d0
SEKI vader_get_registation reg=59d9b0
SEKI vader_get_registation reg=59da90
SEKI vader_get_registation reg=59db70
SEKI vader_get_registation reg=5aab60
SEKI vader_get_registation reg=5aa1a0 *
SEKI vader_get_registation reg=5aa280
SEKI vader_get_registation reg=575620 #
SEKI vader_get_registation reg=5b5bc0
SEKI vader_get_registation reg=5b5ca0
SEKI vader_get_registation reg=575620 #
SEKI vader_get_registation reg=5aa1a0 *
SEKI vader_get_registation reg=5b64c0
SEKI vader_get_registation reg=5b65a0
SEKI vader_get_registation reg=5b6680
SEKI vader_get_registation reg=5b6760
SEKI vader_get_registation reg=5b6680
SEKI vader_get_registation reg=575620 #
SEKI vader_get_registation reg=5aa1a0 *
SEKI mca_btl_vader_endpoint_xpmem_rcache_cleanup ep=5930c8 reg=5aab60 alloc_base=4 peer_smp_rank=1
SEKI mca_btl_vader_endpoint_xpmem_rcache_cleanup ep=5930c8 reg=5aa280 alloc_base=2 peer_smp_rank=1
SEKI mca_btl_vader_endpoint_xpmem_rcache_cleanup ep=5930c8 reg=575620 alloc_base=7 peer_smp_rank=1 #
SEKI mca_btl_vader_endpoint_xpmem_rcache_cleanup ep=5930c8 reg=575620 alloc_base=7 peer_smp_rank=1 #
SEKI mca_btl_vader_endpoint_xpmem_rcache_cleanup ep=5930c8 reg=5aa1a0 alloc_base=1 peer_smp_rank=1 *
SEKI mca_btl_vader_endpoint_xpmem_rcache_cleanup ep=5930c8 reg=575620 alloc_base=7 peer_smp_rank=1 #
SEKI mca_btl_vader_endpoint_xpmem_rcache_cleanup ep=5930c8 reg=5aa1a0 alloc_base=1 peer_smp_rank=1 *

In the case of *, In the mca_btl_vader_endpoint_xpmem_rcache_cleanup, condtion of 'reg->alloc_base == ep->peer_smp_rank' becomes true, OBJ_RELEASE for reg of the same address runs twice.

In the case of #, In the mca_btl_vader_endpoint_xpmem_rcache_cleanup, condtion of 'reg->alloc_base == ep->peer_smp_rank' becomes false, reg is not free by OBJ_RELEASE.

Is it correct that mca_btl_vader_endpoint_xpmem_rcache_cleanup is called by the same address reg?

diff --git a/opal/mca/btl/vader/btl_vader_xpmem.c b/opal/mca/btl/vader/btl_vader_xpmem.c
index 219c0bd5f7..fe6c0c5760 100644
--- a/opal/mca/btl/vader/btl_vader_xpmem.c
+++ b/opal/mca/btl/vader/btl_vader_xpmem.c
@@ -115,6 +115,7 @@ mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpo

     if (NULL == reg) {
         reg = OBJ_NEW(mca_rcache_base_registration_t);
+fprintf(stderr,"SEKI %s reg=%lx \n",__func__,reg);fflush(stderr);
         if (OPAL_LIKELY(NULL != reg)) {
             /* stick around for awhile */
             reg->ref_count = 2;
@@ -154,6 +155,7 @@ mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpo
 static int mca_btl_vader_endpoint_xpmem_rcache_cleanup (mca_rcache_base_registration_t *reg, void *ctx)
 {
     mca_btl_vader_endpoint_t *ep = (mca_btl_vader_endpoint_t *) ctx;
+fprintf(stderr,"SEKI %s ep=%lx reg=%lx alloc_base=%ld peer_smp_rank=%ld \n",__func__,ep,reg,(intptr_t)reg->alloc_base,ep->peer_smp_rank);fflush(stderr);
     if ((intptr_t) reg->alloc_base == ep->peer_smp_rank) {
         /* otherwise dereg will fail on assert */
         reg->ref_count = 0;

I executed IMB-MPI1 at 4 processes on the single-node with the option "-npmin 10000 -iter 1 alltoall -msglen ./len.txt".

cat len.txt 1024 2048 4096 8192 16384 32768 65536

hppritcha commented 3 years ago

fixed via #7283