Closed lcebaman closed 3 years ago
@lcebaman Can you provide a simple reproducer?
@hjelmn Is this a race condition in some one-sided code?
@jsquyres I don't have simple reproducer, but I can try to produce one. It seems this could well be a race condition.
@lcebaman Nathan (@hjelmn) just recently committed several one-sided fixes to master. Could you try a master nightly snapshot tarball, and see if the issue is resolved for you?
I vaguely remember fixing this. I don't know if I upstreamed the fix though.... Will take a look when I get back to the office on Friday.
@lcebaman Any news?
@hjelmn Did you get to look and see if you put the fix in release branches?
We just got a relevant system installed. Will try to look at it today.
@lcebaman could you try testing against one of 3.0.1 release candidates? There were fixes for multi-threaded RMA in this release.
It seems issue also can be reproduced with Intel/OPA hardware. openmpi3 segment fault. Latest openmpi4 does not segment fault, but hang.
[root@rdma05 ~]# /usr/lib64/openmpi3/bin/mpirun --allow-run-as-root -mca pml ob1 -mca btl openib,self -mca btl_openib_if_include hfi1_0:1 --bind-to none -np 2 -H rdma05,rdma06 /usr/lib64/openmpi3/bin/mpitests-IMB-EXT
#------------------------------------------------------------
# Intel (R) MPI Benchmarks 2018 Update 1, MPI-2 part
#------------------------------------------------------------
# Date : Tue Jun 4 21:23:26 2019
# Machine : x86_64
# System : Linux
# Release : 3.10.0-1053.el7.x86_64
# Version : #1 SMP Sat Jun 1 09:11:33 EDT 2019
# MPI Version : 3.1
# MPI Thread Environment:
# Calling sequence was:
# /usr/lib64/openmpi3/bin/mpitests-IMB-EXT
# Minimum message length in bytes: 0
# Maximum message length in bytes: 4194304
#
# MPI_Datatype : MPI_BYTE
# MPI_Datatype for reductions : MPI_FLOAT
# MPI_Op : MPI_SUM
#
#
# List of Benchmarks to run:
# Window
# Unidir_Get
# Unidir_Put
# Bidir_Get
# Bidir_Put
# Accumulate
#----------------------------------------------------------------
# Benchmarking Window
# #processes = 2
#----------------------------------------------------------------
#bytes #repetitions t_min[usec] t_max[usec] t_avg[usec]
0 100 309.22 309.25 309.24
4 100 312.05 312.06 312.05
8 100 320.37 320.38 320.37
16 100 310.31 310.33 310.32
32 100 313.46 313.48 313.47
64 100 311.57 311.58 311.57
128 100 313.01 313.03 313.02
256 100 317.72 317.73 317.72
512 100 318.02 318.03 318.02
1024 100 323.31 323.33 323.32
2048 100 316.87 316.88 316.87
4096 100 318.84 318.85 318.84
8192 100 316.71 316.72 316.72
16384 100 319.34 319.35 319.34
32768 100 354.19 354.26 354.23
65536 100 319.15 319.18 319.16
131072 100 316.22 316.23 316.22
262144 100 320.43 320.45 320.44
524288 80 319.72 319.74 319.73
1048576 40 320.70 320.74 320.72
2097152 20 319.10 319.19 319.15
4194304 10 329.20 329.72 329.46
#---------------------------------------------------
# Benchmarking Unidir_Get
# #processes = 2
#---------------------------------------------------
#
# MODE: AGGREGATE
#
#bytes #repetitions t[usec] Mbytes/sec
0 1000 0.04 0.00
4 1000 3.77 1.06
8 1000 3.70 2.16
16 1000 3.72 4.31
32 1000 3.87 8.26
64 1000 3.67 17.43
128 1000 3.72 34.40
256 1000 3.79 67.55
512 1000 4.04 126.66
1024 1000 4.13 247.97
2048 1000 4.19 488.82
4096 1000 4.30 951.81
8192 1000 4.48 1828.61
16384 1000 5.15 3181.56
32768 1000 6.77 4838.10
65536 640 10.79 6072.77
131072 320 18.79 6977.02
262144 160 35.18 7451.71
524288 80 68.51 7653.20
1048576 40 139.07 7540.05
2097152 20 277.37 7560.95
4194304 10 551.66 7603.10
#---------------------------------------------------
# Benchmarking Unidir_Get
# #processes = 2
#---------------------------------------------------
#
# MODE: NON-AGGREGATE
#
#bytes #repetitions t[usec] Mbytes/sec
0 100 6.01 0.00
4 100 13.92 0.29
8 100 13.75 0.58
16 100 13.86 1.15
32 100 14.13 2.26
64 100 13.97 4.58
128 100 13.75 9.31
256 100 14.07 18.20
512 100 14.47 35.38
1024 100 14.81 69.16
2048 100 14.86 137.79
4096 100 15.73 260.33
8192 100 20.17 406.23
16384 100 21.90 748.18
32768 100 21.68 1511.18
65536 100 30.58 2142.92
131072 100 38.67 3389.73
262144 100 54.69 4793.52
524288 80 89.49 5858.36
1048576 40 158.37 6621.18
2097152 20 354.18 5921.20
4194304 10 607.61 6902.92
#---------------------------------------------------
# Benchmarking Unidir_Put
# #processes = 2
#---------------------------------------------------
#
# MODE: AGGREGATE
#
#bytes #repetitions t[usec] Mbytes/sec
0 1000 0.03 0.00
4 1000 3.26 1.23
8 1000 3.21 2.49
16 1000 3.18 5.03
32 1000 3.33 9.62
64 1000 3.32 19.26
128 1000 3.50 36.59
256 1000 3.75 68.33
512 1000 2.37 215.71
1024 1000 2.42 422.86
2048 1000 2.39 856.31
4096 1000 2.40 1708.07
8192 1000 2.71 3018.75
16384 1000 4.02 4074.74
32768 1000 6.30 5201.24
65536 640 10.71 6117.21
131072 320 20.96 6254.20
262144 160 41.63 6297.14
524288 80 82.95 6320.63
1048576 40 161.72 6483.93
2097152 20 315.88 6639.08
4194304 10 661.03 6345.08
#---------------------------------------------------
# Benchmarking Unidir_Put
# #processes = 2
#---------------------------------------------------
#
# MODE: NON-AGGREGATE
#
#bytes #repetitions t[usec] Mbytes/sec
0 100 5.92 0.00
4 100 10.98 0.36
8 100 10.84 0.74
16 100 10.93 1.46
32 100 10.94 2.92
64 100 11.11 5.76
128 100 11.05 11.58
256 100 11.14 22.97
512 100 13.81 37.07
1024 100 13.76 74.41
2048 100 14.16 144.63
4096 100 14.62 280.09
8192 100 19.16 427.62
16384 100 20.22 810.14
32768 100 21.92 1494.61
65536 100 27.74 2362.59
131072 100 37.86 3461.88
262144 100 58.78 4460.09
524288 80 100.46 5218.69
1048576 40 177.70 5900.81
2097152 20 362.20 5790.01
4194304 10 683.71 6134.61
#---------------------------------------------------
# Benchmarking Bidir_Get
# #processes = 2
#---------------------------------------------------
#
# MODE: AGGREGATE
#
#bytes #repetitions t[usec] Mbytes/sec
0 1000 0.03 0.00
4 1000 6.98 0.57
8 1000 6.89 1.16
16 1000 6.84 2.34
32 1000 7.03 4.55
64 1000 7.02 9.12
128 1000 7.02 18.22
256 1000 6.97 36.74
512 1000 7.77 65.86
1024 1000 7.70 132.92
2048 1000 8.04 254.83
4096 1000 8.51 481.14
8192 1000 8.49 964.62
16384 1000 10.03 1633.00
32768 1000 15.24 2149.52
65536 640 24.01 2729.18
131072 320 40.28 3253.94
262144 160 74.42 3522.60
524288 80 146.86 3570.05
1048576 40 289.99 3615.92
2097152 20 585.49 3581.86
4194304 10 1153.52 3636.10
#---------------------------------------------------
# Benchmarking Bidir_Get
# #processes = 2
#---------------------------------------------------
#
# MODE: NON-AGGREGATE
#
#bytes #repetitions t[usec] Mbytes/sec
0 100 6.00 0.00
4 100 25.99 0.15
8 100 26.02 0.31
16 100 26.00 0.62
32 100 25.50 1.25
64 100 25.59 2.50
128 100 26.85 4.77
256 100 26.57 9.63
512 100 26.71 19.17
1024 100 26.62 38.47
2048 100 27.08 75.64
4096 100 28.79 142.29
8192 100 36.64 223.56
16384 100 39.80 411.62
32768 100 38.35 854.36
65536 100 49.20 1332.06
131072 100 70.34 1863.29
262144 100 109.35 2397.37
524288 80 191.23 2741.65
1048576 40 329.95 3177.95
2097152 20 669.58 3132.05
4194304 10 1238.61 3386.31
#---------------------------------------------------
# Benchmarking Bidir_Put
# #processes = 2
#---------------------------------------------------
#
# MODE: AGGREGATE
#
#bytes #repetitions t[usec] Mbytes/sec
0 1000 0.03 0.00
4 1000 3.77 1.06
8 1000 3.50 2.28
16 1000 3.53 4.53
32 1000 3.84 8.33
64 1000 3.75 17.05
128 1000 3.76 34.08
256 1000 4.00 64.06
512 1000 2.86 178.94
1024 1000 2.89 353.76
2048 1000 2.96 691.32
4096 1000 2.87 1426.03
8192 1000 3.77 2173.03
16384 1000 5.39 3038.07
32768 1000 9.34 3506.89
65536 640 17.72 3698.51
131072 320 35.71 3670.56
262144 160 70.18 3735.28
524288 80 142.26 3685.46
1048576 40 272.11 3853.45
2097152 20 554.40 3782.75
4194304 10 1099.62 3814.33
#---------------------------------------------------
# Benchmarking Bidir_Put
# #processes = 2
#---------------------------------------------------
#
# MODE: NON-AGGREGATE
#
#bytes #repetitions t[usec] Mbytes/sec
0 100 6.00 0.00
4 100 11.55 0.35
8 100 11.50 0.70
16 100 29.16 0.55
32 100 11.49 2.79
64 100 11.76 5.44
128 100 11.55 11.08
256 100 11.85 21.60
512 100 15.14 33.82
1024 100 15.34 66.77
2048 100 15.72 130.25
4096 100 16.31 251.14
8192 100 20.72 395.40
16384 100 23.89 685.86
32768 100 28.71 1141.22
65536 100 37.67 1739.61
131072 100 53.10 2468.24
262144 100 90.86 2885.01
524288 80 168.10 3118.85
1048576 40 319.97 3277.07
2097152 20 601.83 3484.64
4194304 10 1121.62 3739.52
[rdma05:20630:0:20630] Caught signal 11 (Segmentation fault: address not mapped to object at address (nil))
==== backtrace ====
0 /lib64/libucs.so.0(+0x1a82c) [0x7fc53a78e82c]
1 /lib64/libucs.so.0(+0x1a9e2) [0x7fc53a78e9e2]
2 /usr/lib64/openmpi3/lib/openmpi/mca_btl_openib.so(mca_btl_openib_get+0x131) [0x7fc53d549bf1]
3 /usr/lib64/openmpi3/lib/openmpi/mca_osc_rdma.so(ompi_osc_get_data_blocking+0x1bc) [0x7fc53b44ec8c]
4 /usr/lib64/openmpi3/lib/openmpi/mca_osc_rdma.so(+0x11783) [0x7fc53b458783]
5 /usr/lib64/openmpi3/lib/openmpi/mca_osc_rdma.so(ompi_osc_rdma_accumulate+0x187) [0x7fc53b45d607]
6 /usr/lib64/openmpi3/lib/libmpi.so.40(PMPI_Accumulate+0x2d3) [0x7fc550de0c53]
7 /usr/lib64/openmpi3/bin/mpitests-IMB-EXT() [0x4088bd]
8 /usr/lib64/openmpi3/bin/mpitests-IMB-EXT() [0x4063c8]
9 /usr/lib64/openmpi3/bin/mpitests-IMB-EXT() [0x402016]
10 /lib64/libc.so.6(__libc_start_main+0xf5) [0x7fc550786545]
11 /usr/lib64/openmpi3/bin/mpitests-IMB-EXT() [0x402286]
===================
[rdma05:20630] *** Process received signal ***
[rdma05:20630] Signal: Segmentation fault (11)
[rdma05:20630] Signal code: (-6)
[rdma05:20630] Failing at address: 0x5096
[rdma05:20630] [ 0] /lib64/libpthread.so.0(+0xf630)[0x7fc550b41630]
[rdma05:20630] [ 1] /usr/lib64/openmpi3/lib/openmpi/mca_btl_openib.so(mca_btl_openib_get+0x131)[0x7fc53d549bf1]
[rdma05:20630] [ 2] /usr/lib64/openmpi3/lib/openmpi/mca_osc_rdma.so(ompi_osc_get_data_blocking+0x1bc)[0x7fc53b44ec8c]
[rdma05:20630] [ 3] /usr/lib64/openmpi3/lib/openmpi/mca_osc_rdma.so(+0x11783)[0x7fc53b458783]
[rdma05:20630] [ 4] /usr/lib64/openmpi3/lib/openmpi/mca_osc_rdma.so(ompi_osc_rdma_accumulate+0x187)[0x7fc53b45d607]
[rdma05:20630] [ 5] /usr/lib64/openmpi3/lib/libmpi.so.40(PMPI_Accumulate+0x2d3)[0x7fc550de0c53]
[rdma05:20630] [ 6] /usr/lib64/openmpi3/bin/mpitests-IMB-EXT[0x4088bd]
[rdma05:20630] [ 7] /usr/lib64/openmpi3/bin/mpitests-IMB-EXT[0x4063c8]
[rdma05:20630] [ 8] /usr/lib64/openmpi3/bin/mpitests-IMB-EXT[0x402016]
[rdma05:20630] [ 9] /lib64/libc.so.6(__libc_start_main+0xf5)[0x7fc550786545]
[rdma05:20630] [10] /usr/lib64/openmpi3/bin/mpitests-IMB-EXT[0x402286]
[rdma05:20630] *** End of error message ***
[[53556,1],1][btl_openib_component.c:3644:handle_wc] from rdma06 to: rdma05 error polling LP CQ with status REMOTE ACCESS ERROR status number 10 for wr_id 1ab6e58 opcode 3 vendor error 0 qp_idx 0
[root@rdma05 ~]#
Openib btl is removed. Closing.
Runnning OpenMPI 3.0.0 and RMA (MPI_THREAD_MULTIPLE) I get
I've noticed that this happens when the number of MPI processes per node >=4 . Here is some more info that could be (or not) related to this issue: