Vader BTL crashes - Githubissues

jsquyres commented 9 years ago

Per the thread starting here: http://www.open-mpi.org/community/lists/devel/2015/03/17131.php

@opoplawski is seeing crashes in the Open MPI test suite in openmpi-1.8.4-99-20150228 (Feb 28 nightly tarball) with the vader BTL. If he disables the vader BTL, the crashes go away:

#0  mca_btl_vader_check_fboxes () at btl_vader_fbox.h:192
#1  mca_btl_vader_component_progress () at btl_vader_component.c:694
#2  0xf3971b69 in opal_progress () at runtime/opal_progress.c:187
#3  0xf40b4695 in opal_condition_wait (c=<optimized out>, m=<optimized out>)
   at ../opal/threads/condition.h:78
#4  ompi_request_default_wait_all (count=6, requests=<optimized out>,
statuses=0x0)
   at request/req_wait.c:281
#5  0xf28bb5e7 in ompi_coll_tuned_alltoall_intra_basic_linear
(sbuf=sbuf@entry=0xf7a2d328,
   scount=scount@entry=1, sdtype=sdtype@entry=0xf4148240 <ompi_mpi_int>,
   rbuf=rbuf@entry=0xf7af1920, rcount=rcount@entry=1,
   rdtype=rdtype@entry=0xf4148240 <ompi_mpi_int>, comm=comm@entry=0xf7b051d8,
   module=module@entry=0xf7a2b4d0) at coll_tuned_alltoall.c:700
#6  0xf28b4d08 in ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf=0xf7a2d328,
scount=1,
   sdtype=0xf4148240 <ompi_mpi_int>, rbuf=0xf7af1920, rcount=1,
   rdtype=0xf4148240 <ompi_mpi_int>, comm=0xf7b051d8, module=0xf7a2b4d0)
   at coll_tuned_decision_fixed.c:130
#7  0xf40c7899 in PMPI_Alltoall (sendbuf=sendbuf@entry=0xf7a2d328,
   sendcount=sendcount@entry=1, sendtype=sendtype@entry=0xf4148240
<ompi_mpi_int>,
   recvbuf=recvbuf@entry=0xf7af1920, recvcount=recvcount@entry=1,
   recvtype=recvtype@entry=0xf4148240 <ompi_mpi_int>, comm=0xf7b051d8) at
palltoall.c:111
#8  0xe9780da0 in ADIOI_Calc_others_req (fd=fd@entry=0xf7b12640,
count_my_req_procs=1,
   count_my_req_per_proc=0xf7a2d328, my_req=0xf7b00750, nprocs=4, myrank=0,
   count_others_req_procs_ptr=count_others_req_procs_ptr@entry=0xffbea6e8,
   others_req_ptr=others_req_ptr@entry=0xffbea6cc) at
adio/common/ad_aggregate.c:453
#9  0xe9796a14 in ADIOI_GEN_WriteStridedColl (fd=0xf7b12640, buf=0xf7aa0148,
count=2440,
   datatype=0xf4148840 <ompi_mpi_byte>, file_ptr_type=100, offset=0,
status=0xffbea8b8,
   error_code=0xffbea790) at adio/common/ad_write_coll.c:192
#10 0xe97779e0 in MPIOI_File_write_all (fh=fh@entry=0xf7b12640,
offset=offset@entry=0,
   file_ptr_type=file_ptr_type@entry=100, buf=buf@entry=0xf7aa0148,
count=count@entry=2440,
   datatype=datatype@entry=0xf4148840 <ompi_mpi_byte>,
   myname=myname@entry=0xe97a9a1c <myname.9354> "MPI_FILE_WRITE_AT_ALL",
   status=status@entry=0xffbea8b8) at mpi-io/write_all.c:116
#11 0xe9778176 in mca_io_romio_dist_MPI_File_write_at_all (fh=0xf7b12640,
   offset=offset@entry=0, buf=buf@entry=0xf7aa0148, count=count@entry=2440,
   datatype=datatype@entry=0xf4148840 <ompi_mpi_byte>,
status=status@entry=0xffbea8b8)
   at mpi-io/write_atall.c:55
#12 0xe9770bcc in mca_io_romio_file_write_at_all (fh=0xf7aa27c8, offset=0,
buf=0xf7aa0148,
   count=2440, datatype=0xf4148840 <ompi_mpi_byte>, status=0xffbea8b8)
   at src/io_romio_file_write.c:61
#13 0xf40ff3ce in PMPI_File_write_at_all (fh=0xf7aa27c8, offset=0,
buf=buf@entry=0xf7aa0148,
   count=count@entry=2440, e=0xf4148840 <ompi_mpi_byte>,
status=status@entry=0xffbea8b8)
   at pfile_write_at_all.c:75
#14 0xf437a43c in H5FD_mpio_write (_file=_file@entry=0xf7b074a8,
   type=type@entry=H5FD_MEM_DRAW, dxpl_id=167772177, addr=31780,
size=size@entry=2440,
   buf=buf@entry=0xf7aa0148) at ../../src/H5FDmpio.c:1840
#15 0xf4375cd5 in H5FD_write (file=0xf7b074a8, dxpl=0xf7a47d20,
type=H5FD_MEM_DRAW,
   addr=31780, size=size@entry=2440, buf=buf@entry=0xf7aa0148) at
../../src/H5FDint.c:245
#16 0xf4360932 in H5F__accum_write (fio_info=fio_info@entry=0xffbea9d4,
   type=type@entry=H5FD_MEM_DRAW, addr=31780, size=size@entry=2440,
buf=buf@entry=0xf7aa0148)
   at ../../src/H5Faccum.c:824
#17 0xf436430c in H5F_block_write (f=0xf7a31860,
type=type@entry=H5FD_MEM_DRAW, addr=31780,
   size=size@entry=2440, dxpl_id=167772177, buf=0xf7aa0148) at
../../src/H5Fio.c:170
#18 0xf43413ee in H5D__mpio_select_write (io_info=0xffbeab60,
type_info=0xffbeab1c,
   mpi_buf_count=2440, file_space=0x0, mem_space=0x0) at ../../src/H5Dmpio.c:296
#19 0xf4341f33 in H5D__final_collective_io (mpi_buf_type=0xffbeaa7c,
mpi_file_type=0xffbeaa78,
   mpi_buf_count=<optimized out>, type_info=0xffbeab1c, io_info=0xffbeab60)
   at ../../src/H5Dmpio.c:1444
#20 H5D__inter_collective_io (mem_space=0xf7a38120, file_space=0xf7a55590,
   type_info=0xffbeab1c, io_info=0xffbeab60) at ../../src/H5Dmpio.c:1400
#21 H5D__contig_collective_write (io_info=0xffbeab60, type_info=0xffbeab1c,
nelmts=610,
   file_space=0xf7a55590, mem_space=0xf7a38120, fm=0xffbeace0) at
../../src/H5Dmpio.c:528
#22 0xf433ae8d in H5D__write (buf=0xf7aa0148, dxpl_id=167772177,
file_space=0xf7a55590,
   mem_space=0xf7a38120, mem_type_id=-140159600, dataset=0xf7a3eb40) at
../../src/H5Dio.c:787
#23 H5D__pre_write (dset=dset@entry=0xf7a3eb40, direct_write=<optimized out>,
   mem_type_id=mem_type_id@entry=50331747, mem_space=mem_space@entry=0xf7a38120,
   file_space=0xf7a55590, dxpl_id=dxpl_id@entry=167772177,
buf=buf@entry=0xf7aa0148)
   at ../../src/H5Dio.c:351
#24 0xf433b74c in H5Dwrite (dset_id=83886085, mem_type_id=50331747,
   mem_space_id=mem_space_id@entry=67108867,
file_space_id=file_space_id@entry=67108866,
   dxpl_id=dxpl_id@entry=167772177, buf=buf@entry=0xf7aa0148) at
../../src/H5Dio.c:270
#25 0xf466b603 in nc4_put_vara (nc=0xf7a05c58, ncid=ncid@entry=65536,
varid=varid@entry=3,
   startp=startp@entry=0xffbf6a08, countp=countp@entry=0xffbf6a10,
   mem_nc_type=mem_nc_type@entry=5, is_long=is_long@entry=0,
data=data@entry=0xf7a07c40)
   at ../../libsrc4/nc4hdf.c:788
#26 0xf4673c55 in nc4_put_vara_tc (mem_type_is_long=0, op=0xf7a07c40,
countp=0xffbf6a10,
   startp=0xffbf6a08, mem_type=5, varid=3, ncid=65536) at
../../libsrc4/nc4var.c:1429
#27 NC4_put_vara (ncid=65536, varid=3, startp=0xffbf6a08, countp=0xffbf6a10,
op=0xf7a07c40,
   memtype=5) at ../../libsrc4/nc4var.c:1565
#28 0xf460a377 in NC_put_vara (ncid=ncid@entry=65536, varid=varid@entry=3,
   start=start@entry=0xffbf6a08, edges=edges@entry=0xffbf6a10,
value=value@entry=0xf7a07c40,
   memtype=memtype@entry=5) at ../../libdispatch/dvarput.c:79
#29 0xf460b541 in nc_put_vara_float (ncid=65536, varid=3, startp=0xffbf6a08,
   countp=0xffbf6a10, op=0xf7a07c40) at ../../libdispatch/dvarput.c:655
#30 0xf77d06ed in test_pio_2d (cache_size=67108864, facc_type=8192,
access_flag=1,
   comm=0xf414d800 <ompi_mpi_comm_world>, info=0xf4154240
<ompi_mpi_info_null>, mpi_size=4,
   mpi_rank=0, chunk_size=0xffbf76f4) at ../../nc_test4/tst_nc4perf.c:96
#31 0xf77cfdb1 in main (argc=1, argv=0xffbf7804) at
../../nc_test4/tst_nc4perf.c:299

@hjelmn Can you have a look?

hjelmn commented 9 years ago

Trying to wrap my head around this stack trace. It should not be possible. Check fboxes reads from a pointer in a shared memory segment owned by the sender. The sender's fast box data pointer is initialized by the sender before it is sent to the receiver. This rules out the possibility that the sending process ran out of room in its shared memory segment.

Only thing i can think of is memory corruption and by luck it is working with sm.

hjelmn commented 9 years ago

Even more suspicious. The line number of the crash is not the first read from either the fast box or the endpoint. This is not likely Vader's fault.

jsquyres commented 9 years ago

@opoplawski Can you post the program here that caused the problem?

opoplawski commented 9 years ago

Note that the stack trace is from one of the hang conditions where each process spins in opal_condition_wait(). Other times it will trigger and MPI error. I'm afraid the test program is fairly involved. It's the nc_test4/tst_nc4perf.c (https://github.com/Unidata/netcdf-c/blob/master/nc_test4/tst_nc4perf.c) test program in netcdf 4.3.3.1 that makes use of HDF5 IO - in this case its hdf5 1.8.14. This is running on Fedora Rawhide which is using gcc 5.0.0. So far I've only seen it on i686.

opoplawski commented 9 years ago

I should also note that I'm not entirely sure if this is a regression or not (or when it started). I've seen odd behavior for quite a while with netcdf's MPI tests.

hjelmn commented 9 years ago

Ah, ok. If the trace is not a crash that makes more sense. I will take a look and see if I can figure out why that test is getting stuck.

hjelmn commented 9 years ago

Can't reproduce on master (same vader revision) on SLES11 with btl_vader_single_copy_mechanism set to either xpmem or none. tst_nc4perf runs to completion with 2, 4, and 10 ranks running on a single node. I used netcdf master with hdf5 1.8.14, gcc 4.8.2.

This could be a romio bug. The version in 1.8.4 lags behind trunk. Can you try running with -mca io ompio?

hjelmn commented 9 years ago

Another alternative would be to run with master.

opoplawski commented 9 years ago

That's worse:

$ mpirun -np 4 -mca io ompio ./openmpi/nc_test4/tst_nc4perf * Testing parallel IO for NASA... num_proc MPI mode access cache (MB) grid size chunks avg. write time(s) avg. write bandwidth(MB/s) num_tries * * stack smashing detected _: /builddir/build/BUILD/netcdf-4.3.3.1/openmpi/nc_test4/.libs/tstnc4perf terminated [mock1:09190] ** Process received signal *** [mock1:09190] Signal: Aborted (6) [mock1:09190] Signal code: (-6) ======= Backtrace: ========= /lib/libc.so.6(+0x6e183)[0xf3ebd183] /lib/libc.so.6(fortify_fail+0x37)[0xf3f5dee7] [mock1:09190] /lib/libc.so.6(+0x10eea8)[0xf3f5dea8] /usr/lib/openmpi/lib/openmpi/mca_io_ompio.so([ 0] _fini+0x0)[0xf2a32654] /usr/lib/openmpi/lib/openmpi/mca_io_ompio.solinux-gate.so.1(kernel_rt_sigreturn+0x0)0xf7777cc0[0xf2a301a4] [mock1:09190] /usr/lib/openmpi/lib/libmpi.so.1(PMPI_File_set_size+0xa7)[0xf40c53a7] [ 1] linux-gate.so.1(__kernel_vsyscall+0x10)[0xf7777ce0] [mock1:09190] [ 2] /usr/lib/openmpi/lib/libhdf5.so.9(+0xd4b5f)[0xf4340b5f] /lib/libc.so.6(gsignal+0x46)[0xf3e86c36] [mock1:09190] [ 3] /usr/lib/openmpi/lib/libhdf5.so.9(H5FD_truncate+0x40)[0xf43374d0] /lib/libc.so.6(abort+0x145)[0xf3e884b5] [mock1:09190] [ 4] /usr/lib/openmpi/lib/libhdf5.so.9(H5F_dest+0x37a)[0xf432445a] /lib/libc.so.6(+0x6e188)[0xf3ec6188] [mock1:09190] /usr/lib/openmpi/lib/libhdf5.so.9(H5F_try_close+0x193)[0xf43255a3] [ 5] /lib/libc.so.6(__fortify_fail+0x37)[0xf3f66ee7] /usr/lib/openmpi/lib/libhdf5.so.9(H5F_close+0x3c)[0xf432590c[mock1:09190] ] [ 6] /usr/lib/openmpi/lib/libhdf5.so.9(H5I_dec_ref+0xb9)[0xf43aa299] /lib/libc.so.6(+0x10eea8)[0xf3f66ea8] [mock1:09190] /usr/lib/openmpi/lib/libhdf5.so.9(H5I_dec_app_ref[ 7] /usr/lib/openmpi/lib/openmpi/mca_io_ompio.so(_fini+0x0)[0xf2a39654] [mock1:09190] /builddir/build/BUILD/netcdf-4.3.3.1/openmpi/liblib/.libs/libnetcdf.so.7(+0x8ede7)[ 8] /usr/lib/openmpi/lib/openmpi/mca_io_ompio.so(+0x91a4)[0xf2a371a4] [mock1:09190] /builddir/build/BUILD/netcdf-4.3.3.1/openmpi/liblib/.libs/libnetcdf.so.7(nc_close+0x42)[0x[ 9] e9752000-e976e000 r-xp 00000000 08:03 5394869 /usr/lib/libgcc_s-5.0.0-20150226.so.1

opoplawski commented 9 years ago

(gdb) list 0x91a4 0x91a4 is at io_ompio_file_open.c:455. 450 } 451 452 ret = data->ompio_fh.f_fs->fs_file_set_size (&data->ompio_fh, size); 453 454 return ret; 455 } 456 457 int 458 mca_io_ompio_file_get_size (ompi_file_t fh, 459 OMPI_MPI_OFFSET_TYPE *size)

edgargabriel commented 9 years ago

Is this test publicly available? I can have a look at it to see what is going on in ompio. I assuming this is master?

Thanks Edgar

On 3/16/2015 10:46 AM, Orion Poplawski wrote:

That's worse:

$ mpirun -np 4 -mca io ompio ./openmpi/nc_test4/tst_nc4perf * Testing parallel IO for NASA... num_proc MPI mode access cache (MB) grid size chunks avg. write time(s) avg. write bandwidth(MB/s) num_tries * * stack smashing detected /_: /builddir/build/BUILD/netcdf-4.3.3.1/openmpi/nc_test4/.libs/tst_nc4perf terminated [mock1:09190] /_ Process received signal * [mock1:09190] Signal: Aborted (6) [mock1:09190] Signal code: (-6) ======= Backtrace: ========= /lib/libc.so.6(+0x6e183)[0xf3ebd183] /lib/libc.so.6(fortify_fail+0x37)[0xf3f5dee7] [mock1:09190] /lib/libc.so.6(+0x10eea8)[0xf3f5dea8] /usr/lib/openmpi/lib/openmpi/mca_io_ompio.so([ 0] _fini+0x0)[0xf2a32654] /usr/lib/openmpi/lib/openmpi/mca_io_ompio.solinux-gate.so.1(kernel_rt_sigreturn+0x0)0xf7777cc0 <+0x91a4>[0xf2a301a4] [mock1:09190] /usr/lib/openmpi/lib/libmpi.so.1(PMPI_File_set_size+0xa7)[0xf40c53a7] [ 1] linux-gate.so.1(__kernel_vsyscall+0x10)[0xf7777ce0] [mock1:09190] [ 2] /usr/lib/openmpi/lib/libhdf5.so.9(+0xd4b5f)[0xf4340b5f] /lib/libc.so.6(gsignal+0x46)[0xf3e86c36] [mock1:09190] [ 3] /usr/lib/openmpi/lib/libhdf5.so.9(H5FD_truncate+0x40)[0xf43374d0] /lib/libc.so.6(abort+0x145)[0xf3e884b5] [mock1:09190] [ 4] /usr/lib/openmpi/lib/libhdf5.so.9(H5F_dest+0x37a)[0xf432445a] /lib/libc.so.6(+0x6e188)[0xf3ec6188] [mock1:09190] /usr/lib/openmpi/lib/libhdf5.so.9(H5F_try_close+0x193)[0xf43255a3] [ 5] /lib/libc.so.6(__fortify_fail+0x37)[0xf3f66ee7] /usr/lib/openmpi/lib/libhdf5.so.9(H5F_close+0x3c)[0xf432590c[mock1:09190] ] [ 6] /usr/lib/openmpi/lib/libhdf5.so.9(H5I_dec_ref+0xb9)[0xf43aa299] /lib/libc.so.6(+0x10eea8)[0xf3f66ea8] [mock1:09190] /usr/lib/openmpi/lib/libhdf5.so.9(H5I_dec_app_ref[ 7] /usr/lib/openmpi/lib/openmpi/mca_io_ompio.so(_fini+0x0)[0xf2a39654] [mock1:09190] /builddir/build/BUILD/netcdf-4.3.3.1/openmpi/liblib/.libs/libnetcdf.so.7(+0x8ede7)[ 8] /usr/lib/openmpi/lib/openmpi/mca_io_ompio.so(+0x91a4)[0xf2a371a4] [mock1:09190] /builddir/build/BUILD/netcdf-4.3.3.1/openmpi/liblib/.libs/libnetcdf.so.7(nc_close+0x42)[0x[ 9] e9752000-e976e000 r-xp 00000000 08:03 5394869 /usr/lib/libgcc_s-5.0.0-20150226.so.1

— Reply to this email directly or view it on GitHub https://github.com/open-mpi/ompi/issues/473#issuecomment-81751525.

Edgar Gabriel Associate Professor Parallel Software Technologies Lab http://pstl.cs.uh.edu Department of Computer Science University of Houston Philip G. Hoffman Hall, Room 524 Houston, TX-77204, USA

Tel: +1 (713) 743-3857 Fax: +1 (713) 743-3335

Nothing can wear you out like caring about people.

- S.E. Hinton

edgargabriel commented 9 years ago

If this was the 1.8 series of Open MPI, the ompio module there does not have all the fixes necessary to pass the hdf5 test suite, that was done last summer and it includes too many changes to ompio for it to be feasible to backport it to the 1.8 series.

opoplawski commented 9 years ago

@edgargabriel See earlier comments for what test and openmpi version this is.

Some more details:

The HDF5 call seems to be from ./hdf5-1.8.14/src/H5FDmpio.c:1091:

    if (MPI_SUCCESS != (mpi_code=MPI_File_set_size(fh, (MPI_Offset)0)))

opoplawski commented 9 years ago

Okay, so probably not worth worrying about the 1.8 ompio failure. Looks like the original issue may be fixed in master, but we have no idea that the fix may have been?

hjelmn commented 9 years ago

Can you verify that it indeed works for you with master? Just because I can't reproduce doesn't mean it is fixed :). Once we know whether it is fixed we can start the discussion about whether we should back-port the romio fixes on master to 1.8. The fix will likely be among those changes.

opoplawski commented 9 years ago

I'm trying to build the Fedora package with opmi master, but running into issue #475

hjelmn commented 9 years ago

Any luck with master?

opoplawski commented 9 years ago

Still compiling deps. Ran into issue #478 as well - disabled tests for now.

opoplawski commented 9 years ago

I can still reproduce the hang with the current dev snapshot. This may be triggered by gcc 5.0.0 as well.

hjelmn commented 9 years ago

That is my guess. I am trying to install a gcc 5 snapshot build to test this theory. Keep in mind that gcc 5.0 is still technically in beta so there is a good chance this is a gcc bug.

opoplawski commented 9 years ago

I've found another package that is entering a hang loop on Fedora rawhide i686: elpa. Haven't looked at it in detail yet, but this does seem to be a problem with more than just netcdf.

opoplawski commented 9 years ago

Yup, looks like the same cycle in opal_progress().

opoplawski commented 9 years ago

I've updated to 1.8.4-134-g9ad2aa8 and applied the atomic patch. It does not appear to affect this problem. I'm also seeing a (probably) similar failure on arm7hl.

hjelmn commented 9 years ago

Found the problem. Vader assumes 64-bit load/store in the fast box code. With gcc 4.8 this doesn't seem to cause any issues but with gcc-5.0 there is a data race between the process setting the fast box header and the process reading it. This causes the receiver to read an incomplete message leading to a hang or a crash.

I am putting together a fix for the data race for master and with PR it to 1.8.

hjelmn commented 9 years ago

@opoplawski #503 should fix the issue. I was able to reproduce crashes/hangs with ubuntu 14.04 i386 with gcc 5.0-032215.

opoplawski commented 9 years ago

It does indeed appear to fix it for me, thanks! Now I just need to track down the armv7hl issue...

jsquyres commented 9 years ago

Closing, since @opoplawski confirms that it's fixed.

open-mpi / ompi

Vader BTL crashes #473

Tel: +1 (713) 743-3857 Fax: +1 (713) 743-3335

- S.E. Hinton