ofiwg / libfabric

Open Fabric Interfaces
http://libfabric.org/
Other
547 stars 375 forks source link

prov/rxm: segfault during cleanup #6414

Closed shefty closed 3 years ago

shefty commented 3 years ago

To reproduce, run:

gdb --args fi_rdm_tagged_pingpong -p tcp 127.0.0.1

without the peer server running.

Program received signal SIGSEGV, Segmentation fault.
0x00007ffff7b0d059 in fi_close (fid=0x55555724eed0)
    at ./include/rdma/fabric.h:593
593             return fid->ops->close(fid);
(gdb) bt
#0  0x00007ffff7b0d059 in fi_close (fid=0x55555724eed0)
    at ./include/rdma/fabric.h:593
#1  0x00007ffff7b0ecf6 in rxm_conn_close (handle=0x555555794610)
    at prov/rxm/src/rxm_conn.c:328
#2  0x00007ffff7b12143 in rxm_conn_handle_notify (eq_entry=0x7fffffffdb58)
    at prov/rxm/src/rxm_conn.c:1177
#3  0x00007ffff7b126be in rxm_conn_handle_event (rxm_ep=0x555555790740, 
    entry=0x7fffffffdb10) at prov/rxm/src/rxm_conn.c:1255
#4  0x00007ffff7b103bf in rxm_msg_eq_progress (rxm_ep=0x555555790740)
    at prov/rxm/src/rxm_conn.c:668
#5  0x00007ffff7b106cb in rxm_cmap_connect (rxm_ep=0x555555790740, fi_addr=0, 
    handle=0x555555794610) at prov/rxm/src/rxm_conn.c:711
#6  0x00007ffff7b0e6a7 in rxm_get_conn (rxm_ep=0x555555790740, addr=0, 
    rxm_conn=0x7fffffffdca8) at prov/rxm/src/rxm_conn.c:196
#7  0x00007ffff7b1b216 in rxm_ep_tsend (ep_fid=0x555555790740, 
    buf=0x7fffd63a4010, len=16, desc=0x0, dest_addr=0, tag=0, 
    context=0x555555769700 <tx_ctx>) at prov/rxm/src/rxm_ep.c:2208
#8  0x00005555555576be in fi_tsend (ep=0x555555790740, buf=0x7fffd63a4010, 
    len=16, desc=0x0, dest_addr=0, tag=0, context=0x555555769700 <tx_ctx>)
    at /usr/local/include/rdma/fi_tagged.h:114
#9  0x000055555555d288 in ft_post_tx_buf (ep=0x555555790740, fi_addr=0, 
    size=16, data=0, ctx=0x555555769700 <tx_ctx>, op_buf=0x7fffd63a4010, 
    op_mr_desc=0x0, op_tag=0) at common/shared.c:1832
#10 0x000055555555d691 in ft_post_tx (ep=0x555555790740, fi_addr=0, size=16, 
    data=0, ctx=0x555555769700 <tx_ctx>) at common/shared.c:1854
#11 0x000055555555d700 in ft_tx (ep=0x555555790740, fi_addr=0, size=16, 
    ctx=0x555555769700 <tx_ctx>) at common/shared.c:1865
#12 0x000055555555af1d in ft_init_av_dst_addr (av_ptr=0x555555790600, 
---Type <return> to continue, or q <return> to quit---
    55555790740, remote_addr=0x555555769020 <remote_fi_addr>)
    at common/shared.c:1278
#13 0x000055555555ac2c in ft_init_av () at common/shared.c:1219
#14 0x000055555555a23d in ft_init_fabric () at common/shared.c:1058
#15 0x0000555555555ba7 in run () at benchmarks/rdm_tagged_pingpong.c:43
#16 0x0000555555555ea2 in main (argc=4, argv=0x7fffffffe018)
    at benchmarks/rdm_tagged_pingpong.c:107
(gdb) up
#1  0x00007ffff7b0ecf6 in rxm_conn_close (handle=0x555555794610)
    at prov/rxm/src/rxm_conn.c:328
328             if (fi_close(&rxm_conn->msg_ep->fid))
(gdb) p *rxm_conn
$2 = {handle = {cmap = 0x555555791c50, state = RXM_CMAP_SHUTDOWN, 
    key = 1048577, remote_key = 0, fi_addr = 0, peer = 0x0}, 
  msg_ep = 0x55555724eed0, inject_pkt = 0x5555557946c0, 
  inject_data_pkt = 0x555555794750, tinject_pkt = 0x5555557947e0, 
  tinject_data_pkt = 0x555555794870, deferred_conn_entry = {
    next = 0x555555794668, prev = 0x555555794668}, deferred_tx_queue = {
    next = 0x555555794678, prev = 0x555555794678}, sar_rx_msg_list = {
    next = 0x555555794688, prev = 0x555555794688}, sar_deferred_rx_msg_list = {
    next = 0x555555794698, prev = 0x555555794698}, rndv_tx_credits = 0}
(gdb) p *rxm_conn->msg_ep
$3 = {fid = {fclass = 93824994844896, 
    context = 0x7ffff7a26ca0 <main_arena+96>, ops = 0x0}, ops = 0x0, 
  cm = 0x7ffff7dd2c00 <tcpx_cm_ops>, msg = 0x7ffff7dd2ba0 <tcpx_msg_ops>, 
  rma = 0x7ffff7dd2b40 <tcpx_rma_ops>, tagged = 0x0, atomic = 0x0, 
  collective = 0x0}
shefty commented 3 years ago

I've been able to 'move' the segfault to another area, with the same underlying issue.

(gdb) bt
#0  ofi_cq_readfrom (cq_fid=0x555555791020, buf=0x7fffffffda40, count=1, 
    src_addr=0x0) at prov/util/src/util_cq.c:260
#1  0x00007ffff7a77c68 in ofi_cq_read (cq_fid=0x555555791020, 
    buf=0x7fffffffda40, count=1) at prov/util/src/util_cq.c:314
#2  0x00007ffff7b0ca16 in fi_cq_read (cq=0x555555791020, buf=0x7fffffffda40, 
    count=1) at ./include/rdma/fi_eq.h:394
#3  0x00007ffff7b11128 in rxm_flush_msg_cq (rxm_ep=0x555555790740)
    at prov/rxm/src/rxm_conn.c:1144
#4  0x00007ffff7b11331 in rxm_conn_handle_notify (eq_entry=0x7fffffffdb48)
    at prov/rxm/src/rxm_conn.c:1183
#5  0x00007ffff7b1189d in rxm_conn_handle_event (rxm_ep=0x555555790740, 
    entry=0x7fffffffdb00) at prov/rxm/src/rxm_conn.c:1255
#6  0x00007ffff7b0f59e in rxm_msg_eq_progress (rxm_ep=0x555555790740)
    at prov/rxm/src/rxm_conn.c:668
#7  0x00007ffff7b0f8aa in rxm_cmap_connect (rxm_ep=0x555555790740, fi_addr=0, 
    handle=0x555555794610) at prov/rxm/src/rxm_conn.c:711
#8  0x00007ffff7b0d886 in rxm_get_conn (rxm_ep=0x555555790740, addr=0, 
    rxm_conn=0x7fffffffdc98) at prov/rxm/src/rxm_conn.c:196
#9  0x00007ffff7b1a552 in rxm_ep_tsend (ep_fid=0x555555790740, 
    buf=0x7fffd63a3010, len=16, desc=0x0, dest_addr=0, tag=0, 
    context=0x555555769700 <tx_ctx>) at prov/rxm/src/rxm_ep.c:2208
#10 0x00005555555576be in fi_tsend (ep=0x555555790740, buf=0x7fffd63a3010, 
    len=16, desc=0x0, dest_addr=0, tag=0, context=0x555555769700 <tx_ctx>)
    at /usr/local/include/rdma/fi_tagged.h:114
#11 0x000055555555d288 in ft_post_tx_buf (ep=0x555555790740, fi_addr=0, 
    size=16, data=0, ctx=0x555555769700 <tx_ctx>, op_buf=0x7fffd63a3010, 
    op_mr_desc=0x0, op_tag=0) at common/shared.c:1832
#12 0x000055555555d691 in ft_post_tx (ep=0x555555790740, fi_addr=0, size=16, 
    data=0, ctx=0x555555769700 <tx_ctx>) at common/shared.c:1854
#13 0x000055555555d700 in ft_tx (ep=0x555555790740, fi_addr=0, size=16, 
    ctx=0x555555769700 <tx_ctx>) at common/shared.c:1865
#14 0x000055555555af1d in ft_init_av_dst_addr (av_ptr=0x555555790600, 
    ep_ptr=0x555555790740, remote_addr=0x555555769020 <remote_fi_addr>)
    at common/shared.c:1278
#15 0x000055555555ac2c in ft_init_av () at common/shared.c:1219
#16 0x000055555555a23d in ft_init_fabric () at common/shared.c:1058
#17 0x0000555555555ba7 in run () at benchmarks/rdm_tagged_pingpong.c:43
#18 0x0000555555555ea2 in main (argc=4, argv=0x7fffffffe008)
    at benchmarks/rdm_tagged_pingpong.c:107

The problem is that tcp is reporting a zeroed out CQ entry. The problem does not occur when rxm uses shared rx, but instead posts receives directly to the msg ep.

(gdb) p *cq->cirq
$6 = {size = 262144, size_mask = 262143, rcnt = 2064, wcnt = 264208, 
  buf = 0x7fffb57a2030}

I think the msg ep fails to connect, which results in tearing down the ep, and flushing posted receives to the CQ entries. This process repeats several times until the CQ ends up corrupted somehow. It's possible there's an error in the CQ or circular queue logic handling overflow.

shefty commented 3 years ago

fixed in master