BUG: unsafe CXI <-> gdrcopy cleanup interactions

tylerjereddy commented 3 months ago

While investigating https://github.com/NVIDIA/gdrcopy/issues/296#issuecomment-2111473932 I instrumented the libfabric dev-cxi branch from @thomasgillis as follows:

```diff diff --git a/prov/cxi/src/cxip_atomic.c b/prov/cxi/src/cxip_atomic.c index c71a762f0..3522c42d0 100644 --- a/prov/cxi/src/cxip_atomic.c +++ b/prov/cxi/src/cxip_atomic.c @@ -371,6 +371,7 @@ static int _cxip_amo_cb(struct cxip_req *req, const union c_event *event) int ret_err; int success_event = (req->flags & FI_COMPLETION); struct cxip_txc *txc = req->amo.txc; + printf("[%d, %d] _cxip_amo_cb() checkpoint 1 req->amo=%p\n", getpid(), gettid(), req->amo); /* When errors happen, send events can occur before the put/get event. * These events should just be dropped. @@ -420,13 +421,23 @@ static int _cxip_amo_cb(struct cxip_req *req, const union c_event *event) } if (req->amo.result_md) + { + printf("[%d, %d] _cxip_amo_cb() checkpoint 2 before unmap req->amo.result_md=%p\n", getpid(), gettid(), req->amo.result_md); cxip_unmap(req->amo.result_md); + } if (req->amo.oper1_md) + { + printf("[%d, %d] _cxip_amo_cb() checkpoint 3 before unmap req->amo.oper1_md=%p\n", getpid(), gettid(), req->amo.oper1_md); cxip_unmap(req->amo.oper1_md); + } if (req->amo.ibuf) + { + printf("[%d, %d] _cxip_amo_cb() checkpoint 4 before free req->amo.ibuf=%p\n", getpid(), gettid(), req->amo.ibuf); cxip_txc_ibuf_free(txc, req->amo.ibuf); + printf("[%d, %d] _cxip_amo_cb() checkpoint 4 after free\n", getpid(), gettid()); + } req->flags &= (FI_ATOMIC | FI_READ | FI_WRITE); @@ -450,6 +461,7 @@ static int _cxip_amo_cb(struct cxip_req *req, const union c_event *event) ofi_atomic_dec32(&req->amo.txc->otx_reqs); cxip_evtq_req_free(req); + printf("[%d, %d] _cxip_amo_cb() checkpoint 5 before return\n", getpid(), gettid()); return FI_SUCCESS; } diff --git a/prov/cxi/src/cxip_iomm.c b/prov/cxi/src/cxip_iomm.c index f6116f39b..11ad2e7ce 100644 --- a/prov/cxi/src/cxip_iomm.c +++ b/prov/cxi/src/cxip_iomm.c @@ -5,6 +5,7 @@ */ #include "cxip.h" +#include #define CXIP_DBG(...) _CXIP_DBG(FI_LOG_MR, __VA_ARGS__) #define CXIP_WARN(...) _CXIP_WARN(FI_LOG_MR, __VA_ARGS__) @@ -586,15 +587,23 @@ static void cxip_unmap_cache(struct cxip_md *md) static void cxip_unmap_nocache(struct cxip_md *md) { int ret; + printf("[%d, %d] cxip_unmap_nocache() checkpoint 1\n", getpid(), gettid()); if (md->handle_valid) + { + printf("[%d, %d] cxip_unmap_nocache() checkpoint 2 before unregister\n", getpid(), gettid()); ofi_hmem_dev_unregister(md->info.iface, md->handle); + } + printf("[%d, %d] cxip_unmap_nocache() checkpoint 3 before unmap\n", getpid(), gettid()); ret = cxil_unmap(md->md); + printf("[%d, %d] cxip_unmap_nocache() checkpoint 4 after unmap\n", getpid(), gettid()); if (ret) CXIP_WARN("cxil_unmap failed: %d\n", ret); + printf("[%d, %d] cxip_unmap_nocache() checkpoint 5 before free\n", getpid(), gettid()); free(md); + printf("[%d, %d] cxip_unmap_nocache() checkpoint 6 after free\n", getpid(), gettid()); } /* diff --git a/prov/cxi/src/cxip_txc.c b/prov/cxi/src/cxip_txc.c index a15ed8ee6..2d203f63c 100644 --- a/prov/cxi/src/cxip_txc.c +++ b/prov/cxi/src/cxip_txc.c @@ -11,6 +11,7 @@ #include #include +#include #include "cxip.h" @@ -50,7 +51,9 @@ void *cxip_txc_ibuf_alloc(struct cxip_txc *txc) */ void cxip_txc_ibuf_free(struct cxip_txc *txc, void *ibuf) { + printf("[%d, %d] cxip_txc_ibuf_free() checkpoint 1 before freeing ibuf=%p\n", getpid(), gettid(), ibuf); ofi_buf_free(ibuf); + printf("[%d, %d] cxip_txc_ibuf_free() checkpoint 2 after freeing ibuf=%p\n", getpid(), gettid(), ibuf); CXIP_DBG("Freed inject buffer: %p\n", ibuf); } diff --git a/src/hmem_cuda_gdrcopy.c b/src/hmem_cuda_gdrcopy.c index e013777df..8eb66b1d2 100644 --- a/src/hmem_cuda_gdrcopy.c +++ b/src/hmem_cuda_gdrcopy.c @@ -43,6 +43,7 @@ #include #include +#include struct gdrcopy_handle { gdr_mh_t mh; /* memory handler */ @@ -354,10 +355,13 @@ int cuda_gdrcopy_dev_unregister(uint64_t handle) gdrcopy = (struct gdrcopy_handle *)handle; assert(gdrcopy); + printf("[%d, %d] cuda_gdrcopy_dev_unregister() checkpoint 1 before spin lock gdrcopy->mh=%p\n", getpid(), gettid(), gdrcopy->mh); pthread_spin_lock(&global_gdr_lock); + printf("[%d, %d] cuda_gdrcopy_dev_unregister() checkpoint 2 after spin lock and before unmap gdrcopy->mh=%p\n", getpid(), gettid(), gdrcopy->mh); err = global_gdrcopy_ops.gdr_unmap(global_gdr, gdrcopy->mh, gdrcopy->user_ptr, gdrcopy->length); + printf("[%d, %d] cuda_gdrcopy_dev_unregister() checkpoint 3 after unmap gdrcopy->mh=%p\n", getpid(), gettid(), gdrcopy->mh); if (err) { FI_WARN(&core_prov, FI_LOG_CORE, "gdr_unmap failed! error: %s\n", @@ -365,7 +369,9 @@ int cuda_gdrcopy_dev_unregister(uint64_t handle) goto exit; } + printf("[%d, %d] cuda_gdrcopy_dev_unregister() checkpoint 4 before unpin_buffer gdrcopy->mh=%p\n", getpid(), gettid(), gdrcopy->mh); err = global_gdrcopy_ops.gdr_unpin_buffer(global_gdr, gdrcopy->mh); + printf("[%d, %d] cuda_gdrcopy_dev_unregister() checkpoint 4 after unpin_buffer gdrcopy->mh=%p\n", getpid(), gettid(), gdrcopy->mh); if (err) { FI_WARN(&core_prov, FI_LOG_MR, "gdr_unmap failed! error: %s\n", ```

In a sample 2-node cuFFTMp run on a Slingshot 11 machine, I recorded the output: out12.txt

In particular, I was looking for evidence that libfabric was trying to free the same memory address multiple times, one level above where it was happening in gdrcopy. If we check the log, for example, grep -E -i -n "0x7e1ddb0" out12.txt, it looks like this is indeed happening:

2492:===> [nid001252, 112673, 112673] GDRCopy Checkpoint gdr_pin_buffer: 3: mh=0x7e1ddb0
2493:===> [nid001252, 112673, 112673] GDRCopy Checkpoint gdr_pin_buffer: 4: mh=0x7e1ddb0, ret=0
2494:===> [nid001252, 112673, 112673] GDRCopy Checkpoint gdr_map: 1: mh=0x7e1ddb0
2508:===> [nid001252, 112673, 112673] GDRCopy Checkpoint gdr_map: 2: mh=0x7e1ddb0, ret=0
2511:[112673, 112723] cuda_gdrcopy_dev_unregister() checkpoint 1 before spin lock gdrcopy->mh=0x7e1ddb0
2512:[112673, 112723] cuda_gdrcopy_dev_unregister() checkpoint 2 after spin lock and before unmap gdrcopy->mh=0x7e1ddb0
2513:===> [nid001252, 112673, 112723] GDRCopy Checkpoint gdr_unmap: 1: mh=0x7e1ddb0
2514:===> [nid001252, 112673, 112723] GDRCopy Checkpoint gdr_unmap: 3: mh=0x7e1ddb0
2525:===> [nid001252, 112673, 112723] GDRCopy Checkpoint gdr_unmap: 5: mh=0x7e1ddb0
2526:===> [nid001252, 112673, 112723] GDRCopy Checkpoint gdr_unmap: 6: mh=0x7e1ddb0, ret=0
2527:[112673, 112723] cuda_gdrcopy_dev_unregister() checkpoint 3 after unmap gdrcopy->mh=0x7e1ddb0
2528:[112673, 112723] cuda_gdrcopy_dev_unregister() checkpoint 4 before unpin_buffer gdrcopy->mh=0x7e1ddb0
2535:[112673, 112673] cuda_gdrcopy_dev_unregister() checkpoint 1 before spin lock gdrcopy->mh=0x7e1ddb0
2550:===> [nid001252, 112673, 112723] GDRCopy Checkpoint _gdr_unpin_buffer: 1: mh=0x7e1ddb0
2551:===> [nid001252, 112673, 112723] GDRCopy Checkpoint _gdr_unpin_buffer: 2: mh=0x7e1ddb0
2553:[112673, 112723] cuda_gdrcopy_dev_unregister() checkpoint 4 after unpin_buffer gdrcopy->mh=0x7e1ddb0
2567:[112673, 112673] cuda_gdrcopy_dev_unregister() checkpoint 2 after spin lock and before unmap gdrcopy->mh=0x7e1ddb0
2568:===> [nid001252, 112673, 112673] GDRCopy Checkpoint gdr_unmap: 1: mh=0x7e1ddb0
2569:===> [nid001252, 112673, 112673] GDRCopy Checkpoint gdr_unmap: 2: mh=0x7e1ddb0
2584:===> [nid001252, 112673, 112673] GDRCopy Checkpoint gdr_unmap: 6: mh=0x7e1ddb0, ret=22
2585:[112673, 112673] cuda_gdrcopy_dev_unregister() checkpoint 3 after unmap gdrcopy->mh=0x7e1ddb0

The cuda_gdrcopy_dev_unregister() function in src/hmem_cuda_gdrcopy.c is getting called on the same handle memory address on the same process, but a different thread. In another trial, it looked like the spin lock in cuda_gdrcopy_dev_unregister() behaved slightly better, but then ultimately there was still a crash with an attempt to free the unmapped address:

2721 [15358, 15358] cuda_gdrcopy_dev_unregister() checkpoint 2 after spin lock and before unmap gdrcopy->mh=(nil)
2722 ===> [nid001252, 15358, 15358] GDRCopy Checkpoint gdr_unmap: 1: mh=(nil)

In cuda_gdrcopy_dev_unregister() after pthread_spin_lock(&global_gdr_lock);, is there not a need to make sure that another thread hasn't unmapped and unpinned the handle before resuming? I did start messing around with this a little bit:

```diff diff --git a/src/hmem_cuda_gdrcopy.c b/src/hmem_cuda_gdrcopy.c index e013777df..4b2d54dee 100644 --- a/src/hmem_cuda_gdrcopy.c +++ b/src/hmem_cuda_gdrcopy.c @@ -43,6 +43,10 @@ #include #include +#include +#include +#include + struct gdrcopy_handle { gdr_mh_t mh; /* memory handler */ @@ -51,6 +55,19 @@ struct gdrcopy_handle { size_t length; /* page aligned length */ }; +typedef enum gdr_mapping_type { + GDR_MAPPING_TYPE_NONE = 0, + GDR_MAPPING_TYPE_WC = 1, + GDR_MAPPING_TYPE_CACHING = 2, + GDR_MAPPING_TYPE_DEVICE = 3 +} gdr_mapping_type_t; + +typedef struct gdr_memh_t { + uint32_t handle; + LIST_ENTRY(gdr_memh_t) entries; + gdr_mapping_type_t mapping_type; +} gdr_memh_t; + struct gdrcopy_ops { gdr_t (*gdr_open)(); int (*gdr_close)(gdr_t g); @@ -72,6 +89,10 @@ enum gdrcopy_dir { }; static gdr_t global_gdr; +static gdr_memh_t *to_memh(gdr_mh_t mh) { + return (gdr_memh_t *)mh.h; +} + static pthread_spinlock_t global_gdr_lock; #if ENABLE_GDRCOPY_DLOPEN @@ -354,10 +375,15 @@ int cuda_gdrcopy_dev_unregister(uint64_t handle) gdrcopy = (struct gdrcopy_handle *)handle; assert(gdrcopy); + printf("[%d, %d] cuda_gdrcopy_dev_unregister() checkpoint 1 before spin lock gdrcopy->mh=%p\n", getpid(), gettid(), gdrcopy->mh); pthread_spin_lock(&global_gdr_lock); + if (to_memh(gdrcopy->mh)->mapping_type != 0) + { + printf("[%d, %d] cuda_gdrcopy_dev_unregister() checkpoint 2 after spin lock and before unmap gdrcopy->mh=%p\n", getpid(), gettid(), gdrcopy->mh); err = global_gdrcopy_ops.gdr_unmap(global_gdr, gdrcopy->mh, gdrcopy->user_ptr, gdrcopy->length); + printf("[%d, %d] cuda_gdrcopy_dev_unregister() checkpoint 3 after unmap gdrcopy->mh=%p\n", getpid(), gettid(), gdrcopy->mh); if (err) { FI_WARN(&core_prov, FI_LOG_CORE, "gdr_unmap failed! error: %s\n", @@ -365,13 +391,17 @@ int cuda_gdrcopy_dev_unregister(uint64_t handle) goto exit; } + printf("[%d, %d] cuda_gdrcopy_dev_unregister() checkpoint 4 before unpin_buffer gdrcopy->mh=%p\n", getpid(), gettid(), gdrcopy->mh); err = global_gdrcopy_ops.gdr_unpin_buffer(global_gdr, gdrcopy->mh); + printf("[%d, %d] cuda_gdrcopy_dev_unregister() checkpoint 4 after unpin_buffer gdrcopy->mh=%p\n", getpid(), gettid(), gdrcopy->mh); if (err) { FI_WARN(&core_prov, FI_LOG_MR, "gdr_unmap failed! error: %s\n", strerror(err)); goto exit; } + } + else {pthread_spin_unlock(&global_gdr_lock);} exit: pthread_spin_unlock(&global_gdr_lock); ```

It still wasn't enough to get my cuFFTMp example running, but looking at the output I no longer see evidence of a second thread traversing through gdr_unmap with the same address that was unpinned by the first thread. Is it plausible that this and other issues exist in this code, or am I way off base?

tylerjereddy commented 3 months ago

cc @jswaro

jswaro commented 4 weeks ago

@tylerjereddy : Hi tyler, I missed this notification. I'll look into this.

jswaro commented 4 weeks ago

===> [nid001252, 112670, 112670] GDRCopy Checkpoint gdr_pin_buffer: 3: mh=0x7e1ce90 ===> [nid001252, 112670, 112670] GDRCopy Checkpoint gdr_pin_buffer: 4: mh=0x7e1ce90, ret=0 ===> [nid001252, 112670, 112670] GDRCopy Checkpoint gdr_map: 1: mh=0x7e1ce90 ===> [nid001252, 112670, 112670] GDRCopy Checkpoint gdr_map: 2: mh=0x7e1ce90, ret=0 [112670, 112725] cxip_unmap_nocache() checkpoint 1 [112670, 112725] cxip_unmap_nocache() checkpoint 2 before unregister [112670, 112670] cxip_unmap_nocache() checkpoint 1 [112670, 112670] cxip_unmap_nocache() checkpoint 2 before unregister [112670, 112725] cuda_gdrcopy_dev_unregister() checkpoint 1 before spin lock gdrcopy->mh=0x7e1ce90 [112670, 112725] cuda_gdrcopy_dev_unregister() checkpoint 2 after spin lock and before unmap gdrcopy->mh=0x7e1ce90 ===> [nid001252, 112670, 112725] GDRCopy Checkpoint gdr_unmap: 1: mh=0x7e1ce90 [112670, 112670] cuda_gdrcopy_dev_unregister() checkpoint 1 before spin lock gdrcopy->mh=0x7e1ce90 ===> [nid001252, 112670, 112725] GDRCopy Checkpoint gdr_unmap: 3: mh=0x7e1ce90 ===> [nid001252, 112670, 112725] GDRCopy Checkpoint gdr_unmap: 5: mh=0x7e1ce90 ===> [nid001252, 112670, 112725] GDRCopy Checkpoint gdr_unmap: 6: mh=0x7e1ce90, ret=0 [112670, 112725] cuda_gdrcopy_dev_unregister() checkpoint 3 after unmap gdrcopy->mh=0x7e1ce90 [112670, 112725] cuda_gdrcopy_dev_unregister() checkpoint 4 before unpin_buffer gdrcopy->mh=0x7e1ce90 ===> [nid001252, 112670, 112725] GDRCopy Checkpoint _gdr_unpin_buffer: 1: mh=0x7e1ce90 ===> [nid001252, 112670, 112725] GDRCopy Checkpoint _gdr_unpin_buffer: 2: mh=0x7e1ce90 ===> [nid001252, 112670, 112725] GDRCopy Checkpoint _gdr_unpin_buffer: 3 [112670, 112725] cuda_gdrcopy_dev_unregister() checkpoint 4 after unpin_buffer gdrcopy->mh=0x7e1ce90 [112670, 112725] cxip_unmap_nocache() checkpoint 3 before unmap [112670, 112670] cuda_gdrcopy_dev_unregister() checkpoint 2 after spin lock and before unmap gdrcopy->mh=(nil) ===> [nid001252, 112670, 112670] GDRCopy Checkpoint gdr_unmap: 1: mh=(nil) [112670, 112725] cxip_unmap_nocache() checkpoint 4 after unmap [112670, 112725] cxip_unmap_nocache() checkpoint 5 before free [112670, 112725] cxip_unmap_nocache() checkpoint 6 after free

So, I looked at a single thread fault, and tracked it back to two separate threads.

In the log above, what seems to have occurred is the following sequence of events:

In PID=112670, there are at least two threads. TID=112670, and 122725.

TID=112670 registers the memory.
TID=112725 begins to unmap a region, expected to be mh=0x7e1ce90.
TID=112670, same as #2.
TID=112725 beats TID 112670 to the lock, evidenced by checkpoint 2. Starts to unmap the memory handle with GDRCOPY library. At CP 1 in GDRCOPY library.
TID =112670 hits the lock, and is waiting for the lock.
TID=112725 continues to unmap and unpin the buffer. Hits CP3 and CP4 in libfabric. Releases the lock. Frees the handle. Continues on in parallel. As the local memory descriptor is freed and put back onto the list, it is probably cleared, or set to NULL at the beginning of a new transaction that pull the newly freed descriptor off the list.
TID=112670 releases the lock, starts to try to unmap the region but it has already been unmapped and the backing pointer has been modified.

Crash. MH is null, which triggers the issue in the library below. Something in libfabric likely zero'd the gdrcopy struct after the operation. The zero'ing of the struct likely happened while TID=112670 was acquiring the global lock.

If both threads are trying to unmap the same region, then we need one of the following solutions:

1) reference counting on the gdrcopy object to ensure that it isn't unmapped and freed until all references are gone, or 2) a check in the code to verify that the handle is still valid. If invalid, release the lock and bail.

@tylerjereddy Nice log capture. Checkpoints helped quite a bit.

jswaro commented 4 weeks ago

@j-xiong : Do you see this happening in general with other providers using GDRCOPY? Seems like it could be unique to CXI's RMA operation tracking, but it seems worth asking.

j-xiong commented 4 weeks ago

@jswaro I haven't seen this with other providers yet. While I can't say similar thing won't happen with other providers, the sequence of events presented here is very much tied to the cxi provider.

iziemba commented 4 weeks ago

@tylerjereddy : Couple questions

Is this application using the MR cache?
Is this application using fi_mr_reg() and sharing the MR handle with multiple threads?

The CXI provider should have a 1-to-1 mapping between CXI memory region/descriptor and GDR handle. At face value, it seems like the same CXI memory region is being freed by two separate threads.

tylerjereddy commented 3 weeks ago

@iziemba @jswaro Do you both have access to Cray Slingshot 11 (>= 2 nodes) hardware? Probably the easiest way to debug would be to attempt to run the cuFFTMp multi-node example per my instructions at the top of https://github.com/NVIDIA/gdrcopy/issues/296 yourselves. If you don't have access to the hardware, I was hoping LANL HPC would help interface a bit, but in the meanwhile it may be possible for me to try to do an "interactive" debug session where I setup a reproducer and run it on two nodes for you to see, add prints, etc.

The problem is ultimately that we're trying to be able to easily build out a toolchain that performs multi-node cuFFTMp on Cray Slingshot 11 hardware so that we can do physics simulations with i.e., GROMACS, but this toolchain seems to be quite problematic and to often not complain when versions of components mismatch, apart from segfaulting.

You've got NVSHMEM and gdrcopy from NVIDIA, libfabric from OSS, and the CXI part still living on some custom branch, so it seems like this ecosystem still needs some work for it to be straightforward to have a high chance of things "just working" when building it out. And for my trainees with little software training, it is effectively intractable.

The original report was for x86_64, but we're now also wanting to do these builds on Grace-Hopper (aarch64) as well, so if we could check buildouts on both of those architectures, and have clear instructions on how to do the builds (versions of NVSHMEM, libfabric, gdrcopy, and so on that should work together) that would be most helpful. Even better if it could happen via i.e., spack install .. type packaged installs, though that seems farther off for now.

jswaro commented 3 weeks ago

Yes, either one of us would have access to the hardware to reproduce. We'll see if this could be reproduced internally following the instructions in the ticket above.

ofiwg / libfabric

BUG: unsafe CXI <-> gdrcopy cleanup interactions #10041