openucx / ucx

Unified Communication X (mailing list - https://elist.ornl.gov/mailman/listinfo/ucx-group)
http://www.openucx.org
Other
1.11k stars 417 forks source link

undefined sigaction() symbol #7340

Open carns opened 3 years ago

carns commented 3 years ago

Describe the bug

UCX 1.10.1 produces the following error at runtime on the Summit system at the OLCF following the recent upgrade to RHEL8:

/ccs/home/carns/working/install/bin/margo-p2p-bw: symbol lookup error: /autofs/nccs-svm1_home1/carns/working/src/spack/opt/spack/linux-rhel8-power9le/gcc-9.1.0/ucx-1.10.1-dzinsouik3mdgjnqjiqum7ypq5nf723x/lib/libucs.so.0: undefined symbol: sigaction

Steps to Reproduce

This can be reproduced with gcc (9.1 or 11.1), spack origin/develop as of September 1, 2021, and UCX version 1.10.1 (the default version supplied by the spack package). UCX, and programs linked against it, compile and link fine. The symbol error is produced at run time.

I was able to work around it by quickly hacking the UCX code as follows:

diff --git a/src/ucs/debug/debug.c b/src/ucs/debug/debug.c
index c1db45ba3..33172978b 100644
--- a/src/ucs/debug/debug.c
+++ b/src/ucs/debug/debug.c
@@ -1080,34 +1080,6 @@ void ucs_handle_error(const char *message)
     }
 }

-static int ucs_debug_is_error_signal(int signum)
-{
-    khiter_t hash_it;
-    int result;
-
-    if (!ucs_global_opts.handle_errors) {
-        return 0;
-    }
-
-    /* If this signal is error, but was disabled. */
-    ucs_recursive_spin_lock(&ucs_kh_lock);
-    hash_it = kh_get(ucs_signal_orig_action, &ucs_signal_orig_action_map, signum);
-    result = (hash_it != kh_end(&ucs_signal_orig_action_map));
-    ucs_recursive_spin_unlock(&ucs_kh_lock);
-    return result;
-}
-
-static void* ucs_debug_get_orig_func(const char *symbol, void *replacement)
-{
-    void *func_ptr;
-
-    func_ptr = dlsym(RTLD_NEXT, symbol);
-    if (func_ptr == NULL) {
-        func_ptr = dlsym(RTLD_DEFAULT, symbol);
-    }
-    return func_ptr;
-}
-
 #if !HAVE_SIGHANDLER_T
 #if HAVE___SIGHANDLER_T
 typedef __sighandler_t *sighandler_t;
@@ -1115,45 +1087,6 @@ typedef __sighandler_t *sighandler_t;
 #error "Port me"
 #endif
 #endif
-sighandler_t signal(int signum, sighandler_t handler)
-{
-    typedef sighandler_t (*sighandler_func_t)(int, sighandler_t);
-
-    static sighandler_func_t orig = NULL;
-
-    if (ucs_debug_initialized && ucs_debug_is_error_signal(signum)) {
-        return SIG_DFL;
-    }
-
-    if (orig == NULL) {
-        orig = (sighandler_func_t)ucs_debug_get_orig_func("signal", signal);
-    }
-
-    return orig(signum, handler);
-}
-
-static int orig_sigaction(int signum, const struct sigaction *act,
-                          struct sigaction *oact)
-{
-    typedef int (*sigaction_func_t)(int, const struct sigaction*, struct sigaction*);
-
-    static sigaction_func_t orig = NULL;
-
-    if (orig == NULL) {
-        orig = (sigaction_func_t)ucs_debug_get_orig_func("sigaction", sigaction);
-    }
-
-    return orig(signum, act, oact);
-}
-
-int sigaction(int signum, const struct sigaction *act, struct sigaction *oact)
-{
-    if (ucs_debug_initialized && ucs_debug_is_error_signal(signum)) {
-        return orig_sigaction(signum, NULL, oact); /* Return old, do not set new */
-    }
-
-    return orig_sigaction(signum, act, oact);
-}

 static void ucs_debug_signal_handler(int signo)
 {
@@ -1233,7 +1166,7 @@ static void ucs_set_signal_handler(void (*handler)(int, siginfo_t*, void *))
     sigemptyset(&sigact.sa_mask);

     for (i = 0; i < ucs_global_opts.error_signals.count; ++i) {
-        ret = orig_sigaction(ucs_global_opts.error_signals.signals[i], &sigact,
+        ret = sigaction(ucs_global_opts.error_signals.signals[i], &sigact,
                              &old_action);
         if (ret < 0) {
             ucs_warn("failed to set signal handler for sig %d : %m",
@@ -1323,7 +1256,7 @@ void ucs_debug_init()
         memset(&sigact, 0, sizeof(sigact));
         memset(&old_action, 0, sizeof(old_action));
         sigact.sa_handler = ucs_debug_signal_handler;
-        orig_sigaction(ucs_global_opts.debug_signo, &sigact, &old_action);
+        sigaction(ucs_global_opts.debug_signo, &sigact, &old_action);
         ucs_debug_save_original_sighandler(ucs_global_opts.debug_signo, &old_action);
     }

@@ -1370,7 +1303,7 @@ static inline void ucs_debug_disable_signal_nolock(int signum)
     }

     original_action = kh_val(&ucs_signal_orig_action_map, hash_it);
-    ret = orig_sigaction(signum, original_action, &ucs_action);
+    ret = sigaction(signum, original_action, &ucs_action);
     if (ret < 0) {
         ucs_warn("failed to set signal handler for sig %d : %m", signum);
     }

For some reason the interception (and subsequent dlsym() lookup) of sigaction isn't working in my environment. Possibly a problem with the link order? This did not happen on Summit prior to the recent RHEL8 upgrade, though it is possible there were other coincidental changes that played a factor as well.

The above patch is not a proper solution; I just did enough to confirm that the problem was related to interception/dlsym of the sigaction/signal function and then execute some benchmarks that don't depend on functionality provided by that interception.

eisenhauer commented 2 years ago

A little bit more information about this problem, which I am also encountering.

First, the root of the issue seems to be the failure of dlsym(RTDL_NEXT, symbol) in ucs_debug_get_orig_func(). Rather than failing quietly and returning NULL, it is generating the "symbol lookup error:" output and calling exit(). It looks like that is because it's not the system dlsym(), but instead some wrapper from libpami_cudahook.so:

Breakpoint 1, 0x00007ffff7f6596c in dlsym () from /sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-9.1.0/spectrum-mpi-10.4.0.3-20210112-6jbupg3thjwhsabgevk6xmwhd2bbyxdc/container/../lib/libpami_cudahook.so Missing separate debuginfos, use: yum debuginfo-install libevent-2.1.8-5.el8.ppc64le libgcc-8.3.1-5.el8.ppc64le libstdc++-8.3.1-5.el8.ppc64le numactl-libs-2.0.12-9.el8.ppc64le openssl-libs-1.1.1c-15.el8.ppc64le ucx-1.8.0-1.49224.ppc64le zlib-1.2.11-16.el8_2.ppc64le (gdb) where

0 0x00007ffff7f6596c in dlsym ()

from /sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-9.1.0/spectrum-mpi-10.4.0.3-20210112-6jbupg3thjwhsabgevk6xmwhd2bbyxdc/container/../lib/libpami_cudahook.so

1 0x00007ffff6fdedc8 in ucs_debug_get_orig_func.isra ()

from /usr/lib64/libucs.so.0

2 0x00007ffff6fdeea4 in orig_sigaction () from /usr/lib64/libucs.so.0

3 0x00007ffff6fe1f94 in ucs_debug_init () from /usr/lib64/libucs.so.0

4 0x00007ffff6fca11c in ucs_init () from /usr/lib64/libucs.so.0

5 0x00007ffff7fc4d30 in call_init () from /lib64/ld64.so.2

6 0x00007ffff7fc4e8c in _dl_init () from /lib64/ld64.so.2

7 0x00007ffff7fb14ac in _dl_start_user () from /lib64/ld64.so.2

(gdb)

According to https://github.com/flux-framework/flux-core-v0.11/issues/11, this is by design and part of the Spectrum MPI implementation on Summit: "Without getting into too much detail, this is an ugly optimization technique that IBM used to allow their MPI to be able to send buffers allocated by CUDA memory allocation routines. The interception of the CUDA driver calls was achieved by wrapping dlsym in, libpami_cudahook.so, that is preloaded to each MPI process. But this has had lots, lots of issues, least of which was compatibility with both performance and debugging tools."

IMHO, the upshot is that the installed UCX in /usr/lib64 on Summit is broken and unusable in MPI programs. I've gotten around this by building a personal UCX 1.8.0, changing the dlsym RTDL_NEXT to use RTDL_GLOBAL (which seems to work) and using this via LD_LIBRARY_PATH. I'll submit this as a support issue to OLCF.