openucx / ucx

Unified Communication X (mailing list - https://elist.ornl.gov/mailman/listinfo/ucx-group)
http://www.openucx.org
Other
1.15k stars 427 forks source link

ucp_context.c:1100 UCX WARN transport 'ud' is not available #8808

Open PurvangL opened 1 year ago

PurvangL commented 1 year ago

I have 8 x A100 PCiE server, where I have installed ucx with following args.

./configure --prefix=/usr --enable-optimizations=1 --disable-logging --disable-debug --disable-assertions --disable-params-check --enable-mt=1 --with-cuda=/usr/local/cuda --disable-doxygen-doc --with-sysroot --with-avx=yes --with-mcpu=yes --with-march=yes --with-rc=yes --with-ud=yes --with-dc=yes --with-mlx5-dv=yes --with-ib-hw-tm=yes --with-dm=yes --with-devx=yes --with-xpmem=yes --with-iodemo-cuda=yes

and during installation of OpenMpi, I use

./configure --prefix=/usr --with-ucx=/usr/ --build=x86_64-linux-gnu --includedir=/usr/include --mandir=/usr/share/man --infodir=/usr/share/info --sysconfdir=/etc --localstatedir=/var --disable-silent-rules --libexecdir=/usr/lib --disable-maintainer-mode --disable-dependency-tracking --prefix=/usr/mpi/gcc/openmpi-4.1.4 --with-platform=contrib/platform/mellanox/optimized

But, Running Nccl test with installed mpi like below gives warning: ucp_context.c:1100 UCX WARN transport 'ud' is not available, please use one or more of: cma, cuda, cuda_copy, cuda_ipc, mm, posix, self, shm, sm, sysv, tcp

export var_UCX_NET_DEVICES=ibs8f0:1; export var_NCCL_IB_HCA="ibs8f0"; time mpirun --allow-run-as-root --mca pml ucx --bind-to numa   -x NCCL_DEBUG=WARN   -x NCCL_IB_SL=0   -x NCCL_IB_TC=41   -x NCCL_IB_QPS_PER_CONNECTION=4   -x UCX_TLS=ud,self,sm   -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES}   -x HCOLL_ENABLE_MCAST_ALL=0   -x coll_hcoll_enable=0   -x NCCL_IB_GID_INDEX=3   -x NCCL_ALGO=Ring   -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" --np 8 -H localhost:8 -N 8 ./build/all_reduce_perf -b 1G -e 10G -i $((1024*1024*1024*9))

Question is why my system cannot detect ud protocol even I specified in ucx installation?

I want to unable missing protocols from following list but I don't see in availables. How can I enable or install?

cma, dc, dc_mlx5, dc_x, ib, mm, posix, rc, rc_mlx5, rc_v, rc_verbs, rc_x, self, shm, sm, sysv, tcp, ud, ud_mlx5, ud_v, ud_verbs, ud_x

Thank you

yosefe commented 1 year ago
  1. can you pls remove the setting of "var_UCX_NET_DEVICES"?
  2. what is the output of ucx_info -bdv and ibv_devinfo ?
PurvangL commented 1 year ago

sure.

ucx_info -bdv

# Library version: 1.15.0
# Library path: /lib/libucs.so.0
# API headers version: 1.15.0
# Git branch 'master', revision 89f3299
# Configured with: --disable-logging --disable-debug --disable-assertions --disable-params-check --prefix=/usr --enable-optimizations=1 --disable-logging --disable-debug --disable-assertions --disable-params-check --enable-mt=1 --with-cuda=/usr/local/cuda --disable-doxygen-doc --with-sysroot --with-avx=yes --with-mcpu=yes --with-march=yes --with-rc=yes --with-ud=yes --with-dc=yes --with-mlx5-dv=yes --with-ib-hw-tm=yes --with-dm=yes --with-devx=yes --with-xpmem=yes --with-iodemo-cuda=yes
#define UCX_CONFIG_H              
#define ENABLE_BUILTIN_MEMCPY     1
#define ENABLE_DEBUG_DATA         0
#define ENABLE_MT                 0
#define ENABLE_PARAMS_CHECK       0
#define HAVE_ALLOCA               1
#define HAVE_ALLOCA_H             1
#define HAVE_ATTRIBUTE_NOOPTIMIZE 1
#define HAVE_CLEARENV             1
#define HAVE_CPU_SET_T            1
#define HAVE_CUDA                 1
#define HAVE_CUDA_H               1
#define HAVE_CUDA_RUNTIME_H       1
#define HAVE_DECL_ASPRINTF        1
#define HAVE_DECL_BASENAME        1
#define HAVE_DECL_CPU_ISSET       1
#define HAVE_DECL_CPU_ZERO        1
#define HAVE_DECL_ETHTOOL_CMD_SPEED 1
#define HAVE_DECL_FMEMOPEN        1
#define HAVE_DECL_FUSE_MOUNT      0
#define HAVE_DECL_FUSE_OPEN_CHANNEL 0
#define HAVE_DECL_FUSE_UNMOUNT    0
#define HAVE_DECL_F_SETOWN_EX     1
#define HAVE_DECL_GETAUXVAL       1
#define HAVE_DECL_IBV_CREATE_SRQ  0
#define HAVE_DECL_IBV_EVENT_TYPE_STR 0
#define HAVE_DECL_IBV_GET_ASYNC_EVENT 0
#define HAVE_DECL_IBV_GET_DEVICE_NAME 0
#define HAVE_DECL_IBV_QUERY_GID   0
#define HAVE_DECL_IBV_WC_STATUS_STR 0
#define HAVE_DECL_INOTIFY_ADD_WATCH 1
#define HAVE_DECL_INOTIFY_INIT    1
#define HAVE_DECL_IN_ATTRIB       1
#define HAVE_DECL_IPPROTO_TCP     1
#define HAVE_DECL_MADV_FREE       1
#define HAVE_DECL_MADV_REMOVE     1
#define HAVE_DECL_POSIX_MADV_DONTNEED 1
#define HAVE_DECL_PR_SET_PTRACER  1
#define HAVE_DECL_SOL_SOCKET      1
#define HAVE_DECL_SO_KEEPALIVE    1
#define HAVE_DECL_SPEED_UNKNOWN   1
#define HAVE_DECL_STRERROR_R      1
#define HAVE_DECL_SYS_BRK         1
#define HAVE_DECL_SYS_IPC         0
#define HAVE_DECL_SYS_MADVISE     1
#define HAVE_DECL_SYS_MMAP        1
#define HAVE_DECL_SYS_MREMAP      1
#define HAVE_DECL_SYS_MUNMAP      1
#define HAVE_DECL_SYS_SHMAT       1
#define HAVE_DECL_SYS_SHMDT       1
#define HAVE_DECL_TCP_KEEPCNT     1
#define HAVE_DECL_TCP_KEEPIDLE    1
#define HAVE_DECL_TCP_KEEPINTVL   1
#define HAVE_DECL___PPC_GET_TIMEBASE 0
#define HAVE_DECL___PPC_GET_TIMEBASE_FREQ 0
#define HAVE_DLFCN_H              1
#define HAVE_HW_TIMER             1
#define HAVE_IN6_ADDR_S6_ADDR32   1
#define HAVE_INOTIFY              1
#define HAVE_INTTYPES_H           1
#define HAVE_IP_IP_DST            1
#define HAVE_LIBGEN_H             1
#define HAVE_LIBRT                1
#define HAVE_LINUX_FUTEX_H        1
#define HAVE_LINUX_IP_H           1
#define HAVE_LINUX_MMAN_H         1
#define HAVE_MALLOC_H             1
#define HAVE_MALLOC_HOOK          1
#define HAVE_MALLOC_TRIM          1
#define HAVE_MEMALIGN             1
#define HAVE_MEMORY_H             1
#define HAVE_MREMAP               1
#define HAVE_NETINET_IP_H         1
#define HAVE_NET_ETHERNET_H       1
#define HAVE_NVML_H               1
#define HAVE_POSIX_MEMALIGN       1
#define HAVE_SCHED_GETAFFINITY    1
#define HAVE_SCHED_SETAFFINITY    1
#define HAVE_SIGACTION_SA_RESTORER 1
#define HAVE_SIGEVENT_SIGEV_UN_TID 1
#define HAVE_SIGHANDLER_T         1
#define HAVE_STDINT_H             1
#define HAVE_STDLIB_H             1
#define HAVE_STRERROR_R           1
#define HAVE_STRINGS_H            1
#define HAVE_STRING_H             1
#define HAVE_STRUCT_DL_PHDR_INFO  1
#define HAVE_SYS_EPOLL_H          1
#define HAVE_SYS_EVENTFD_H        1
#define HAVE_SYS_STAT_H           1
#define HAVE_SYS_TYPES_H          1
#define HAVE_SYS_UIO_H            1
#define HAVE_UCM_PTMALLOC286      1
#define HAVE_UNISTD_H             1
#define HAVE___CLEAR_CACHE        1
#define HAVE___CURBRK             1
#define HAVE___SIGHANDLER_T       1
#define LT_OBJDIR                 ".libs/"
#define NVALGRIND                 1
#define PACKAGE                   "ucx"
#define PACKAGE_BUGREPORT         ""
#define PACKAGE_NAME              "ucx"
#define PACKAGE_STRING            "ucx 1.15"
#define PACKAGE_TARNAME           "ucx"
#define PACKAGE_URL               ""
#define PACKAGE_VERSION           "1.15"
#define STDC_HEADERS              1
#define STRERROR_R_CHAR_P         1
#define UCM_BISTRO_HOOKS          1
#define UCS_MAX_LOG_LEVEL         UCS_LOG_LEVEL_DEBUG
#define UCT_TCP_EP_KEEPALIVE      1
#define UCT_UD_EP_DEBUG_HOOKS     0
#define UCX_CONFIGURE_FLAGS       "--disable-logging --disable-debug --disable-assertions --disable-params-check --prefix=/usr --enable-optimizations=1 --disable-logging --disable-debug --disable-assertions --disable-params-check --enable-mt=1 --with-cuda=/usr/local/cuda --disable-doxygen-doc --with-sysroot --with-avx=yes --with-mcpu=yes --with-march=yes --with-rc=yes --with-ud=yes --with-dc=yes --with-mlx5-dv=yes --with-ib-hw-tm=yes --with-dm=yes --with-devx=yes --with-xpmem=yes --with-iodemo-cuda=yes"
#define UCX_MODULE_SUBDIR         "ucx"
#define VERSION                   "1.15"
#define WITH_IODEMO_CUDA          1
#define restrict                  __restrict
#define test_MODULES              ":module"
#define ucm_MODULES               ":cuda"
#define ucs_MODULES               ""
#define uct_MODULES               ":cuda:cma"
#define uct_cuda_MODULES          ""
#define uct_ib_MODULES            ""
#define uct_rocm_MODULES          ""
#define ucx_perftest_MODULES      ":cuda"
#
# Memory domain: self
#     Component: self
#             register: unlimited, cost: 0 nsec
#           remote key: 0 bytes
#         memory types: host (access,reg,cache)
#
#      Transport: self
#         Device: memory
#           Type: loopback
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 6911.00 MB/sec
#              latency: 0 nsec
#             overhead: 10 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 8K
#             am_bcopy: <= 8K
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 0 bytes
#        iface address: 8 bytes
#       error handling: ep_check
#
#
# Memory domain: tcp
#     Component: tcp
#             register: unlimited, cost: 0 nsec
#           remote key: 0 bytes
#         memory types: host (access,reg,cache)
#
#      Transport: tcp
#         Device: ibs8f0
#           Type: network
#  System device: ibs8f0 (0)
#
#      capabilities:
#            bandwidth: 11142.51/ppn + 0.00 MB/sec
#              latency: 5206 nsec
#             overhead: 50000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to ep, to iface
#      device priority: 1
#     device num paths: 1
#              max eps: 256
#       device address: 6 bytes
#        iface address: 2 bytes
#           ep address: 10 bytes
#       error handling: peer failure, ep_check, keepalive
#
#      Transport: tcp
#         Device: ens8f1
#           Type: network
#  System device: ens8f1 (1)
#
#      capabilities:
#            bandwidth: 11818.05/ppn + 0.00 MB/sec
#              latency: 5206 nsec
#             overhead: 50000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to ep, to iface
#      device priority: 0
#     device num paths: 1
#              max eps: 256
#       device address: 6 bytes
#        iface address: 2 bytes
#           ep address: 10 bytes
#       error handling: peer failure, ep_check, keepalive
#
#      Transport: tcp
#         Device: lo
#           Type: network
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 11.91/ppn + 0.00 MB/sec
#              latency: 10960 nsec
#             overhead: 50000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to ep, to iface
#      device priority: 1
#     device num paths: 1
#              max eps: 256
#       device address: 18 bytes
#        iface address: 2 bytes
#           ep address: 10 bytes
#       error handling: peer failure, ep_check, keepalive
#
#
# Connection manager: tcp
#      max_conn_priv: 2064 bytes
#
# Memory domain: sysv
#     Component: sysv
#             allocate: unlimited
#           remote key: 12 bytes
#           rkey_ptr is supported
#         memory types: host (access,alloc,cache)
#
#      Transport: sysv
#         Device: memory
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 15360.00 MB/sec
#              latency: 80 nsec
#             overhead: 10 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 100
#             am_bcopy: <= 8256
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 8 bytes
#       error handling: ep_check
#
#
# Memory domain: posix
#     Component: posix
#             allocate: <= 264085196K
#           remote key: 32 bytes
#           rkey_ptr is supported
#         memory types: host (access,alloc,cache)
#
#      Transport: posix
#         Device: memory
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 15360.00 MB/sec
#              latency: 80 nsec
#             overhead: 10 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 100
#             am_bcopy: <= 8256
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 16 bytes
#       error handling: ep_check
#
#
# Memory domain: cuda_cpy
#     Component: cuda_cpy
#             allocate: unlimited
#             register: unlimited, cost: 0 nsec
#         memory types: host (reg), cuda (access,alloc,reg,cache,detect), cuda-managed (access,alloc,reg,cache,detect)
#
#      Transport: cuda_copy
#         Device: cuda
#           Type: accelerator
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 10000.00/ppn + 0.00 MB/sec
#              latency: 8000 nsec
#             overhead: 0 nsec
#            put_short: <= 4294967295
#            put_zcopy: unlimited, up to 1 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 1
#            get_short: <= 4294967295
#            get_zcopy: unlimited, up to 1 iov
#  get_opt_zcopy_align: <= 1
#        get_align_mtu: <= 1
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 0 bytes
#        iface address: 8 bytes
#       error handling: none
#
#
# Memory domain: cuda_ipc
#     Component: cuda_ipc
#             register: unlimited, cost: 0 nsec
#           remote key: 112 bytes
#           memory invalidation is supported
#         memory types: cuda (access,reg,cache)
#
#      Transport: cuda_ipc
#         Device: cuda
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 300000.00/ppn + 0.00 MB/sec
#              latency: 1000 nsec
#             overhead: 7000 nsec
#            put_zcopy: unlimited, up to 1 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 1
#            get_zcopy: <= 0, up to 1 iov
#  get_opt_zcopy_align: <= 1
#        get_align_mtu: <= 1
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 4 bytes
#       error handling: peer failure, ep_check
#
#
# Memory domain: cma
#     Component: cma
#             register: unlimited, cost: 9 nsec
#         memory types: host (access,reg,cache)
#
#      Transport: cma
#         Device: memory
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 11145.00 MB/sec
#              latency: 80 nsec
#             overhead: 2000 nsec
#            put_zcopy: unlimited, up to 16 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 1
#            get_zcopy: unlimited, up to 16 iov
#  get_opt_zcopy_align: <= 1
#        get_align_mtu: <= 1
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 16 bytes
#       error handling: peer failure, ep_check
#
ibv_devinfo

hca_id: mlx5_0
    transport:          InfiniBand (0)
    fw_ver:             20.31.2354
    node_guid:          0c42:a103:0002:40c2
    sys_image_guid:         0c42:a103:0002:40c2
    vendor_id:          0x02c9
    vendor_part_id:         4123
    hw_ver:             0x0
    board_id:           MT_0000000224
    phys_port_cnt:          1
        port:   1
            state:          PORT_ACTIVE (4)
            max_mtu:        4096 (5)
            active_mtu:     4096 (5)
            sm_lid:         21
            port_lid:       10
            port_lmc:       0x00
            link_layer:     InfiniBand

hca_id: mlx5_1
    transport:          InfiniBand (0)
    fw_ver:             20.31.2354
    node_guid:          0c42:a103:0002:40c3
    sys_image_guid:         0c42:a103:0002:40c2
    vendor_id:          0x02c9
    vendor_part_id:         4123
    hw_ver:             0x0
    board_id:           MT_0000000224
    phys_port_cnt:          1
        port:   1
            state:          PORT_ACTIVE (4)
            max_mtu:        4096 (5)
            active_mtu:     4096 (5)
            sm_lid:         0
            port_lid:       0
            port_lmc:       0x00
            link_layer:     Ethernet
yosefe commented 1 year ago

it seems UCX was built without Verbs support. Perhaps rdma-core development libraries were not installed on the build system

PurvangL commented 1 year ago

Thank you. I run in docker and found out that ibv_devices wasn't showing any output.

so installed rdma-core and now it output as follows.

ibv_devices

device                 node GUID
------              ----------------
mlx5_0              0c42a103000240c2
mlx5_1              0c42a103000240c3

while running nccl-test, I get following warnings:

ucp_context.c:1100 UCX  WARN  network device 'mlx5_0:1' is not available, please use one or more of: 'ens8f1'(tcp), 'ibs8f0'(tcp), 'lo'(tcp)
UCX  WARN  transport 'ud' is not available, please use one or more of: cma, cuda, cuda_copy, cuda_ipc, mm, posix, self, shm, sm, sysv, tcp

I still see warning about 'ud' not found. what other libraries needed?

yosefe commented 1 year ago

@PurvangL need to make user /usr/include/infiniband/verbs.h is installed as well, then reconfigure and rebuild ucx

PurvangL commented 1 year ago

I see. I am trying to run in docker container, where mentioned file is not there. Though I do see verbs.h on host system. Any suggestion how can I achieve in docker container?

As a part of process, I installed openmpi along with ucx on host and tries to mount the opnmpi binaries folder with docker. but than it starts complaining as below:

image

Also ompi_info output

                Built by: 
                Built on: Thu Jan 12 16:52:14 UTC 2023
              Built host: smc-gpu-01
              C bindings: yes
            C++ bindings: yes
             Fort mpif.h: no
            Fort use mpi: no
       Fort use mpi size: deprecated-ompi-info-value
        Fort use mpi_f08: no
 Fort mpi_f08 compliance: The mpi_f08 module was not built
  Fort mpi_f08 subarrays: no
           Java bindings: no
  Wrapper compiler rpath: runpath
              C compiler: gcc
     C compiler absolute: /usr/bin/gcc
  C compiler family name: GNU
      C compiler version: 9.4.0
            C++ compiler: g++
   C++ compiler absolute: /usr/bin/g++
           Fort compiler: none
       Fort compiler abs: none
         Fort ignore TKR: no
   Fort 08 assumed shape: no
      Fort optional args: no
          Fort INTERFACE: no
    Fort ISO_FORTRAN_ENV: no
       Fort STORAGE_SIZE: no
      Fort BIND(C) (all): no
      Fort ISO_C_BINDING: no
 Fort SUBROUTINE BIND(C): no
       Fort TYPE,BIND(C): no
 Fort T,BIND(C,name="a"): no
            Fort PRIVATE: no
          Fort PROTECTED: no
           Fort ABSTRACT: no
       Fort ASYNCHRONOUS: no
          Fort PROCEDURE: no
         Fort USE...ONLY: no
           Fort C_FUNLOC: no
 Fort f08 using wrappers: no
         Fort MPI_SIZEOF: no
             C profiling: yes
           C++ profiling: yes
   Fort mpif.h profiling: no
  Fort use mpi profiling: no
   Fort use mpi_f08 prof: no
          C++ exceptions: no
          Thread support: posix (MPI_THREAD_MULTIPLE: yes, OPAL support: yes, OMPI progress: no, ORTE progress: yes, Event lib: yes)
           Sparse Groups: no
  Internal debug support: no
  MPI interface warnings: yes
     MPI parameter check: never
Memory profiling support: no
Memory debugging support: no
              dl support: yes
   Heterogeneous support: no
 mpirun default --prefix: yes
       MPI_WTIME support: native
     Symbol vis. support: yes
   Host topology support: yes
            IPv6 support: yes
      MPI1 compatibility: no
          MPI extensions: affinity, cuda, pcollreq
   FT Checkpoint support: no (checkpoint thread: no)
   C/R Enabled Debugging: no
  MPI_MAX_PROCESSOR_NAME: 256
    MPI_MAX_ERROR_STRING: 256
     MPI_MAX_OBJECT_NAME: 64
        MPI_MAX_INFO_KEY: 36
        MPI_MAX_INFO_VAL: 256
       MPI_MAX_PORT_NAME: 1024
  MPI_MAX_DATAREP_STRING: 128
           MCA allocator: basic (MCA v2.1.0, API v2.0.0, Component v4.1.4)
           MCA allocator: bucket (MCA v2.1.0, API v2.0.0, Component v4.1.4)
           MCA backtrace: execinfo (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA btl: tcp (MCA v2.1.0, API v3.1.0, Component v4.1.4)
                 MCA btl: self (MCA v2.1.0, API v3.1.0, Component v4.1.4)
                 MCA btl: vader (MCA v2.1.0, API v3.1.0, Component v4.1.4)
                 MCA btl: smcuda (MCA v2.1.0, API v3.1.0, Component v4.1.4)
            MCA compress: bzip (MCA v2.1.0, API v2.0.0, Component v4.1.4)
            MCA compress: gzip (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA crs: none (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                  MCA dl: dlopen (MCA v2.1.0, API v1.0.0, Component v4.1.4)
               MCA event: external (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA hwloc: external (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                  MCA if: linux_ipv6 (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                  MCA if: posix_ipv4 (MCA v2.1.0, API v2.0.0, Component v4.1.4)
         MCA installdirs: env (MCA v2.1.0, API v2.0.0, Component v4.1.4)
         MCA installdirs: config (MCA v2.1.0, API v2.0.0, Component v4.1.4)
              MCA memory: patcher (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA mpool: hugepage (MCA v2.1.0, API v3.0.0, Component v4.1.4)
             MCA patcher: overwrite (MCA v2.1.0, API v1.0.0, Component v4.1.4)
                MCA pmix: isolated (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA pmix: flux (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA pmix: pmix3x (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA pstat: linux (MCA v2.1.0, API v2.0.0, Component v4.1.4)
              MCA rcache: grdma (MCA v2.1.0, API v3.3.0, Component v4.1.4)
              MCA rcache: gpusm (MCA v2.1.0, API v3.3.0, Component v4.1.4)
              MCA rcache: rgpusm (MCA v2.1.0, API v3.3.0, Component v4.1.4)
           MCA reachable: weighted (MCA v2.1.0, API v2.0.0, Component v4.1.4)
           MCA reachable: netlink (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA shmem: posix (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA shmem: mmap (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA shmem: sysv (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA timer: linux (MCA v2.1.0, API v2.0.0, Component v4.1.4)
              MCA errmgr: default_orted (MCA v2.1.0, API v3.0.0, Component v4.1.4)
              MCA errmgr: default_app (MCA v2.1.0, API v3.0.0, Component v4.1.4)
              MCA errmgr: default_tool (MCA v2.1.0, API v3.0.0, Component v4.1.4)
              MCA errmgr: default_hnp (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA ess: tool (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA ess: hnp (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA ess: pmi (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA ess: singleton (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA ess: env (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA ess: slurm (MCA v2.1.0, API v3.0.0, Component v4.1.4)
               MCA filem: raw (MCA v2.1.0, API v2.0.0, Component v4.1.4)
             MCA grpcomm: direct (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA iof: orted (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA iof: tool (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA iof: hnp (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA odls: pspawn (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA odls: default (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA oob: tcp (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA plm: rsh (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA plm: isolated (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA plm: slurm (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA ras: slurm (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA ras: simulator (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA ras: gridengine (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA regx: naive (MCA v2.1.0, API v1.0.0, Component v4.1.4)
                MCA regx: fwd (MCA v2.1.0, API v1.0.0, Component v4.1.4)
                MCA regx: reverse (MCA v2.1.0, API v1.0.0, Component v4.1.4)
               MCA rmaps: seq (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA rmaps: rank_file (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA rmaps: round_robin (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA rmaps: mindist (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA rmaps: ppr (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA rmaps: resilient (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA rml: oob (MCA v2.1.0, API v3.0.0, Component v4.1.4)
              MCA routed: direct (MCA v2.1.0, API v3.0.0, Component v4.1.4)
              MCA routed: radix (MCA v2.1.0, API v3.0.0, Component v4.1.4)
              MCA routed: binomial (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA rtc: hwloc (MCA v2.1.0, API v1.0.0, Component v4.1.4)
              MCA schizo: ompi (MCA v2.1.0, API v1.0.0, Component v4.1.4)
              MCA schizo: orte (MCA v2.1.0, API v1.0.0, Component v4.1.4)
              MCA schizo: jsm (MCA v2.1.0, API v1.0.0, Component v4.1.4)
              MCA schizo: slurm (MCA v2.1.0, API v1.0.0, Component v4.1.4)
              MCA schizo: flux (MCA v2.1.0, API v1.0.0, Component v4.1.4)
               MCA state: novm (MCA v2.1.0, API v1.0.0, Component v4.1.4)
               MCA state: app (MCA v2.1.0, API v1.0.0, Component v4.1.4)
               MCA state: orted (MCA v2.1.0, API v1.0.0, Component v4.1.4)
               MCA state: tool (MCA v2.1.0, API v1.0.0, Component v4.1.4)
               MCA state: hnp (MCA v2.1.0, API v1.0.0, Component v4.1.4)
                 MCA bml: r2 (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA coll: monitoring (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA coll: basic (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA coll: sm (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA coll: self (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA coll: cuda (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA coll: inter (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA coll: tuned (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA coll: sync (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA coll: adapt (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA coll: libnbc (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA coll: han (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA fbtl: posix (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA fcoll: dynamic_gen2 (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA fcoll: vulcan (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA fcoll: dynamic (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA fcoll: individual (MCA v2.1.0, API v2.0.0, Component v4.1.4)
               MCA fcoll: two_phase (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                  MCA fs: ufs (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                  MCA io: ompio (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                  MCA io: romio321 (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                  MCA op: avx (MCA v2.1.0, API v1.0.0, Component v4.1.4)
                 MCA osc: pt2pt (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA osc: ucx (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA osc: monitoring (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA osc: rdma (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA osc: sm (MCA v2.1.0, API v3.0.0, Component v4.1.4)
                 MCA pml: v (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA pml: cm (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA pml: ob1 (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA pml: ucx (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA pml: monitoring (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                 MCA rte: orte (MCA v2.1.0, API v2.0.0, Component v4.1.4)
            MCA sharedfp: individual (MCA v2.1.0, API v2.0.0, Component v4.1.4)
            MCA sharedfp: sm (MCA v2.1.0, API v2.0.0, Component v4.1.4)
            MCA sharedfp: lockedfile (MCA v2.1.0, API v2.0.0, Component v4.1.4)
                MCA topo: treematch (MCA v2.1.0, API v2.2.0, Component v4.1.4)
                MCA topo: basic (MCA v2.1.0, API v2.2.0, Component v4.1.4)
           MCA vprotocol: pessimist (MCA v2.1.0, API v2.0.0, Component v4.1.4)
idps@smc-gpu-01:/data/ASS/gpu3/Obstacle_Detection_ASS$ cat ompi_info.txt | grep btl
  Configure command line: '--prefix=/usr' '--with-ucx=/usr' '--with-cuda' '--includedir=/usr/include' '--mandir=/usr/share/man' '--infodir=/usr/share/info' '--sysconfdir=/etc' '--localstatedir=/var' '--disable-silent-rules' '--libexecdir=/usr/lib' '--disable-maintainer-mode' '--enable-dlopen' '--enable-ipv6' '--enable-mpirun-prefix-by-default' '--enable-mpi-cxx' '--enable-oshmem' '--enable-oshmem-compat' '--enable-oshmem-profile' '--enable-spc' '--enable-builtin-atomics' '--enable-openib-udcm' '--enable-openib-dynamic-sl' '--enable-openib-rdmacm' '--enable-openib-rdmacm-ibaddr' '--enable-btl' '--with-verbs' '--with-sge' '--with-memory-manager=none' '--with-hwloc' '--with-libltdl' '--with-devel-headers' '--enable-shared' '--prefix=/usr/mpi/gcc/openmpi-4.1.4' '--with-platform=contrib/platform/mellanox/optimized'
                 MCA btl: tcp (MCA v2.1.0, API v3.1.0, Component v4.1.4)
                 MCA btl: self (MCA v2.1.0, API v3.1.0, Component v4.1.4)
                 MCA btl: vader (MCA v2.1.0, API v3.1.0, Component v4.1.4)
                 MCA btl: smcuda (MCA v2.1.0, API v3.1.0, Component v4.1.4)
                MCA fbtl: posix (MCA v2.1.0, API v2.0.0, Component v4.1.4)

and ucx_info output

# Library version: 1.15.0
# Library path: /lib/libucs.so.0
# API headers version: 1.15.0
# Git branch 'master', revision 89f3299
# Configured with: --disable-logging --disable-debug --disable-assertions --disable-params-check --prefix=/usr --enable-optimizations --disable-logging --disable-debug --disable-assertions --disable-params-check --enable-mt --with-verbs=/usr --with-hwloc=/usr --with-ucc=/usr --with-orte --with-sysroot --with-cuda=/usr/local/cuda --disable-doxygen-doc --with-avx --with-mcpu --with-march --with-rc --with-ud --with-dc --with-mlx5-dv --with-ib-hw-tm --with-dm --with-devx --with-xpmem --with-iodemo-cuda
#define UCX_CONFIG_H              
#define ENABLE_BUILTIN_MEMCPY     1
#define ENABLE_DEBUG_DATA         0
#define ENABLE_MT                 1
#define ENABLE_PARAMS_CHECK       0
#define HAVE_ALLOCA               1
#define HAVE_ALLOCA_H             1
#define HAVE_ATTRIBUTE_NOOPTIMIZE 1
#define HAVE_CLEARENV             1
#define HAVE_CPU_SET_T            1
#define HAVE_CUDA                 1
#define HAVE_CUDA_H               1
#define HAVE_CUDA_RUNTIME_H       1
#define HAVE_DECL_ASPRINTF        1
#define HAVE_DECL_BASENAME        1
#define HAVE_DECL_CPU_ISSET       1
#define HAVE_DECL_CPU_ZERO        1
#define HAVE_DECL_ETHTOOL_CMD_SPEED 1
#define HAVE_DECL_FMEMOPEN        1
#define HAVE_DECL_FUSE_MOUNT      0
#define HAVE_DECL_FUSE_OPEN_CHANNEL 0
#define HAVE_DECL_FUSE_UNMOUNT    0
#define HAVE_DECL_F_SETOWN_EX     1
#define HAVE_DECL_GETAUXVAL       1
#define HAVE_DECL_IBV_CREATE_SRQ  0
#define HAVE_DECL_IBV_EVENT_TYPE_STR 0
#define HAVE_DECL_IBV_GET_ASYNC_EVENT 0
#define HAVE_DECL_IBV_GET_DEVICE_NAME 0
#define HAVE_DECL_IBV_QUERY_GID   0
#define HAVE_DECL_IBV_WC_STATUS_STR 0
#define HAVE_DECL_INOTIFY_ADD_WATCH 1
#define HAVE_DECL_INOTIFY_INIT    1
#define HAVE_DECL_IN_ATTRIB       1
#define HAVE_DECL_IPPROTO_TCP     1
#define HAVE_DECL_MADV_FREE       1
#define HAVE_DECL_MADV_REMOVE     1
#define HAVE_DECL_POSIX_MADV_DONTNEED 1
#define HAVE_DECL_PR_SET_PTRACER  1
#define HAVE_DECL_SOL_SOCKET      1
#define HAVE_DECL_SO_KEEPALIVE    1
#define HAVE_DECL_SPEED_UNKNOWN   1
#define HAVE_DECL_STRERROR_R      1
#define HAVE_DECL_SYS_BRK         1
#define HAVE_DECL_SYS_IPC         0
#define HAVE_DECL_SYS_MADVISE     1
#define HAVE_DECL_SYS_MMAP        1
#define HAVE_DECL_SYS_MREMAP      1
#define HAVE_DECL_SYS_MUNMAP      1
#define HAVE_DECL_SYS_SHMAT       1
#define HAVE_DECL_SYS_SHMDT       1
#define HAVE_DECL_TCP_KEEPCNT     1
#define HAVE_DECL_TCP_KEEPIDLE    1
#define HAVE_DECL_TCP_KEEPINTVL   1
#define HAVE_DECL___PPC_GET_TIMEBASE 0
#define HAVE_DECL___PPC_GET_TIMEBASE_FREQ 0
#define HAVE_DLFCN_H              1
#define HAVE_HW_TIMER             1
#define HAVE_IN6_ADDR_S6_ADDR32   1
#define HAVE_INOTIFY              1
#define HAVE_INTTYPES_H           1
#define HAVE_IP_IP_DST            1
#define HAVE_LIBGEN_H             1
#define HAVE_LIBRT                1
#define HAVE_LINUX_FUTEX_H        1
#define HAVE_LINUX_IP_H           1
#define HAVE_LINUX_MMAN_H         1
#define HAVE_MALLOC_H             1
#define HAVE_MALLOC_HOOK          1
#define HAVE_MALLOC_TRIM          1
#define HAVE_MEMALIGN             1
#define HAVE_MEMORY_H             1
#define HAVE_MREMAP               1
#define HAVE_NETINET_IP_H         1
#define HAVE_NET_ETHERNET_H       1
#define HAVE_NVML_H               1
#define HAVE_POSIX_MEMALIGN       1
#define HAVE_SCHED_GETAFFINITY    1
#define HAVE_SCHED_SETAFFINITY    1
#define HAVE_SIGACTION_SA_RESTORER 1
#define HAVE_SIGEVENT_SIGEV_UN_TID 1
#define HAVE_SIGHANDLER_T         1
#define HAVE_STDINT_H             1
#define HAVE_STDLIB_H             1
#define HAVE_STRERROR_R           1
#define HAVE_STRINGS_H            1
#define HAVE_STRING_H             1
#define HAVE_STRUCT_DL_PHDR_INFO  1
#define HAVE_SYS_EPOLL_H          1
#define HAVE_SYS_EVENTFD_H        1
#define HAVE_SYS_STAT_H           1
#define HAVE_SYS_TYPES_H          1
#define HAVE_SYS_UIO_H            1
#define HAVE_UCM_PTMALLOC286      1
#define HAVE_UNISTD_H             1
#define HAVE___CLEAR_CACHE        1
#define HAVE___CURBRK             1
#define HAVE___SIGHANDLER_T       1
#define LT_OBJDIR                 ".libs/"
#define NVALGRIND                 1
#define PACKAGE                   "ucx"
#define PACKAGE_BUGREPORT         ""
#define PACKAGE_NAME              "ucx"
#define PACKAGE_STRING            "ucx 1.15"
#define PACKAGE_TARNAME           "ucx"
#define PACKAGE_URL               ""
#define PACKAGE_VERSION           "1.15"
#define STDC_HEADERS              1
#define STRERROR_R_CHAR_P         1
#define UCM_BISTRO_HOOKS          1
#define UCS_MAX_LOG_LEVEL         UCS_LOG_LEVEL_DEBUG
#define UCT_TCP_EP_KEEPALIVE      1
#define UCT_UD_EP_DEBUG_HOOKS     0
#define UCX_CONFIGURE_FLAGS       "--disable-logging --disable-debug --disable-assertions --disable-params-check --prefix=/usr --enable-optimizations --disable-logging --disable-debug --disable-assertions --disable-params-check --enable-mt --with-verbs=/usr --with-hwloc=/usr --with-ucc=/usr --with-orte --with-sysroot --with-cuda=/usr/local/cuda --disable-doxygen-doc --with-avx --with-mcpu --with-march --with-rc --with-ud --with-dc --with-mlx5-dv --with-ib-hw-tm --with-dm --with-devx --with-xpmem --with-iodemo-cuda"
#define UCX_MODULE_SUBDIR         "ucx"
#define VERSION                   "1.15"
#define WITH_IODEMO_CUDA          1
#define restrict                  __restrict
#define test_MODULES              ":module"
#define ucm_MODULES               ":cuda"
#define ucs_MODULES               ""
#define uct_MODULES               ":cuda:cma"
#define uct_cuda_MODULES          ""
#define uct_ib_MODULES            ""
#define uct_rocm_MODULES          ""
#define ucx_perftest_MODULES      ":cuda"