Open PurvangL opened 1 year ago
ucx_info -bdv
and ibv_devinfo
?sure.
ucx_info -bdv
# Library version: 1.15.0
# Library path: /lib/libucs.so.0
# API headers version: 1.15.0
# Git branch 'master', revision 89f3299
# Configured with: --disable-logging --disable-debug --disable-assertions --disable-params-check --prefix=/usr --enable-optimizations=1 --disable-logging --disable-debug --disable-assertions --disable-params-check --enable-mt=1 --with-cuda=/usr/local/cuda --disable-doxygen-doc --with-sysroot --with-avx=yes --with-mcpu=yes --with-march=yes --with-rc=yes --with-ud=yes --with-dc=yes --with-mlx5-dv=yes --with-ib-hw-tm=yes --with-dm=yes --with-devx=yes --with-xpmem=yes --with-iodemo-cuda=yes
#define UCX_CONFIG_H
#define ENABLE_BUILTIN_MEMCPY 1
#define ENABLE_DEBUG_DATA 0
#define ENABLE_MT 0
#define ENABLE_PARAMS_CHECK 0
#define HAVE_ALLOCA 1
#define HAVE_ALLOCA_H 1
#define HAVE_ATTRIBUTE_NOOPTIMIZE 1
#define HAVE_CLEARENV 1
#define HAVE_CPU_SET_T 1
#define HAVE_CUDA 1
#define HAVE_CUDA_H 1
#define HAVE_CUDA_RUNTIME_H 1
#define HAVE_DECL_ASPRINTF 1
#define HAVE_DECL_BASENAME 1
#define HAVE_DECL_CPU_ISSET 1
#define HAVE_DECL_CPU_ZERO 1
#define HAVE_DECL_ETHTOOL_CMD_SPEED 1
#define HAVE_DECL_FMEMOPEN 1
#define HAVE_DECL_FUSE_MOUNT 0
#define HAVE_DECL_FUSE_OPEN_CHANNEL 0
#define HAVE_DECL_FUSE_UNMOUNT 0
#define HAVE_DECL_F_SETOWN_EX 1
#define HAVE_DECL_GETAUXVAL 1
#define HAVE_DECL_IBV_CREATE_SRQ 0
#define HAVE_DECL_IBV_EVENT_TYPE_STR 0
#define HAVE_DECL_IBV_GET_ASYNC_EVENT 0
#define HAVE_DECL_IBV_GET_DEVICE_NAME 0
#define HAVE_DECL_IBV_QUERY_GID 0
#define HAVE_DECL_IBV_WC_STATUS_STR 0
#define HAVE_DECL_INOTIFY_ADD_WATCH 1
#define HAVE_DECL_INOTIFY_INIT 1
#define HAVE_DECL_IN_ATTRIB 1
#define HAVE_DECL_IPPROTO_TCP 1
#define HAVE_DECL_MADV_FREE 1
#define HAVE_DECL_MADV_REMOVE 1
#define HAVE_DECL_POSIX_MADV_DONTNEED 1
#define HAVE_DECL_PR_SET_PTRACER 1
#define HAVE_DECL_SOL_SOCKET 1
#define HAVE_DECL_SO_KEEPALIVE 1
#define HAVE_DECL_SPEED_UNKNOWN 1
#define HAVE_DECL_STRERROR_R 1
#define HAVE_DECL_SYS_BRK 1
#define HAVE_DECL_SYS_IPC 0
#define HAVE_DECL_SYS_MADVISE 1
#define HAVE_DECL_SYS_MMAP 1
#define HAVE_DECL_SYS_MREMAP 1
#define HAVE_DECL_SYS_MUNMAP 1
#define HAVE_DECL_SYS_SHMAT 1
#define HAVE_DECL_SYS_SHMDT 1
#define HAVE_DECL_TCP_KEEPCNT 1
#define HAVE_DECL_TCP_KEEPIDLE 1
#define HAVE_DECL_TCP_KEEPINTVL 1
#define HAVE_DECL___PPC_GET_TIMEBASE 0
#define HAVE_DECL___PPC_GET_TIMEBASE_FREQ 0
#define HAVE_DLFCN_H 1
#define HAVE_HW_TIMER 1
#define HAVE_IN6_ADDR_S6_ADDR32 1
#define HAVE_INOTIFY 1
#define HAVE_INTTYPES_H 1
#define HAVE_IP_IP_DST 1
#define HAVE_LIBGEN_H 1
#define HAVE_LIBRT 1
#define HAVE_LINUX_FUTEX_H 1
#define HAVE_LINUX_IP_H 1
#define HAVE_LINUX_MMAN_H 1
#define HAVE_MALLOC_H 1
#define HAVE_MALLOC_HOOK 1
#define HAVE_MALLOC_TRIM 1
#define HAVE_MEMALIGN 1
#define HAVE_MEMORY_H 1
#define HAVE_MREMAP 1
#define HAVE_NETINET_IP_H 1
#define HAVE_NET_ETHERNET_H 1
#define HAVE_NVML_H 1
#define HAVE_POSIX_MEMALIGN 1
#define HAVE_SCHED_GETAFFINITY 1
#define HAVE_SCHED_SETAFFINITY 1
#define HAVE_SIGACTION_SA_RESTORER 1
#define HAVE_SIGEVENT_SIGEV_UN_TID 1
#define HAVE_SIGHANDLER_T 1
#define HAVE_STDINT_H 1
#define HAVE_STDLIB_H 1
#define HAVE_STRERROR_R 1
#define HAVE_STRINGS_H 1
#define HAVE_STRING_H 1
#define HAVE_STRUCT_DL_PHDR_INFO 1
#define HAVE_SYS_EPOLL_H 1
#define HAVE_SYS_EVENTFD_H 1
#define HAVE_SYS_STAT_H 1
#define HAVE_SYS_TYPES_H 1
#define HAVE_SYS_UIO_H 1
#define HAVE_UCM_PTMALLOC286 1
#define HAVE_UNISTD_H 1
#define HAVE___CLEAR_CACHE 1
#define HAVE___CURBRK 1
#define HAVE___SIGHANDLER_T 1
#define LT_OBJDIR ".libs/"
#define NVALGRIND 1
#define PACKAGE "ucx"
#define PACKAGE_BUGREPORT ""
#define PACKAGE_NAME "ucx"
#define PACKAGE_STRING "ucx 1.15"
#define PACKAGE_TARNAME "ucx"
#define PACKAGE_URL ""
#define PACKAGE_VERSION "1.15"
#define STDC_HEADERS 1
#define STRERROR_R_CHAR_P 1
#define UCM_BISTRO_HOOKS 1
#define UCS_MAX_LOG_LEVEL UCS_LOG_LEVEL_DEBUG
#define UCT_TCP_EP_KEEPALIVE 1
#define UCT_UD_EP_DEBUG_HOOKS 0
#define UCX_CONFIGURE_FLAGS "--disable-logging --disable-debug --disable-assertions --disable-params-check --prefix=/usr --enable-optimizations=1 --disable-logging --disable-debug --disable-assertions --disable-params-check --enable-mt=1 --with-cuda=/usr/local/cuda --disable-doxygen-doc --with-sysroot --with-avx=yes --with-mcpu=yes --with-march=yes --with-rc=yes --with-ud=yes --with-dc=yes --with-mlx5-dv=yes --with-ib-hw-tm=yes --with-dm=yes --with-devx=yes --with-xpmem=yes --with-iodemo-cuda=yes"
#define UCX_MODULE_SUBDIR "ucx"
#define VERSION "1.15"
#define WITH_IODEMO_CUDA 1
#define restrict __restrict
#define test_MODULES ":module"
#define ucm_MODULES ":cuda"
#define ucs_MODULES ""
#define uct_MODULES ":cuda:cma"
#define uct_cuda_MODULES ""
#define uct_ib_MODULES ""
#define uct_rocm_MODULES ""
#define ucx_perftest_MODULES ":cuda"
#
# Memory domain: self
# Component: self
# register: unlimited, cost: 0 nsec
# remote key: 0 bytes
# memory types: host (access,reg,cache)
#
# Transport: self
# Device: memory
# Type: loopback
# System device: <unknown>
#
# capabilities:
# bandwidth: 0.00/ppn + 6911.00 MB/sec
# latency: 0 nsec
# overhead: 10 nsec
# put_short: <= 4294967295
# put_bcopy: unlimited
# get_bcopy: unlimited
# am_short: <= 8K
# am_bcopy: <= 8K
# domain: cpu
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 0 bytes
# iface address: 8 bytes
# error handling: ep_check
#
#
# Memory domain: tcp
# Component: tcp
# register: unlimited, cost: 0 nsec
# remote key: 0 bytes
# memory types: host (access,reg,cache)
#
# Transport: tcp
# Device: ibs8f0
# Type: network
# System device: ibs8f0 (0)
#
# capabilities:
# bandwidth: 11142.51/ppn + 0.00 MB/sec
# latency: 5206 nsec
# overhead: 50000 nsec
# put_zcopy: <= 18446744073709551590, up to 6 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 0
# am_short: <= 8K
# am_bcopy: <= 8K
# am_zcopy: <= 64K, up to 6 iov
# am_opt_zcopy_align: <= 1
# am_align_mtu: <= 0
# am header: <= 8037
# connection: to ep, to iface
# device priority: 1
# device num paths: 1
# max eps: 256
# device address: 6 bytes
# iface address: 2 bytes
# ep address: 10 bytes
# error handling: peer failure, ep_check, keepalive
#
# Transport: tcp
# Device: ens8f1
# Type: network
# System device: ens8f1 (1)
#
# capabilities:
# bandwidth: 11818.05/ppn + 0.00 MB/sec
# latency: 5206 nsec
# overhead: 50000 nsec
# put_zcopy: <= 18446744073709551590, up to 6 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 0
# am_short: <= 8K
# am_bcopy: <= 8K
# am_zcopy: <= 64K, up to 6 iov
# am_opt_zcopy_align: <= 1
# am_align_mtu: <= 0
# am header: <= 8037
# connection: to ep, to iface
# device priority: 0
# device num paths: 1
# max eps: 256
# device address: 6 bytes
# iface address: 2 bytes
# ep address: 10 bytes
# error handling: peer failure, ep_check, keepalive
#
# Transport: tcp
# Device: lo
# Type: network
# System device: <unknown>
#
# capabilities:
# bandwidth: 11.91/ppn + 0.00 MB/sec
# latency: 10960 nsec
# overhead: 50000 nsec
# put_zcopy: <= 18446744073709551590, up to 6 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 0
# am_short: <= 8K
# am_bcopy: <= 8K
# am_zcopy: <= 64K, up to 6 iov
# am_opt_zcopy_align: <= 1
# am_align_mtu: <= 0
# am header: <= 8037
# connection: to ep, to iface
# device priority: 1
# device num paths: 1
# max eps: 256
# device address: 18 bytes
# iface address: 2 bytes
# ep address: 10 bytes
# error handling: peer failure, ep_check, keepalive
#
#
# Connection manager: tcp
# max_conn_priv: 2064 bytes
#
# Memory domain: sysv
# Component: sysv
# allocate: unlimited
# remote key: 12 bytes
# rkey_ptr is supported
# memory types: host (access,alloc,cache)
#
# Transport: sysv
# Device: memory
# Type: intra-node
# System device: <unknown>
#
# capabilities:
# bandwidth: 0.00/ppn + 15360.00 MB/sec
# latency: 80 nsec
# overhead: 10 nsec
# put_short: <= 4294967295
# put_bcopy: unlimited
# get_bcopy: unlimited
# am_short: <= 100
# am_bcopy: <= 8256
# domain: cpu
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 8 bytes
# iface address: 8 bytes
# error handling: ep_check
#
#
# Memory domain: posix
# Component: posix
# allocate: <= 264085196K
# remote key: 32 bytes
# rkey_ptr is supported
# memory types: host (access,alloc,cache)
#
# Transport: posix
# Device: memory
# Type: intra-node
# System device: <unknown>
#
# capabilities:
# bandwidth: 0.00/ppn + 15360.00 MB/sec
# latency: 80 nsec
# overhead: 10 nsec
# put_short: <= 4294967295
# put_bcopy: unlimited
# get_bcopy: unlimited
# am_short: <= 100
# am_bcopy: <= 8256
# domain: cpu
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 8 bytes
# iface address: 16 bytes
# error handling: ep_check
#
#
# Memory domain: cuda_cpy
# Component: cuda_cpy
# allocate: unlimited
# register: unlimited, cost: 0 nsec
# memory types: host (reg), cuda (access,alloc,reg,cache,detect), cuda-managed (access,alloc,reg,cache,detect)
#
# Transport: cuda_copy
# Device: cuda
# Type: accelerator
# System device: <unknown>
#
# capabilities:
# bandwidth: 10000.00/ppn + 0.00 MB/sec
# latency: 8000 nsec
# overhead: 0 nsec
# put_short: <= 4294967295
# put_zcopy: unlimited, up to 1 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 1
# get_short: <= 4294967295
# get_zcopy: unlimited, up to 1 iov
# get_opt_zcopy_align: <= 1
# get_align_mtu: <= 1
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 0 bytes
# iface address: 8 bytes
# error handling: none
#
#
# Memory domain: cuda_ipc
# Component: cuda_ipc
# register: unlimited, cost: 0 nsec
# remote key: 112 bytes
# memory invalidation is supported
# memory types: cuda (access,reg,cache)
#
# Transport: cuda_ipc
# Device: cuda
# Type: intra-node
# System device: <unknown>
#
# capabilities:
# bandwidth: 300000.00/ppn + 0.00 MB/sec
# latency: 1000 nsec
# overhead: 7000 nsec
# put_zcopy: unlimited, up to 1 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 1
# get_zcopy: <= 0, up to 1 iov
# get_opt_zcopy_align: <= 1
# get_align_mtu: <= 1
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 8 bytes
# iface address: 4 bytes
# error handling: peer failure, ep_check
#
#
# Memory domain: cma
# Component: cma
# register: unlimited, cost: 9 nsec
# memory types: host (access,reg,cache)
#
# Transport: cma
# Device: memory
# Type: intra-node
# System device: <unknown>
#
# capabilities:
# bandwidth: 0.00/ppn + 11145.00 MB/sec
# latency: 80 nsec
# overhead: 2000 nsec
# put_zcopy: unlimited, up to 16 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 1
# get_zcopy: unlimited, up to 16 iov
# get_opt_zcopy_align: <= 1
# get_align_mtu: <= 1
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 8 bytes
# iface address: 16 bytes
# error handling: peer failure, ep_check
#
ibv_devinfo
hca_id: mlx5_0
transport: InfiniBand (0)
fw_ver: 20.31.2354
node_guid: 0c42:a103:0002:40c2
sys_image_guid: 0c42:a103:0002:40c2
vendor_id: 0x02c9
vendor_part_id: 4123
hw_ver: 0x0
board_id: MT_0000000224
phys_port_cnt: 1
port: 1
state: PORT_ACTIVE (4)
max_mtu: 4096 (5)
active_mtu: 4096 (5)
sm_lid: 21
port_lid: 10
port_lmc: 0x00
link_layer: InfiniBand
hca_id: mlx5_1
transport: InfiniBand (0)
fw_ver: 20.31.2354
node_guid: 0c42:a103:0002:40c3
sys_image_guid: 0c42:a103:0002:40c2
vendor_id: 0x02c9
vendor_part_id: 4123
hw_ver: 0x0
board_id: MT_0000000224
phys_port_cnt: 1
port: 1
state: PORT_ACTIVE (4)
max_mtu: 4096 (5)
active_mtu: 4096 (5)
sm_lid: 0
port_lid: 0
port_lmc: 0x00
link_layer: Ethernet
it seems UCX was built without Verbs support. Perhaps rdma-core development libraries were not installed on the build system
Thank you. I run in docker and found out that ibv_devices wasn't showing any output.
so installed rdma-core and now it output as follows.
ibv_devices
device node GUID
------ ----------------
mlx5_0 0c42a103000240c2
mlx5_1 0c42a103000240c3
while running nccl-test, I get following warnings:
ucp_context.c:1100 UCX WARN network device 'mlx5_0:1' is not available, please use one or more of: 'ens8f1'(tcp), 'ibs8f0'(tcp), 'lo'(tcp)
UCX WARN transport 'ud' is not available, please use one or more of: cma, cuda, cuda_copy, cuda_ipc, mm, posix, self, shm, sm, sysv, tcp
I still see warning about 'ud' not found. what other libraries needed?
@PurvangL need to make user /usr/include/infiniband/verbs.h is installed as well, then reconfigure and rebuild ucx
I see. I am trying to run in docker container, where mentioned file is not there. Though I do see verbs.h on host system. Any suggestion how can I achieve in docker container?
As a part of process, I installed openmpi along with ucx on host and tries to mount the opnmpi binaries folder with docker. but than it starts complaining as below:
Also ompi_info output
Built by:
Built on: Thu Jan 12 16:52:14 UTC 2023
Built host: smc-gpu-01
C bindings: yes
C++ bindings: yes
Fort mpif.h: no
Fort use mpi: no
Fort use mpi size: deprecated-ompi-info-value
Fort use mpi_f08: no
Fort mpi_f08 compliance: The mpi_f08 module was not built
Fort mpi_f08 subarrays: no
Java bindings: no
Wrapper compiler rpath: runpath
C compiler: gcc
C compiler absolute: /usr/bin/gcc
C compiler family name: GNU
C compiler version: 9.4.0
C++ compiler: g++
C++ compiler absolute: /usr/bin/g++
Fort compiler: none
Fort compiler abs: none
Fort ignore TKR: no
Fort 08 assumed shape: no
Fort optional args: no
Fort INTERFACE: no
Fort ISO_FORTRAN_ENV: no
Fort STORAGE_SIZE: no
Fort BIND(C) (all): no
Fort ISO_C_BINDING: no
Fort SUBROUTINE BIND(C): no
Fort TYPE,BIND(C): no
Fort T,BIND(C,name="a"): no
Fort PRIVATE: no
Fort PROTECTED: no
Fort ABSTRACT: no
Fort ASYNCHRONOUS: no
Fort PROCEDURE: no
Fort USE...ONLY: no
Fort C_FUNLOC: no
Fort f08 using wrappers: no
Fort MPI_SIZEOF: no
C profiling: yes
C++ profiling: yes
Fort mpif.h profiling: no
Fort use mpi profiling: no
Fort use mpi_f08 prof: no
C++ exceptions: no
Thread support: posix (MPI_THREAD_MULTIPLE: yes, OPAL support: yes, OMPI progress: no, ORTE progress: yes, Event lib: yes)
Sparse Groups: no
Internal debug support: no
MPI interface warnings: yes
MPI parameter check: never
Memory profiling support: no
Memory debugging support: no
dl support: yes
Heterogeneous support: no
mpirun default --prefix: yes
MPI_WTIME support: native
Symbol vis. support: yes
Host topology support: yes
IPv6 support: yes
MPI1 compatibility: no
MPI extensions: affinity, cuda, pcollreq
FT Checkpoint support: no (checkpoint thread: no)
C/R Enabled Debugging: no
MPI_MAX_PROCESSOR_NAME: 256
MPI_MAX_ERROR_STRING: 256
MPI_MAX_OBJECT_NAME: 64
MPI_MAX_INFO_KEY: 36
MPI_MAX_INFO_VAL: 256
MPI_MAX_PORT_NAME: 1024
MPI_MAX_DATAREP_STRING: 128
MCA allocator: basic (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA allocator: bucket (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA backtrace: execinfo (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA btl: tcp (MCA v2.1.0, API v3.1.0, Component v4.1.4)
MCA btl: self (MCA v2.1.0, API v3.1.0, Component v4.1.4)
MCA btl: vader (MCA v2.1.0, API v3.1.0, Component v4.1.4)
MCA btl: smcuda (MCA v2.1.0, API v3.1.0, Component v4.1.4)
MCA compress: bzip (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA compress: gzip (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA crs: none (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA dl: dlopen (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA event: external (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA hwloc: external (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA if: linux_ipv6 (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA if: posix_ipv4 (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA installdirs: env (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA installdirs: config (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA memory: patcher (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA mpool: hugepage (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA patcher: overwrite (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA pmix: isolated (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA pmix: flux (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA pmix: pmix3x (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA pstat: linux (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA rcache: grdma (MCA v2.1.0, API v3.3.0, Component v4.1.4)
MCA rcache: gpusm (MCA v2.1.0, API v3.3.0, Component v4.1.4)
MCA rcache: rgpusm (MCA v2.1.0, API v3.3.0, Component v4.1.4)
MCA reachable: weighted (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA reachable: netlink (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA shmem: posix (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA shmem: mmap (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA shmem: sysv (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA timer: linux (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA errmgr: default_orted (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA errmgr: default_app (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA errmgr: default_tool (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA errmgr: default_hnp (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA ess: tool (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA ess: hnp (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA ess: pmi (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA ess: singleton (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA ess: env (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA ess: slurm (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA filem: raw (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA grpcomm: direct (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA iof: orted (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA iof: tool (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA iof: hnp (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA odls: pspawn (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA odls: default (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA oob: tcp (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA plm: rsh (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA plm: isolated (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA plm: slurm (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA ras: slurm (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA ras: simulator (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA ras: gridengine (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA regx: naive (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA regx: fwd (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA regx: reverse (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA rmaps: seq (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA rmaps: rank_file (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA rmaps: round_robin (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA rmaps: mindist (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA rmaps: ppr (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA rmaps: resilient (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA rml: oob (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA routed: direct (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA routed: radix (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA routed: binomial (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA rtc: hwloc (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA schizo: ompi (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA schizo: orte (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA schizo: jsm (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA schizo: slurm (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA schizo: flux (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA state: novm (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA state: app (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA state: orted (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA state: tool (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA state: hnp (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA bml: r2 (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA coll: monitoring (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA coll: basic (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA coll: sm (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA coll: self (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA coll: cuda (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA coll: inter (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA coll: tuned (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA coll: sync (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA coll: adapt (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA coll: libnbc (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA coll: han (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA fbtl: posix (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA fcoll: dynamic_gen2 (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA fcoll: vulcan (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA fcoll: dynamic (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA fcoll: individual (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA fcoll: two_phase (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA fs: ufs (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA io: ompio (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA io: romio321 (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA op: avx (MCA v2.1.0, API v1.0.0, Component v4.1.4)
MCA osc: pt2pt (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA osc: ucx (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA osc: monitoring (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA osc: rdma (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA osc: sm (MCA v2.1.0, API v3.0.0, Component v4.1.4)
MCA pml: v (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA pml: cm (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA pml: ob1 (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA pml: ucx (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA pml: monitoring (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA rte: orte (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA sharedfp: individual (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA sharedfp: sm (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA sharedfp: lockedfile (MCA v2.1.0, API v2.0.0, Component v4.1.4)
MCA topo: treematch (MCA v2.1.0, API v2.2.0, Component v4.1.4)
MCA topo: basic (MCA v2.1.0, API v2.2.0, Component v4.1.4)
MCA vprotocol: pessimist (MCA v2.1.0, API v2.0.0, Component v4.1.4)
idps@smc-gpu-01:/data/ASS/gpu3/Obstacle_Detection_ASS$ cat ompi_info.txt | grep btl
Configure command line: '--prefix=/usr' '--with-ucx=/usr' '--with-cuda' '--includedir=/usr/include' '--mandir=/usr/share/man' '--infodir=/usr/share/info' '--sysconfdir=/etc' '--localstatedir=/var' '--disable-silent-rules' '--libexecdir=/usr/lib' '--disable-maintainer-mode' '--enable-dlopen' '--enable-ipv6' '--enable-mpirun-prefix-by-default' '--enable-mpi-cxx' '--enable-oshmem' '--enable-oshmem-compat' '--enable-oshmem-profile' '--enable-spc' '--enable-builtin-atomics' '--enable-openib-udcm' '--enable-openib-dynamic-sl' '--enable-openib-rdmacm' '--enable-openib-rdmacm-ibaddr' '--enable-btl' '--with-verbs' '--with-sge' '--with-memory-manager=none' '--with-hwloc' '--with-libltdl' '--with-devel-headers' '--enable-shared' '--prefix=/usr/mpi/gcc/openmpi-4.1.4' '--with-platform=contrib/platform/mellanox/optimized'
MCA btl: tcp (MCA v2.1.0, API v3.1.0, Component v4.1.4)
MCA btl: self (MCA v2.1.0, API v3.1.0, Component v4.1.4)
MCA btl: vader (MCA v2.1.0, API v3.1.0, Component v4.1.4)
MCA btl: smcuda (MCA v2.1.0, API v3.1.0, Component v4.1.4)
MCA fbtl: posix (MCA v2.1.0, API v2.0.0, Component v4.1.4)
and ucx_info output
# Library version: 1.15.0
# Library path: /lib/libucs.so.0
# API headers version: 1.15.0
# Git branch 'master', revision 89f3299
# Configured with: --disable-logging --disable-debug --disable-assertions --disable-params-check --prefix=/usr --enable-optimizations --disable-logging --disable-debug --disable-assertions --disable-params-check --enable-mt --with-verbs=/usr --with-hwloc=/usr --with-ucc=/usr --with-orte --with-sysroot --with-cuda=/usr/local/cuda --disable-doxygen-doc --with-avx --with-mcpu --with-march --with-rc --with-ud --with-dc --with-mlx5-dv --with-ib-hw-tm --with-dm --with-devx --with-xpmem --with-iodemo-cuda
#define UCX_CONFIG_H
#define ENABLE_BUILTIN_MEMCPY 1
#define ENABLE_DEBUG_DATA 0
#define ENABLE_MT 1
#define ENABLE_PARAMS_CHECK 0
#define HAVE_ALLOCA 1
#define HAVE_ALLOCA_H 1
#define HAVE_ATTRIBUTE_NOOPTIMIZE 1
#define HAVE_CLEARENV 1
#define HAVE_CPU_SET_T 1
#define HAVE_CUDA 1
#define HAVE_CUDA_H 1
#define HAVE_CUDA_RUNTIME_H 1
#define HAVE_DECL_ASPRINTF 1
#define HAVE_DECL_BASENAME 1
#define HAVE_DECL_CPU_ISSET 1
#define HAVE_DECL_CPU_ZERO 1
#define HAVE_DECL_ETHTOOL_CMD_SPEED 1
#define HAVE_DECL_FMEMOPEN 1
#define HAVE_DECL_FUSE_MOUNT 0
#define HAVE_DECL_FUSE_OPEN_CHANNEL 0
#define HAVE_DECL_FUSE_UNMOUNT 0
#define HAVE_DECL_F_SETOWN_EX 1
#define HAVE_DECL_GETAUXVAL 1
#define HAVE_DECL_IBV_CREATE_SRQ 0
#define HAVE_DECL_IBV_EVENT_TYPE_STR 0
#define HAVE_DECL_IBV_GET_ASYNC_EVENT 0
#define HAVE_DECL_IBV_GET_DEVICE_NAME 0
#define HAVE_DECL_IBV_QUERY_GID 0
#define HAVE_DECL_IBV_WC_STATUS_STR 0
#define HAVE_DECL_INOTIFY_ADD_WATCH 1
#define HAVE_DECL_INOTIFY_INIT 1
#define HAVE_DECL_IN_ATTRIB 1
#define HAVE_DECL_IPPROTO_TCP 1
#define HAVE_DECL_MADV_FREE 1
#define HAVE_DECL_MADV_REMOVE 1
#define HAVE_DECL_POSIX_MADV_DONTNEED 1
#define HAVE_DECL_PR_SET_PTRACER 1
#define HAVE_DECL_SOL_SOCKET 1
#define HAVE_DECL_SO_KEEPALIVE 1
#define HAVE_DECL_SPEED_UNKNOWN 1
#define HAVE_DECL_STRERROR_R 1
#define HAVE_DECL_SYS_BRK 1
#define HAVE_DECL_SYS_IPC 0
#define HAVE_DECL_SYS_MADVISE 1
#define HAVE_DECL_SYS_MMAP 1
#define HAVE_DECL_SYS_MREMAP 1
#define HAVE_DECL_SYS_MUNMAP 1
#define HAVE_DECL_SYS_SHMAT 1
#define HAVE_DECL_SYS_SHMDT 1
#define HAVE_DECL_TCP_KEEPCNT 1
#define HAVE_DECL_TCP_KEEPIDLE 1
#define HAVE_DECL_TCP_KEEPINTVL 1
#define HAVE_DECL___PPC_GET_TIMEBASE 0
#define HAVE_DECL___PPC_GET_TIMEBASE_FREQ 0
#define HAVE_DLFCN_H 1
#define HAVE_HW_TIMER 1
#define HAVE_IN6_ADDR_S6_ADDR32 1
#define HAVE_INOTIFY 1
#define HAVE_INTTYPES_H 1
#define HAVE_IP_IP_DST 1
#define HAVE_LIBGEN_H 1
#define HAVE_LIBRT 1
#define HAVE_LINUX_FUTEX_H 1
#define HAVE_LINUX_IP_H 1
#define HAVE_LINUX_MMAN_H 1
#define HAVE_MALLOC_H 1
#define HAVE_MALLOC_HOOK 1
#define HAVE_MALLOC_TRIM 1
#define HAVE_MEMALIGN 1
#define HAVE_MEMORY_H 1
#define HAVE_MREMAP 1
#define HAVE_NETINET_IP_H 1
#define HAVE_NET_ETHERNET_H 1
#define HAVE_NVML_H 1
#define HAVE_POSIX_MEMALIGN 1
#define HAVE_SCHED_GETAFFINITY 1
#define HAVE_SCHED_SETAFFINITY 1
#define HAVE_SIGACTION_SA_RESTORER 1
#define HAVE_SIGEVENT_SIGEV_UN_TID 1
#define HAVE_SIGHANDLER_T 1
#define HAVE_STDINT_H 1
#define HAVE_STDLIB_H 1
#define HAVE_STRERROR_R 1
#define HAVE_STRINGS_H 1
#define HAVE_STRING_H 1
#define HAVE_STRUCT_DL_PHDR_INFO 1
#define HAVE_SYS_EPOLL_H 1
#define HAVE_SYS_EVENTFD_H 1
#define HAVE_SYS_STAT_H 1
#define HAVE_SYS_TYPES_H 1
#define HAVE_SYS_UIO_H 1
#define HAVE_UCM_PTMALLOC286 1
#define HAVE_UNISTD_H 1
#define HAVE___CLEAR_CACHE 1
#define HAVE___CURBRK 1
#define HAVE___SIGHANDLER_T 1
#define LT_OBJDIR ".libs/"
#define NVALGRIND 1
#define PACKAGE "ucx"
#define PACKAGE_BUGREPORT ""
#define PACKAGE_NAME "ucx"
#define PACKAGE_STRING "ucx 1.15"
#define PACKAGE_TARNAME "ucx"
#define PACKAGE_URL ""
#define PACKAGE_VERSION "1.15"
#define STDC_HEADERS 1
#define STRERROR_R_CHAR_P 1
#define UCM_BISTRO_HOOKS 1
#define UCS_MAX_LOG_LEVEL UCS_LOG_LEVEL_DEBUG
#define UCT_TCP_EP_KEEPALIVE 1
#define UCT_UD_EP_DEBUG_HOOKS 0
#define UCX_CONFIGURE_FLAGS "--disable-logging --disable-debug --disable-assertions --disable-params-check --prefix=/usr --enable-optimizations --disable-logging --disable-debug --disable-assertions --disable-params-check --enable-mt --with-verbs=/usr --with-hwloc=/usr --with-ucc=/usr --with-orte --with-sysroot --with-cuda=/usr/local/cuda --disable-doxygen-doc --with-avx --with-mcpu --with-march --with-rc --with-ud --with-dc --with-mlx5-dv --with-ib-hw-tm --with-dm --with-devx --with-xpmem --with-iodemo-cuda"
#define UCX_MODULE_SUBDIR "ucx"
#define VERSION "1.15"
#define WITH_IODEMO_CUDA 1
#define restrict __restrict
#define test_MODULES ":module"
#define ucm_MODULES ":cuda"
#define ucs_MODULES ""
#define uct_MODULES ":cuda:cma"
#define uct_cuda_MODULES ""
#define uct_ib_MODULES ""
#define uct_rocm_MODULES ""
#define ucx_perftest_MODULES ":cuda"
I have 8 x A100 PCiE server, where I have installed ucx with following args.
and during installation of OpenMpi, I use
But, Running Nccl test with installed mpi like below gives warning: ucp_context.c:1100 UCX WARN transport 'ud' is not available, please use one or more of: cma, cuda, cuda_copy, cuda_ipc, mm, posix, self, shm, sm, sysv, tcp
Question is why my system cannot detect ud protocol even I specified in ucx installation?
I want to unable missing protocols from following list but I don't see in availables. How can I enable or install?
Thank you