openucx / ucx

Unified Communication X (mailing list - https://elist.ornl.gov/mailman/listinfo/ucx-group)
http://www.openucx.org
Other
1.07k stars 409 forks source link

UCX installation done with OFED doesn't recognize cuda, cuda_cpy etc. #9950

Open RamHPC opened 3 weeks ago

RamHPC commented 3 weeks ago

Describe the bug

Installed ucx-1.16 and everything was working fine. The devices/transports recognized are inline with the expectation. Installed OFED (MLNX_OFED_LINUX-24.04-0.6.6.0-rhel8.9-x86_64) for which automatically installed ucx-1.17 version. This doesn't show cuda, cuda_cpy and gdr_copy as devices/transports

Steps to Reproduce

$ ucx_info -b

define UCX_CONFIG_H

define ENABLE_BUILTIN_MEMCPY 1

define ENABLE_DEBUG_DATA 0

define ENABLE_MT 1

define ENABLE_PARAMS_CHECK 0

define HAVE_1_ARG_BFD_SECTION_SIZE 0

define HAVE_ALLOCA 1

define HAVE_ALLOCA_H 1

define HAVE_ATTRIBUTE_NOOPTIMIZE 1

define HAVE_CLEARENV 1

define HAVE_CPLUS_DEMANGLE 1

define HAVE_CPU_SET_T 1

define HAVE_CUDA 1

define HAVE_CUDA_H 1

define HAVE_CUDA_RUNTIME_H 1

define HAVE_DC_DV 1

define HAVE_DECL_ASPRINTF 1

define HAVE_DECL_BASENAME 1

define HAVE_DECL_BFD_GET_SECTION_FLAGS 1

define HAVE_DECL_BFD_GET_SECTION_VMA 1

define HAVE_DECL_BFD_SECTION_FLAGS 0

define HAVE_DECL_BFD_SECTION_VMA 1

define HAVE_DECL_CPU_ISSET 1

define HAVE_DECL_CPU_ZERO 1

define HAVE_DECL_ETHTOOL_CMD_SPEED 1

define HAVE_DECL_FMEMOPEN 1

define HAVE_DECL_F_SETOWN_EX 1

define HAVE_DECL_GDR_COPY_TO_MAPPING 1

define HAVE_DECL_GETAUXVAL 1

define HAVE_DECL_IBV_ACCESS_ON_DEMAND 1

define HAVE_DECL_IBV_ACCESS_RELAXED_ORDERING 1

define HAVE_DECL_IBV_ADVISE_MR 1

define HAVE_DECL_IBV_ALLOC_DM 1

define HAVE_DECL_IBV_ALLOC_TD 1

define HAVE_DECL_IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN 1

define HAVE_DECL_IBV_CREATE_CQ_EX 1

define HAVE_DECL_IBV_CREATE_QP_EX 1

define HAVE_DECL_IBV_CREATE_SRQ 1

define HAVE_DECL_IBV_CREATE_SRQ_EX 1

define HAVE_DECL_IBV_EVENT_GID_CHANGE 1

define HAVE_DECL_IBV_EVENT_TYPE_STR 1

define HAVE_DECL_IBV_GET_ASYNC_EVENT 1

define HAVE_DECL_IBV_GET_DEVICE_NAME 1

define HAVE_DECL_IBV_LINK_LAYER_ETHERNET 1

define HAVE_DECL_IBV_LINK_LAYER_INFINIBAND 1

define HAVE_DECL_IBV_QPF_GRH_REQUIRED 1

define HAVE_DECL_IBV_QUERY_DEVICE_EX 1

define HAVE_DECL_IBV_QUERY_GID 1

define HAVE_DECL_IBV_REG_DMABUF_MR 1

define HAVE_DECL_IBV_SET_ECE 1

define HAVE_DECL_IBV_TRANSPORT_UNSPECIFIED 1

define HAVE_DECL_IBV_TRANSPORT_USNIC 1

define HAVE_DECL_IBV_TRANSPORT_USNIC_UDP 1

define HAVE_DECL_IBV_WC_STATUS_STR 1

define HAVE_DECL_INOTIFY_ADD_WATCH 1

define HAVE_DECL_INOTIFY_INIT 1

define HAVE_DECL_IN_ATTRIB 1

define HAVE_DECL_IPPROTO_TCP 1

define HAVE_DECL_MADV_FREE 1

define HAVE_DECL_MADV_REMOVE 1

define HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE 1

define HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE 1

define HAVE_DECL_MLX5DV_CREATE_QP 1

define HAVE_DECL_MLX5DV_DCTYPE_DCT 1

define HAVE_DECL_MLX5DV_DEVX_SUBSCRIBE_DEVX_EVENT 1

define HAVE_DECL_MLX5DV_DEVX_UMEM_REG_EX 1

define HAVE_DECL_MLX5DV_INIT_OBJ 1

define HAVE_DECL_MLX5DV_IS_SUPPORTED 1

define HAVE_DECL_MLX5DV_OBJ_AH 1

define HAVE_DECL_MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE 1

define HAVE_DECL_MLX5DV_UAR_ALLOC_TYPE_BF 1

define HAVE_DECL_MLX5DV_UAR_ALLOC_TYPE_NC 1

define HAVE_DECL_POSIX_MADV_DONTNEED 1

define HAVE_DECL_PR_SET_PTRACER 1

define HAVE_DECL_SOL_SOCKET 1

define HAVE_DECL_SO_KEEPALIVE 1

define HAVE_DECL_SPEED_UNKNOWN 1

define HAVE_DECL_STRERROR_R 1

define HAVE_DECL_SYS_BRK 1

define HAVE_DECL_SYS_IPC 0

define HAVE_DECL_SYS_MADVISE 1

define HAVE_DECL_SYS_MMAP 1

define HAVE_DECL_SYS_MREMAP 1

define HAVE_DECL_SYS_MUNMAP 1

define HAVE_DECL_SYS_SHMAT 1

define HAVE_DECL_SYS_SHMDT 1

define HAVE_DECL_TCP_KEEPCNT 1

define HAVE_DECL_TCP_KEEPIDLE 1

define HAVE_DECL_TCP_KEEPINTVL 1

define HAVE_DECL___PPC_GET_TIMEBASE 0

define HAVE_DECL___PPC_GET_TIMEBASE_FREQ 0

define HAVE_DETAILED_BACKTRACE 1

define HAVE_DEVX 1

define HAVE_DLFCN_H 1

define HAVE_GDRAPI_H 1

define HAVE_HW_TIMER 1

define HAVE_IB 1

define HAVE_IBV_DM 1

define HAVE_IN6_ADDR_S6_ADDR32 1

define HAVE_INFINIBAND_MLX5DV_H 1

define HAVE_INOTIFY 1

define HAVE_INTTYPES_H 1

define HAVE_IP_IP_DST 1

define HAVE_LIBGEN_H 1

define HAVE_LIBRT 1

define HAVE_LINUX_FUTEX_H 1

define HAVE_LINUX_IP_H 1

define HAVE_LINUX_MMAN_H 1

define HAVE_MALLOC_H 1

define HAVE_MALLOC_HOOK 1

define HAVE_MALLOC_TRIM 1

define HAVE_MEMALIGN 1

define HAVE_MEMORY_H 1

define HAVE_MLX5_DV 1

define HAVE_MLX5_HW_UD 1

define HAVE_MREMAP 1

define HAVE_NETINET_IP_H 1

define HAVE_NET_ETHERNET_H 1

define HAVE_NVML_H 1

define HAVE_POSIX_MEMALIGN 1

define HAVE_PREFETCH 1

define HAVE_SCHED_GETAFFINITY 1

define HAVE_SCHED_SETAFFINITY 1

define HAVE_SIGACTION_SA_RESTORER 1

define HAVE_SIGEVENT_SIGEV_UN_TID 1

define HAVE_SIGHANDLER_T 1

define HAVE_STDINT_H 1

define HAVE_STDLIB_H 1

define HAVE_STRERROR_R 1

define HAVE_STRINGS_H 1

define HAVE_STRING_H 1

define HAVE_STRUCT_DL_PHDR_INFO 1

define HAVE_STRUCT_IBV_DEVICE_ATTR_EX_ODP_CAPS 1

define HAVE_STRUCT_IBV_DEVICE_ATTR_EX_PCI_ATOMIC_CAPS 1

define HAVE_STRUCT_IBV_TM_CAPS_FLAGS 1

define HAVE_STRUCT_MLX5DV_CQ_CQ_UAR 1

define HAVE_SYS_EPOLL_H 1

define HAVE_SYS_EVENTFD_H 1

define HAVE_SYS_STAT_H 1

define HAVE_SYS_TYPES_H 1

define HAVE_SYS_UIO_H 1

define HAVE_TL_DC 1

define HAVE_TL_RC 1

define HAVE_TL_UD 1

define HAVE_UCM_PTMALLOC286 1

define HAVE_UNISTD_H 1

define HAVE___CLEAR_CACHE 1

define HAVE___CURBRK 1

define HAVE___SIGHANDLER_T 1

define IBV_HW_TM 1

define LT_OBJDIR ".libs/"

define NVALGRIND 1

define PACKAGE "ucx"

define PACKAGE_BUGREPORT ""

define PACKAGE_NAME "ucx"

define PACKAGE_STRING "ucx 1.17"

define PACKAGE_TARNAME "ucx"

define PACKAGE_URL ""

define PACKAGE_VERSION "1.17"

define STDC_HEADERS 1

define STRERROR_R_CHAR_P 1

define UCM_BISTRO_HOOKS 1

define UCS_MAX_LOG_LEVEL UCS_LOG_LEVEL_DEBUG

define UCT_TCP_EP_KEEPALIVE 1

define UCT_UD_EP_DEBUG_HOOKS 0

define UCX_CONFIGURE_FLAGS "--build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu --program-prefix= --disable-dependency-tracking --prefix=/usr --exec-prefix=/usr --bindir=/usr/bin --sbindir=/usr/sbin --sysconfdir=/etc --datadir=/usr/share --includedir=/usr/include --libdir=/usr/lib64 --libexecdir=/usr/libexec --localstatedir=/var --sharedstatedir=/var/lib --mandir=/usr/share/man --infodir=/usr/share/info --disable-optimizations --disable-logging --disable-debug --disable-assertions --enable-mt --disable-params-check --without-go --without-java --enable-cma --with-cuda --with-gdrcopy --with-verbs --with-knem --with-rdmacm --without-rocm --with-xpmem --without-fuse3 --without-ugni --without-mad --without-ze --with-cuda=/usr/local/cuda-12.4"

define UCX_MODULE_SUBDIR "ucx"

define VERSION "1.17"

define restrict __restrict

define test_MODULES ":module"

define ucm_MODULES ":cuda"

define ucs_MODULES ""

define uct_MODULES ":cuda:ib:rdmacm:cma:knem:xpmem"

define uct_cuda_MODULES ":gdrcopy"

define uct_ib_MODULES ""

define uct_rocm_MODULES ""

define ucx_perftest_MODULES ":cuda"

$ ucx_info -d #

Memory domain: self

Component: self

register: unlimited, cost: 0 nsec

remote key: 0 bytes

rkey_ptr is supported

memory types: host (access,reg_nonblock,reg,cache)

#

Transport: self

Device: memory

Type: loopback

System device:

#

capabilities:

bandwidth: 0.00/ppn + 19360.00 MB/sec

latency: 0 nsec

overhead: 10 nsec

put_short: <= 4294967295

put_bcopy: unlimited

get_bcopy: unlimited

am_short: <= 8K

am_bcopy: <= 8K

domain: cpu

atomic_add: 32, 64 bit

atomic_and: 32, 64 bit

atomic_or: 32, 64 bit

atomic_xor: 32, 64 bit

atomic_fadd: 32, 64 bit

atomic_fand: 32, 64 bit

atomic_for: 32, 64 bit

atomic_fxor: 32, 64 bit

atomic_swap: 32, 64 bit

atomic_cswap: 32, 64 bit

connection: to iface

device priority: 0

device num paths: 1

max eps: inf

device address: 0 bytes

iface address: 8 bytes

error handling: ep_check

# #

Memory domain: tcp

Component: tcp

register: unlimited, cost: 0 nsec

remote key: 0 bytes

memory types: host (access,reg_nonblock,reg,cache)

#

Transport: tcp

Device: ens21f0

Type: network

System device: ens21f0 (0)

#

capabilities:

bandwidth: 113.16/ppn + 0.00 MB/sec

latency: 5776 nsec

overhead: 50000 nsec

put_zcopy: <= 18446744073709551590, up to 6 iov

put_opt_zcopy_align: <= 1

put_align_mtu: <= 0

am_short: <= 8K

am_bcopy: <= 8K

am_zcopy: <= 64K, up to 6 iov

am_opt_zcopy_align: <= 1

am_align_mtu: <= 0

am header: <= 8037

connection: to ep, to iface

device priority: 0

device num paths: 1

max eps: 256

device address: 6 bytes

iface address: 2 bytes

ep address: 10 bytes

error handling: peer failure, ep_check, keepalive

#

Transport: tcp

Device: ens21f1

Type: network

System device: ens21f1 (1)

#

capabilities:

bandwidth: 113.16/ppn + 0.00 MB/sec

latency: 5776 nsec

overhead: 50000 nsec

put_zcopy: <= 18446744073709551590, up to 6 iov

put_opt_zcopy_align: <= 1

put_align_mtu: <= 0

am_short: <= 8K

am_bcopy: <= 8K

am_zcopy: <= 64K, up to 6 iov

am_opt_zcopy_align: <= 1

am_align_mtu: <= 0

am header: <= 8037

connection: to ep, to iface

device priority: 0

device num paths: 1

max eps: 256

device address: 6 bytes

iface address: 2 bytes

ep address: 10 bytes

error handling: peer failure, ep_check, keepalive

#

Transport: tcp

Device: ib0

Type: network

System device: ib0 (2)

#

capabilities:

bandwidth: 2200.00/ppn + 0.00 MB/sec

latency: 5203 nsec

overhead: 50000 nsec

put_zcopy: <= 18446744073709551590, up to 6 iov

put_opt_zcopy_align: <= 1

put_align_mtu: <= 0

am_short: <= 8K

am_bcopy: <= 8K

am_zcopy: <= 64K, up to 6 iov

am_opt_zcopy_align: <= 1

am_align_mtu: <= 0

am header: <= 8037

connection: to ep, to iface

device priority: 0

device num paths: 1

max eps: 256

device address: 6 bytes

iface address: 2 bytes

ep address: 10 bytes

error handling: peer failure, ep_check, keepalive

#

Transport: tcp

Device: ib1

Type: network

System device: ib1 (3)

#

capabilities:

bandwidth: 2200.00/ppn + 0.00 MB/sec

latency: 5203 nsec

overhead: 50000 nsec

put_zcopy: <= 18446744073709551590, up to 6 iov

put_opt_zcopy_align: <= 1

put_align_mtu: <= 0

am_short: <= 8K

am_bcopy: <= 8K

am_zcopy: <= 64K, up to 6 iov

am_opt_zcopy_align: <= 1

am_align_mtu: <= 0

am header: <= 8037

connection: to ep, to iface

device priority: 0

device num paths: 1

max eps: 256

device address: 6 bytes

iface address: 2 bytes

ep address: 10 bytes

error handling: peer failure, ep_check, keepalive

#

Transport: tcp

Device: lo

Type: network

System device:

#

capabilities:

bandwidth: 11.91/ppn + 0.00 MB/sec

latency: 10960 nsec

overhead: 50000 nsec

put_zcopy: <= 18446744073709551590, up to 6 iov

put_opt_zcopy_align: <= 1

put_align_mtu: <= 0

am_short: <= 8K

am_bcopy: <= 8K

am_zcopy: <= 64K, up to 6 iov

am_opt_zcopy_align: <= 1

am_align_mtu: <= 0

am header: <= 8037

connection: to ep, to iface

device priority: 1

device num paths: 1

max eps: 256

device address: 18 bytes

iface address: 2 bytes

ep address: 10 bytes

error handling: peer failure, ep_check, keepalive

# #

Connection manager: tcp

max_conn_priv: 2064 bytes

#

Memory domain: sysv

Component: sysv

allocate: unlimited

remote key: 12 bytes

rkey_ptr is supported

memory types: host (access,alloc,cache)

#

Transport: sysv

Device: memory

Type: intra-node

System device:

#

capabilities:

bandwidth: 0.00/ppn + 15360.00 MB/sec

latency: 80 nsec

overhead: 10 nsec

put_short: <= 4294967295

put_bcopy: unlimited

get_bcopy: unlimited

am_short: <= 100

am_bcopy: <= 8256

domain: cpu

atomic_add: 32, 64 bit

atomic_and: 32, 64 bit

atomic_or: 32, 64 bit

atomic_xor: 32, 64 bit

atomic_fadd: 32, 64 bit

atomic_fand: 32, 64 bit

atomic_for: 32, 64 bit

atomic_fxor: 32, 64 bit

atomic_swap: 32, 64 bit

atomic_cswap: 32, 64 bit

connection: to iface

device priority: 0

device num paths: 1

max eps: inf

device address: 8 bytes

iface address: 8 bytes

error handling: ep_check

# #

Memory domain: posix

Component: posix

allocate: <= 263740988K

remote key: 24 bytes

rkey_ptr is supported

memory types: host (access,alloc,cache)

#

Transport: posix

Device: memory

Type: intra-node

System device:

#

capabilities:

bandwidth: 0.00/ppn + 15360.00 MB/sec

latency: 80 nsec

overhead: 10 nsec

put_short: <= 4294967295

put_bcopy: unlimited

get_bcopy: unlimited

am_short: <= 100

am_bcopy: <= 8256

domain: cpu

atomic_add: 32, 64 bit

atomic_and: 32, 64 bit

atomic_or: 32, 64 bit

atomic_xor: 32, 64 bit

atomic_fadd: 32, 64 bit

atomic_fand: 32, 64 bit

atomic_for: 32, 64 bit

atomic_fxor: 32, 64 bit

atomic_swap: 32, 64 bit

atomic_cswap: 32, 64 bit

connection: to iface

device priority: 0

device num paths: 1

max eps: inf

device address: 8 bytes

iface address: 8 bytes

error handling: ep_check

# #

Memory domain: mlx5_0

Component: ib

register: unlimited, dmabuf, cost: 16000 + 0.060 * N nsec

remote key: 8 bytes

local memory handle is required for zcopy

memory invalidation is supported

memory types: host (access,reg,cache)

#

Transport: dc_mlx5

Device: mlx5_0:1

Type: network

System device: mlx5_0 (2)

#

capabilities:

bandwidth: 23588.47/ppn + 0.00 MB/sec

latency: 660 nsec

overhead: 40 nsec

put_short: <= 2K

put_bcopy: <= 8256

put_zcopy: <= 1G, up to 11 iov

put_opt_zcopy_align: <= 512

put_align_mtu: <= 4K

get_bcopy: <= 8256

get_zcopy: 65..1G, up to 11 iov

get_opt_zcopy_align: <= 512

get_align_mtu: <= 4K

am_short: <= 2046

am_bcopy: <= 8254

am_zcopy: <= 8254, up to 3 iov

am_opt_zcopy_align: <= 512

am_align_mtu: <= 4K

am header: <= 138

domain: device

atomic_add: 32, 64 bit

atomic_and: 32, 64 bit

atomic_or: 32, 64 bit

atomic_xor: 32, 64 bit

atomic_fadd: 32, 64 bit

atomic_fand: 32, 64 bit

atomic_for: 32, 64 bit

atomic_fxor: 32, 64 bit

atomic_swap: 32, 64 bit

atomic_cswap: 32, 64 bit

connection: to iface

device priority: 50

device num paths: 1

max eps: inf

device address: 3 bytes

iface address: 7 bytes

error handling: buffer (zcopy), remote access, peer failure, ep_check

# #

Transport: rc_verbs

Device: mlx5_0:1

Type: network

System device: mlx5_0 (2)

#

capabilities:

bandwidth: 23588.47/ppn + 0.00 MB/sec

latency: 600 + 1.000 * N nsec

overhead: 75 nsec

put_short: <= 124

put_bcopy: <= 8256

put_zcopy: <= 1G, up to 5 iov

put_opt_zcopy_align: <= 512

put_align_mtu: <= 4K

get_bcopy: <= 8256

get_zcopy: 65..1G, up to 5 iov

get_opt_zcopy_align: <= 512

get_align_mtu: <= 4K

am_short: <= 123

am_bcopy: <= 8255

am_zcopy: <= 8255, up to 4 iov

am_opt_zcopy_align: <= 512

am_align_mtu: <= 4K

am header: <= 127

domain: device

atomic_add: 64 bit

atomic_fadd: 64 bit

atomic_cswap: 64 bit

connection: to ep

device priority: 50

device num paths: 1

max eps: 256

device address: 3 bytes

ep address: 7 bytes

error handling: peer failure, ep_check

# #

Transport: rc_mlx5

Device: mlx5_0:1

Type: network

System device: mlx5_0 (2)

#

capabilities:

bandwidth: 23588.47/ppn + 0.00 MB/sec

latency: 600 + 1.000 * N nsec

overhead: 40 nsec

put_short: <= 2K

put_bcopy: <= 8256

put_zcopy: <= 1G, up to 14 iov

put_opt_zcopy_align: <= 512

put_align_mtu: <= 4K

get_bcopy: <= 8256

get_zcopy: 65..1G, up to 14 iov

get_opt_zcopy_align: <= 512

get_align_mtu: <= 4K

am_short: <= 2046

am_bcopy: <= 8254

am_zcopy: <= 8254, up to 3 iov

am_opt_zcopy_align: <= 512

am_align_mtu: <= 4K

am header: <= 186

domain: device

atomic_add: 32, 64 bit

atomic_and: 32, 64 bit

atomic_or: 32, 64 bit

atomic_xor: 32, 64 bit

atomic_fadd: 32, 64 bit

atomic_fand: 32, 64 bit

atomic_for: 32, 64 bit

atomic_fxor: 32, 64 bit

atomic_swap: 32, 64 bit

atomic_cswap: 32, 64 bit

connection: to ep

device priority: 50

device num paths: 1

max eps: 256

device address: 3 bytes

ep address: 10 bytes

error handling: buffer (zcopy), remote access, peer failure, ep_check

# #

Transport: ud_verbs

Device: mlx5_0:1

Type: network

System device: mlx5_0 (2)

#

capabilities:

bandwidth: 23588.47/ppn + 0.00 MB/sec

latency: 630 nsec

overhead: 105 nsec

am_short: <= 116

am_bcopy: <= 4088

am_zcopy: <= 4088, up to 5 iov

am_opt_zcopy_align: <= 512

am_align_mtu: <= 4K

am header: <= 3992

connection: to ep, to iface

device priority: 50

device num paths: 1

max eps: inf

device address: 3 bytes

iface address: 3 bytes

ep address: 6 bytes

error handling: peer failure, ep_check

# #

Transport: ud_mlx5

Device: mlx5_0:1

Type: network

System device: mlx5_0 (2)

#

capabilities:

bandwidth: 23588.47/ppn + 0.00 MB/sec

latency: 630 nsec

overhead: 80 nsec

am_short: <= 180

am_bcopy: <= 4088

am_zcopy: <= 4088, up to 3 iov

am_opt_zcopy_align: <= 512

am_align_mtu: <= 4K

am header: <= 132

connection: to ep, to iface

device priority: 50

device num paths: 1

max eps: inf

device address: 3 bytes

iface address: 3 bytes

ep address: 6 bytes

error handling: peer failure, ep_check

# #

Memory domain: mlx5_1

Component: ib

register: unlimited, dmabuf, cost: 16000 + 0.060 * N nsec

remote key: 8 bytes

local memory handle is required for zcopy

memory invalidation is supported

memory types: host (access,reg,cache)

#

Transport: dc_mlx5

Device: mlx5_1:1

Type: network

System device: mlx5_1 (3)

#

capabilities:

bandwidth: 23588.47/ppn + 0.00 MB/sec

latency: 660 nsec

overhead: 40 nsec

put_short: <= 2K

put_bcopy: <= 8256

put_zcopy: <= 1G, up to 11 iov

put_opt_zcopy_align: <= 512

put_align_mtu: <= 4K

get_bcopy: <= 8256

get_zcopy: 65..1G, up to 11 iov

get_opt_zcopy_align: <= 512

get_align_mtu: <= 4K

am_short: <= 2046

am_bcopy: <= 8254

am_zcopy: <= 8254, up to 3 iov

am_opt_zcopy_align: <= 512

am_align_mtu: <= 4K

am header: <= 138

domain: device

atomic_add: 32, 64 bit

atomic_and: 32, 64 bit

atomic_or: 32, 64 bit

atomic_xor: 32, 64 bit

atomic_fadd: 32, 64 bit

atomic_fand: 32, 64 bit

atomic_for: 32, 64 bit

atomic_fxor: 32, 64 bit

atomic_swap: 32, 64 bit

atomic_cswap: 32, 64 bit

connection: to iface

device priority: 50

device num paths: 1

max eps: inf

device address: 3 bytes

iface address: 7 bytes

error handling: buffer (zcopy), remote access, peer failure, ep_check

# #

Transport: rc_verbs

Device: mlx5_1:1

Type: network

System device: mlx5_1 (3)

#

capabilities:

bandwidth: 23588.47/ppn + 0.00 MB/sec

latency: 600 + 1.000 * N nsec

overhead: 75 nsec

put_short: <= 124

put_bcopy: <= 8256

put_zcopy: <= 1G, up to 5 iov

put_opt_zcopy_align: <= 512

put_align_mtu: <= 4K

get_bcopy: <= 8256

get_zcopy: 65..1G, up to 5 iov

get_opt_zcopy_align: <= 512

get_align_mtu: <= 4K

am_short: <= 123

am_bcopy: <= 8255

am_zcopy: <= 8255, up to 4 iov

am_opt_zcopy_align: <= 512

am_align_mtu: <= 4K

am header: <= 127

domain: device

atomic_add: 64 bit

atomic_fadd: 64 bit

atomic_cswap: 64 bit

connection: to ep

device priority: 50

device num paths: 1

max eps: 256

device address: 3 bytes

ep address: 7 bytes

error handling: peer failure, ep_check

# #

Transport: rc_mlx5

Device: mlx5_1:1

Type: network

System device: mlx5_1 (3)

#

capabilities:

bandwidth: 23588.47/ppn + 0.00 MB/sec

latency: 600 + 1.000 * N nsec

overhead: 40 nsec

put_short: <= 2K

put_bcopy: <= 8256

put_zcopy: <= 1G, up to 14 iov

put_opt_zcopy_align: <= 512

put_align_mtu: <= 4K

get_bcopy: <= 8256

get_zcopy: 65..1G, up to 14 iov

get_opt_zcopy_align: <= 512

get_align_mtu: <= 4K

am_short: <= 2046

am_bcopy: <= 8254

am_zcopy: <= 8254, up to 3 iov

am_opt_zcopy_align: <= 512

am_align_mtu: <= 4K

am header: <= 186

domain: device

atomic_add: 32, 64 bit

atomic_and: 32, 64 bit

atomic_or: 32, 64 bit

atomic_xor: 32, 64 bit

atomic_fadd: 32, 64 bit

atomic_fand: 32, 64 bit

atomic_for: 32, 64 bit

atomic_fxor: 32, 64 bit

atomic_swap: 32, 64 bit

atomic_cswap: 32, 64 bit

connection: to ep

device priority: 50

device num paths: 1

max eps: 256

device address: 3 bytes

ep address: 10 bytes

error handling: buffer (zcopy), remote access, peer failure, ep_check

# #

Transport: ud_verbs

Device: mlx5_1:1

Type: network

System device: mlx5_1 (3)

#

capabilities:

bandwidth: 23588.47/ppn + 0.00 MB/sec

latency: 630 nsec

overhead: 105 nsec

am_short: <= 116

am_bcopy: <= 4088

am_zcopy: <= 4088, up to 5 iov

am_opt_zcopy_align: <= 512

am_align_mtu: <= 4K

am header: <= 3992

connection: to ep, to iface

device priority: 50

device num paths: 1

max eps: inf

device address: 3 bytes

iface address: 3 bytes

ep address: 6 bytes

error handling: peer failure, ep_check

# #

Transport: ud_mlx5

Device: mlx5_1:1

Type: network

System device: mlx5_1 (3)

#

capabilities:

bandwidth: 23588.47/ppn + 0.00 MB/sec

latency: 630 nsec

overhead: 80 nsec

am_short: <= 180

am_bcopy: <= 4088

am_zcopy: <= 4088, up to 3 iov

am_opt_zcopy_align: <= 512

am_align_mtu: <= 4K

am header: <= 132

connection: to ep, to iface

device priority: 50

device num paths: 1

max eps: inf

device address: 3 bytes

iface address: 3 bytes

ep address: 6 bytes

error handling: peer failure, ep_check

# #

Memory domain: mlx5_0

Component: gga

register: unlimited, dmabuf, cost: 16000 + 0.060 * N nsec

remote key: 8 bytes

local memory handle is required for zcopy

memory invalidation is supported

memory types: host (access,reg,cache)

< no supported devices found >

#

Memory domain: mlx5_1

Component: gga

register: unlimited, dmabuf, cost: 16000 + 0.060 * N nsec

remote key: 8 bytes

local memory handle is required for zcopy

memory invalidation is supported

memory types: host (access,reg,cache)

< no supported devices found >

#

Connection manager: rdmacm

max_conn_priv: 54 bytes

#

Memory domain: cma

Component: cma

register: unlimited, cost: 9 nsec

memory types: host (access,reg_nonblock,reg,cache)

#

Transport: cma

Device: memory

Type: intra-node

System device:

#

capabilities:

bandwidth: 0.00/ppn + 11145.00 MB/sec

latency: 80 nsec

overhead: 2000 nsec

put_zcopy: unlimited, up to 16 iov

put_opt_zcopy_align: <= 1

put_align_mtu: <= 1

get_zcopy: unlimited, up to 16 iov

get_opt_zcopy_align: <= 1

get_align_mtu: <= 1

connection: to iface

device priority: 0

device num paths: 1

max eps: inf

device address: 8 bytes

iface address: 4 bytes

error handling: peer failure, ep_check

# #

Memory domain: knem

Component: knem

register: unlimited, cost: 1200 + 0.007 * N nsec

remote key: 16 bytes

memory types: host (access,reg,cache)

#

Transport: knem

Device: memory

Type: intra-node

System device:

#

capabilities:

bandwidth: 0.00/ppn + 13862.00 MB/sec

latency: 80 nsec

overhead: 2000 nsec

put_zcopy: unlimited, up to 16 iov

put_opt_zcopy_align: <= 1

put_align_mtu: <= 1

get_zcopy: unlimited, up to 16 iov

get_opt_zcopy_align: <= 1

get_align_mtu: <= 1

connection: to iface

device priority: 0

device num paths: 1

max eps: inf

device address: 8 bytes

iface address: 0 bytes

error handling: none

# #

Memory domain: xpmem

Component: xpmem

register: unlimited, cost: 60 nsec

remote key: 24 bytes

rkey_ptr is supported

memory types: host (access,alloc,reg_nonblock,reg,cache)

#

Transport: xpmem

Device: memory

Type: intra-node

System device:

#

capabilities:

bandwidth: 0.00/ppn + 15360.00 MB/sec

latency: 80 nsec

overhead: 10 nsec

put_short: <= 4294967295

put_bcopy: unlimited

get_bcopy: unlimited

am_short: <= 100

am_bcopy: <= 8256

domain: cpu

atomic_add: 32, 64 bit

atomic_and: 32, 64 bit

atomic_or: 32, 64 bit

atomic_xor: 32, 64 bit

atomic_fadd: 32, 64 bit

atomic_fand: 32, 64 bit

atomic_for: 32, 64 bit

atomic_fxor: 32, 64 bit

atomic_swap: 32, 64 bit

atomic_cswap: 32, 64 bit

connection: to iface

device priority: 0

device num paths: 1

max eps: inf

device address: 8 bytes

iface address: 16 bytes

error handling: none

#

- **Any UCX environment variables used**
export UCX_TLS=ib,sm,cuda,cuda_copy,cuda_ipc,gdr_copy

### Setup and versions
- OS version (e.g Linux distro) + CPU architecture (x86_64/aarch64/ppc64le/...)
   - `cat /etc/issue` or `cat /etc/redhat-release` + `uname -a`
  Red Hat Enterprise Linux release 8.9 (Ootpa) + Linux gpu2 4.18.0-513.24.1.el8_9.x86_64 #1 SMP Thu Mar 14 14:20:09 EDT 2024 x86_64 x86_64 x86_64 GNU/Linux
   - For Nvidia Bluefield SmartNIC include `cat /etc/mlnx-release` (the string identifies software and firmware setup)
- For RDMA/IB/RoCE related issues:
    - Driver version:
        - `rpm -q rdma-core` or `rpm -q libibverbs`
        - or: MLNX_OFED version `ofed_info -s`
        $ ofed_info -s
          MLNX_OFED_LINUX-24.04-0.6.6.0:
   - HW information from `ibstat` or `ibv_devinfo -vv` command

hca_id: mlx5_1 transport: InfiniBand (0) fw_ver: 20.37.1700 node_guid: 88e9:a4ff:ff25:a45b sys_image_guid: 88e9:a4ff:ff25:a45a vendor_id: 0x02c9 vendor_part_id: 4123 hw_ver: 0x0 board_id: MT_0000000594 phys_port_cnt: 1 max_mr_size: 0xffffffffffffffff page_size_cap: 0xfffffffffffff000 max_qp: 131072 max_qp_wr: 32768 device_cap_flags: 0x21361c36 BAD_PKEY_CNTR BAD_QKEY_CNTR AUTO_PATH_MIG CHANGE_PHY_PORT PORT_ACTIVE_EVENT SYS_IMAGE_GUID RC_RNR_NAK_GEN MEM_WINDOW UD_IP_CSUM XRC MEM_MGT_EXTENSIONS MEM_WINDOW_TYPE_2B MANAGED_FLOW_STEERING max_sge: 30 max_sge_rd: 30 max_cq: 16777216 max_cqe: 4194303 max_mr: 16777216 max_pd: 8388608 max_qp_rd_atom: 16 max_ee_rd_atom: 0 max_res_rd_atom: 2097152 max_qp_init_rd_atom: 16 max_ee_init_rd_atom: 0 atomic_cap: ATOMIC_HCA (1) max_ee: 0 max_rdd: 0 max_mw: 16777216 max_raw_ipv6_qp: 0 max_raw_ethy_qp: 0 max_mcast_grp: 2097152 max_mcast_qp_attach: 240 max_total_mcast_qp_attach: 503316480 max_ah: 2147483647 max_fmr: 0 max_srq: 8388608 max_srq_wr: 32767 max_srq_sge: 31 max_pkeys: 128 local_ca_ack_delay: 16 general_odp_caps: ODP_SUPPORT ODP_SUPPORT_IMPLICIT rc_odp_caps: SUPPORT_SEND SUPPORT_RECV SUPPORT_WRITE SUPPORT_READ SUPPORT_ATOMIC SUPPORT_SRQ uc_odp_caps: NO SUPPORT ud_odp_caps: SUPPORT_SEND xrc_odp_caps: SUPPORT_SEND SUPPORT_WRITE SUPPORT_READ SUPPORT_ATOMIC SUPPORT_SRQ completion timestamp_mask: 0x7fffffffffffffff hca_core_clock: 156250kHZ device_cap_flags_ex: 0x3000005021361C36 PCI_WRITE_END_PADDING Unknown flags: 0x3000004000000000 tso_caps: max_tso: 0 rss_caps: max_rwq_indirection_tables: 0 max_rwq_indirection_table_size: 0 rx_hash_function: 0x0 rx_hash_fields_mask: 0x0 max_wq_type_rq: 0 packet_pacing_caps: qp_rate_limit_min: 0kbps qp_rate_limit_max: 0kbps max_rndv_hdr_size: 64 max_num_tags: 127 max_ops: 32768 max_sge: 1 flags: IBV_TM_CAP_RC

cq moderation caps:
    max_cq_count:   65535
    max_cq_period:  4095 us

maximum available device memory:    131072Bytes

num_comp_vectors:       63
    port:   1
        state:          PORT_ACTIVE (4)
        max_mtu:        4096 (5)
        active_mtu:     4096 (5)
        sm_lid:         1
        port_lid:       6
        port_lmc:       0x00
        link_layer:     InfiniBand
        max_msg_sz:     0x40000000
        port_cap_flags:     0xa259e848
        port_cap_flags2:    0x0032
        max_vl_num:     4 (3)
        bad_pkey_cntr:      0x0
        qkey_viol_cntr:     0x0
        sm_sl:          0
        pkey_tbl_len:       128
        gid_tbl_len:        8
        subnet_timeout:     18
        init_type_reply:    0
        active_width:       4X (2)
        active_speed:       50.0 Gbps (64)
        phys_state:     LINK_UP (5)
        GID[  0]:       fe80:0000:0000:0000:88e9:a4ff:ff25:a45b


- For GPU related issues:
  - GPU type
  - Nvidia A100
  - Cuda: 
      - Drivers version
      Driver Version: 555.42.02
      - Check if peer-direct is loaded: `lsmod|grep nv_peer_mem` and/or gdrcopy: `lsmod|grep gdrdrv`
      - Manually need to load nv_peer_mem but not loaded when the problem happens.
      $ lsmod|grep gdrdrv
gdrdrv                 24576  0
nvidia               8691712  365 nvidia_uvm,nvidia_fs,gdrdrv,nvidia_modeset

### Additional information (depending on the issue)
- OpenMPI version
- 5.0.3
- Output of `ucx_info -d` to show transports and devices recognized by UCX
- Configure result - config.log
- Log file - configure UCX with "--enable-logging" - and run with "UCX_LOG_LEVEL=data"
yosefe commented 3 weeks ago

@RamHPC Probably the UCX coming with MLNX_OFED bundle does not include the compiled-in cuda and gdrcopy support. Currently MLNX_OFED includes ucx/cuda support for most, but not all operating systems. I'd suggest to keep using UCX from GitHub distribution, that is to uninstall all ucx-* RPMs that came from MLNX_OFED.

RamHPC commented 2 weeks ago

@RamHPC Probably the UCX coming with MLNX_OFED bundle does not include the compiled-in cuda and gdrcopy support. Currently MLNX_OFED includes ucx/cuda support for most, but not all operating systems. I'd suggest to keep using UCX from GitHub distribution, that is to uninstall all ucx-* RPMs that came from MLNX_OFED.

Thank you! Will give this a try. When I see the flags MLNX_OFED installed UCX it looks like it included cuda and gdr_copy but some how transports don't show up. There is no way to re-calibrate existing installation?