Open RamHPC opened 3 weeks ago
@RamHPC Probably the UCX coming with MLNX_OFED bundle does not include the compiled-in cuda and gdrcopy support. Currently MLNX_OFED includes ucx/cuda support for most, but not all operating systems. I'd suggest to keep using UCX from GitHub distribution, that is to uninstall all ucx-* RPMs that came from MLNX_OFED.
@RamHPC Probably the UCX coming with MLNX_OFED bundle does not include the compiled-in cuda and gdrcopy support. Currently MLNX_OFED includes ucx/cuda support for most, but not all operating systems. I'd suggest to keep using UCX from GitHub distribution, that is to uninstall all ucx-* RPMs that came from MLNX_OFED.
Thank you! Will give this a try. When I see the flags MLNX_OFED installed UCX it looks like it included cuda and gdr_copy but some how transports don't show up. There is no way to re-calibrate existing installation?
Describe the bug
Installed ucx-1.16 and everything was working fine. The devices/transports recognized are inline with the expectation. Installed OFED (MLNX_OFED_LINUX-24.04-0.6.6.0-rhel8.9-x86_64) for which automatically installed ucx-1.17 version. This doesn't show cuda, cuda_cpy and gdr_copy as devices/transports
Steps to Reproduce
ucx_info -v
)$ ucx_info -b
define UCX_CONFIG_H
define ENABLE_BUILTIN_MEMCPY 1
define ENABLE_DEBUG_DATA 0
define ENABLE_MT 1
define ENABLE_PARAMS_CHECK 0
define HAVE_1_ARG_BFD_SECTION_SIZE 0
define HAVE_ALLOCA 1
define HAVE_ALLOCA_H 1
define HAVE_ATTRIBUTE_NOOPTIMIZE 1
define HAVE_CLEARENV 1
define HAVE_CPLUS_DEMANGLE 1
define HAVE_CPU_SET_T 1
define HAVE_CUDA 1
define HAVE_CUDA_H 1
define HAVE_CUDA_RUNTIME_H 1
define HAVE_DC_DV 1
define HAVE_DECL_ASPRINTF 1
define HAVE_DECL_BASENAME 1
define HAVE_DECL_BFD_GET_SECTION_FLAGS 1
define HAVE_DECL_BFD_GET_SECTION_VMA 1
define HAVE_DECL_BFD_SECTION_FLAGS 0
define HAVE_DECL_BFD_SECTION_VMA 1
define HAVE_DECL_CPU_ISSET 1
define HAVE_DECL_CPU_ZERO 1
define HAVE_DECL_ETHTOOL_CMD_SPEED 1
define HAVE_DECL_FMEMOPEN 1
define HAVE_DECL_F_SETOWN_EX 1
define HAVE_DECL_GDR_COPY_TO_MAPPING 1
define HAVE_DECL_GETAUXVAL 1
define HAVE_DECL_IBV_ACCESS_ON_DEMAND 1
define HAVE_DECL_IBV_ACCESS_RELAXED_ORDERING 1
define HAVE_DECL_IBV_ADVISE_MR 1
define HAVE_DECL_IBV_ALLOC_DM 1
define HAVE_DECL_IBV_ALLOC_TD 1
define HAVE_DECL_IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN 1
define HAVE_DECL_IBV_CREATE_CQ_EX 1
define HAVE_DECL_IBV_CREATE_QP_EX 1
define HAVE_DECL_IBV_CREATE_SRQ 1
define HAVE_DECL_IBV_CREATE_SRQ_EX 1
define HAVE_DECL_IBV_EVENT_GID_CHANGE 1
define HAVE_DECL_IBV_EVENT_TYPE_STR 1
define HAVE_DECL_IBV_GET_ASYNC_EVENT 1
define HAVE_DECL_IBV_GET_DEVICE_NAME 1
define HAVE_DECL_IBV_LINK_LAYER_ETHERNET 1
define HAVE_DECL_IBV_LINK_LAYER_INFINIBAND 1
define HAVE_DECL_IBV_QPF_GRH_REQUIRED 1
define HAVE_DECL_IBV_QUERY_DEVICE_EX 1
define HAVE_DECL_IBV_QUERY_GID 1
define HAVE_DECL_IBV_REG_DMABUF_MR 1
define HAVE_DECL_IBV_SET_ECE 1
define HAVE_DECL_IBV_TRANSPORT_UNSPECIFIED 1
define HAVE_DECL_IBV_TRANSPORT_USNIC 1
define HAVE_DECL_IBV_TRANSPORT_USNIC_UDP 1
define HAVE_DECL_IBV_WC_STATUS_STR 1
define HAVE_DECL_INOTIFY_ADD_WATCH 1
define HAVE_DECL_INOTIFY_INIT 1
define HAVE_DECL_IN_ATTRIB 1
define HAVE_DECL_IPPROTO_TCP 1
define HAVE_DECL_MADV_FREE 1
define HAVE_DECL_MADV_REMOVE 1
define HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE 1
define HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE 1
define HAVE_DECL_MLX5DV_CREATE_QP 1
define HAVE_DECL_MLX5DV_DCTYPE_DCT 1
define HAVE_DECL_MLX5DV_DEVX_SUBSCRIBE_DEVX_EVENT 1
define HAVE_DECL_MLX5DV_DEVX_UMEM_REG_EX 1
define HAVE_DECL_MLX5DV_INIT_OBJ 1
define HAVE_DECL_MLX5DV_IS_SUPPORTED 1
define HAVE_DECL_MLX5DV_OBJ_AH 1
define HAVE_DECL_MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE 1
define HAVE_DECL_MLX5DV_UAR_ALLOC_TYPE_BF 1
define HAVE_DECL_MLX5DV_UAR_ALLOC_TYPE_NC 1
define HAVE_DECL_POSIX_MADV_DONTNEED 1
define HAVE_DECL_PR_SET_PTRACER 1
define HAVE_DECL_SOL_SOCKET 1
define HAVE_DECL_SO_KEEPALIVE 1
define HAVE_DECL_SPEED_UNKNOWN 1
define HAVE_DECL_STRERROR_R 1
define HAVE_DECL_SYS_BRK 1
define HAVE_DECL_SYS_IPC 0
define HAVE_DECL_SYS_MADVISE 1
define HAVE_DECL_SYS_MMAP 1
define HAVE_DECL_SYS_MREMAP 1
define HAVE_DECL_SYS_MUNMAP 1
define HAVE_DECL_SYS_SHMAT 1
define HAVE_DECL_SYS_SHMDT 1
define HAVE_DECL_TCP_KEEPCNT 1
define HAVE_DECL_TCP_KEEPIDLE 1
define HAVE_DECL_TCP_KEEPINTVL 1
define HAVE_DECL___PPC_GET_TIMEBASE 0
define HAVE_DECL___PPC_GET_TIMEBASE_FREQ 0
define HAVE_DETAILED_BACKTRACE 1
define HAVE_DEVX 1
define HAVE_DLFCN_H 1
define HAVE_GDRAPI_H 1
define HAVE_HW_TIMER 1
define HAVE_IB 1
define HAVE_IBV_DM 1
define HAVE_IN6_ADDR_S6_ADDR32 1
define HAVE_INFINIBAND_MLX5DV_H 1
define HAVE_INOTIFY 1
define HAVE_INTTYPES_H 1
define HAVE_IP_IP_DST 1
define HAVE_LIBGEN_H 1
define HAVE_LIBRT 1
define HAVE_LINUX_FUTEX_H 1
define HAVE_LINUX_IP_H 1
define HAVE_LINUX_MMAN_H 1
define HAVE_MALLOC_H 1
define HAVE_MALLOC_HOOK 1
define HAVE_MALLOC_TRIM 1
define HAVE_MEMALIGN 1
define HAVE_MEMORY_H 1
define HAVE_MLX5_DV 1
define HAVE_MLX5_HW_UD 1
define HAVE_MREMAP 1
define HAVE_NETINET_IP_H 1
define HAVE_NET_ETHERNET_H 1
define HAVE_NVML_H 1
define HAVE_POSIX_MEMALIGN 1
define HAVE_PREFETCH 1
define HAVE_SCHED_GETAFFINITY 1
define HAVE_SCHED_SETAFFINITY 1
define HAVE_SIGACTION_SA_RESTORER 1
define HAVE_SIGEVENT_SIGEV_UN_TID 1
define HAVE_SIGHANDLER_T 1
define HAVE_STDINT_H 1
define HAVE_STDLIB_H 1
define HAVE_STRERROR_R 1
define HAVE_STRINGS_H 1
define HAVE_STRING_H 1
define HAVE_STRUCT_DL_PHDR_INFO 1
define HAVE_STRUCT_IBV_DEVICE_ATTR_EX_ODP_CAPS 1
define HAVE_STRUCT_IBV_DEVICE_ATTR_EX_PCI_ATOMIC_CAPS 1
define HAVE_STRUCT_IBV_TM_CAPS_FLAGS 1
define HAVE_STRUCT_MLX5DV_CQ_CQ_UAR 1
define HAVE_SYS_EPOLL_H 1
define HAVE_SYS_EVENTFD_H 1
define HAVE_SYS_STAT_H 1
define HAVE_SYS_TYPES_H 1
define HAVE_SYS_UIO_H 1
define HAVE_TL_DC 1
define HAVE_TL_RC 1
define HAVE_TL_UD 1
define HAVE_UCM_PTMALLOC286 1
define HAVE_UNISTD_H 1
define HAVE___CLEAR_CACHE 1
define HAVE___CURBRK 1
define HAVE___SIGHANDLER_T 1
define IBV_HW_TM 1
define LT_OBJDIR ".libs/"
define NVALGRIND 1
define PACKAGE "ucx"
define PACKAGE_BUGREPORT ""
define PACKAGE_NAME "ucx"
define PACKAGE_STRING "ucx 1.17"
define PACKAGE_TARNAME "ucx"
define PACKAGE_URL ""
define PACKAGE_VERSION "1.17"
define STDC_HEADERS 1
define STRERROR_R_CHAR_P 1
define UCM_BISTRO_HOOKS 1
define UCS_MAX_LOG_LEVEL UCS_LOG_LEVEL_DEBUG
define UCT_TCP_EP_KEEPALIVE 1
define UCT_UD_EP_DEBUG_HOOKS 0
define UCX_CONFIGURE_FLAGS "--build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu --program-prefix= --disable-dependency-tracking --prefix=/usr --exec-prefix=/usr --bindir=/usr/bin --sbindir=/usr/sbin --sysconfdir=/etc --datadir=/usr/share --includedir=/usr/include --libdir=/usr/lib64 --libexecdir=/usr/libexec --localstatedir=/var --sharedstatedir=/var/lib --mandir=/usr/share/man --infodir=/usr/share/info --disable-optimizations --disable-logging --disable-debug --disable-assertions --enable-mt --disable-params-check --without-go --without-java --enable-cma --with-cuda --with-gdrcopy --with-verbs --with-knem --with-rdmacm --without-rocm --with-xpmem --without-fuse3 --without-ugni --without-mad --without-ze --with-cuda=/usr/local/cuda-12.4"
define UCX_MODULE_SUBDIR "ucx"
define VERSION "1.17"
define restrict __restrict
define test_MODULES ":module"
define ucm_MODULES ":cuda"
define ucs_MODULES ""
define uct_MODULES ":cuda:ib:rdmacm:cma:knem:xpmem"
define uct_cuda_MODULES ":gdrcopy"
define uct_ib_MODULES ""
define uct_rocm_MODULES ""
define ucx_perftest_MODULES ":cuda"
$ ucx_info -d #
Memory domain: self
Component: self
register: unlimited, cost: 0 nsec
remote key: 0 bytes
rkey_ptr is supported
memory types: host (access,reg_nonblock,reg,cache)
#
Transport: self
Device: memory
Type: loopback
System device:
#
capabilities:
bandwidth: 0.00/ppn + 19360.00 MB/sec
latency: 0 nsec
overhead: 10 nsec
put_short: <= 4294967295
put_bcopy: unlimited
get_bcopy: unlimited
am_short: <= 8K
am_bcopy: <= 8K
domain: cpu
atomic_add: 32, 64 bit
atomic_and: 32, 64 bit
atomic_or: 32, 64 bit
atomic_xor: 32, 64 bit
atomic_fadd: 32, 64 bit
atomic_fand: 32, 64 bit
atomic_for: 32, 64 bit
atomic_fxor: 32, 64 bit
atomic_swap: 32, 64 bit
atomic_cswap: 32, 64 bit
connection: to iface
device priority: 0
device num paths: 1
max eps: inf
device address: 0 bytes
iface address: 8 bytes
error handling: ep_check
# #
Memory domain: tcp
Component: tcp
register: unlimited, cost: 0 nsec
remote key: 0 bytes
memory types: host (access,reg_nonblock,reg,cache)
#
Transport: tcp
Device: ens21f0
Type: network
System device: ens21f0 (0)
#
capabilities:
bandwidth: 113.16/ppn + 0.00 MB/sec
latency: 5776 nsec
overhead: 50000 nsec
put_zcopy: <= 18446744073709551590, up to 6 iov
put_opt_zcopy_align: <= 1
put_align_mtu: <= 0
am_short: <= 8K
am_bcopy: <= 8K
am_zcopy: <= 64K, up to 6 iov
am_opt_zcopy_align: <= 1
am_align_mtu: <= 0
am header: <= 8037
connection: to ep, to iface
device priority: 0
device num paths: 1
max eps: 256
device address: 6 bytes
iface address: 2 bytes
ep address: 10 bytes
error handling: peer failure, ep_check, keepalive
#
Transport: tcp
Device: ens21f1
Type: network
System device: ens21f1 (1)
#
capabilities:
bandwidth: 113.16/ppn + 0.00 MB/sec
latency: 5776 nsec
overhead: 50000 nsec
put_zcopy: <= 18446744073709551590, up to 6 iov
put_opt_zcopy_align: <= 1
put_align_mtu: <= 0
am_short: <= 8K
am_bcopy: <= 8K
am_zcopy: <= 64K, up to 6 iov
am_opt_zcopy_align: <= 1
am_align_mtu: <= 0
am header: <= 8037
connection: to ep, to iface
device priority: 0
device num paths: 1
max eps: 256
device address: 6 bytes
iface address: 2 bytes
ep address: 10 bytes
error handling: peer failure, ep_check, keepalive
#
Transport: tcp
Device: ib0
Type: network
System device: ib0 (2)
#
capabilities:
bandwidth: 2200.00/ppn + 0.00 MB/sec
latency: 5203 nsec
overhead: 50000 nsec
put_zcopy: <= 18446744073709551590, up to 6 iov
put_opt_zcopy_align: <= 1
put_align_mtu: <= 0
am_short: <= 8K
am_bcopy: <= 8K
am_zcopy: <= 64K, up to 6 iov
am_opt_zcopy_align: <= 1
am_align_mtu: <= 0
am header: <= 8037
connection: to ep, to iface
device priority: 0
device num paths: 1
max eps: 256
device address: 6 bytes
iface address: 2 bytes
ep address: 10 bytes
error handling: peer failure, ep_check, keepalive
#
Transport: tcp
Device: ib1
Type: network
System device: ib1 (3)
#
capabilities:
bandwidth: 2200.00/ppn + 0.00 MB/sec
latency: 5203 nsec
overhead: 50000 nsec
put_zcopy: <= 18446744073709551590, up to 6 iov
put_opt_zcopy_align: <= 1
put_align_mtu: <= 0
am_short: <= 8K
am_bcopy: <= 8K
am_zcopy: <= 64K, up to 6 iov
am_opt_zcopy_align: <= 1
am_align_mtu: <= 0
am header: <= 8037
connection: to ep, to iface
device priority: 0
device num paths: 1
max eps: 256
device address: 6 bytes
iface address: 2 bytes
ep address: 10 bytes
error handling: peer failure, ep_check, keepalive
#
Transport: tcp
Device: lo
Type: network
System device:
#
capabilities:
bandwidth: 11.91/ppn + 0.00 MB/sec
latency: 10960 nsec
overhead: 50000 nsec
put_zcopy: <= 18446744073709551590, up to 6 iov
put_opt_zcopy_align: <= 1
put_align_mtu: <= 0
am_short: <= 8K
am_bcopy: <= 8K
am_zcopy: <= 64K, up to 6 iov
am_opt_zcopy_align: <= 1
am_align_mtu: <= 0
am header: <= 8037
connection: to ep, to iface
device priority: 1
device num paths: 1
max eps: 256
device address: 18 bytes
iface address: 2 bytes
ep address: 10 bytes
error handling: peer failure, ep_check, keepalive
# #
Connection manager: tcp
max_conn_priv: 2064 bytes
#
Memory domain: sysv
Component: sysv
allocate: unlimited
remote key: 12 bytes
rkey_ptr is supported
memory types: host (access,alloc,cache)
#
Transport: sysv
Device: memory
Type: intra-node
System device:
#
capabilities:
bandwidth: 0.00/ppn + 15360.00 MB/sec
latency: 80 nsec
overhead: 10 nsec
put_short: <= 4294967295
put_bcopy: unlimited
get_bcopy: unlimited
am_short: <= 100
am_bcopy: <= 8256
domain: cpu
atomic_add: 32, 64 bit
atomic_and: 32, 64 bit
atomic_or: 32, 64 bit
atomic_xor: 32, 64 bit
atomic_fadd: 32, 64 bit
atomic_fand: 32, 64 bit
atomic_for: 32, 64 bit
atomic_fxor: 32, 64 bit
atomic_swap: 32, 64 bit
atomic_cswap: 32, 64 bit
connection: to iface
device priority: 0
device num paths: 1
max eps: inf
device address: 8 bytes
iface address: 8 bytes
error handling: ep_check
# #
Memory domain: posix
Component: posix
allocate: <= 263740988K
remote key: 24 bytes
rkey_ptr is supported
memory types: host (access,alloc,cache)
#
Transport: posix
Device: memory
Type: intra-node
System device:
#
capabilities:
bandwidth: 0.00/ppn + 15360.00 MB/sec
latency: 80 nsec
overhead: 10 nsec
put_short: <= 4294967295
put_bcopy: unlimited
get_bcopy: unlimited
am_short: <= 100
am_bcopy: <= 8256
domain: cpu
atomic_add: 32, 64 bit
atomic_and: 32, 64 bit
atomic_or: 32, 64 bit
atomic_xor: 32, 64 bit
atomic_fadd: 32, 64 bit
atomic_fand: 32, 64 bit
atomic_for: 32, 64 bit
atomic_fxor: 32, 64 bit
atomic_swap: 32, 64 bit
atomic_cswap: 32, 64 bit
connection: to iface
device priority: 0
device num paths: 1
max eps: inf
device address: 8 bytes
iface address: 8 bytes
error handling: ep_check
# #
Memory domain: mlx5_0
Component: ib
register: unlimited, dmabuf, cost: 16000 + 0.060 * N nsec
remote key: 8 bytes
local memory handle is required for zcopy
memory invalidation is supported
memory types: host (access,reg,cache)
#
Transport: dc_mlx5
Device: mlx5_0:1
Type: network
System device: mlx5_0 (2)
#
capabilities:
bandwidth: 23588.47/ppn + 0.00 MB/sec
latency: 660 nsec
overhead: 40 nsec
put_short: <= 2K
put_bcopy: <= 8256
put_zcopy: <= 1G, up to 11 iov
put_opt_zcopy_align: <= 512
put_align_mtu: <= 4K
get_bcopy: <= 8256
get_zcopy: 65..1G, up to 11 iov
get_opt_zcopy_align: <= 512
get_align_mtu: <= 4K
am_short: <= 2046
am_bcopy: <= 8254
am_zcopy: <= 8254, up to 3 iov
am_opt_zcopy_align: <= 512
am_align_mtu: <= 4K
am header: <= 138
domain: device
atomic_add: 32, 64 bit
atomic_and: 32, 64 bit
atomic_or: 32, 64 bit
atomic_xor: 32, 64 bit
atomic_fadd: 32, 64 bit
atomic_fand: 32, 64 bit
atomic_for: 32, 64 bit
atomic_fxor: 32, 64 bit
atomic_swap: 32, 64 bit
atomic_cswap: 32, 64 bit
connection: to iface
device priority: 50
device num paths: 1
max eps: inf
device address: 3 bytes
iface address: 7 bytes
error handling: buffer (zcopy), remote access, peer failure, ep_check
# #
Transport: rc_verbs
Device: mlx5_0:1
Type: network
System device: mlx5_0 (2)
#
capabilities:
bandwidth: 23588.47/ppn + 0.00 MB/sec
latency: 600 + 1.000 * N nsec
overhead: 75 nsec
put_short: <= 124
put_bcopy: <= 8256
put_zcopy: <= 1G, up to 5 iov
put_opt_zcopy_align: <= 512
put_align_mtu: <= 4K
get_bcopy: <= 8256
get_zcopy: 65..1G, up to 5 iov
get_opt_zcopy_align: <= 512
get_align_mtu: <= 4K
am_short: <= 123
am_bcopy: <= 8255
am_zcopy: <= 8255, up to 4 iov
am_opt_zcopy_align: <= 512
am_align_mtu: <= 4K
am header: <= 127
domain: device
atomic_add: 64 bit
atomic_fadd: 64 bit
atomic_cswap: 64 bit
connection: to ep
device priority: 50
device num paths: 1
max eps: 256
device address: 3 bytes
ep address: 7 bytes
error handling: peer failure, ep_check
# #
Transport: rc_mlx5
Device: mlx5_0:1
Type: network
System device: mlx5_0 (2)
#
capabilities:
bandwidth: 23588.47/ppn + 0.00 MB/sec
latency: 600 + 1.000 * N nsec
overhead: 40 nsec
put_short: <= 2K
put_bcopy: <= 8256
put_zcopy: <= 1G, up to 14 iov
put_opt_zcopy_align: <= 512
put_align_mtu: <= 4K
get_bcopy: <= 8256
get_zcopy: 65..1G, up to 14 iov
get_opt_zcopy_align: <= 512
get_align_mtu: <= 4K
am_short: <= 2046
am_bcopy: <= 8254
am_zcopy: <= 8254, up to 3 iov
am_opt_zcopy_align: <= 512
am_align_mtu: <= 4K
am header: <= 186
domain: device
atomic_add: 32, 64 bit
atomic_and: 32, 64 bit
atomic_or: 32, 64 bit
atomic_xor: 32, 64 bit
atomic_fadd: 32, 64 bit
atomic_fand: 32, 64 bit
atomic_for: 32, 64 bit
atomic_fxor: 32, 64 bit
atomic_swap: 32, 64 bit
atomic_cswap: 32, 64 bit
connection: to ep
device priority: 50
device num paths: 1
max eps: 256
device address: 3 bytes
ep address: 10 bytes
error handling: buffer (zcopy), remote access, peer failure, ep_check
# #
Transport: ud_verbs
Device: mlx5_0:1
Type: network
System device: mlx5_0 (2)
#
capabilities:
bandwidth: 23588.47/ppn + 0.00 MB/sec
latency: 630 nsec
overhead: 105 nsec
am_short: <= 116
am_bcopy: <= 4088
am_zcopy: <= 4088, up to 5 iov
am_opt_zcopy_align: <= 512
am_align_mtu: <= 4K
am header: <= 3992
connection: to ep, to iface
device priority: 50
device num paths: 1
max eps: inf
device address: 3 bytes
iface address: 3 bytes
ep address: 6 bytes
error handling: peer failure, ep_check
# #
Transport: ud_mlx5
Device: mlx5_0:1
Type: network
System device: mlx5_0 (2)
#
capabilities:
bandwidth: 23588.47/ppn + 0.00 MB/sec
latency: 630 nsec
overhead: 80 nsec
am_short: <= 180
am_bcopy: <= 4088
am_zcopy: <= 4088, up to 3 iov
am_opt_zcopy_align: <= 512
am_align_mtu: <= 4K
am header: <= 132
connection: to ep, to iface
device priority: 50
device num paths: 1
max eps: inf
device address: 3 bytes
iface address: 3 bytes
ep address: 6 bytes
error handling: peer failure, ep_check
# #
Memory domain: mlx5_1
Component: ib
register: unlimited, dmabuf, cost: 16000 + 0.060 * N nsec
remote key: 8 bytes
local memory handle is required for zcopy
memory invalidation is supported
memory types: host (access,reg,cache)
#
Transport: dc_mlx5
Device: mlx5_1:1
Type: network
System device: mlx5_1 (3)
#
capabilities:
bandwidth: 23588.47/ppn + 0.00 MB/sec
latency: 660 nsec
overhead: 40 nsec
put_short: <= 2K
put_bcopy: <= 8256
put_zcopy: <= 1G, up to 11 iov
put_opt_zcopy_align: <= 512
put_align_mtu: <= 4K
get_bcopy: <= 8256
get_zcopy: 65..1G, up to 11 iov
get_opt_zcopy_align: <= 512
get_align_mtu: <= 4K
am_short: <= 2046
am_bcopy: <= 8254
am_zcopy: <= 8254, up to 3 iov
am_opt_zcopy_align: <= 512
am_align_mtu: <= 4K
am header: <= 138
domain: device
atomic_add: 32, 64 bit
atomic_and: 32, 64 bit
atomic_or: 32, 64 bit
atomic_xor: 32, 64 bit
atomic_fadd: 32, 64 bit
atomic_fand: 32, 64 bit
atomic_for: 32, 64 bit
atomic_fxor: 32, 64 bit
atomic_swap: 32, 64 bit
atomic_cswap: 32, 64 bit
connection: to iface
device priority: 50
device num paths: 1
max eps: inf
device address: 3 bytes
iface address: 7 bytes
error handling: buffer (zcopy), remote access, peer failure, ep_check
# #
Transport: rc_verbs
Device: mlx5_1:1
Type: network
System device: mlx5_1 (3)
#
capabilities:
bandwidth: 23588.47/ppn + 0.00 MB/sec
latency: 600 + 1.000 * N nsec
overhead: 75 nsec
put_short: <= 124
put_bcopy: <= 8256
put_zcopy: <= 1G, up to 5 iov
put_opt_zcopy_align: <= 512
put_align_mtu: <= 4K
get_bcopy: <= 8256
get_zcopy: 65..1G, up to 5 iov
get_opt_zcopy_align: <= 512
get_align_mtu: <= 4K
am_short: <= 123
am_bcopy: <= 8255
am_zcopy: <= 8255, up to 4 iov
am_opt_zcopy_align: <= 512
am_align_mtu: <= 4K
am header: <= 127
domain: device
atomic_add: 64 bit
atomic_fadd: 64 bit
atomic_cswap: 64 bit
connection: to ep
device priority: 50
device num paths: 1
max eps: 256
device address: 3 bytes
ep address: 7 bytes
error handling: peer failure, ep_check
# #
Transport: rc_mlx5
Device: mlx5_1:1
Type: network
System device: mlx5_1 (3)
#
capabilities:
bandwidth: 23588.47/ppn + 0.00 MB/sec
latency: 600 + 1.000 * N nsec
overhead: 40 nsec
put_short: <= 2K
put_bcopy: <= 8256
put_zcopy: <= 1G, up to 14 iov
put_opt_zcopy_align: <= 512
put_align_mtu: <= 4K
get_bcopy: <= 8256
get_zcopy: 65..1G, up to 14 iov
get_opt_zcopy_align: <= 512
get_align_mtu: <= 4K
am_short: <= 2046
am_bcopy: <= 8254
am_zcopy: <= 8254, up to 3 iov
am_opt_zcopy_align: <= 512
am_align_mtu: <= 4K
am header: <= 186
domain: device
atomic_add: 32, 64 bit
atomic_and: 32, 64 bit
atomic_or: 32, 64 bit
atomic_xor: 32, 64 bit
atomic_fadd: 32, 64 bit
atomic_fand: 32, 64 bit
atomic_for: 32, 64 bit
atomic_fxor: 32, 64 bit
atomic_swap: 32, 64 bit
atomic_cswap: 32, 64 bit
connection: to ep
device priority: 50
device num paths: 1
max eps: 256
device address: 3 bytes
ep address: 10 bytes
error handling: buffer (zcopy), remote access, peer failure, ep_check
# #
Transport: ud_verbs
Device: mlx5_1:1
Type: network
System device: mlx5_1 (3)
#
capabilities:
bandwidth: 23588.47/ppn + 0.00 MB/sec
latency: 630 nsec
overhead: 105 nsec
am_short: <= 116
am_bcopy: <= 4088
am_zcopy: <= 4088, up to 5 iov
am_opt_zcopy_align: <= 512
am_align_mtu: <= 4K
am header: <= 3992
connection: to ep, to iface
device priority: 50
device num paths: 1
max eps: inf
device address: 3 bytes
iface address: 3 bytes
ep address: 6 bytes
error handling: peer failure, ep_check
# #
Transport: ud_mlx5
Device: mlx5_1:1
Type: network
System device: mlx5_1 (3)
#
capabilities:
bandwidth: 23588.47/ppn + 0.00 MB/sec
latency: 630 nsec
overhead: 80 nsec
am_short: <= 180
am_bcopy: <= 4088
am_zcopy: <= 4088, up to 3 iov
am_opt_zcopy_align: <= 512
am_align_mtu: <= 4K
am header: <= 132
connection: to ep, to iface
device priority: 50
device num paths: 1
max eps: inf
device address: 3 bytes
iface address: 3 bytes
ep address: 6 bytes
error handling: peer failure, ep_check
# #
Memory domain: mlx5_0
Component: gga
register: unlimited, dmabuf, cost: 16000 + 0.060 * N nsec
remote key: 8 bytes
local memory handle is required for zcopy
memory invalidation is supported
memory types: host (access,reg,cache)
< no supported devices found >
#
Memory domain: mlx5_1
Component: gga
register: unlimited, dmabuf, cost: 16000 + 0.060 * N nsec
remote key: 8 bytes
local memory handle is required for zcopy
memory invalidation is supported
memory types: host (access,reg,cache)
< no supported devices found >
#
Connection manager: rdmacm
max_conn_priv: 54 bytes
#
Memory domain: cma
Component: cma
register: unlimited, cost: 9 nsec
memory types: host (access,reg_nonblock,reg,cache)
#
Transport: cma
Device: memory
Type: intra-node
System device:
#
capabilities:
bandwidth: 0.00/ppn + 11145.00 MB/sec
latency: 80 nsec
overhead: 2000 nsec
put_zcopy: unlimited, up to 16 iov
put_opt_zcopy_align: <= 1
put_align_mtu: <= 1
get_zcopy: unlimited, up to 16 iov
get_opt_zcopy_align: <= 1
get_align_mtu: <= 1
connection: to iface
device priority: 0
device num paths: 1
max eps: inf
device address: 8 bytes
iface address: 4 bytes
error handling: peer failure, ep_check
# #
Memory domain: knem
Component: knem
register: unlimited, cost: 1200 + 0.007 * N nsec
remote key: 16 bytes
memory types: host (access,reg,cache)
#
Transport: knem
Device: memory
Type: intra-node
System device:
#
capabilities:
bandwidth: 0.00/ppn + 13862.00 MB/sec
latency: 80 nsec
overhead: 2000 nsec
put_zcopy: unlimited, up to 16 iov
put_opt_zcopy_align: <= 1
put_align_mtu: <= 1
get_zcopy: unlimited, up to 16 iov
get_opt_zcopy_align: <= 1
get_align_mtu: <= 1
connection: to iface
device priority: 0
device num paths: 1
max eps: inf
device address: 8 bytes
iface address: 0 bytes
error handling: none
# #
Memory domain: xpmem
Component: xpmem
register: unlimited, cost: 60 nsec
remote key: 24 bytes
rkey_ptr is supported
memory types: host (access,alloc,reg_nonblock,reg,cache)
#
Transport: xpmem
Device: memory
Type: intra-node
System device:
#
capabilities:
bandwidth: 0.00/ppn + 15360.00 MB/sec
latency: 80 nsec
overhead: 10 nsec
put_short: <= 4294967295
put_bcopy: unlimited
get_bcopy: unlimited
am_short: <= 100
am_bcopy: <= 8256
domain: cpu
atomic_add: 32, 64 bit
atomic_and: 32, 64 bit
atomic_or: 32, 64 bit
atomic_xor: 32, 64 bit
atomic_fadd: 32, 64 bit
atomic_fand: 32, 64 bit
atomic_for: 32, 64 bit
atomic_fxor: 32, 64 bit
atomic_swap: 32, 64 bit
atomic_cswap: 32, 64 bit
connection: to iface
device priority: 0
device num paths: 1
max eps: inf
device address: 8 bytes
iface address: 16 bytes
error handling: none
#
$ ibv_devinfo -vv hca_id: mlx5_0 transport: InfiniBand (0) fw_ver: 20.37.1700 node_guid: 88e9:a4ff:ff25:a45a sys_image_guid: 88e9:a4ff:ff25:a45a vendor_id: 0x02c9 vendor_part_id: 4123 hw_ver: 0x0 board_id: MT_0000000594 phys_port_cnt: 1 max_mr_size: 0xffffffffffffffff page_size_cap: 0xfffffffffffff000 max_qp: 131072 max_qp_wr: 32768 device_cap_flags: 0x21361c36 BAD_PKEY_CNTR BAD_QKEY_CNTR AUTO_PATH_MIG CHANGE_PHY_PORT PORT_ACTIVE_EVENT SYS_IMAGE_GUID RC_RNR_NAK_GEN MEM_WINDOW UD_IP_CSUM XRC MEM_MGT_EXTENSIONS MEM_WINDOW_TYPE_2B MANAGED_FLOW_STEERING max_sge: 30 max_sge_rd: 30 max_cq: 16777216 max_cqe: 4194303 max_mr: 16777216 max_pd: 8388608 max_qp_rd_atom: 16 max_ee_rd_atom: 0 max_res_rd_atom: 2097152 max_qp_init_rd_atom: 16 max_ee_init_rd_atom: 0 atomic_cap: ATOMIC_HCA (1) max_ee: 0 max_rdd: 0 max_mw: 16777216 max_raw_ipv6_qp: 0 max_raw_ethy_qp: 0 max_mcast_grp: 2097152 max_mcast_qp_attach: 240 max_total_mcast_qp_attach: 503316480 max_ah: 2147483647 max_fmr: 0 max_srq: 8388608 max_srq_wr: 32767 max_srq_sge: 31 max_pkeys: 128 local_ca_ack_delay: 16 general_odp_caps: ODP_SUPPORT ODP_SUPPORT_IMPLICIT rc_odp_caps: SUPPORT_SEND SUPPORT_RECV SUPPORT_WRITE SUPPORT_READ SUPPORT_ATOMIC SUPPORT_SRQ uc_odp_caps: NO SUPPORT ud_odp_caps: SUPPORT_SEND xrc_odp_caps: SUPPORT_SEND SUPPORT_WRITE SUPPORT_READ SUPPORT_ATOMIC SUPPORT_SRQ completion timestamp_mask: 0x7fffffffffffffff hca_core_clock: 156250kHZ device_cap_flags_ex: 0x3000005021361C36 PCI_WRITE_END_PADDING Unknown flags: 0x3000004000000000 tso_caps: max_tso: 0 rss_caps: max_rwq_indirection_tables: 0 max_rwq_indirection_table_size: 0 rx_hash_function: 0x0 rx_hash_fields_mask: 0x0 max_wq_type_rq: 0 packet_pacing_caps: qp_rate_limit_min: 0kbps qp_rate_limit_max: 0kbps max_rndv_hdr_size: 64 max_num_tags: 127 max_ops: 32768 max_sge: 1 flags: IBV_TM_CAP_RC
cq moderation caps: max_cq_count: 65535 max_cq_period: 4095 us
maximum available device memory: 131072Bytes
num_comp_vectors: 63 port: 1 state: PORT_ACTIVE (4) max_mtu: 4096 (5) active_mtu: 4096 (5) sm_lid: 1 port_lid: 3 port_lmc: 0x00 link_layer: InfiniBand max_msg_sz: 0x40000000 port_cap_flags: 0xa259e848 port_cap_flags2: 0x0032 max_vl_num: 4 (3) bad_pkey_cntr: 0x0 qkey_viol_cntr: 0x0 sm_sl: 0 pkey_tbl_len: 128 gid_tbl_len: 8 subnet_timeout: 18 init_type_reply: 0 active_width: 4X (2) active_speed: 50.0 Gbps (64) phys_state: LINK_UP (5) GID[ 0]: fe80:0000:0000:0000:88e9:a4ff:ff25:a45a
hca_id: mlx5_1 transport: InfiniBand (0) fw_ver: 20.37.1700 node_guid: 88e9:a4ff:ff25:a45b sys_image_guid: 88e9:a4ff:ff25:a45a vendor_id: 0x02c9 vendor_part_id: 4123 hw_ver: 0x0 board_id: MT_0000000594 phys_port_cnt: 1 max_mr_size: 0xffffffffffffffff page_size_cap: 0xfffffffffffff000 max_qp: 131072 max_qp_wr: 32768 device_cap_flags: 0x21361c36 BAD_PKEY_CNTR BAD_QKEY_CNTR AUTO_PATH_MIG CHANGE_PHY_PORT PORT_ACTIVE_EVENT SYS_IMAGE_GUID RC_RNR_NAK_GEN MEM_WINDOW UD_IP_CSUM XRC MEM_MGT_EXTENSIONS MEM_WINDOW_TYPE_2B MANAGED_FLOW_STEERING max_sge: 30 max_sge_rd: 30 max_cq: 16777216 max_cqe: 4194303 max_mr: 16777216 max_pd: 8388608 max_qp_rd_atom: 16 max_ee_rd_atom: 0 max_res_rd_atom: 2097152 max_qp_init_rd_atom: 16 max_ee_init_rd_atom: 0 atomic_cap: ATOMIC_HCA (1) max_ee: 0 max_rdd: 0 max_mw: 16777216 max_raw_ipv6_qp: 0 max_raw_ethy_qp: 0 max_mcast_grp: 2097152 max_mcast_qp_attach: 240 max_total_mcast_qp_attach: 503316480 max_ah: 2147483647 max_fmr: 0 max_srq: 8388608 max_srq_wr: 32767 max_srq_sge: 31 max_pkeys: 128 local_ca_ack_delay: 16 general_odp_caps: ODP_SUPPORT ODP_SUPPORT_IMPLICIT rc_odp_caps: SUPPORT_SEND SUPPORT_RECV SUPPORT_WRITE SUPPORT_READ SUPPORT_ATOMIC SUPPORT_SRQ uc_odp_caps: NO SUPPORT ud_odp_caps: SUPPORT_SEND xrc_odp_caps: SUPPORT_SEND SUPPORT_WRITE SUPPORT_READ SUPPORT_ATOMIC SUPPORT_SRQ completion timestamp_mask: 0x7fffffffffffffff hca_core_clock: 156250kHZ device_cap_flags_ex: 0x3000005021361C36 PCI_WRITE_END_PADDING Unknown flags: 0x3000004000000000 tso_caps: max_tso: 0 rss_caps: max_rwq_indirection_tables: 0 max_rwq_indirection_table_size: 0 rx_hash_function: 0x0 rx_hash_fields_mask: 0x0 max_wq_type_rq: 0 packet_pacing_caps: qp_rate_limit_min: 0kbps qp_rate_limit_max: 0kbps max_rndv_hdr_size: 64 max_num_tags: 127 max_ops: 32768 max_sge: 1 flags: IBV_TM_CAP_RC