openucx / ucx

Unified Communication X (mailing list - https://elist.ornl.gov/mailman/listinfo/ucx-group)
http://www.openucx.org
Other
1.17k stars 428 forks source link

mlx5dv_devx_obj_modify() error and hang #5657

Closed alex--m closed 4 years ago

alex--m commented 4 years ago

Thanks to @MichaelLaufer for bringing this to my attention.

Describe the bug

Error on MPI_Init() + Hang on MPI_Finalize(), when running a simple MPI app with rc_x/dc_x transports:

[mlaufer@storm test_MPI]$ mpirun -np 2 --hostfile hostfile_09 -x UCX_TLS=self,sysv,rc_x -x UCX_LOG_LEVEL=trace -x UCX_IB_GID_INDEX=0 --mca coll ^ucx --mca fcoll ^vulcan ./pical
[1599217002.536877] [thunder09:322013:0]           sock.c:114  UCX  DIAG  failed to read from /sys/class/net/ens3f0np0/bonding/ad_num_ports: No such file or directory, assuming 802.3ad bonding is disabled
[1599217002.536879] [thunder09:322014:0]           sock.c:114  UCX  DIAG  failed to read from /sys/class/net/ens3f0np0/bonding/ad_num_ports: No such file or directory, assuming 802.3ad bonding is disabled
[1599217002.538940] [thunder09:322014:0]           sock.c:114  UCX  DIAG  failed to read from /sys/class/net/ens3f0np0/bonding/ad_num_ports: No such file or directory, assuming 802.3ad bonding is disabled
[1599217002.539004] [thunder09:322013:0]           sock.c:114  UCX  DIAG  failed to read from /sys/class/net/ens3f0np0/bonding/ad_num_ports: No such file or directory, assuming 802.3ad bonding is disabled
[1599217002.546716] [thunder09:322014:0]           sock.c:114  UCX  DIAG  failed to read from /sys/class/net/ens2f0np0/bonding/ad_num_ports: No such file or directory, assuming 802.3ad bonding is disabled
[1599217002.546724] [thunder09:322013:0]           sock.c:114  UCX  DIAG  failed to read from /sys/class/net/ens2f0np0/bonding/ad_num_ports: No such file or directory, assuming 802.3ad bonding is disabled
[1599217002.548892] [thunder09:322014:0]           sock.c:114  UCX  DIAG  failed to read from /sys/class/net/ens2f0np0/bonding/ad_num_ports: No such file or directory, assuming 802.3ad bonding is disabled
[1599217002.548961] [thunder09:322013:0]           sock.c:114  UCX  DIAG  failed to read from /sys/class/net/ens2f0np0/bonding/ad_num_ports: No such file or directory, assuming 802.3ad bonding is disabled
[1599217002.556681] [thunder09:322014:0]     ucp_worker.c:1648 UCX  INFO  ep_cfg[0]: tag(self/memory rc_mlx5/mlx5_0:1 rc_mlx5/mlx5_2:1);
[1599217002.556684] [thunder09:322013:0]     ucp_worker.c:1648 UCX  INFO  ep_cfg[0]: tag(self/memory rc_mlx5/mlx5_0:1 rc_mlx5/mlx5_2:1);
[1599217002.573537] [thunder09:322013:0]     ucp_worker.c:1648 UCX  INFO  ep_cfg[1]: tag(sysv_/memory rc_mlx5/mlx5_0:1 rc_mlx5/mlx5_2:1);
[1599217002.576122] [thunder09:322013:0]     ib_mlx5_dv.c:224  UCX  ERROR mlx5dv_devx_obj_modify(503) failed, syndrome 0: Invalid argument
[1599217002.577668] [thunder09:322014:0]     ucp_worker.c:1648 UCX  INFO  ep_cfg[1]: tag(sysv_/memory rc_mlx5/mlx5_0:1 rc_mlx5/mlx5_2:1);
Process 0 of 2 on thunder09
[1599217002.581119] [thunder09:322014:0]     ib_mlx5_dv.c:224  UCX  ERROR mlx5dv_devx_obj_modify(503) failed, syndrome 0: Invalid argument
[1599217002.581207] [thunder09:322014:0]     ib_mlx5_dv.c:224  UCX  ERROR mlx5dv_devx_obj_modify(503) failed, syndrome 0: Invalid argument
Process 1 of 2 on thunder09
pi is approximately 3.1415926544231318, Error is 0.0000000008333387
wall clock time = 0.002396
<HANG>

The app does nothing special:

/*  example from MPICH  */
#include "mpi.h"
#include <stdio.h>
#include <math.h>

double f(double);

double f(double a)
{
    return (4.0 / (1.0 + a*a));
}

int main(int argc,char *argv[])
{
    int done = 0, n, myid, numprocs, i;
    double PI25DT = 3.141592653589793238462643;
    double mypi, pi, h, sum, x;
    double startwtime = 0.0, endwtime;
    int  namelen;
    char processor_name[MPI_MAX_PROCESSOR_NAME];

    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD,&myid);
    MPI_Get_processor_name(processor_name,&namelen);

    fprintf(stdout,"Process %d of %d on %s\n",
            myid, numprocs, processor_name);

    n = 0;
    while (!done)
    {
        if (myid == 0)
        {
/*
            printf("Enter the number of intervals: (0 quits) ");
            scanf("%d",&n);
*/
            if (n==0) n=10000; else n=0;

            startwtime = MPI_Wtime();
        }
        MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
        if (n == 0)
            done = 1;
        else
        {
            h   = 1.0 / (double) n;
            sum = 0.0;
            /* A slightly better approach starts from large i and works back */
            for (i = myid + 1; i <= n; i += numprocs)
            {
                x = h * ((double)i - 0.5);
                sum += f(x);
            }
            mypi = h * sum;

            MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

            if (myid == 0)
            {
                printf("pi is approximately %.16f, Error is %.16f\n",
                       pi, fabs(pi - PI25DT));
                endwtime = MPI_Wtime();
                printf("wall clock time = %f\n", endwtime-startwtime);
                fflush( stdout );
            }
        }
    }
    MPI_Finalize();
    return 0;
}

Setup and versions

[root@thunder09 ~]# /mnt/central/testing/x86_64/ucx/bin/ucx_info -d
#
# Memory domain: posix_
#     Component: posix_
#             allocate: unlimited
#           remote key: 24 bytes
#           rkey_ptr is supported
#
#   Transport: posix_
#      Device: memory
#
#      capabilities:
#            bandwidth: 0.00/ppn + 12179.00 MB/sec
#              latency: 80 nsec
#             overhead (short): 10 nsec
#             overhead (bcopy): 11 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 100
#             am_bcopy: <= 8256
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 8 bytes
#       error handling: none
#
#
# Memory domain: posix_bcast_
#     Component: posix_bcast_
#             allocate: unlimited
#           remote key: 24 bytes
#           rkey_ptr is supported
#
#   Transport: posix_bcast_
#      Device: memory
#
#      capabilities:
#            bandwidth: 0.00/ppn + 0.00 MB/sec
#              latency: 0 nsec
#             overhead (short): 0 nsec
#             overhead (bcopy): 0 nsec
#           connection: none
#      device priority: 0
#     device num paths: 0
#              max eps: 0
#       device address: 0 bytes
#       error handling: none
#
#
# Memory domain: posix_incast_
#     Component: posix_incast_
#             allocate: unlimited
#           remote key: 24 bytes
#           rkey_ptr is supported
#
#   Transport: posix_incast_
#      Device: memory
#
#      capabilities:
#            bandwidth: 0.00/ppn + 0.00 MB/sec
#              latency: 0 nsec
#             overhead (short): 0 nsec
#             overhead (bcopy): 0 nsec
#           connection: none
#      device priority: 0
#     device num paths: 0
#              max eps: 0
#       device address: 0 bytes
#       error handling: none
#
#
# Memory domain: sysv_
#     Component: sysv_
#             allocate: unlimited
#           remote key: 12 bytes
#           rkey_ptr is supported
#
#   Transport: sysv_
#      Device: memory
#
#      capabilities:
#            bandwidth: 0.00/ppn + 12179.00 MB/sec
#              latency: 80 nsec
#             overhead (short): 10 nsec
#             overhead (bcopy): 11 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 100
#             am_bcopy: <= 8256
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 8 bytes
#       error handling: none
#
#
# Memory domain: sysv_bcast_
#     Component: sysv_bcast_
#             allocate: unlimited
#           remote key: 12 bytes
#           rkey_ptr is supported
#
#   Transport: sysv_bcast_
#      Device: memory
#
#      capabilities:
#            bandwidth: 0.00/ppn + 0.00 MB/sec
#              latency: 0 nsec
#             overhead (short): 0 nsec
#             overhead (bcopy): 0 nsec
#           connection: none
#      device priority: 0
#     device num paths: 0
#              max eps: 0
#       device address: 0 bytes
#       error handling: none
#
#
# Memory domain: sysv_incast_
#     Component: sysv_incast_
#             allocate: unlimited
#           remote key: 12 bytes
#           rkey_ptr is supported
#
#   Transport: sysv_incast_
#      Device: memory
#
#      capabilities:
#            bandwidth: 0.00/ppn + 0.00 MB/sec
#              latency: 0 nsec
#             overhead (short): 0 nsec
#             overhead (bcopy): 0 nsec
#           connection: none
#      device priority: 0
#     device num paths: 0
#              max eps: 0
#       device address: 0 bytes
#       error handling: none
#
#
# Memory domain: self
#     Component: self
#             register: unlimited, cost: 0 nsec
#           remote key: 0 bytes
#
#   Transport: self
#      Device: memory
#
#      capabilities:
#            bandwidth: 0.00/ppn + 6911.00 MB/sec
#              latency: 0 nsec
#             overhead (short): 10 nsec
#             overhead (bcopy): 11 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 8K
#             am_bcopy: <= 8K
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 0 bytes
#        iface address: 8 bytes
#       error handling: none
#
#
# Memory domain: tcp
#     Component: tcp
#             register: unlimited, cost: 0 nsec
#           remote key: 0 bytes
#
#   Transport: tcp
#      Device: enp24s0f0
#
#      capabilities:
#            bandwidth: 113.16/ppn + 0.00 MB/sec
#              latency: 5776 nsec
#             overhead (short): 50000 nsec
#             overhead (bcopy): 51000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to iface
#      device priority: 1
#     device num paths: 1
#              max eps: 256
#       device address: 16 bytes
#        iface address: 2 bytes
#       error handling: peer failure
#
#   Transport: tcp
#      Device: ens3f0np0
#
#      capabilities:
#            bandwidth: 11316.36/ppn + 0.00 MB/sec
#              latency: 5206 nsec
#             overhead (short): 50000 nsec
#             overhead (bcopy): 51000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to iface
#      device priority: 1
#     device num paths: 1
#              max eps: 256
#       device address: 16 bytes
#        iface address: 2 bytes
#       error handling: peer failure
#
#   Transport: tcp
#      Device: ens2f0np0
#
#      capabilities:
#            bandwidth: 11316.36/ppn + 0.00 MB/sec
#              latency: 5206 nsec
#             overhead (short): 50000 nsec
#             overhead (bcopy): 51000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to iface
#      device priority: 1
#     device num paths: 1
#              max eps: 256
#       device address: 16 bytes
#        iface address: 2 bytes
#       error handling: peer failure
#
#
# Connection manager: tcp
#      max_conn_priv: 2032 bytes
#
# Memory domain: sockcm
#     Component: sockcm
#           supports client-server connection establishment via sockaddr
#   < no supported devices found >
#
# Memory domain: mlx5_0
#     Component: ib
#             register: unlimited, cost: 180 nsec
#           remote key: 8 bytes
#           local memory handle is required for zcopy
#
#   Transport: rc_verbs
#      Device: mlx5_0:1
#
#      capabilities:
#            bandwidth: 10957.84/ppn + 0.00 MB/sec
#              latency: 800 + 1.000 * N nsec
#             overhead (short): 75 nsec
#             overhead (bcopy): 76 nsec
#            put_short: <= 124
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 3 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 1K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 3 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 1K
#             am_short: <= 123
#             am_bcopy: <= 8255
#             am_zcopy: <= 8255, up to 2 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 1K
#            am header: <= 127
#               domain: device
#           atomic_add: 64 bit
#          atomic_fadd: 64 bit
#         atomic_cswap: 64 bit
#           connection: to ep
#      device priority: 50
#     device num paths: 1
#              max eps: 256
#       device address: 17 bytes
#           ep address: 17 bytes
#       error handling: peer failure
#
#
#   Transport: rc_mlx5
#      Device: mlx5_0:1
#
#      capabilities:
#            bandwidth: 10957.84/ppn + 0.00 MB/sec
#              latency: 800 + 1.000 * N nsec
#             overhead (short): 40 nsec
#             overhead (bcopy): 41 nsec
#            put_short: <= 2K
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 14 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 1K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 14 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 1K
#             am_short: <= 2046
#             am_bcopy: <= 8254
#             am_zcopy: <= 8254, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 1K
#            am header: <= 186
#               domain: device
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to ep
#      device priority: 50
#     device num paths: 1
#              max eps: 256
#       device address: 17 bytes
#           ep address: 7 bytes
#       error handling: buffer (zcopy), remote access, peer failure
#
#
#   Transport: dc_mlx5
#      Device: mlx5_0:1
#
#      capabilities:
#            bandwidth: 10957.84/ppn + 0.00 MB/sec
#              latency: 860 nsec
#             overhead (short): 40 nsec
#             overhead (bcopy): 41 nsec
#            put_short: <= 2K
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 11 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 1K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 11 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 1K
#             am_short: <= 2046
#             am_bcopy: <= 8254
#             am_zcopy: <= 8254, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 1K
#            am header: <= 138
#               domain: device
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 50
#     device num paths: 1
#              max eps: inf
#       device address: 17 bytes
#        iface address: 5 bytes
#       error handling: buffer (zcopy), remote access, peer failure
#
#
#   Transport: ud_verbs
#      Device: mlx5_0:1
#
#      capabilities:
#            bandwidth: 10957.84/ppn + 0.00 MB/sec
#              latency: 830 nsec
#             overhead (short): 105 nsec
#             overhead (bcopy): 106 nsec
#             am_short: <= 116
#             am_bcopy: <= 1016
#             am_zcopy: <= 1016, up to 1 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 1K
#            am header: <= 880
#           connection: to ep, to iface
#      device priority: 50
#     device num paths: 1
#              max eps: inf
#       device address: 17 bytes
#        iface address: 3 bytes
#           ep address: 6 bytes
#       error handling: peer failure
#
#
#   Transport: ud_mlx5
#      Device: mlx5_0:1
#
#      capabilities:
#            bandwidth: 10957.84/ppn + 0.00 MB/sec
#              latency: 830 nsec
#             overhead (short): 80 nsec
#             overhead (bcopy): 81 nsec
#             am_short: <= 180
#             am_bcopy: <= 1016
#             am_zcopy: <= 1016, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 1K
#            am header: <= 132
#           connection: to ep, to iface
#      device priority: 50
#     device num paths: 1
#              max eps: inf
#       device address: 17 bytes
#        iface address: 3 bytes
#           ep address: 6 bytes
#       error handling: peer failure
#
#
# Memory domain: mlx5_1
#     Component: ib
#             register: unlimited, cost: 180 nsec
#           remote key: 8 bytes
#           local memory handle is required for zcopy
#   < no supported devices found >
#
# Memory domain: mlx5_2
#     Component: ib
#             register: unlimited, cost: 180 nsec
#           remote key: 8 bytes
#           local memory handle is required for zcopy
#
#   Transport: rc_verbs
#      Device: mlx5_2:1
#
#      capabilities:
#            bandwidth: 10957.84/ppn + 0.00 MB/sec
#              latency: 800 + 1.000 * N nsec
#             overhead (short): 75 nsec
#             overhead (bcopy): 76 nsec
#            put_short: <= 124
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 3 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 1K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 3 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 1K
#             am_short: <= 123
#             am_bcopy: <= 8255
#             am_zcopy: <= 8255, up to 2 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 1K
#            am header: <= 127
#               domain: device
#           atomic_add: 64 bit
#          atomic_fadd: 64 bit
#         atomic_cswap: 64 bit
#           connection: to ep
#      device priority: 50
#     device num paths: 1
#              max eps: 256
#       device address: 17 bytes
#           ep address: 17 bytes
#       error handling: peer failure
#
#
#   Transport: rc_mlx5
#      Device: mlx5_2:1
#
#      capabilities:
#            bandwidth: 10957.84/ppn + 0.00 MB/sec
#              latency: 800 + 1.000 * N nsec
#             overhead (short): 40 nsec
#             overhead (bcopy): 41 nsec
#            put_short: <= 2K
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 14 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 1K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 14 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 1K
#             am_short: <= 2046
#             am_bcopy: <= 8254
#             am_zcopy: <= 8254, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 1K
#            am header: <= 186
#               domain: device
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to ep
#      device priority: 50
#     device num paths: 1
#              max eps: 256
#       device address: 17 bytes
#           ep address: 7 bytes
#       error handling: buffer (zcopy), remote access, peer failure
#
#
#   Transport: dc_mlx5
#      Device: mlx5_2:1
#
#      capabilities:
#            bandwidth: 10957.84/ppn + 0.00 MB/sec
#              latency: 860 nsec
#             overhead (short): 40 nsec
#             overhead (bcopy): 41 nsec
#            put_short: <= 2K
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 11 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 1K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 11 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 1K
#             am_short: <= 2046
#             am_bcopy: <= 8254
#             am_zcopy: <= 8254, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 1K
#            am header: <= 138
#               domain: device
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 50
#     device num paths: 1
#              max eps: inf
#       device address: 17 bytes
#        iface address: 5 bytes
#       error handling: buffer (zcopy), remote access, peer failure
#
#
#   Transport: ud_verbs
#      Device: mlx5_2:1
#
#      capabilities:
#            bandwidth: 10957.84/ppn + 0.00 MB/sec
#              latency: 830 nsec
#             overhead (short): 105 nsec
#             overhead (bcopy): 106 nsec
#             am_short: <= 116
#             am_bcopy: <= 1016
#             am_zcopy: <= 1016, up to 1 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 1K
#            am header: <= 880
#           connection: to ep, to iface
#      device priority: 50
#     device num paths: 1
#              max eps: inf
#       device address: 17 bytes
#        iface address: 3 bytes
#           ep address: 6 bytes
#       error handling: peer failure
#
#
#   Transport: ud_mlx5
#      Device: mlx5_2:1
#
#      capabilities:
#            bandwidth: 10957.84/ppn + 0.00 MB/sec
#              latency: 830 nsec
#             overhead (short): 80 nsec
#             overhead (bcopy): 81 nsec
#             am_short: <= 180
#             am_bcopy: <= 1016
#             am_zcopy: <= 1016, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 1K
#            am header: <= 132
#           connection: to ep, to iface
#      device priority: 50
#     device num paths: 1
#              max eps: inf
#       device address: 17 bytes
#        iface address: 3 bytes
#           ep address: 6 bytes
#       error handling: peer failure
#
#
# Memory domain: mlx5_3
#     Component: ib
#             register: unlimited, cost: 180 nsec
#           remote key: 8 bytes
#           local memory handle is required for zcopy
#   < no supported devices found >
#
# Memory domain: rdmacm
#     Component: rdmacm
#           supports client-server connection establishment via sockaddr
#   < no supported devices found >
#
# Connection manager: rdmacm
#      max_conn_priv: 54 bytes
#
# Memory domain: cma
#     Component: cma
#             register: unlimited, cost: 9 nsec
#
#   Transport: cma
#      Device: memory
#
#      capabilities:
#            bandwidth: 0.00/ppn + 11145.00 MB/sec
#              latency: 80 nsec
#             overhead (short): 400 nsec
#             overhead (bcopy): 400 nsec
#            put_zcopy: unlimited, up to 16 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 1
#            get_zcopy: unlimited, up to 16 iov
#  get_opt_zcopy_align: <= 1
#        get_align_mtu: <= 1
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 4 bytes
#       error handling: none
#
alex--m commented 4 years ago

Q: Alex, why do you need GID_INDEX=0 ? A: #5598 .