amzn / amzn-drivers

Official AWS drivers repository for Elastic Network Adapter (ENA) and Elastic Fabric Adapter (EFA)
457 stars 176 forks source link

[Bug]: EFA: ibv_open_device() eventually fails when running in loop #306

Open tvegas1 opened 4 months ago

tvegas1 commented 4 months ago

Preliminary Actions

Driver Type

Linux kernel driver for Elastic Fabric Adapter (EFA)

Driver Tag/Commit

2.8.0g

Custom Code

No

OS Platform and Distribution

5.15.0-1055-aws #60~20.04.1-Ubuntu SMP

$ cat /sys/class/infiniband/rdmap79s0/device/driver/module/version 2.8.0g $ cat /sys/class/infiniband/rdmap79s0/device/device 0xefa1

Bug description

The program below eventually fails after few loops with ENOMEM. It can be reproduced at will by restarting the program. When removing ibv_create_comp_channel(), the failure does not seem to reproduce anymore.

Is the cq creation with comp_channel supported on EFA?

Reproduction steps

Source for ibv.c is at the end of the description:

$ gcc ./ibv.c -libverbs && ./a.out
Using rdmap79s0:
................................................................
................................................................
................................................................
................................................................
ibv_open_device(rdmap79s0) failed: Cannot allocate memory (12)

Expected Behavior

If cq with completion channel is not supported: maybe ibv failure If cq with completion channel is supported: no failure, even when running in loop

Actual Behavior

The call ibv_open_device(rdmap79s0) eventually fails with ENOMEM.

Additional Data

No response

Relevant log output


$ strace ./a.out
openat(AT_FDCWD, "/dev/infiniband/uverbs0", O_RDWR|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFCHR|0666, st_rdev=makedev(0xe7, 0xc0), ...}) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde70) = -1 ENOSPC (No space left on device)
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffddc0) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffd8f0) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdf50) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde30) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdc30) = 0
mmap(NULL, 4096, PROT_READ, MAP_SHARED, 3, 0) = 0x155555552000
mmap(NULL, 4096, PROT_WRITE, MAP_SHARED, 3, 0x1000) = 0x155555008000
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdf00) = 0
munmap(0x155555008004, 4096)            = -1 EINVAL (Invalid argument)
munmap(0x155555552000, 4096)            = 0
close(5)                                = 0
close(3)                                = 0
close(4)                                = 0
openat(AT_FDCWD, "/dev/infiniband/uverbs0", O_RDWR|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFCHR|0666, st_rdev=makedev(0xe7, 0xc0), ...}) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde70) = -1 ENOSPC (No space left on device)
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffddc0) = 0 <============================ DID NOT FAIL
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffd8f0) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdf50) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde30) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdc30) = 0
mmap(NULL, 4096, PROT_READ, MAP_SHARED, 3, 0) = 0x155555552000
mmap(NULL, 4096, PROT_WRITE, MAP_SHARED, 3, 0x1000) = 0x155555007000
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdf00) = 0
munmap(0x155555007004, 4096)            = -1 EINVAL (Invalid argument)
munmap(0x155555552000, 4096)            = 0
close(5)                                = 0
close(3)                                = 0
close(4)                                = 0
openat(AT_FDCWD, "/dev/infiniband/uverbs0", O_RDWR|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFCHR|0666, st_rdev=makedev(0xe7, 0xc0), ...}) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde70) = -1 ENOSPC (No space left on device)
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffddc0) = -1 ENOMEM (Cannot allocate memory) <====================== FAILS
close(3)                                = 0
write(1, "Using rdmap79s0:\n..............."..., 340Using rdmap79s0:
................................................................
................................................................
................................................................
................................................................
ibv_open_device(rdmap79s0) failed: Cannot allocate memory (12)
) = 340
exit_group(1)                           = ?
+++ exited with 1 +++
$ cat ibv.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#include <infiniband/verbs.h>

#define fatal(name, dev) { \
        printf("\n%s(%s) failed: %s (%d)\n", \
               name, ibv_get_device_name(device), strerror(errno), errno); \
        exit(1); \
}

static void cycle(struct ibv_device *device)
{
        struct ibv_cq *cq;
        struct ibv_comp_channel *comp_channel = NULL;
        struct ibv_context *context = ibv_open_device(device);

        if (!context) {
                fatal("ibv_open_device", device);
        }
#if 1
        comp_channel = ibv_create_comp_channel(context);
        if (!comp_channel) {
                fatal("ibv_create_comp_channel", device);
        }
#endif
        cq = ibv_create_cq(context, 100, NULL, comp_channel, 0);
        if (!cq) {
                fatal("ibv_create_cq", device);
        }
        if (ibv_destroy_cq(cq)) {
                fatal("ibv_destroy_cq", device);
        }
        if (comp_channel && ibv_destroy_comp_channel(comp_channel)) {
                fatal("ibv_destroy_comp_channel", device);
        }
        if (ibv_close_device(context)) {
                fatal("ibv_close_device", device);
        }
}

int main(void)
{
        int num_devices;
        struct ibv_device **device_list = ibv_get_device_list(&num_devices);

        if (device_list && num_devices > 0) {
                printf("Using %s:\n", ibv_get_device_name(device_list[0]));

                for (unsigned i = 0;; i++) {
                        printf("%s.", (i && (i % 64) == 0)? "\n" : "");
                        cycle(device_list[0]);
                }
        } else {
                printf("Cannot get device list\n");
                return -1;
        }

        ibv_free_device_list(device_list);
        return 0;
}

Contact Details

No response

mrgolin commented 4 months ago

@tvegas1 Thanks for your detailed description, we will look into it and update accordingly.