axboe / liburing

Library providing helpers for the Linux kernel io_uring support
MIT License
2.85k stars 402 forks source link

queue init fails when IORING_SETUP_NO_MMAP is used with IORING_SETUP_CQSIZE #1032

Closed bnbajwa closed 9 months ago

bnbajwa commented 9 months ago

I can't find anything in the docs to suggest IORING_SETUP_CQSIZE is incompatible with IORING_SETUP_NO_MMAP, so seems like a bug.

io_uring_queue_init_mem fails with -14 (-EFAULT). If you remove IORING_SETUP_CQSIZE from flags in the reproducer below, then I don't get this error. Note, that this seems to only occur if the memory required is greater than 1 page (e.g. if you use 4 entries for SQ ring and 16 for CQ ring in the reproducer below, then I don't get EFAULT).

I have modified the test 'buf-ring-nommap.c' to create a reproducer (see below). I'm using Ubuntu 23.10 (kernel version 6.5.0-14-generic)

/* SPDX-License-Identifier: MIT */
/*
 * Description: test IOU_PBUF_RING_MMAP with a ring setup with a ring
 *      setup without mmap'ing sq/cq arrays
 *
 */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sys/mman.h>

#include "liburing.h"
#include "helpers.h"

static int bgid = 5;
static int bid = 89;

int main(int argc, char *argv[])
{
    struct io_uring_buf_ring *br;
    struct io_uring_sqe *sqe;
    struct io_uring_cqe *cqe;
    struct io_uring ring;
    size_t ring_size;
    int ret, ring_mask, fds[2];
    struct io_uring_buf_reg reg = {
        .ring_entries = 1,
        .bgid = bgid,
        .flags = IOU_PBUF_RING_MMAP,
    };
    struct io_uring_params p = { };
    void *ring_mem;
    char buf[32];
    off_t off;

    if (argc > 1)
        return T_EXIT_SKIP;

    if (posix_memalign(&ring_mem, 16384, 16384))
        return T_EXIT_FAIL;

    p.flags = IORING_SETUP_NO_MMAP | IORING_SETUP_CQSIZE;
    p.cq_entries = 256;

    ret = io_uring_queue_init_mem(64, &ring, &p, ring_mem, 16384);
    if (ret < 0) {
        if (ret == -EINVAL)
            return T_EXIT_SKIP;
        fprintf(stderr, "queue init failed %d\n", ret);
        return T_EXIT_FAIL;
    }

    if (pipe(fds) < 0) {
        perror("pipe");
        return T_EXIT_FAIL;
    }

    ring_size = sizeof(struct io_uring_buf);
    ring_mask = io_uring_buf_ring_mask(1);

    ret = io_uring_register_buf_ring(&ring, &reg, 0);
    if (ret) {
        if (ret == -EINVAL)
            return T_EXIT_SKIP;
        fprintf(stderr, "reg buf ring: %d\n", ret);
        return T_EXIT_FAIL;
    }

    off = IORING_OFF_PBUF_RING |
            (unsigned long long) bgid << IORING_OFF_PBUF_SHIFT;
    br = mmap(NULL, ring_size, PROT_READ | PROT_WRITE,
                MAP_SHARED | MAP_POPULATE, ring.ring_fd, off);
    if (br == MAP_FAILED) {
        perror("mmap");
        return T_EXIT_FAIL;
    }

    io_uring_buf_ring_add(br, buf, sizeof(buf), bid, ring_mask, 0);
    io_uring_buf_ring_advance(br, 1);

    sqe = io_uring_get_sqe(&ring);
    io_uring_prep_read(sqe, fds[0], NULL, 0, 0);
    sqe->flags |= IOSQE_BUFFER_SELECT;
    sqe->buf_group = bgid;

    io_uring_submit(&ring);

    ret = write(fds[1], "Hello", 5);
    if (ret < 0) {
        perror("write");
        return T_EXIT_FAIL;
    } else if (ret != 5) {
        fprintf(stderr, "short write %d\n", ret);
        return T_EXIT_FAIL;
    }

    ret = io_uring_wait_cqe(&ring, &cqe);
    if (ret) {
        fprintf(stderr, "wait %d\n", ret);
        return T_EXIT_FAIL;
    }
    if (cqe->res < 0) {
        fprintf(stderr, "cqe res %d\n", cqe->res);
        return T_EXIT_FAIL;
    }
    if (!(cqe->flags & IORING_CQE_F_BUFFER)) {
        fprintf(stderr, "buffer not selected in cqe\n");
        return T_EXIT_FAIL;
    }
    if ((cqe->flags >> IORING_CQE_BUFFER_SHIFT) != bid) {
        fprintf(stderr, "wrong buffer id returned\n");
        return T_EXIT_FAIL;
    }

    io_uring_cqe_seen(&ring, cqe);

    io_uring_queue_exit(&ring);
    return T_EXIT_PASS;
}
axboe commented 9 months ago

Using posix_memalign() does not guarantee that the pages will be contiguous, which is a requirement. This is why the man page references using a huge page for this, because that will always work. You're at the mercy of how pages are allocated otherwise.

bnbajwa commented 9 months ago

Thanks for the response.

I did try that too, i.e. allocated huge page with mmap, but got same error (EFAULT).

reg-fd-only.c also tests IORING_SETUP_NO_MMAP with huge page. That test also fails for me ("ring setup failed" and "test 512 failed"). EFAULT is also the reason why this test fails.

I can't seem to figure what's causing the EFAULT error code. I've done a fresh install of Fedora Server 39 (kernel: 6.5.6-300.fc39.x86_64) and re-tested to no avail.

I do have huge pages enabled:

sudo grep Huge /proc/meminfo AnonHugePages: 0 kB ShmemHugePages: 0 kB FileHugePages: 0 kB HugePages_Total: 2048 HugePages_Free: 2048 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 2048 kB Hugetlb: 4194304 kB

I can allocate and use huge pages within my application as normal, no issues.

also:

ulimit -l 7600000000

Any ideas?

axboe commented 9 months ago

Can you send a reproducer with the huge pages? Funky.

bnbajwa commented 9 months ago

yep, see reproducer below. Both tests in the reproducer fail for me with error EFAULT. IORING_SETUP_CQSIZE is irrelevant -- IORING_SETUP_NO_MMAP fails on its own for me with sufficiently large number of entries for SQ ring.

(i compiled/ran it as follows: added new file in liburing/test directory, and then added entry in liburing/test/Makefile, then make command in liburing directory compiles it)

#include <stdio.h>
#include <sys/mman.h>
#include <linux/mman.h>

#include "liburing.h"
#include "helpers.h"

static int test_init1() {
    struct io_uring ring;
    int ret;
    struct io_uring_params p = { };
    void *ring_mem;
    size_t ring_mem_size;

    ring_mem_size = 2u * 1024u * 1024u; // 2MiB
    ring_mem = mmap(NULL,
            ring_mem_size,
            PROT_READ | PROT_WRITE,
            MAP_ANONYMOUS | MAP_SHARED | MAP_HUGETLB,
            -1,
            0);

    if (ring_mem == MAP_FAILED) {
        fprintf(stderr, "huge page allocation failed %d\n", errno);
        return T_EXIT_FAIL;
    }

    p.flags = IORING_SETUP_NO_MMAP;
    ret = io_uring_queue_init_mem(512, &ring, &p, ring_mem, ring_mem_size);
    if (ret < 0) {
        fprintf(stderr, "queue init failed %d\n", ret);
        return T_EXIT_FAIL;
    }

    io_uring_queue_exit(&ring);
    return T_EXIT_PASS;
}

static int test_init2() {
    struct io_uring ring;
    int ret;

    ret = io_uring_queue_init(512, &ring, IORING_SETUP_NO_MMAP);
    if (ret < 0) {
        fprintf(stderr, "queue init failed %d\n", ret);
        return T_EXIT_FAIL;
    }

    io_uring_queue_exit(&ring);
    return T_EXIT_PASS;
}

int main(int argc, char *argv[])
{
    int ret;
    bool failed = false;

    if (argc > 1)
        return T_EXIT_SKIP;

    ret = test_init1();
    if (ret != T_EXIT_PASS) {
        fprintf(stderr, "test 1 failed\n");
        failed = true;
    }

    ret = test_init2();
    if (ret != T_EXIT_PASS) {
        fprintf(stderr, "test 2 failed\n");
        failed = true;
    }

    if (failed)
        return T_EXIT_FAIL;
    else
        return T_EXIT_PASS;
}
axboe commented 9 months ago

Very odd, works fine for me on both arm64 and x86-64. I'll try 6.5-stable to see if it's related to that somehow.

axboe commented 9 months ago

Yep it's an issue with 6.5. 6.5-stable is no longer being maintained, for some reason ubuntu always sticks with those... So not much I can do, except you could possibly nudge ubuntu to include some patches. I can take a peek and figure out which one(s) we need.

bnbajwa commented 9 months ago

Thanks for checking and confirming. I will just use a newer kernel for now, so closing this.