axboe / liburing

Library providing helpers for the Linux kernel io_uring support
MIT License
2.83k stars 402 forks source link

EFAULT if not respecting IORING_FEAT_SINGLE_MMAP under 6.10-rc1 #1157

Closed Mulling closed 4 months ago

Mulling commented 4 months ago

A way to reproduce it is to ignore IORING_FEAT_SINGLE_MMAP. This does work under <= 6.9.* and if setup with less than 65 entries.

diff --git a/src/setup.c b/src/setup.c
index 1997d25..75ef341 100644
--- a/src/setup.c
+++ b/src/setup.c
@@ -106,31 +106,34 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
                size += sizeof(struct io_uring_cqe);

        sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
+
        cq->ring_sz = p->cq_off.cqes + p->cq_entries * size;

-       if (p->features & IORING_FEAT_SINGLE_MMAP) {
-               if (cq->ring_sz > sq->ring_sz)
-                       sq->ring_sz = cq->ring_sz;
-               cq->ring_sz = sq->ring_sz;
-       }
+       // if (p->features & IORING_FEAT_SINGLE_MMAP) {
+       //      if (cq->ring_sz > sq->ring_sz)
+       //              sq->ring_sz = cq->ring_sz;
+       //      cq->ring_sz = sq->ring_sz;
+       // }
+
        sq->ring_ptr = __sys_mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
                                  MAP_SHARED | MAP_POPULATE, fd,
                                  IORING_OFF_SQ_RING);
+
        if (IS_ERR(sq->ring_ptr))
                return PTR_ERR(sq->ring_ptr);

-       if (p->features & IORING_FEAT_SINGLE_MMAP) {
-               cq->ring_ptr = sq->ring_ptr;
-       } else {
-               cq->ring_ptr = __sys_mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
-                                         MAP_SHARED | MAP_POPULATE, fd,
-                                         IORING_OFF_CQ_RING);
-               if (IS_ERR(cq->ring_ptr)) {
-                       ret = PTR_ERR(cq->ring_ptr);
-                       cq->ring_ptr = NULL;
-                       goto err;
-               }
-       }
+       // if (p->features & IORING_FEAT_SINGLE_MMAP) {
+       //      cq->ring_ptr = sq->ring_ptr;
+       // } else {
+    cq->ring_ptr = __sys_mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
+                  MAP_SHARED | MAP_POPULATE, fd,
+                  IORING_OFF_CQ_RING);
+    if (IS_ERR(cq->ring_ptr)) {
+        ret = PTR_ERR(cq->ring_ptr);
+        cq->ring_ptr = NULL;
+        goto err;
+    }
+       // }

        size = sizeof(struct io_uring_sqe);
        if (p->flags & IORING_SETUP_SQE128)
diff --git a/test/nop.c b/test/nop.c
index 1aa88fc..046c0be 100644
--- a/test/nop.c
+++ b/test/nop.c
@@ -129,7 +129,7 @@ static int test_ring(unsigned flags)
        int ret, i;

        p.flags = flags;
-       ret = io_uring_queue_init_params(8, &ring, &p);
+       ret = io_uring_queue_init_params(65, &ring, &p);
        if (ret) {
                if (ret == -EINVAL)
                        return 0;

Fails this check in vm_insert_pages:

    if (addr < vma->vm_start || end_addr >= vma->vm_end)
        return -EFAULT;

with:

[11383.099643] vm_start = 140108537937920,     addr = 140108537937920
[11383.099646] vm_end   = 140108537942016, end_addr = 140108537946111
axboe commented 4 months ago

Was just poking at this based on your earlier tweet, this is what I see here too. liburing seems to be underestimating the memory required, or the kernel overestimating. I'll poke a bit and figure out what is going on.

Mulling commented 4 months ago

I think it's the Kernel, since I came to this by doing things by hand, without liburing.

axboe commented 4 months ago

You sure? Looks like liburing passing in a too small value here. For example, for 128 sq entries, which is 256 entries, liburing passes in 4096. But that covers only the CQEs themselves, not the shared ring space which is 64 bytes. So the kernel thinks it should be 4096+64, liburing thinks it's 4096.

axboe commented 4 months ago

Hmm maybe it is the kernel...

Mulling commented 4 months ago

Not sure, weird that is works on previous kernels using the same ring size.

axboe commented 4 months ago

Agree, the most likely suspect is the kernel not honoring old userspace which still does separate mmap's rather than take advantage for the single mmap.

axboe commented 4 months ago

Try this:

diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 4785d6af5fee..e5f634a1218e 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -244,17 +244,20 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
    struct io_ring_ctx *ctx = file->private_data;
    size_t sz = vma->vm_end - vma->vm_start;
    long offset = vma->vm_pgoff << PAGE_SHIFT;
+   unsigned int npages;
    void *ptr;

    ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
    if (IS_ERR(ptr))
        return PTR_ERR(ptr);

+   npages = ctx->n_ring_pages;
    switch (offset & IORING_OFF_MMAP_MASK) {
    case IORING_OFF_SQ_RING:
+       npages = min(npages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT);
+       fallthrough;
    case IORING_OFF_CQ_RING:
-       return io_uring_mmap_pages(ctx, vma, ctx->ring_pages,
-                       ctx->n_ring_pages);
+       return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages);
    case IORING_OFF_SQES:
        return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
                        ctx->n_sqe_pages);
axboe commented 4 months ago

Tweaked it a bit and pushed it here:

https://git.kernel.dk/cgit/linux/commit/?h=io_uring-6.10&id=6767af20a5ea99889a64ef90a7f22c23c7d0b962

I like to add a Reported-by: tag to the commits to attribute the person who reported the issue. If you don't mind that, please provide me with an identity and email that I can use for that, eg Name <foo@bar.baz> that I can put in there. If you don't want.

Mulling commented 4 months ago

The patch worked!

Sure, can use Lucas Mülling <lmulling@proton.me>

axboe commented 4 months ago

Great! Updated:

https://git.kernel.dk/cgit/linux/commit/?h=io_uring-6.10&id=06fe9b1df1086b42718d632aa57e8f7cd1a66a21

Closing this one, it'll go into Linus's tree later this week, before 6.10-rc2. Thanks for the report, and for testing -rc kernels! Much better to find these kinds of things before they hit an actual release.