summaryrefslogtreecommitdiff
path: root/io_uring/kbuf.c
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2023-03-14 11:07:19 -0600
committerJens Axboe <axboe@kernel.dk>2023-04-03 07:14:21 -0600
commitc56e022c0a27142b7b59ae6bdf45f86bf4b298a1 (patch)
treefe2dd7047b6231430a8cac684dd1a0aed2eb4fde /io_uring/kbuf.c
parent81cf17cd3ab3e5441e876a8e9e9c38ae9920cecb (diff)
downloadlwn-c56e022c0a27142b7b59ae6bdf45f86bf4b298a1.tar.gz
lwn-c56e022c0a27142b7b59ae6bdf45f86bf4b298a1.zip
io_uring: add support for user mapped provided buffer ring
The ring mapped provided buffer rings rely on the application allocating the memory for the ring, and then the kernel will map it. This generally works fine, but runs into issues on some architectures where we need to be able to ensure that the kernel and application virtual address for the ring play nicely together. This at least impacts architectures that set SHM_COLOUR, but potentially also anyone setting SHMLBA. To use this variant of ring provided buffers, the application need not allocate any memory for the ring. Instead the kernel will do so, and the allocation must subsequently call mmap(2) on the ring with the offset set to: IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) to get a virtual address for the buffer ring. Normally the application would allocate a suitable piece of memory (and correctly aligned) and simply pass that in via io_uring_buf_reg.ring_addr and the kernel would map it. Outside of the setup differences, the kernel allocate + user mapped provided buffer ring works exactly the same. Acked-by: Helge Deller <deller@gmx.de> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'io_uring/kbuf.c')
-rw-r--r--io_uring/kbuf.c99
1 files changed, 76 insertions, 23 deletions
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 4b2f4a0ee962..cd1d9dddf58e 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -137,7 +137,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
return NULL;
head &= bl->mask;
- if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+ /* mmaped buffers are always contig */
+ if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
buf = &br->bufs[head];
} else {
int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
@@ -214,15 +215,27 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (!nbufs)
return 0;
- if (bl->is_mapped && bl->buf_nr_pages) {
- int j;
-
+ if (bl->is_mapped) {
i = bl->buf_ring->tail - bl->head;
- for (j = 0; j < bl->buf_nr_pages; j++)
- unpin_user_page(bl->buf_pages[j]);
- kvfree(bl->buf_pages);
- bl->buf_pages = NULL;
- bl->buf_nr_pages = 0;
+ if (bl->is_mmap) {
+ if (bl->buf_ring) {
+ struct page *page;
+
+ page = virt_to_head_page(bl->buf_ring);
+ if (put_page_testzero(page))
+ free_compound_page(page);
+ bl->buf_ring = NULL;
+ }
+ bl->is_mmap = 0;
+ } else if (bl->buf_nr_pages) {
+ int j;
+
+ for (j = 0; j < bl->buf_nr_pages; j++)
+ unpin_user_page(bl->buf_pages[j]);
+ kvfree(bl->buf_pages);
+ bl->buf_pages = NULL;
+ bl->buf_nr_pages = 0;
+ }
/* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list);
bl->is_mapped = 0;
@@ -482,6 +495,25 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
bl->buf_nr_pages = nr_pages;
bl->buf_ring = br;
bl->is_mapped = 1;
+ bl->is_mmap = 0;
+ return 0;
+}
+
+static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
+ struct io_buffer_list *bl)
+{
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+ size_t ring_size;
+ void *ptr;
+
+ ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
+ ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
+ if (!ptr)
+ return -ENOMEM;
+
+ bl->buf_ring = ptr;
+ bl->is_mapped = 1;
+ bl->is_mmap = 1;
return 0;
}
@@ -496,12 +528,18 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
- if (reg.flags)
- return -EINVAL;
- if (!reg.ring_addr)
- return -EFAULT;
- if (reg.ring_addr & ~PAGE_MASK)
+ if (reg.flags & ~IOU_PBUF_RING_MMAP)
return -EINVAL;
+ if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
+ if (!reg.ring_addr)
+ return -EFAULT;
+ if (reg.ring_addr & ~PAGE_MASK)
+ return -EINVAL;
+ } else {
+ if (reg.ring_addr)
+ return -EINVAL;
+ }
+
if (!is_power_of_2(reg.ring_entries))
return -EINVAL;
@@ -526,17 +564,21 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -ENOMEM;
}
- ret = io_pin_pbuf_ring(&reg, bl);
- if (ret) {
- kfree(free_bl);
- return ret;
- }
+ if (!(reg.flags & IOU_PBUF_RING_MMAP))
+ ret = io_pin_pbuf_ring(&reg, bl);
+ else
+ ret = io_alloc_pbuf_ring(&reg, bl);
- bl->nr_entries = reg.ring_entries;
- bl->mask = reg.ring_entries - 1;
+ if (!ret) {
+ bl->nr_entries = reg.ring_entries;
+ bl->mask = reg.ring_entries - 1;
- io_buffer_add_list(ctx, bl, reg.bgid);
- return 0;
+ io_buffer_add_list(ctx, bl, reg.bgid);
+ return 0;
+ }
+
+ kfree(free_bl);
+ return ret;
}
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
@@ -564,3 +606,14 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
}
return 0;
}
+
+void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
+{
+ struct io_buffer_list *bl;
+
+ bl = io_buffer_get_list(ctx, bgid);
+ if (!bl || !bl->is_mmap)
+ return NULL;
+
+ return bl->buf_ring;
+}