diff options
Diffstat (limited to 'io_uring/kbuf.c')
-rw-r--r-- | io_uring/kbuf.c | 177 |
1 files changed, 156 insertions, 21 deletions
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index a1e4239c7d75..72b6af1d2ed3 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -33,19 +33,42 @@ struct io_provide_buf { __u16 bid; }; +struct io_buf_free { + struct hlist_node list; + void *mem; + size_t size; + int inuse; +}; + +static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, + unsigned int bgid) +{ + if (bl && bgid < BGID_ARRAY) + return &bl[bgid]; + + return xa_load(&ctx->io_bl_xa, bgid); +} + static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { - if (ctx->io_bl && bgid < BGID_ARRAY) - return &ctx->io_bl[bgid]; + lockdep_assert_held(&ctx->uring_lock); - return xa_load(&ctx->io_bl_xa, bgid); + return __io_buffer_get_list(ctx, ctx->io_bl, bgid); } static int io_buffer_add_list(struct io_ring_ctx *ctx, struct io_buffer_list *bl, unsigned int bgid) { + /* + * Store buffer group ID and finally mark the list as visible. + * The normal lookup doesn't care about the visibility as we're + * always under the ->uring_lock, but the RCU lookup from mmap does. + */ bl->bgid = bgid; + smp_store_release(&bl->is_ready, 1); + if (bgid < BGID_ARRAY) return 0; @@ -196,21 +219,40 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, static __cold int io_init_bl_list(struct io_ring_ctx *ctx) { + struct io_buffer_list *bl; int i; - ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), - GFP_KERNEL); - if (!ctx->io_bl) + bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL); + if (!bl) return -ENOMEM; for (i = 0; i < BGID_ARRAY; i++) { - INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); - ctx->io_bl[i].bgid = i; + INIT_LIST_HEAD(&bl[i].buf_list); + bl[i].bgid = i; } + smp_store_release(&ctx->io_bl, bl); return 0; } +/* + * Mark the given mapped range as free for reuse + */ +static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +{ + struct io_buf_free *ibf; + + hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { + if (bl->buf_ring == ibf->mem) { + ibf->inuse = 0; + return; + } + } + + /* can't happen... */ + WARN_ON_ONCE(1); +} + static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer_list *bl, unsigned nbufs) { @@ -223,7 +265,11 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->is_mapped) { i = bl->buf_ring->tail - bl->head; if (bl->is_mmap) { - folio_put(virt_to_folio(bl->buf_ring)); + /* + * io_kbuf_list_free() will free the page(s) at + * ->release() time. + */ + io_kbuf_mark_free(ctx, bl); bl->buf_ring = NULL; bl->is_mmap = 0; } else if (bl->buf_nr_pages) { @@ -274,9 +320,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) xa_for_each(&ctx->io_bl_xa, index, bl) { xa_erase(&ctx->io_bl_xa, bl->bgid); __io_remove_buffers(ctx, bl, -1U); - kfree(bl); + kfree_rcu(bl, rcu); } + /* + * Move deferred locked entries to cache before pruning + */ + spin_lock(&ctx->completion_lock); + if (!list_empty(&ctx->io_buffers_comp)) + list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache); + spin_unlock(&ctx->completion_lock); + list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { buf = list_entry(item, struct io_buffer, list); kmem_cache_free(io_buf_cachep, buf); @@ -460,7 +514,16 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) INIT_LIST_HEAD(&bl->buf_list); ret = io_buffer_add_list(ctx, bl, p->bgid); if (ret) { - kfree(bl); + /* + * Doesn't need rcu free as it was never visible, but + * let's keep it consistent throughout. Also can't + * be a lower indexed array group, as adding one + * where lookup failed cannot happen. + */ + if (p->bgid >= BGID_ARRAY) + kfree_rcu(bl, rcu); + else + WARN_ON_ONCE(1); goto err; } } @@ -531,19 +594,63 @@ error_unpin: return -EINVAL; } -static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, +/* + * See if we have a suitable region that we can reuse, rather than allocate + * both a new io_buf_free and mem region again. We leave it on the list as + * even a reused entry will need freeing at ring release. + */ +static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, + size_t ring_size) +{ + struct io_buf_free *ibf, *best = NULL; + size_t best_dist; + + hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { + size_t dist; + + if (ibf->inuse || ibf->size < ring_size) + continue; + dist = ibf->size - ring_size; + if (!best || dist < best_dist) { + best = ibf; + if (!dist) + break; + best_dist = dist; + } + } + + return best; +} + +static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, + struct io_uring_buf_reg *reg, struct io_buffer_list *bl) { - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; + struct io_buf_free *ibf; size_t ring_size; void *ptr; ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - ptr = (void *) __get_free_pages(gfp, get_order(ring_size)); - if (!ptr) - return -ENOMEM; - bl->buf_ring = ptr; + /* Reuse existing entry, if we can */ + ibf = io_lookup_buf_free_entry(ctx, ring_size); + if (!ibf) { + ptr = io_mem_alloc(ring_size); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + /* Allocate and store deferred free entry */ + ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); + if (!ibf) { + io_mem_free(ptr); + return -ENOMEM; + } + ibf->mem = ptr; + ibf->size = ring_size; + hlist_add_head(&ibf->list, &ctx->io_buf_list); + } + ibf->inuse = 1; + bl->buf_ring = ibf->mem; bl->is_mapped = 1; bl->is_mmap = 1; return 0; @@ -555,6 +662,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) struct io_buffer_list *bl, *free_bl = NULL; int ret; + lockdep_assert_held(&ctx->uring_lock); + if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; @@ -599,7 +708,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!(reg.flags & IOU_PBUF_RING_MMAP)) ret = io_pin_pbuf_ring(®, bl); else - ret = io_alloc_pbuf_ring(®, bl); + ret = io_alloc_pbuf_ring(ctx, ®, bl); if (!ret) { bl->nr_entries = reg.ring_entries; @@ -609,7 +718,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return 0; } - kfree(free_bl); + kfree_rcu(free_bl, rcu); return ret; } @@ -618,6 +727,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) struct io_uring_buf_reg reg; struct io_buffer_list *bl; + lockdep_assert_held(&ctx->uring_lock); + if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; if (reg.resv[0] || reg.resv[1] || reg.resv[2]) @@ -634,7 +745,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) __io_remove_buffers(ctx, bl, -1U); if (bl->bgid >= BGID_ARRAY) { xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree(bl); + kfree_rcu(bl, rcu); } return 0; } @@ -643,9 +754,33 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) { struct io_buffer_list *bl; - bl = io_buffer_get_list(ctx, bgid); + bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid); + if (!bl || !bl->is_mmap) return NULL; + /* + * Ensure the list is fully setup. Only strictly needed for RCU lookup + * via mmap, and in that case only for the array indexed groups. For + * the xarray lookups, it's either visible and ready, or not at all. + */ + if (!smp_load_acquire(&bl->is_ready)) + return NULL; return bl->buf_ring; } + +/* + * Called at or after ->release(), free the mmap'ed buffers that we used + * for memory mapped provided buffer rings. + */ +void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) +{ + struct io_buf_free *ibf; + struct hlist_node *tmp; + + hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { + hlist_del(&ibf->list); + io_mem_free(ibf->mem); + kfree(ibf); + } +} |