diff options
Diffstat (limited to 'io_uring')
74 files changed, 9473 insertions, 4153 deletions
diff --git a/io_uring/Kconfig b/io_uring/Kconfig new file mode 100644 index 000000000000..a283d9e53787 --- /dev/null +++ b/io_uring/Kconfig @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# io_uring configuration +# + +config IO_URING_ZCRX + def_bool y + depends on IO_URING + depends on PAGE_POOL + depends on INET + depends on NET_RX_BUSY_POLL + +config IO_URING_BPF + def_bool y + depends on BPF + depends on NET + +config IO_URING_BPF_OPS + def_bool y + depends on IO_URING + depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF diff --git a/io_uring/Makefile b/io_uring/Makefile index d695b60dba4f..c54e328d1410 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,13 +7,22 @@ GCOV_PROFILE := y endif obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ - tctx.o filetable.o rw.o net.o poll.o \ - eventfd.o uring_cmd.o openclose.o \ - sqpoll.o xattr.o nop.o fs.o splice.o \ - sync.o msg_ring.o advise.o openclose.o \ - epoll.o statx.o timeout.o fdinfo.o \ + tctx.o filetable.o rw.o poll.o \ + tw.o wait.o eventfd.o uring_cmd.o \ + openclose.o sqpoll.o xattr.o nop.o \ + fs.o splice.o sync.o msg_ring.o \ + advise.o openclose.o statx.o timeout.o \ cancel.o waitid.o register.o \ - truncate.o memmap.o alloc_cache.o + truncate.o memmap.o alloc_cache.o \ + query.o loop.o + +obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o -obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o +obj-$(CONFIG_EPOLL) += epoll.o +obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o +obj-$(CONFIG_NET) += net.o cmd_net.o +obj-$(CONFIG_PROC_FS) += fdinfo.o +obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o +obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o +obj-$(CONFIG_IO_URING_BPF_OPS) += bpf-ops.o diff --git a/io_uring/advise.c b/io_uring/advise.c index cb7b881665e5..0073f74e3658 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -58,7 +58,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; #else return -EOPNOTSUPP; #endif @@ -104,5 +104,5 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 0dd17d8ba93a..962b6e2d04cc 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -1,7 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOU_ALLOC_CACHE_H #define IOU_ALLOC_CACHE_H #include <linux/io_uring_types.h> +#include <linux/kasan.h> /* * Don't allow the cache to grow beyond this size. @@ -16,15 +18,6 @@ bool io_alloc_cache_init(struct io_alloc_cache *cache, void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp); -static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr) -{ - if (IS_ENABLED(CONFIG_KASAN)) { - kfree(*iov); - *iov = NULL; - *nr = 0; - } -} - static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, void *entry) { @@ -68,4 +61,10 @@ static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp) return io_cache_alloc_new(cache, gfp); } +static inline void io_cache_free(struct io_alloc_cache *cache, void *obj) +{ + if (!io_alloc_cache_put(cache, obj)) + kvfree(obj); +} + #endif diff --git a/io_uring/bpf-ops.c b/io_uring/bpf-ops.c new file mode 100644 index 000000000000..937e48bef40b --- /dev/null +++ b/io_uring/bpf-ops.c @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/mutex.h> +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> + +#include "io_uring.h" +#include "register.h" +#include "loop.h" +#include "memmap.h" +#include "bpf-ops.h" + +static DEFINE_MUTEX(io_bpf_ctrl_mutex); +static const struct btf_type *loop_params_type; + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_io_uring_submit_sqes(struct io_ring_ctx *ctx, u32 nr) +{ + return io_submit_sqes(ctx, nr); +} + +__bpf_kfunc +__u8 *bpf_io_uring_get_region(struct io_ring_ctx *ctx, __u32 region_id, + const size_t rdwr_buf_size) +{ + struct io_mapped_region *r; + + lockdep_assert_held(&ctx->uring_lock); + + switch (region_id) { + case IOU_REGION_MEM: + r = &ctx->param_region; + break; + case IOU_REGION_CQ: + r = &ctx->ring_region; + break; + case IOU_REGION_SQ: + r = &ctx->sq_region; + break; + default: + return NULL; + } + + if (unlikely(rdwr_buf_size > io_region_size(r))) + return NULL; + return io_region_get_ptr(r); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(io_uring_kfunc_set) +BTF_ID_FLAGS(func, bpf_io_uring_submit_sqes, KF_SLEEPABLE); +BTF_ID_FLAGS(func, bpf_io_uring_get_region, KF_RET_NULL); +BTF_KFUNCS_END(io_uring_kfunc_set) + +static const struct btf_kfunc_id_set bpf_io_uring_kfunc_set = { + .owner = THIS_MODULE, + .set = &io_uring_kfunc_set, +}; + +static int io_bpf_ops__loop_step(struct io_ring_ctx *ctx, + struct iou_loop_params *lp) +{ + return IOU_LOOP_STOP; +} + +static struct io_uring_bpf_ops io_bpf_ops_stubs = { + .loop_step = io_bpf_ops__loop_step, +}; + +static bool bpf_io_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type != BPF_READ) + return false; + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (off % size != 0) + return false; + + return btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_io_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, int off, + int size) +{ + const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); + + if (t == loop_params_type) { + if (off + size <= offsetofend(struct iou_loop_params, cq_wait_idx)) + return SCALAR_VALUE; + } + + return -EACCES; +} + +static const struct bpf_verifier_ops bpf_io_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = bpf_io_is_valid_access, + .btf_struct_access = bpf_io_btf_struct_access, +}; + +static const struct btf_type * +io_lookup_struct_type(struct btf *btf, const char *name) +{ + s32 type_id; + + type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT); + if (type_id < 0) + return NULL; + return btf_type_by_id(btf, type_id); +} + +static int bpf_io_init(struct btf *btf) +{ + int ret; + + loop_params_type = io_lookup_struct_type(btf, "iou_loop_params"); + if (!loop_params_type) { + pr_err("io_uring: Failed to locate iou_loop_params\n"); + return -EINVAL; + } + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &bpf_io_uring_kfunc_set); + if (ret) { + pr_err("io_uring: Failed to register kfuncs (%d)\n", ret); + return ret; + } + return 0; +} + +static int bpf_io_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + return 0; +} + +static int bpf_io_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + const struct io_uring_bpf_ops *uops = udata; + struct io_uring_bpf_ops *ops = kdata; + + switch (moff) { + case offsetof(struct io_uring_bpf_ops, ring_fd): + ops->ring_fd = uops->ring_fd; + return 1; + } + return 0; +} + +static int io_install_bpf(struct io_ring_ctx *ctx, struct io_uring_bpf_ops *ops) +{ + if (ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_IOPOLL)) + return -EOPNOTSUPP; + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EOPNOTSUPP; + + if (ctx->bpf_ops) + return -EBUSY; + if (WARN_ON_ONCE(!ops->loop_step)) + return -EINVAL; + + ops->priv = ctx; + ctx->bpf_ops = ops; + ctx->loop_step = ops->loop_step; + return 0; +} + +static int bpf_io_reg(void *kdata, struct bpf_link *link) +{ + struct io_uring_bpf_ops *ops = kdata; + struct io_ring_ctx *ctx; + struct file *file; + int ret = -EBUSY; + + file = io_uring_ctx_get_file(ops->ring_fd, false); + if (IS_ERR(file)) + return PTR_ERR(file); + ctx = file->private_data; + + scoped_guard(mutex, &io_bpf_ctrl_mutex) { + guard(mutex)(&ctx->uring_lock); + ret = io_install_bpf(ctx, ops); + } + + fput(file); + return ret; +} + +static void io_eject_bpf(struct io_ring_ctx *ctx) +{ + struct io_uring_bpf_ops *ops = ctx->bpf_ops; + + if (WARN_ON_ONCE(!ops)) + return; + if (WARN_ON_ONCE(ops->priv != ctx)) + return; + + ops->priv = NULL; + ctx->bpf_ops = NULL; + ctx->loop_step = NULL; +} + +static void bpf_io_unreg(void *kdata, struct bpf_link *link) +{ + struct io_uring_bpf_ops *ops = kdata; + struct io_ring_ctx *ctx; + + guard(mutex)(&io_bpf_ctrl_mutex); + ctx = ops->priv; + if (ctx) { + guard(mutex)(&ctx->uring_lock); + if (WARN_ON_ONCE(ctx->bpf_ops != ops)) + return; + + io_eject_bpf(ctx); + } +} + +void io_unregister_bpf_ops(struct io_ring_ctx *ctx) +{ + /* + * ->bpf_ops is write protected by io_bpf_ctrl_mutex and uring_lock, + * and read protected by either. Try to avoid taking the global lock + * for rings that never had any bpf installed. + */ + scoped_guard(mutex, &ctx->uring_lock) { + if (!ctx->bpf_ops) + return; + } + + guard(mutex)(&io_bpf_ctrl_mutex); + guard(mutex)(&ctx->uring_lock); + if (ctx->bpf_ops) + io_eject_bpf(ctx); +} + +static struct bpf_struct_ops bpf_ring_ops = { + .verifier_ops = &bpf_io_verifier_ops, + .reg = bpf_io_reg, + .unreg = bpf_io_unreg, + .check_member = bpf_io_check_member, + .init_member = bpf_io_init_member, + .init = bpf_io_init, + .cfi_stubs = &io_bpf_ops_stubs, + .name = "io_uring_bpf_ops", + .owner = THIS_MODULE, +}; + +static int __init io_uring_bpf_init(void) +{ + int ret; + + ret = register_bpf_struct_ops(&bpf_ring_ops, io_uring_bpf_ops); + if (ret) { + pr_err("io_uring: Failed to register struct_ops (%d)\n", ret); + return ret; + } + + return 0; +} +__initcall(io_uring_bpf_init); diff --git a/io_uring/bpf-ops.h b/io_uring/bpf-ops.h new file mode 100644 index 000000000000..b39b3fd3acda --- /dev/null +++ b/io_uring/bpf-ops.h @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_BPF_OPS_H +#define IOU_BPF_OPS_H + +#include <linux/io_uring_types.h> + +enum { + IOU_REGION_MEM, + IOU_REGION_CQ, + IOU_REGION_SQ, +}; + +struct io_uring_bpf_ops { + int (*loop_step)(struct io_ring_ctx *ctx, struct iou_loop_params *lp); + + __u32 ring_fd; + void *priv; +}; + +#ifdef CONFIG_IO_URING_BPF_OPS +void io_unregister_bpf_ops(struct io_ring_ctx *ctx); +#else +static inline void io_unregister_bpf_ops(struct io_ring_ctx *ctx) +{ +} +#endif + +#endif /* IOU_BPF_OPS_H */ diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c new file mode 100644 index 000000000000..c0037632b7af --- /dev/null +++ b/io_uring/bpf_filter.c @@ -0,0 +1,457 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BPF filter support for io_uring. Supports SQE opcodes for now. + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/io_uring.h> +#include <linux/filter.h> +#include <linux/bpf.h> +#include <uapi/linux/io_uring.h> + +#include "io_uring.h" +#include "bpf_filter.h" +#include "net.h" +#include "openclose.h" + +struct io_bpf_filter { + refcount_t refs; + struct bpf_prog *prog; + struct io_bpf_filter *next; +}; + +/* Deny if this is set as the filter */ +static const struct io_bpf_filter dummy_filter; + +static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx, + struct io_kiocb *req) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + + bctx->opcode = req->opcode; + bctx->sqe_flags = (__force int) req->flags & SQE_VALID_FLAGS; + bctx->user_data = req->cqe.user_data; + /* clear residual, anything from pdu_size and below */ + memset((void *) bctx + offsetof(struct io_uring_bpf_ctx, pdu_size), 0, + sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size)); + + /* + * Opcodes can provide a handler for populating more data into bctx, + * for filters to use. + */ + if (def->filter_pdu_size) { + bctx->pdu_size = def->filter_pdu_size; + def->filter_populate(bctx, req); + } +} + +/* + * Run registered filters for a given opcode. For filters, a return of 0 denies + * execution of the request, a return of 1 allows it. If any filter for an + * opcode returns 0, filter processing is stopped, and the request is denied. + * This also stops the processing of filters. + * + * __io_uring_run_bpf_filters() returns 0 on success, allow running the + * request, and -EACCES when a request is denied. + */ +int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters, + struct io_kiocb *req) +{ + struct io_bpf_filter *filter; + struct io_uring_bpf_ctx bpf_ctx; + int ret; + + /* Fast check for existence of filters outside of RCU */ + if (!rcu_access_pointer(filters[req->opcode])) + return 0; + + /* + * req->opcode has already been validated to be within the range + * of what we expect, io_init_req() does this. + */ + guard(rcu)(); + filter = rcu_dereference(filters[req->opcode]); + if (!filter) + return 0; + else if (filter == &dummy_filter) + return -EACCES; + + io_uring_populate_bpf_ctx(&bpf_ctx, req); + + /* + * Iterate registered filters. The opcode is allowed IFF all filters + * return 1. If any filter returns denied, opcode will be denied. + */ + do { + if (filter == &dummy_filter) + return -EACCES; + ret = bpf_prog_run_pin_on_cpu(filter->prog, &bpf_ctx); + if (!ret) + return -EACCES; + filter = filter->next; + } while (filter); + + return 0; +} + +static void io_free_bpf_filters(struct rcu_head *head) +{ + struct io_bpf_filter __rcu **filter; + struct io_bpf_filters *filters; + int i; + + filters = container_of(head, struct io_bpf_filters, rcu_head); + scoped_guard(spinlock, &filters->lock) { + filter = filters->filters; + if (!filter) + return; + } + + for (i = 0; i < IORING_OP_LAST; i++) { + struct io_bpf_filter *f; + + rcu_read_lock(); + f = rcu_dereference(filter[i]); + while (f) { + struct io_bpf_filter *next = f->next; + + /* + * Even if stacked, dummy filter will always be last + * as it can only get installed into an empty spot. + */ + if (f == &dummy_filter) + break; + + /* Someone still holds a ref, stop iterating. */ + if (!refcount_dec_and_test(&f->refs)) + break; + + bpf_prog_destroy(f->prog); + kfree(f); + f = next; + } + rcu_read_unlock(); + } + kfree(filters->filters); + kfree(filters); +} + +static void __io_put_bpf_filters(struct io_bpf_filters *filters) +{ + if (refcount_dec_and_test(&filters->refs)) + call_rcu(&filters->rcu_head, io_free_bpf_filters); +} + +void io_put_bpf_filters(struct io_restriction *res) +{ + if (res->bpf_filters) + __io_put_bpf_filters(res->bpf_filters); +} + +static struct io_bpf_filters *io_new_bpf_filters(void) +{ + struct io_bpf_filters *filters __free(kfree) = NULL; + + filters = kzalloc_obj(*filters, GFP_KERNEL_ACCOUNT); + if (!filters) + return ERR_PTR(-ENOMEM); + + filters->filters = kzalloc_objs(struct io_bpf_filter *, IORING_OP_LAST, + GFP_KERNEL_ACCOUNT); + if (!filters->filters) + return ERR_PTR(-ENOMEM); + + refcount_set(&filters->refs, 1); + spin_lock_init(&filters->lock); + return no_free_ptr(filters); +} + +/* + * Validate classic BPF filter instructions. Only allow a safe subset of + * operations - no packet data access, just context field loads and basic + * ALU/jump operations. + */ +static int io_uring_check_cbpf_filter(struct sock_filter *filter, + unsigned int flen) +{ + int pc; + + for (pc = 0; pc < flen; pc++) { + struct sock_filter *ftest = &filter[pc]; + u16 code = ftest->code; + u32 k = ftest->k; + + switch (code) { + case BPF_LD | BPF_W | BPF_ABS: + ftest->code = BPF_LDX | BPF_W | BPF_ABS; + /* 32-bit aligned and not out of bounds. */ + if (k >= sizeof(struct io_uring_bpf_ctx) || k & 3) + return -EINVAL; + continue; + case BPF_LD | BPF_W | BPF_LEN: + ftest->code = BPF_LD | BPF_IMM; + ftest->k = sizeof(struct io_uring_bpf_ctx); + continue; + case BPF_LDX | BPF_W | BPF_LEN: + ftest->code = BPF_LDX | BPF_IMM; + ftest->k = sizeof(struct io_uring_bpf_ctx); + continue; + /* Explicitly include allowed calls. */ + case BPF_RET | BPF_K: + case BPF_RET | BPF_A: + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_NEG: + case BPF_LD | BPF_IMM: + case BPF_LDX | BPF_IMM: + case BPF_MISC | BPF_TAX: + case BPF_MISC | BPF_TXA: + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + case BPF_ST: + case BPF_STX: + case BPF_JMP | BPF_JA: + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + continue; + default: + return -EINVAL; + } + } + return 0; +} + +void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src) +{ + if (!src->bpf_filters) + return; + + rcu_read_lock(); + /* + * If the src filter is going away, just ignore it. + */ + if (refcount_inc_not_zero(&src->bpf_filters->refs)) { + dst->bpf_filters = src->bpf_filters; + dst->bpf_filters_cow = true; + } + rcu_read_unlock(); +} + +/* + * Allocate a new struct io_bpf_filters. Used when a filter is cloned and + * modifications need to be made. + */ +static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src) +{ + struct io_bpf_filters *filters; + struct io_bpf_filter *srcf; + int i; + + filters = io_new_bpf_filters(); + if (IS_ERR(filters)) + return filters; + + /* + * Iterate filters from src and assign in destination. Grabbing + * a reference is enough, we don't need to duplicate the memory. + * This is safe because filters are only ever appended to the + * front of the list, hence the only memory ever touched inside + * a filter is the refcount. + */ + rcu_read_lock(); + for (i = 0; i < IORING_OP_LAST; i++) { + srcf = rcu_dereference(src->bpf_filters->filters[i]); + if (!srcf) { + continue; + } else if (srcf == &dummy_filter) { + rcu_assign_pointer(filters->filters[i], &dummy_filter); + continue; + } + + /* + * Getting a ref on the first node is enough, putting the + * filter and iterating nodes to free will stop on the first + * one that doesn't hit zero when dropping. + */ + if (!refcount_inc_not_zero(&srcf->refs)) + goto err; + rcu_assign_pointer(filters->filters[i], srcf); + } + rcu_read_unlock(); + return filters; +err: + rcu_read_unlock(); + __io_put_bpf_filters(filters); + return ERR_PTR(-EBUSY); +} + +#define IO_URING_BPF_FILTER_FLAGS (IO_URING_BPF_FILTER_DENY_REST | \ + IO_URING_BPF_FILTER_SZ_STRICT) + +static int io_bpf_filter_import(struct io_uring_bpf *reg, + struct io_uring_bpf __user *arg) +{ + const struct io_issue_def *def; + int ret; + + if (copy_from_user(reg, arg, sizeof(*reg))) + return -EFAULT; + if (reg->cmd_type != IO_URING_BPF_CMD_FILTER) + return -EINVAL; + if (reg->cmd_flags || reg->resv) + return -EINVAL; + + if (reg->filter.opcode >= IORING_OP_LAST) + return -EINVAL; + if (reg->filter.flags & ~IO_URING_BPF_FILTER_FLAGS) + return -EINVAL; + if (!mem_is_zero(reg->filter.resv, sizeof(reg->filter.resv))) + return -EINVAL; + if (!mem_is_zero(reg->filter.resv2, sizeof(reg->filter.resv2))) + return -EINVAL; + if (!reg->filter.filter_len || reg->filter.filter_len > BPF_MAXINSNS) + return -EINVAL; + + /* Verify filter size */ + def = &io_issue_defs[array_index_nospec(reg->filter.opcode, IORING_OP_LAST)]; + + /* same size, always ok */ + ret = 0; + if (reg->filter.pdu_size == def->filter_pdu_size) + ; + /* size differs, fail in strict mode */ + else if (reg->filter.flags & IO_URING_BPF_FILTER_SZ_STRICT) + ret = -EMSGSIZE; + /* userspace filter is bigger, always disallow */ + else if (reg->filter.pdu_size > def->filter_pdu_size) + ret = -EMSGSIZE; + + /* copy back kernel filter size */ + reg->filter.pdu_size = def->filter_pdu_size; + if (copy_to_user(&arg->filter, ®->filter, sizeof(reg->filter))) + return -EFAULT; + + return ret; +} + +int io_register_bpf_filter(struct io_restriction *res, + struct io_uring_bpf __user *arg) +{ + struct io_bpf_filters *filters, *old_filters = NULL; + struct io_bpf_filter *filter, *old_filter; + struct io_uring_bpf reg; + struct bpf_prog *prog; + struct sock_fprog fprog; + int ret; + + ret = io_bpf_filter_import(®, arg); + if (ret) + return ret; + + fprog.len = reg.filter.filter_len; + fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr); + + ret = bpf_prog_create_from_user(&prog, &fprog, + io_uring_check_cbpf_filter, false); + if (ret) + return ret; + + /* + * No existing filters, allocate set. + */ + filters = res->bpf_filters; + if (!filters) { + filters = io_new_bpf_filters(); + if (IS_ERR(filters)) { + ret = PTR_ERR(filters); + goto err_prog; + } + } else if (res->bpf_filters_cow) { + filters = io_bpf_filter_cow(res); + if (IS_ERR(filters)) { + ret = PTR_ERR(filters); + goto err_prog; + } + /* + * Stash old filters, we'll put them once we know we'll + * succeed. Until then, res->bpf_filters is left untouched. + */ + old_filters = res->bpf_filters; + } + + filter = kzalloc_obj(*filter, GFP_KERNEL_ACCOUNT); + if (!filter) { + ret = -ENOMEM; + goto err; + } + refcount_set(&filter->refs, 1); + filter->prog = prog; + + /* + * Success - install the new filter set now. If we did COW, put + * the old filters as we're replacing them. + */ + if (old_filters) { + __io_put_bpf_filters(old_filters); + res->bpf_filters_cow = false; + } + res->bpf_filters = filters; + + /* + * Insert filter - if the current opcode already has a filter + * attached, add to the set. + */ + rcu_read_lock(); + spin_lock_bh(&filters->lock); + old_filter = rcu_dereference(filters->filters[reg.filter.opcode]); + if (old_filter) + filter->next = old_filter; + rcu_assign_pointer(filters->filters[reg.filter.opcode], filter); + + /* + * If IO_URING_BPF_FILTER_DENY_REST is set, fill any unregistered + * opcode with the dummy filter. That will cause them to be denied. + */ + if (reg.filter.flags & IO_URING_BPF_FILTER_DENY_REST) { + for (int i = 0; i < IORING_OP_LAST; i++) { + if (i == reg.filter.opcode) + continue; + old_filter = rcu_dereference(filters->filters[i]); + if (old_filter) + continue; + rcu_assign_pointer(filters->filters[i], &dummy_filter); + } + } + + spin_unlock_bh(&filters->lock); + rcu_read_unlock(); + return 0; +err: + if (filters != res->bpf_filters) + __io_put_bpf_filters(filters); +err_prog: + bpf_prog_destroy(prog); + return ret; +} diff --git a/io_uring/bpf_filter.h b/io_uring/bpf_filter.h new file mode 100644 index 000000000000..66a776cf25b4 --- /dev/null +++ b/io_uring/bpf_filter.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IO_URING_BPF_FILTER_H +#define IO_URING_BPF_FILTER_H + +#include <uapi/linux/io_uring/bpf_filter.h> + +#ifdef CONFIG_IO_URING_BPF + +int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters, struct io_kiocb *req); + +int io_register_bpf_filter(struct io_restriction *res, + struct io_uring_bpf __user *arg); + +void io_put_bpf_filters(struct io_restriction *res); + +void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src); + +static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters, + struct io_kiocb *req) +{ + if (filters) + return __io_uring_run_bpf_filters(filters, req); + + return 0; +} + +#else + +static inline int io_register_bpf_filter(struct io_restriction *res, + struct io_uring_bpf __user *arg) +{ + return -EINVAL; +} +static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters, + struct io_kiocb *req) +{ + return 0; +} +static inline void io_put_bpf_filters(struct io_restriction *res) +{ +} +static inline void io_bpf_filter_clone(struct io_restriction *dst, + struct io_restriction *src) +{ +} +#endif /* CONFIG_IO_URING_BPF */ + +#endif diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 484193567839..4aa3103ba9c3 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -2,22 +2,24 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> -#include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/namei.h> #include <linux/nospec.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> +#include "filetable.h" #include "io_uring.h" #include "tctx.h" +#include "sqpoll.h" +#include "uring_cmd.h" #include "poll.h" #include "timeout.h" #include "waitid.h" #include "futex.h" #include "cancel.h" +#include "wait.h" struct io_cancel { struct file *file; @@ -154,9 +156,16 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) cancel->fd = READ_ONCE(sqe->fd); } if (cancel->flags & IORING_ASYNC_CANCEL_OP) { + u32 op; + if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; - cancel->opcode = READ_ONCE(sqe->len); + + op = READ_ONCE(sqe->len); + if (op >= IORING_OP_LAST) + return -EINVAL; + + cancel->opcode = op; } return 0; @@ -181,7 +190,9 @@ static int __io_async_cancel(struct io_cancel_data *cd, } while (1); /* slow path, try all io-wq's */ + __set_current_state(TASK_RUNNING); io_ring_submit_lock(ctx, issue_flags); + mutex_lock(&ctx->tctx_lock); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { ret = io_async_cancel_one(node->task->io_uring, cd); @@ -191,6 +202,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, nr++; } } + mutex_unlock(&ctx->tctx_lock); io_ring_submit_unlock(ctx, issue_flags); return all ? nr : ret; } @@ -229,7 +241,7 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static int __io_sync_cancel(struct io_uring_task *tctx, @@ -341,3 +353,315 @@ out: fput(file); return ret; } + +bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, + struct hlist_head *list, bool cancel_all, + bool (*cancel)(struct io_kiocb *)) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool found = false; + + lockdep_assert_held(&ctx->uring_lock); + + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (!io_match_task_safe(req, tctx, cancel_all)) + continue; + hlist_del_init(&req->hash_node); + if (cancel(req)) + found = true; + } + + return found; +} + +int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned int issue_flags, struct hlist_head *list, + bool (*cancel)(struct io_kiocb *)) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + int nr = 0; + + io_ring_submit_lock(ctx, issue_flags); + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (!io_cancel_req_match(req, cd)) + continue; + if (cancel(req)) + nr++; + if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) + break; + } + io_ring_submit_unlock(ctx, issue_flags); + return nr ?: -ENOENT; +} + +static bool io_match_linked(struct io_kiocb *head) +{ + struct io_kiocb *req; + + io_for_each_link(req, head) { + if (req->flags & REQ_F_INFLIGHT) + return true; + } + return false; +} + +/* + * As io_match_task() but protected against racing with linked timeouts. + * User must not hold timeout_lock. + */ +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, + bool cancel_all) +{ + bool matched; + + if (tctx && head->tctx != tctx) + return false; + if (cancel_all) + return true; + + if (head->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = head->ctx; + + /* protect against races with linked timeouts */ + raw_spin_lock_irq(&ctx->timeout_lock); + matched = io_match_linked(head); + raw_spin_unlock_irq(&ctx->timeout_lock); + } else { + matched = io_match_linked(head); + } + return matched; +} + +void __io_uring_cancel(bool cancel_all) +{ + io_uring_unreg_ringfd(); + io_uring_cancel_generic(cancel_all, NULL); +} + +struct io_task_cancel { + struct io_uring_task *tctx; + bool all; +}; + +static bool io_cancel_task_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_task_cancel *cancel = data; + + return io_match_task_safe(req, cancel->tctx, cancel->all); +} + +static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, + struct io_uring_task *tctx, + bool cancel_all) +{ + struct io_defer_entry *de; + LIST_HEAD(list); + + list_for_each_entry_reverse(de, &ctx->defer_list, list) { + if (io_match_task_safe(de->req, tctx, cancel_all)) { + list_cut_position(&list, &ctx->defer_list, &de->list); + break; + } + } + if (list_empty(&list)) + return false; + + while (!list_empty(&list)) { + de = list_first_entry(&list, struct io_defer_entry, list); + list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); + io_req_task_queue_fail(de->req, -ECANCELED); + kfree(de); + } + return true; +} + +__cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + return req->ctx == data; +} + +static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) +{ + struct io_tctx_node *node; + enum io_wq_cancel cret; + bool ret = false; + + mutex_lock(&ctx->uring_lock); + mutex_lock(&ctx->tctx_lock); + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + /* + * io_wq will stay alive while we hold uring_lock, because it's + * killed after ctx nodes, which requires to take the lock. + */ + if (!tctx || !tctx->io_wq) + continue; + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } + mutex_unlock(&ctx->tctx_lock); + mutex_unlock(&ctx->uring_lock); + + return ret; +} + +__cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, + struct io_uring_task *tctx, + bool cancel_all, bool is_sqpoll_thread) +{ + struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; + enum io_wq_cancel cret; + bool ret = false; + + /* set it so io_req_local_work_add() would wake us up */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + } + + /* failed during ring init, it couldn't have issued any requests */ + if (!ctx->rings) + return false; + + if (!tctx) { + ret |= io_uring_try_cancel_iowq(ctx); + } else if (tctx->io_wq) { + /* + * Cancels requests of all rings, not only @ctx, but + * it's fine as the task is in exit/exec. + */ + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, + &cancel, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } + + /* SQPOLL thread does its own polling */ + if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || + is_sqpoll_thread) { + while (!list_empty(&ctx->iopoll_list)) { + io_iopoll_try_reap_events(ctx); + ret = true; + cond_resched(); + } + } + + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && + io_allowed_defer_tw_run(ctx)) + ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; + mutex_lock(&ctx->uring_lock); + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); + ret |= io_poll_remove_all(ctx, tctx, cancel_all); + ret |= io_waitid_remove_all(ctx, tctx, cancel_all); + ret |= io_futex_remove_all(ctx, tctx, cancel_all); + ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); + ret |= io_kill_timeouts(ctx, tctx, cancel_all); + mutex_unlock(&ctx->uring_lock); + if (tctx) + ret |= io_run_task_work() > 0; + else + ret |= flush_delayed_work(&ctx->fallback_work); + return ret; +} + +static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) +{ + if (tracked) + return atomic_read(&tctx->inflight_tracked); + return percpu_counter_sum(&tctx->inflight); +} + +/* + * Find any io_uring ctx that this task has registered or done IO on, and cancel + * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. + */ +__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) +{ + struct io_uring_task *tctx = current->io_uring; + struct io_ring_ctx *ctx; + struct io_tctx_node *node; + unsigned long index; + s64 inflight; + DEFINE_WAIT(wait); + + WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current); + + if (!current->io_uring) + return; + if (tctx->io_wq) + io_wq_exit_start(tctx->io_wq); + + atomic_inc(&tctx->in_cancel); + do { + bool loop = false; + + io_uring_drop_tctx_refs(current); + if (!tctx_inflight(tctx, !cancel_all)) + break; + + /* read completions before cancelations */ + inflight = tctx_inflight(tctx, false); + if (!inflight) + break; + + if (!sqd) { + xa_for_each(&tctx->xa, index, node) { + /* sqpoll task will cancel all its requests */ + if (node->ctx->sq_data) + continue; + loop |= io_uring_try_cancel_requests(node->ctx, + current->io_uring, + cancel_all, + false); + } + } else { + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + loop |= io_uring_try_cancel_requests(ctx, + current->io_uring, + cancel_all, + true); + } + + if (loop) { + cond_resched(); + continue; + } + + prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); + io_run_task_work(); + io_uring_drop_tctx_refs(current); + xa_for_each(&tctx->xa, index, node) { + if (io_local_work_pending(node->ctx)) { + WARN_ON_ONCE(node->ctx->submitter_task && + node->ctx->submitter_task != current); + goto end_wait; + } + } + /* + * If we've seen completions, retry without waiting. This + * avoids a race where a completion comes in before we did + * prepare_to_wait(). + */ + if (inflight == tctx_inflight(tctx, !cancel_all)) + schedule(); +end_wait: + finish_wait(&tctx->wait, &wait); + } while (1); + + io_uring_clean_tctx(tctx); + if (cancel_all) { + /* + * We shouldn't run task_works after cancel, so just leave + * ->in_cancel set for normal exit. + */ + atomic_dec(&tctx->in_cancel); + /* for exec all current's requests should be gone, kill tctx */ + __io_uring_free(current); + } +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h index bbfea2cd00ea..1b201a094303 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -6,10 +6,8 @@ struct io_cancel_data { struct io_ring_ctx *ctx; - union { - u64 data; - struct file *file; - }; + u64 data; + struct file *file; u8 opcode; u32 flags; int seq; @@ -23,6 +21,20 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, + bool cancel_all); + +bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, + struct hlist_head *list, bool cancel_all, + bool (*cancel)(struct io_kiocb *)); +int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned int issue_flags, struct hlist_head *list, + bool (*cancel)(struct io_kiocb *)); +__cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, + struct io_uring_task *tctx, + bool cancel_all, bool is_sqpoll_thread); +__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); +__cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data); static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c new file mode 100644 index 000000000000..7cd411fc4f33 --- /dev/null +++ b/io_uring/cmd_net.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <asm/ioctls.h> +#include <linux/io_uring/net.h> +#include <linux/errqueue.h> +#include <net/sock.h> + +#include "uring_cmd.h" +#include "io_uring.h" + +static int io_uring_cmd_get_sock_ioctl(struct socket *sock, int op) +{ + struct sock *sk = sock->sk; + struct proto *prot = READ_ONCE(sk->sk_prot); + int ret, arg = 0; + + if (!prot || !prot->ioctl) + return -EOPNOTSUPP; + + ret = prot->ioctl(sk, op, &arg); + if (ret) + return ret; + return arg; +} + +static inline int io_uring_cmd_getsockopt(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + bool compat = !!(issue_flags & IO_URING_F_COMPAT); + int optlen, optname, level, err; + void __user *optval; + + level = READ_ONCE(sqe->level); + if (level != SOL_SOCKET) + return -EOPNOTSUPP; + + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + + err = do_sock_getsockopt(sock, compat, level, optname, + USER_SOCKPTR(optval), + KERNEL_SOCKPTR(&optlen)); + if (err) + return err; + + /* On success, return optlen */ + return optlen; +} + +static inline int io_uring_cmd_setsockopt(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + bool compat = !!(issue_flags & IO_URING_F_COMPAT); + int optname, optlen, level; + void __user *optval; + sockptr_t optval_s; + + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + level = READ_ONCE(sqe->level); + optval_s = USER_SOCKPTR(optval); + + return do_sock_setsockopt(sock, compat, level, optname, optval_s, + optlen); +} + +static bool io_process_timestamp_skb(struct io_uring_cmd *cmd, struct sock *sk, + struct sk_buff *skb, unsigned issue_flags) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + struct io_uring_cqe cqe[2]; + struct io_timespec *iots; + struct timespec64 ts; + u32 tstype, tskey; + int ret; + + BUILD_BUG_ON(sizeof(struct io_uring_cqe) != sizeof(struct io_timespec)); + + ret = skb_get_tx_timestamp(skb, sk, &ts); + if (ret < 0) + return false; + + tskey = serr->ee.ee_data; + tstype = serr->ee.ee_info; + + cqe->user_data = 0; + cqe->res = tskey; + cqe->flags = IORING_CQE_F_MORE | ctx_cqe32_flags(cmd_to_io_kiocb(cmd)->ctx); + cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT; + if (ret == SOF_TIMESTAMPING_TX_HARDWARE) + cqe->flags |= IORING_CQE_F_TSTAMP_HW; + + iots = (struct io_timespec *)&cqe[1]; + iots->tv_sec = ts.tv_sec; + iots->tv_nsec = ts.tv_nsec; + return io_uring_cmd_post_mshot_cqe32(cmd, issue_flags, cqe); +} + +static int io_uring_cmd_timestamp(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct sock *sk = sock->sk; + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *tmp; + struct sk_buff_head list; + int ret; + + if (!(issue_flags & IO_URING_F_CQE32)) + return -EINVAL; + ret = io_cmd_poll_multishot(cmd, issue_flags, EPOLLERR); + if (unlikely(ret)) + return ret; + + if (skb_queue_empty_lockless(q)) + return -EAGAIN; + __skb_queue_head_init(&list); + + scoped_guard(spinlock_irq, &q->lock) { + skb_queue_walk_safe(q, skb, tmp) { + /* don't support skbs with payload */ + if (!skb_has_tx_timestamp(skb, sk) || skb->len) + continue; + __skb_unlink(skb, q); + __skb_queue_tail(&list, skb); + } + } + + while (1) { + skb = skb_peek(&list); + if (!skb) + break; + if (!io_process_timestamp_skb(cmd, sk, skb, issue_flags)) + break; + __skb_dequeue(&list); + consume_skb(skb); + } + + if (!unlikely(skb_queue_empty(&list))) { + scoped_guard(spinlock_irqsave, &q->lock) + skb_queue_splice(&list, q); + } + return -EAGAIN; +} + +static int io_uring_cmd_getsockname(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + struct sockaddr __user *uaddr; + unsigned int peer; + int __user *ulen; + + if (sqe->ioprio || sqe->__pad1 || sqe->len || sqe->rw_flags) + return -EINVAL; + + uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ulen = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + peer = READ_ONCE(sqe->optlen); + if (peer > 1) + return -EINVAL; + return do_getsockname(sock, peer, uaddr, ulen); +} + +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct socket *sock = cmd->file->private_data; + + switch (cmd->cmd_op) { + case SOCKET_URING_OP_SIOCINQ: + return io_uring_cmd_get_sock_ioctl(sock, SIOCINQ); + case SOCKET_URING_OP_SIOCOUTQ: + return io_uring_cmd_get_sock_ioctl(sock, SIOCOUTQ); + case SOCKET_URING_OP_GETSOCKOPT: + return io_uring_cmd_getsockopt(sock, cmd, issue_flags); + case SOCKET_URING_OP_SETSOCKOPT: + return io_uring_cmd_setsockopt(sock, cmd, issue_flags); + case SOCKET_URING_OP_TX_TIMESTAMP: + return io_uring_cmd_timestamp(sock, cmd, issue_flags); + case SOCKET_URING_OP_GETSOCKNAME: + return io_uring_cmd_getsockname(sock, cmd, issue_flags); + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(io_uring_cmd_sock); diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 89bff2068a19..8d4610246ba0 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -12,7 +12,6 @@ #include "io_uring.h" #include "epoll.h" -#if defined(CONFIG_EPOLL) struct io_epoll { struct file *file; int epfd; @@ -21,6 +20,12 @@ struct io_epoll { struct epoll_event event; }; +struct io_epoll_wait { + struct file *file; + int maxevents; + struct epoll_event __user *events; +}; + int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_epoll *epoll = io_kiocb_to_cmd(req, struct io_epoll); @@ -56,6 +61,32 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; +} + +int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + + if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + + iew->maxevents = READ_ONCE(sqe->len); + iew->events = u64_to_user_ptr(READ_ONCE(sqe->addr)); + return 0; +} + +int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + int ret; + + ret = epoll_sendevents(req->file, iew->events, iew->maxevents); + if (ret == 0) + return -EAGAIN; + if (ret < 0) + req_set_fail(req); + + io_req_set_res(req, ret, 0); + return IOU_COMPLETE; } -#endif diff --git a/io_uring/epoll.h b/io_uring/epoll.h index 870cce11ba98..4111997c360b 100644 --- a/io_uring/epoll.h +++ b/io_uring/epoll.h @@ -3,4 +3,6 @@ #if defined(CONFIG_EPOLL) int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags); +int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags); #endif diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 100d5da94cb9..d656cc2a0b9b 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -43,17 +43,11 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + atomic_andnot(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops); eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); io_eventfd_put(ev_fd); } -static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref) -{ - if (put_ref) - io_eventfd_put(ev_fd); - rcu_read_unlock(); -} - /* * Returns true if the caller should put the ev_fd reference, false if not. */ @@ -72,63 +66,38 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) /* * Trigger if eventfd_async isn't set, or if it's set and the caller is - * an async worker. If ev_fd isn't valid, obviously return false. + * an async worker. */ static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) { - if (ev_fd) - return !ev_fd->eventfd_async || io_wq_current_is_worker(); - return false; + return !ev_fd->eventfd_async || io_wq_current_is_worker(); } -/* - * On success, returns with an ev_fd reference grabbed and the RCU read - * lock held. - */ -static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx) +void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) { + bool skip = false; struct io_ev_fd *ev_fd; + struct io_rings *rings; - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return NULL; - - rcu_read_lock(); + guard(rcu)(); - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ + rings = rcu_dereference(ctx->rings_rcu); + if (!rings) + return; + if (READ_ONCE(rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return; ev_fd = rcu_dereference(ctx->io_ev_fd); - /* * Check again if ev_fd exists in case an io_eventfd_unregister call * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ - if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs)) - return ev_fd; - - rcu_read_unlock(); - return NULL; -} - -void io_eventfd_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = io_eventfd_grab(ctx); - if (ev_fd) - io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd)); -} - -void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = io_eventfd_grab(ctx); - if (ev_fd) { - bool skip, put_ref = true; + if (!ev_fd) + return; + if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs)) + return; + if (cqe_event) { /* * Eventfd should only get triggered when at least one event * has been posted. Some applications rely on the eventfd @@ -142,12 +111,10 @@ void io_eventfd_flush_signal(struct io_ring_ctx *ctx) skip = ctx->cached_cq_tail == ev_fd->last_cq_tail; ev_fd->last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); - - if (!skip) - put_ref = __io_eventfd_signal(ev_fd); - - io_eventfd_release(ev_fd, put_ref); } + + if (skip || __io_eventfd_signal(ev_fd)) + io_eventfd_put(ev_fd); } int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, @@ -165,7 +132,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, if (copy_from_user(&fd, fds, sizeof(*fds))) return -EFAULT; - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); + ev_fd = kmalloc_obj(*ev_fd); if (!ev_fd) return -ENOMEM; @@ -182,7 +149,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, spin_unlock(&ctx->completion_lock); ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; + ctx->int_flags |= IO_RING_F_HAS_EVFD; refcount_set(&ev_fd->refs, 1); atomic_set(&ev_fd->ops, 0); rcu_assign_pointer(ctx->io_ev_fd, ev_fd); @@ -196,7 +163,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) { - ctx->has_evfd = false; + ctx->int_flags &= ~IO_RING_F_HAS_EVFD; rcu_assign_pointer(ctx->io_ev_fd, NULL); io_eventfd_put(ev_fd); return 0; diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h index d394f49c6321..400eda4a4165 100644 --- a/io_uring/eventfd.h +++ b/io_uring/eventfd.h @@ -1,8 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ struct io_ring_ctx; int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int eventfd_async); int io_eventfd_unregister(struct io_ring_ctx *ctx); -void io_eventfd_flush_signal(struct io_ring_ctx *ctx); -void io_eventfd_signal(struct io_ring_ctx *ctx); +void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index f60d0a9d505e..001fb542dc11 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -5,46 +5,17 @@ #include <linux/file.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/nospec.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> -#include "io_uring.h" +#include "filetable.h" #include "sqpoll.h" #include "fdinfo.h" #include "cancel.h" #include "rsrc.h" - -#ifdef CONFIG_PROC_FS -static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, - const struct cred *cred) -{ - struct user_namespace *uns = seq_user_ns(m); - struct group_info *gi; - kernel_cap_t cap; - int g; - - seq_printf(m, "%5d\n", id); - seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); - seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); - seq_puts(m, "\n\tGroups:\t"); - gi = cred->group_info; - for (g = 0; g < gi->ngroups; g++) { - seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(uns, gi->gid[g])); - } - seq_puts(m, "\n\tCapEff:\t"); - cap = cred->cap_effective; - seq_put_hex_ll(m, NULL, cap.val, 16); - seq_putc(m, '\n'); - return 0; -} +#include "opdef.h" #ifdef CONFIG_NET_RX_BUSY_POLL static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, @@ -86,31 +57,21 @@ static inline void napi_show_fdinfo(struct io_ring_ctx *ctx, } #endif -/* - * Caller holds a reference to the file already, we don't need to do - * anything else to get an extra reference. - */ -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { - struct io_ring_ctx *ctx = file->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; - struct rusage sq_usage; unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; unsigned int sq_head = READ_ONCE(r->sq.head); unsigned int sq_tail = READ_ONCE(r->sq.tail); unsigned int cq_head = READ_ONCE(r->cq.head); unsigned int cq_tail = READ_ONCE(r->cq.tail); - unsigned int cq_shift = 0; unsigned int sq_shift = 0; - unsigned int sq_entries, cq_entries; + unsigned int cq_entries, sq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; - bool has_lock; unsigned int i; - if (ctx->flags & IORING_SETUP_CQE32) - cq_shift = 1; if (ctx->flags & IORING_SETUP_SQE128) sq_shift = 1; @@ -123,32 +84,59 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "SqMask:\t0x%x\n", sq_mask); seq_printf(m, "SqHead:\t%u\n", sq_head); seq_printf(m, "SqTail:\t%u\n", sq_tail); - seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); + seq_printf(m, "CachedSqHead:\t%u\n", data_race(ctx->cached_sq_head)); seq_printf(m, "CqMask:\t0x%x\n", cq_mask); seq_printf(m, "CqHead:\t%u\n", cq_head); seq_printf(m, "CqTail:\t%u\n", cq_tail); - seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); + seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail)); seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head); sq_entries = min(sq_tail - sq_head, ctx->sq_entries); for (i = 0; i < sq_entries; i++) { unsigned int entry = i + sq_head; struct io_uring_sqe *sqe; unsigned int sq_idx; + bool sqe128 = false; + u8 opcode; if (ctx->flags & IORING_SETUP_NO_SQARRAY) - break; - sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); + sq_idx = entry & sq_mask; + else + sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); if (sq_idx > sq_mask) continue; + sqe = &ctx->sq_sqes[sq_idx << sq_shift]; + opcode = READ_ONCE(sqe->opcode); + if (opcode >= IORING_OP_LAST) + continue; + opcode = array_index_nospec(opcode, IORING_OP_LAST); + if (sq_shift) { + sqe128 = true; + } else if (io_issue_defs[opcode].is_128) { + if (!(ctx->flags & IORING_SETUP_SQE_MIXED)) { + seq_printf(m, + "%5u: invalid sqe, 128B entry on non-mixed sq\n", + sq_idx); + break; + } + if (sq_idx == sq_mask) { + seq_printf(m, + "%5u: corrupted sqe, wrapping 128B entry\n", + sq_idx); + break; + } + sq_head++; + i++; + sqe128 = true; + } seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " "addr:0x%llx, rw_flags:0x%x, buf_index:%d " "user_data:%llu", - sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd, + sq_idx, io_uring_get_opcode(opcode), sqe->fd, sqe->flags, (unsigned long long) sqe->off, (unsigned long long) sqe->addr, sqe->rw_flags, sqe->buf_index, sqe->user_data); - if (sq_shift) { + if (sqe128) { u64 *sqeb = (void *) (sqe + 1); int size = sizeof(struct io_uring_sqe) / sizeof(u64); int j; @@ -160,44 +148,56 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } } seq_printf(m, "\n"); + cond_resched(); } seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); cq_entries = min(cq_tail - cq_head, ctx->cq_entries); for (i = 0; i < cq_entries; i++) { - unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; + struct io_uring_cqe *cqe; + bool cqe32 = false; - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", - entry & cq_mask, cqe->user_data, cqe->res, + cqe = &r->cqes[(cq_head & cq_mask)]; + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) + cqe32 = true; + seq_printf(m, "%5u: user_data:%llu, res:%d, flags:%x", + cq_head & cq_mask, cqe->user_data, cqe->res, cqe->flags); - if (cq_shift) - seq_printf(m, ", extra1:%llu, extra2:%llu\n", + if (cqe32) + seq_printf(m, ", extra1:%llu, extra2:%llu", cqe->big_cqe[0], cqe->big_cqe[1]); seq_printf(m, "\n"); + cq_head++; + if (cqe32) { + cq_head++; + i++; + } + cond_resched(); } - /* - * Avoid ABBA deadlock between the seq lock and the io_uring mutex, - * since fdinfo case grabs it in the opposite direction of normal use - * cases. If we fail to get the lock, we just don't iterate any - * structures that could be going away outside the io_uring mutex. - */ - has_lock = mutex_trylock(&ctx->uring_lock); - - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { + if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sq = ctx->sq_data; + struct task_struct *tsk; + rcu_read_lock(); + tsk = rcu_dereference(sq->thread); /* * sq->thread might be NULL if we raced with the sqpoll * thread termination. */ - if (sq->thread) { - sq_pid = sq->task_pid; + if (tsk) { + u64 usec; + + get_task_struct(tsk); + rcu_read_unlock(); + usec = io_sq_cpu_usec(tsk); + sq_pid = task_pid_nr_ns(tsk, + proc_pid_ns(file_inode(m->file)->i_sb)); + put_task_struct(tsk); sq_cpu = sq->sq_cpu; - getrusage(sq->thread, RUSAGE_SELF, &sq_usage); - sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000 - + sq_usage.ru_stime.tv_usec); + sq_total_time = usec; sq_work_time = sq->work_time; + } else { + rcu_read_unlock(); } } @@ -206,7 +206,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time); seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time); seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr); - for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) { + for (i = 0; i < ctx->file_table.data.nr; i++) { struct file *f = NULL; if (ctx->file_table.data.nodes[i]) @@ -218,7 +218,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } } seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); - for (i = 0; has_lock && i < ctx->buf_table.nr; i++) { + for (i = 0; i < ctx->buf_table.nr; i++) { struct io_mapped_ubuf *buf = NULL; if (ctx->buf_table.nodes[i]) @@ -228,17 +228,9 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) else seq_printf(m, "%5u: <none>\n", i); } - if (has_lock && !xa_empty(&ctx->personalities)) { - unsigned long index; - const struct cred *cred; - - seq_printf(m, "Personalities:\n"); - xa_for_each(&ctx->personalities, index, cred) - io_uring_show_cred(m, index, cred); - } seq_puts(m, "PollList:\n"); - for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) { + for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; struct io_kiocb *req; @@ -247,9 +239,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) task_work_pending(req->tctx->task)); } - if (has_lock) - mutex_unlock(&ctx->uring_lock); - seq_puts(m, "CqOverflowList:\n"); spin_lock(&ctx->completion_lock); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { @@ -262,4 +251,22 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) spin_unlock(&ctx->completion_lock); napi_show_fdinfo(ctx, m); } -#endif + +/* + * Caller holds a reference to the file already, we don't need to do + * anything else to get an extra reference. + */ +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +{ + struct io_ring_ctx *ctx = file->private_data; + + /* + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, + * since fdinfo case grabs it in the opposite direction of normal use + * cases. + */ + if (mutex_trylock(&ctx->uring_lock)) { + __io_uring_show_fdinfo(ctx, m); + mutex_unlock(&ctx->uring_lock); + } +} diff --git a/io_uring/filetable.c b/io_uring/filetable.c index dd8eeec97acf..cb1838c9fc37 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -22,6 +22,10 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) if (!table->bitmap) return -ENFILE; + if (table->alloc_hint < ctx->file_alloc_start || + table->alloc_hint >= ctx->file_alloc_end) + table->alloc_hint = ctx->file_alloc_start; + do { ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); if (ret != nr) @@ -57,7 +61,7 @@ void io_free_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table) static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, u32 slot_index) - __must_hold(&req->ctx->uring_lock) + __must_hold(&ctx->uring_lock) { struct io_rsrc_node *node; @@ -68,7 +72,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, if (slot_index >= ctx->file_table.data.nr) return -EINVAL; - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) return -ENOMEM; diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 7717ea9efd0e..c348233a3411 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -2,7 +2,6 @@ #ifndef IOU_FILE_TABLE_H #define IOU_FILE_TABLE_H -#include <linux/file.h> #include <linux/io_uring_types.h> #include "rsrc.h" diff --git a/io_uring/fs.c b/io_uring/fs.c index eccea851dd5a..d0580c754bf8 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -19,8 +19,8 @@ struct io_rename { struct file *file; int old_dfd; int new_dfd; - struct filename *oldpath; - struct filename *newpath; + struct delayed_filename oldpath; + struct delayed_filename newpath; int flags; }; @@ -28,22 +28,22 @@ struct io_unlink { struct file *file; int dfd; int flags; - struct filename *filename; + struct delayed_filename filename; }; struct io_mkdir { struct file *file; int dfd; umode_t mode; - struct filename *filename; + struct delayed_filename filename; }; struct io_link { struct file *file; int old_dfd; int new_dfd; - struct filename *oldpath; - struct filename *newpath; + struct delayed_filename oldpath; + struct delayed_filename newpath; int flags; }; @@ -51,6 +51,7 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); const char __user *oldf, *newf; + int err; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; @@ -63,14 +64,14 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ren->new_dfd = READ_ONCE(sqe->len); ren->flags = READ_ONCE(sqe->rename_flags); - ren->oldpath = getname(oldf); - if (IS_ERR(ren->oldpath)) - return PTR_ERR(ren->oldpath); + err = delayed_getname(&ren->oldpath, oldf); + if (unlikely(err)) + return err; - ren->newpath = getname(newf); - if (IS_ERR(ren->newpath)) { - putname(ren->oldpath); - return PTR_ERR(ren->newpath); + err = delayed_getname(&ren->newpath, newf); + if (unlikely(err)) { + dismiss_delayed_filename(&ren->oldpath); + return err; } req->flags |= REQ_F_NEED_CLEANUP; @@ -81,30 +82,33 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_renameat(struct io_kiocb *req, unsigned int issue_flags) { struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); + CLASS(filename_complete_delayed, old)(&ren->oldpath); + CLASS(filename_complete_delayed, new)(&ren->newpath); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, - ren->newpath, ren->flags); + ret = filename_renameat2(ren->old_dfd, old, + ren->new_dfd, new, ren->flags); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_renameat_cleanup(struct io_kiocb *req) { struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); - putname(ren->oldpath); - putname(ren->newpath); + dismiss_delayed_filename(&ren->oldpath); + dismiss_delayed_filename(&ren->newpath); } int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink); const char __user *fname; + int err; if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; @@ -118,9 +122,9 @@ int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - un->filename = getname(fname); - if (IS_ERR(un->filename)) - return PTR_ERR(un->filename); + err = delayed_getname(&un->filename, fname); + if (unlikely(err)) + return err; req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; @@ -130,31 +134,33 @@ int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink); + CLASS(filename_complete_delayed, name)(&un->filename); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (un->flags & AT_REMOVEDIR) - ret = do_rmdir(un->dfd, un->filename); + ret = filename_rmdir(un->dfd, name); else - ret = do_unlinkat(un->dfd, un->filename); + ret = filename_unlinkat(un->dfd, name); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_unlinkat_cleanup(struct io_kiocb *req) { struct io_unlink *ul = io_kiocb_to_cmd(req, struct io_unlink); - putname(ul->filename); + dismiss_delayed_filename(&ul->filename); } int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir); const char __user *fname; + int err; if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; @@ -165,9 +171,9 @@ int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) mkd->mode = READ_ONCE(sqe->len); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - mkd->filename = getname(fname); - if (IS_ERR(mkd->filename)) - return PTR_ERR(mkd->filename); + err = delayed_getname(&mkd->filename, fname); + if (unlikely(err)) + return err; req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; @@ -177,28 +183,30 @@ int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) { struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir); + CLASS(filename_complete_delayed, name)(&mkd->filename); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); + ret = filename_mkdirat(mkd->dfd, name, mkd->mode); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_mkdirat_cleanup(struct io_kiocb *req) { struct io_mkdir *md = io_kiocb_to_cmd(req, struct io_mkdir); - putname(md->filename); + dismiss_delayed_filename(&md->filename); } int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); const char __user *oldpath, *newpath; + int err; if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; @@ -209,14 +217,14 @@ int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - sl->oldpath = getname(oldpath); - if (IS_ERR(sl->oldpath)) - return PTR_ERR(sl->oldpath); + err = delayed_getname(&sl->oldpath, oldpath); + if (unlikely(err)) + return err; - sl->newpath = getname(newpath); - if (IS_ERR(sl->newpath)) { - putname(sl->oldpath); - return PTR_ERR(sl->newpath); + err = delayed_getname(&sl->newpath, newpath); + if (unlikely(err)) { + dismiss_delayed_filename(&sl->oldpath); + return err; } req->flags |= REQ_F_NEED_CLEANUP; @@ -227,21 +235,24 @@ int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); + CLASS(filename_complete_delayed, old)(&sl->oldpath); + CLASS(filename_complete_delayed, new)(&sl->newpath); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); + ret = filename_symlinkat(old, sl->new_dfd, new); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); const char __user *oldf, *newf; + int err; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; @@ -254,14 +265,14 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); lnk->flags = READ_ONCE(sqe->hardlink_flags); - lnk->oldpath = getname_uflags(oldf, lnk->flags); - if (IS_ERR(lnk->oldpath)) - return PTR_ERR(lnk->oldpath); + err = delayed_getname_uflags(&lnk->oldpath, oldf, lnk->flags); + if (unlikely(err)) + return err; - lnk->newpath = getname(newf); - if (IS_ERR(lnk->newpath)) { - putname(lnk->oldpath); - return PTR_ERR(lnk->newpath); + err = delayed_getname(&lnk->newpath, newf); + if (unlikely(err)) { + dismiss_delayed_filename(&lnk->oldpath); + return err; } req->flags |= REQ_F_NEED_CLEANUP; @@ -272,22 +283,23 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_linkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); + CLASS(filename_complete_delayed, old)(&lnk->oldpath); + CLASS(filename_complete_delayed, new)(&lnk->newpath); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, - lnk->newpath, lnk->flags); + ret = filename_linkat(lnk->old_dfd, old, lnk->new_dfd, new, lnk->flags); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_link_cleanup(struct io_kiocb *req) { struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); - putname(sl->oldpath); - putname(sl->newpath); + dismiss_delayed_filename(&sl->oldpath); + dismiss_delayed_filename(&sl->newpath); } diff --git a/io_uring/futex.c b/io_uring/futex.c index 43e2143255f5..9cc1788ef4c6 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -14,13 +14,9 @@ struct io_futex { struct file *file; - union { - u32 __user *uaddr; - struct futex_waitv __user *uwaitv; - }; + void __user *uaddr; unsigned long futex_val; unsigned long futex_mask; - unsigned long futexv_owned; u32 futex_flags; unsigned int futex_nr; bool futexv_unqueued; @@ -31,6 +27,11 @@ struct io_futex_data { struct io_kiocb *req; }; +struct io_futexv_data { + unsigned long owned; + struct futex_vector futexv[]; +}; + #define IO_FUTEX_ALLOC_CACHE_MAX 32 bool io_futex_cache_init(struct io_ring_ctx *ctx) @@ -44,53 +45,51 @@ void io_futex_cache_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->futex_cache, kfree); } -static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void __io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw) { - req->async_data = NULL; - hlist_del_init(&req->hash_node); - io_req_task_complete(req, ts); + hlist_del_init(&tw_req.req->hash_node); + io_req_task_complete(tw_req, tw); } -static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw) { - struct io_futex_data *ifd = req->async_data; + struct io_kiocb *req = tw_req.req; struct io_ring_ctx *ctx = req->ctx; - io_tw_lock(ctx, ts); - if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) - kfree(ifd); - __io_futex_complete(req, ts); + io_tw_lock(ctx, tw); + io_cache_free(&ctx->futex_cache, req->async_data); + io_req_async_data_clear(req, 0); + __io_futex_complete(tw_req, tw); } -static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_futexv_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); - struct futex_vector *futexv = req->async_data; + struct io_futexv_data *ifd = req->async_data; - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (!iof->futexv_unqueued) { int res; - res = futex_unqueue_multiple(futexv, iof->futex_nr); + res = futex_unqueue_multiple(ifd->futexv, iof->futex_nr); if (res != -1) io_req_set_res(req, res, 0); } - kfree(req->async_data); - req->flags &= ~REQ_F_ASYNC_DATA; - __io_futex_complete(req, ts); + io_req_async_data_free(req); + __io_futex_complete(tw_req, tw); } -static bool io_futexv_claim(struct io_futex *iof) +static bool io_futexv_claim(struct io_futexv_data *ifd) { - if (test_bit(0, &iof->futexv_owned) || - test_and_set_bit_lock(0, &iof->futexv_owned)) + if (test_bit(0, &ifd->owned) || test_and_set_bit_lock(0, &ifd->owned)) return false; return true; } -static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +static bool __io_futex_cancel(struct io_kiocb *req) { /* futex wake already done or in progress */ if (req->opcode == IORING_OP_FUTEX_WAIT) { @@ -100,9 +99,9 @@ static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) return false; req->io_task_work.func = io_futex_complete; } else { - struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); + struct io_futexv_data *ifd = req->async_data; - if (!io_futexv_claim(iof)) + if (!io_futexv_claim(ifd)) return false; req->io_task_work.func = io_futexv_complete; } @@ -116,49 +115,13 @@ static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { - struct hlist_node *tmp; - struct io_kiocb *req; - int nr = 0; - - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) - return -ENOENT; - - io_ring_submit_lock(ctx, issue_flags); - hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { - if (req->cqe.user_data != cd->data && - !(cd->flags & IORING_ASYNC_CANCEL_ANY)) - continue; - if (__io_futex_cancel(ctx, req)) - nr++; - if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) - break; - } - io_ring_submit_unlock(ctx, issue_flags); - - if (nr) - return nr; - - return -ENOENT; + return io_cancel_remove(ctx, cd, issue_flags, &ctx->futex_list, __io_futex_cancel); } bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { - struct hlist_node *tmp; - struct io_kiocb *req; - bool found = false; - - lockdep_assert_held(&ctx->uring_lock); - - hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { - if (!io_match_task_safe(req, tctx, cancel_all)) - continue; - hlist_del_init(&req->hash_node); - __io_futex_cancel(ctx, req); - found = true; - } - - return found; + return io_cancel_remove_all(ctx, tctx, &ctx->futex_list, cancel_all, __io_futex_cancel); } int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -186,16 +149,20 @@ int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) !futex_validate_input(iof->futex_flags, iof->futex_mask)) return -EINVAL; + /* Mark as inflight, so file exit cancelation will find it */ + io_req_track_inflight(req); return 0; } static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q) { struct io_kiocb *req = q->wake_data; - struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); + struct io_futexv_data *ifd = req->async_data; - if (!io_futexv_claim(iof)) + if (!io_futexv_claim(ifd)) { + __futex_wake_mark(q); return; + } if (unlikely(!__futex_wake_mark(q))) return; @@ -207,7 +174,7 @@ static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q) int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); - struct futex_vector *futexv; + struct io_futexv_data *ifd; int ret; /* No flags or mask supported for waitv */ @@ -220,21 +187,23 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!iof->futex_nr || iof->futex_nr > FUTEX_WAITV_MAX) return -EINVAL; - futexv = kcalloc(iof->futex_nr, sizeof(*futexv), GFP_KERNEL); - if (!futexv) + ifd = kzalloc_flex(struct io_futexv_data, futexv, iof->futex_nr, + GFP_KERNEL_ACCOUNT); + if (!ifd) return -ENOMEM; - ret = futex_parse_waitv(futexv, iof->uwaitv, iof->futex_nr, + ret = futex_parse_waitv(ifd->futexv, iof->uaddr, iof->futex_nr, io_futex_wakev_fn, req); if (ret) { - kfree(futexv); + kfree(ifd); return ret; } - iof->futexv_owned = 0; + /* Mark as inflight, so file exit cancelation will find it */ + io_req_track_inflight(req); iof->futexv_unqueued = 0; req->flags |= REQ_F_ASYNC_DATA; - req->async_data = futexv; + req->async_data = ifd; return 0; } @@ -254,13 +223,13 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); - struct futex_vector *futexv = req->async_data; + struct io_futexv_data *ifd = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret, woken = -1; io_ring_submit_lock(ctx, issue_flags); - ret = futex_wait_multiple_setup(futexv, iof->futex_nr, &woken); + ret = futex_wait_multiple_setup(ifd->futexv, iof->futex_nr, &woken); /* * Error case, ret is < 0. Mark the request as failed. @@ -269,10 +238,8 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_unlock(ctx, issue_flags); req_set_fail(req); io_req_set_res(req, ret, 0); - kfree(futexv); - req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; - return IOU_OK; + io_req_async_data_free(req); + return IOU_COMPLETE; } /* @@ -311,7 +278,6 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct io_ring_ctx *ctx = req->ctx; struct io_futex_data *ifd = NULL; - struct futex_hash_bucket *hb; int ret; if (!iof->futex_mask) { @@ -326,6 +292,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) goto done_unlock; } + req->flags |= REQ_F_ASYNC_DATA; req->async_data = ifd; ifd->q = futex_q_init; ifd->q.bitset = iof->futex_mask; @@ -333,12 +300,11 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) ifd->req = req; ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags, - &ifd->q, &hb); + &ifd->q, NULL, NULL); if (!ret) { hlist_add_head(&req->hash_node, &ctx->futex_list); io_ring_submit_unlock(ctx, issue_flags); - futex_queue(&ifd->q, hb, NULL); return IOU_ISSUE_SKIP_COMPLETE; } @@ -348,8 +314,8 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - kfree(ifd); - return IOU_OK; + io_req_async_data_free(req); + return IOU_COMPLETE; } int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags) @@ -366,5 +332,5 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 91019b4d0308..8cc7b47d3089 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -17,6 +17,7 @@ #include <linux/task_work.h> #include <linux/audit.h> #include <linux/mmu_context.h> +#include <linux/sched/sysctl.h> #include <uapi/linux/io_uring.h> #include "io-wq.h" @@ -30,11 +31,11 @@ enum { IO_WORKER_F_UP = 0, /* up and active */ IO_WORKER_F_RUNNING = 1, /* account as running */ IO_WORKER_F_FREE = 2, /* worker on free list */ - IO_WORKER_F_BOUND = 3, /* is doing bounded work */ }; enum { IO_WQ_BIT_EXIT = 0, /* wq exiting */ + IO_WQ_BIT_EXIT_ON_IDLE = 1, /* allow all workers to exit on idle */ }; enum { @@ -46,12 +47,12 @@ enum { */ struct io_worker { refcount_t ref; - int create_index; unsigned long flags; struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; struct io_wq *wq; + struct io_wq_acct *acct; struct io_wq_work *cur_work; raw_spinlock_t lock; @@ -77,10 +78,27 @@ struct io_worker { #define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) struct io_wq_acct { + /** + * Protects access to the worker lists. + */ + raw_spinlock_t workers_lock; + unsigned nr_workers; unsigned max_workers; - int index; atomic_t nr_running; + + /** + * The list of free workers. Protected by #workers_lock + * (write) and RCU (read). + */ + struct hlist_nulls_head free_list; + + /** + * The list of all workers. Protected by #workers_lock + * (write) and RCU (read). + */ + struct list_head all_list; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long flags; @@ -98,9 +116,6 @@ enum { struct io_wq { unsigned long state; - free_work_fn *free_work; - io_wq_work_fn *do_work; - struct io_wq_hash *hash; atomic_t worker_refs; @@ -112,12 +127,6 @@ struct io_wq { struct io_wq_acct acct[IO_WQ_ACCT_NR]; - /* lock protects access to elements below */ - raw_spinlock_t lock; - - struct hlist_nulls_head free_list; - struct list_head all_list; - struct wait_queue_entry wait; struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; @@ -135,7 +144,7 @@ struct io_cb_cancel_data { bool cancel_all; }; -static bool create_io_worker(struct io_wq *wq, int index); +static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct); static void io_wq_dec_running(struct io_worker *worker); static bool io_acct_cancel_pending_work(struct io_wq *wq, struct io_wq_acct *acct, @@ -143,6 +152,16 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq, static void create_worker_cb(struct callback_head *cb); static void io_wq_cancel_tw_create(struct io_wq *wq); +static inline unsigned int __io_get_work_hash(unsigned int work_flags) +{ + return work_flags >> IO_WQ_HASH_SHIFT; +} + +static inline unsigned int io_get_work_hash(struct io_wq_work *work) +{ + return __io_get_work_hash(atomic_read(&work->flags)); +} + static bool io_worker_get(struct io_worker *worker) { return refcount_inc_not_zero(&worker->ref); @@ -160,14 +179,14 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) } static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, - struct io_wq_work *work) + unsigned int work_flags) { - return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(work_flags & IO_WQ_WORK_UNBOUND)); } static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags)); + return worker->acct; } static void io_worker_ref_put(struct io_wq *wq) @@ -192,9 +211,9 @@ static void io_worker_cancel_cb(struct io_worker *worker) struct io_wq *wq = worker->wq; atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_worker_ref_put(wq); clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); @@ -213,6 +232,7 @@ static bool io_task_worker_match(struct callback_head *cb, void *data) static void io_worker_exit(struct io_worker *worker) { struct io_wq *wq = worker->wq; + struct io_wq_acct *acct = io_wq_get_acct(worker); while (1) { struct callback_head *cb = task_work_cancel_match(wq->task, @@ -226,11 +246,11 @@ static void io_worker_exit(struct io_worker *worker) io_worker_release(worker); wait_for_completion(&worker->ref_done); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (test_bit(IO_WORKER_F_FREE, &worker->flags)) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_wq_dec_running(worker); /* * this worker is a goner, clear ->worker_private to avoid any @@ -269,8 +289,7 @@ static inline bool io_acct_run_queue(struct io_wq_acct *acct) * Check head of free list for an available worker. If one isn't available, * caller must create one. */ -static bool io_wq_activate_free_worker(struct io_wq *wq, - struct io_wq_acct *acct) +static bool io_acct_activate_free_worker(struct io_wq_acct *acct) __must_hold(RCU) { struct hlist_nulls_node *n; @@ -281,13 +300,9 @@ static bool io_wq_activate_free_worker(struct io_wq *wq, * activate. If a given worker is on the free_list but in the process * of exiting, keep trying. */ - hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) { + hlist_nulls_for_each_entry_rcu(worker, n, &acct->free_list, nulls_node) { if (!io_worker_get(worker)) continue; - if (io_wq_get_acct(worker) != acct) { - io_worker_release(worker); - continue; - } /* * If the worker is already running, it's either already * starting work or finishing work. In either case, if it does @@ -314,16 +329,16 @@ static bool io_wq_create_worker(struct io_wq *wq, struct io_wq_acct *acct) if (unlikely(!acct->max_workers)) pr_warn_once("io-wq is not configured for unbound workers"); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers >= acct->max_workers) { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); return true; } acct->nr_workers++; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); atomic_inc(&acct->nr_running); atomic_inc(&wq->worker_refs); - return create_io_worker(wq, acct->index); + return create_io_worker(wq, acct); } static void io_wq_inc_running(struct io_worker *worker) @@ -339,21 +354,29 @@ static void create_worker_cb(struct callback_head *cb) struct io_wq *wq; struct io_wq_acct *acct; - bool do_create = false; + bool activated_free_worker, do_create = false; worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; - acct = &wq->acct[worker->create_index]; - raw_spin_lock(&wq->lock); + acct = worker->acct; + + rcu_read_lock(); + activated_free_worker = io_acct_activate_free_worker(acct); + rcu_read_unlock(); + if (activated_free_worker) + goto no_need_create; + + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers < acct->max_workers) { acct->nr_workers++; do_create = true; } - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); if (do_create) { - create_io_worker(wq, worker->create_index); + create_io_worker(wq, acct); } else { +no_need_create: atomic_dec(&acct->nr_running); io_worker_ref_put(wq); } @@ -384,7 +407,6 @@ static bool io_queue_worker_create(struct io_worker *worker, atomic_inc(&wq->worker_refs); init_task_work(&worker->create_work, func); - worker->create_index = acct->index; if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { /* * EXIT may have been set after checking it above, check after @@ -407,6 +429,30 @@ fail: return false; } +/* Defer if current and next work are both hashed to the same chain */ +static bool io_wq_hash_defer(struct io_wq_work *work, struct io_wq_acct *acct) +{ + unsigned int hash, work_flags; + struct io_wq_work *next; + + lockdep_assert_held(&acct->lock); + + work_flags = atomic_read(&work->flags); + if (!__io_wq_is_hashed(work_flags)) + return false; + + /* should not happen, io_acct_run_queue() said we had work */ + if (wq_list_empty(&acct->work_list)) + return true; + + hash = __io_get_work_hash(work_flags); + next = container_of(acct->work_list.first, struct io_wq_work, list); + work_flags = atomic_read(&next->flags); + if (!__io_wq_is_hashed(work_flags)) + return false; + return hash == __io_get_work_hash(work_flags); +} + static void io_wq_dec_running(struct io_worker *worker) { struct io_wq_acct *acct = io_wq_get_acct(worker); @@ -417,8 +463,14 @@ static void io_wq_dec_running(struct io_worker *worker) if (!atomic_dec_and_test(&acct->nr_running)) return; + if (!worker->cur_work) + return; if (!io_acct_run_queue(acct)) return; + if (io_wq_hash_defer(worker->cur_work, acct)) { + raw_spin_unlock(&acct->lock); + return; + } raw_spin_unlock(&acct->lock); atomic_inc(&acct->nr_running); @@ -430,33 +482,28 @@ static void io_wq_dec_running(struct io_worker *worker) * Worker will start processing some work. Move it to the busy list, if * it's currently on the freelist */ -static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) +static void __io_worker_busy(struct io_wq_acct *acct, struct io_worker *worker) { if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { clear_bit(IO_WORKER_F_FREE, &worker->flags); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); hlist_nulls_del_init_rcu(&worker->nulls_node); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); } } /* * No work, worker going to sleep. Move to freelist. */ -static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) - __must_hold(wq->lock) +static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker) + __must_hold(acct->workers_lock) { if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { set_bit(IO_WORKER_F_FREE, &worker->flags); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); } } -static inline unsigned int io_get_work_hash(struct io_wq_work *work) -{ - return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; -} - static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) { bool ret = false; @@ -475,26 +522,27 @@ static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) } static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, - struct io_worker *worker) + struct io_wq *wq) __must_hold(acct->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; unsigned int stall_hash = -1U; - struct io_wq *wq = worker->wq; wq_list_for_each(node, prev, &acct->work_list) { + unsigned int work_flags; unsigned int hash; work = container_of(node, struct io_wq_work, list); /* not hashed, can run anytime */ - if (!io_wq_is_hashed(work)) { + work_flags = atomic_read(&work->flags); + if (!__io_wq_is_hashed(work_flags)) { wq_list_del(&acct->work_list, node, prev); return work; } - hash = io_get_work_hash(work); + hash = __io_get_work_hash(work_flags); /* all items with this hash lie in [work, tail] */ tail = wq->hash_tail[hash]; @@ -552,9 +600,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct, __releases(&acct->lock) { struct io_wq *wq = worker->wq; - bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); do { + bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); struct io_wq_work *work; /* @@ -564,7 +612,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * can't make progress, any work completion or insertion will * clear the stalled flag. */ - work = io_get_next_work(acct, worker); + work = io_get_next_work(acct, wq); if (work) { /* * Make sure cancelation can find this, even before @@ -583,7 +631,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, if (!work) break; - __io_worker_busy(wq, worker); + __io_worker_busy(acct, worker); io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); @@ -591,17 +639,20 @@ static void io_worker_handle_work(struct io_wq_acct *acct, /* handle a whole dependent link */ do { struct io_wq_work *next_hashed, *linked; - unsigned int hash = io_get_work_hash(work); + unsigned int work_flags = atomic_read(&work->flags); + unsigned int hash = __io_wq_is_hashed(work_flags) + ? __io_get_work_hash(work_flags) + : -1U; next_hashed = wq_next_work(work); if (do_kill && - (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) + (work_flags & IO_WQ_WORK_UNBOUND)) atomic_or(IO_WQ_WORK_CANCEL, &work->flags); - wq->do_work(work); + io_wq_submit_work(work); io_assign_current_work(worker, NULL); - linked = wq->free_work(work); + linked = io_wq_free_work(work); work = next_hashed; if (!work && linked && !io_wq_is_hashed(linked)) { work = linked; @@ -654,20 +705,24 @@ static int io_wq_worker(void *data) while (io_acct_run_queue(acct)) io_worker_handle_work(acct, worker); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); /* * Last sleep timed out. Exit if we're not the last worker, - * or if someone modified our affinity. + * or if someone modified our affinity. If wq is marked + * idle-exit, drop the worker as well. This is used to avoid + * keeping io-wq workers around for tasks that no longer have + * any active io_uring instances. */ - if (last_timeout && (exit_mask || acct->nr_workers > 1)) { + if ((last_timeout && (exit_mask || acct->nr_workers > 1)) || + test_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state)) { acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); __set_current_state(TASK_RUNNING); break; } last_timeout = false; - __io_worker_idle(wq, worker); - raw_spin_unlock(&wq->lock); + __io_worker_idle(acct, worker); + raw_spin_unlock(&acct->workers_lock); if (io_run_task_work()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); @@ -728,18 +783,18 @@ void io_wq_worker_sleeping(struct task_struct *tsk) io_wq_dec_running(worker); } -static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, +static void io_init_new_worker(struct io_wq *wq, struct io_wq_acct *acct, struct io_worker *worker, struct task_struct *tsk) { tsk->worker_private = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, wq->cpu_mask); - raw_spin_lock(&wq->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); - list_add_tail_rcu(&worker->all_list, &wq->all_list); + raw_spin_lock(&acct->workers_lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); + list_add_tail_rcu(&worker->all_list, &acct->all_list); set_bit(IO_WORKER_F_FREE, &worker->flags); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); wake_up_new_task(tsk); } @@ -756,11 +811,12 @@ static inline bool io_should_retry_thread(struct io_worker *worker, long err) */ if (fatal_signal_pending(current)) return false; - if (worker->init_retries++ >= WORKER_INIT_LIMIT) - return false; + worker->init_retries++; switch (err) { case -EAGAIN: + return worker->init_retries <= WORKER_INIT_LIMIT; + /* Analogous to a fork() syscall, always retry on a restartable error */ case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTARTNOHAND: @@ -787,20 +843,20 @@ static void create_worker_cont(struct callback_head *cb) struct io_worker *worker; struct task_struct *tsk; struct io_wq *wq; + struct io_wq_acct *acct; worker = container_of(cb, struct io_worker, create_work); clear_bit_unlock(0, &worker->create_state); wq = worker->wq; + acct = io_wq_get_acct(worker); tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wq, worker, tsk); + io_init_new_worker(wq, acct, worker, tsk); io_worker_release(worker); return; } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { - struct io_wq_acct *acct = io_wq_get_acct(worker); - atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; if (!acct->nr_workers) { struct io_cb_cancel_data match = { @@ -808,11 +864,11 @@ static void create_worker_cont(struct callback_head *cb) .cancel_all = true, }; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); while (io_acct_cancel_pending_work(wq, acct, &match)) ; } else { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); } io_worker_ref_put(wq); kfree(worker); @@ -834,36 +890,33 @@ static void io_workqueue_create(struct work_struct *work) kfree(worker); } -static bool create_io_worker(struct io_wq *wq, int index) +static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct) { - struct io_wq_acct *acct = &wq->acct[index]; struct io_worker *worker; struct task_struct *tsk; __set_current_state(TASK_RUNNING); - worker = kzalloc(sizeof(*worker), GFP_KERNEL); + worker = kzalloc_obj(*worker); if (!worker) { fail: atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_worker_ref_put(wq); return false; } refcount_set(&worker->ref, 1); worker->wq = wq; + worker->acct = acct; raw_spin_lock_init(&worker->lock); init_completion(&worker->ref_done); - if (index == IO_WQ_ACCT_BOUND) - set_bit(IO_WORKER_F_BOUND, &worker->flags); - tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wq, worker, tsk); + io_init_new_worker(wq, acct, worker, tsk); } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { kfree(worker); goto fail; @@ -879,14 +932,14 @@ fail: * Iterate the passed in list and call the specific function for each * worker that isn't exiting */ -static bool io_wq_for_each_worker(struct io_wq *wq, - bool (*func)(struct io_worker *, void *), - void *data) +static bool io_acct_for_each_worker(struct io_wq_acct *acct, + bool (*func)(struct io_worker *, void *), + void *data) { struct io_worker *worker; bool ret = false; - list_for_each_entry_rcu(worker, &wq->all_list, all_list) { + list_for_each_entry_rcu(worker, &acct->all_list, all_list) { if (io_worker_get(worker)) { /* no task if node is/was offline */ if (worker->task) @@ -900,6 +953,15 @@ static bool io_wq_for_each_worker(struct io_wq *wq, return ret; } +static void io_wq_for_each_worker(struct io_wq *wq, + bool (*func)(struct io_worker *, void *), + void *data) +{ + for (int i = 0; i < IO_WQ_ACCT_NR; i++) + if (io_acct_for_each_worker(&wq->acct[i], func, data)) + break; +} + static bool io_wq_worker_wake(struct io_worker *worker, void *data) { __set_notify_signal(worker->task); @@ -907,28 +969,46 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) return false; } +void io_wq_set_exit_on_idle(struct io_wq *wq, bool enable) +{ + if (!wq->task) + return; + + if (!enable) { + clear_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state); + return; + } + + if (test_and_set_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state)) + return; + + rcu_read_lock(); + io_wq_for_each_worker(wq, io_wq_worker_wake, NULL); + rcu_read_unlock(); +} + static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { atomic_or(IO_WQ_WORK_CANCEL, &work->flags); - wq->do_work(work); - work = wq->free_work(work); + io_wq_submit_work(work); + work = io_wq_free_work(work); } while (work); } -static void io_wq_insert_work(struct io_wq *wq, struct io_wq_work *work) +static void io_wq_insert_work(struct io_wq *wq, struct io_wq_acct *acct, + struct io_wq_work *work, unsigned int work_flags) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash; struct io_wq_work *tail; - if (!io_wq_is_hashed(work)) { + if (!__io_wq_is_hashed(work_flags)) { append: wq_list_add_tail(&work->list, &acct->work_list); return; } - hash = io_get_work_hash(work); + hash = __io_get_work_hash(work_flags); tail = wq->hash_tail[hash]; wq->hash_tail[hash] = work; if (!tail) @@ -944,8 +1024,8 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int work_flags = atomic_read(&work->flags); + struct io_wq_acct *acct = io_work_get_acct(wq, work_flags); struct io_cb_cancel_data match = { .fn = io_wq_work_match_item, .data = work, @@ -964,12 +1044,12 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) } raw_spin_lock(&acct->lock); - io_wq_insert_work(wq, work); + io_wq_insert_work(wq, acct, work, work_flags); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); rcu_read_lock(); - do_create = !io_wq_activate_free_worker(wq, acct); + do_create = !io_acct_activate_free_worker(acct); rcu_read_unlock(); if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || @@ -980,12 +1060,12 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) if (likely(did_create)) return; - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers) { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); return; } - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); /* fatal condition, failed to create the first worker */ io_acct_cancel_pending_work(wq, acct, &match); @@ -1034,17 +1114,18 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) } static inline void io_wq_remove_pending(struct io_wq *wq, + struct io_wq_acct *acct, struct io_wq_work *work, struct io_wq_work_node *prev) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash = io_get_work_hash(work); struct io_wq_work *prev_work = NULL; if (io_wq_is_hashed(work) && work == wq->hash_tail[hash]) { if (prev) prev_work = container_of(prev, struct io_wq_work, list); - if (prev_work && io_get_work_hash(prev_work) == hash) + if (prev_work && io_wq_is_hashed(prev_work) && + io_get_work_hash(prev_work) == hash) wq->hash_tail[hash] = prev_work; else wq->hash_tail[hash] = NULL; @@ -1064,7 +1145,7 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq, work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; - io_wq_remove_pending(wq, work, prev); + io_wq_remove_pending(wq, acct, work, prev); raw_spin_unlock(&acct->lock); io_run_cancel(work, wq); match->nr_pending++; @@ -1092,11 +1173,22 @@ retry: } } +static void io_acct_cancel_running_work(struct io_wq_acct *acct, + struct io_cb_cancel_data *match) +{ + raw_spin_lock(&acct->workers_lock); + io_acct_for_each_worker(acct, io_wq_worker_cancel, match); + raw_spin_unlock(&acct->workers_lock); +} + static void io_wq_cancel_running_work(struct io_wq *wq, struct io_cb_cancel_data *match) { rcu_read_lock(); - io_wq_for_each_worker(wq, io_wq_worker_cancel, match); + + for (int i = 0; i < IO_WQ_ACCT_NR; i++) + io_acct_cancel_running_work(&wq->acct[i], match); + rcu_read_unlock(); } @@ -1119,16 +1211,14 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. * - * Do both of these while holding the wq->lock, to ensure that + * Do both of these while holding the acct->workers_lock, to ensure that * we'll find a work item regardless of state. */ io_wq_cancel_pending_work(wq, &match); if (match.nr_pending && !match.cancel_all) return IO_WQ_CANCEL_OK; - raw_spin_lock(&wq->lock); io_wq_cancel_running_work(wq, &match); - raw_spin_unlock(&wq->lock); if (match.nr_running && !match.cancel_all) return IO_WQ_CANCEL_RUNNING; @@ -1152,7 +1242,7 @@ static int io_wq_hash_wake(struct wait_queue_entry *wait, unsigned mode, struct io_wq_acct *acct = &wq->acct[i]; if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - io_wq_activate_free_worker(wq, acct); + io_acct_activate_free_worker(acct); } rcu_read_unlock(); return 1; @@ -1163,19 +1253,15 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) int ret, i; struct io_wq *wq; - if (WARN_ON_ONCE(!data->free_work || !data->do_work)) - return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(!bounded)) return ERR_PTR(-EINVAL); - wq = kzalloc(sizeof(struct io_wq), GFP_KERNEL); + wq = kzalloc_obj(struct io_wq); if (!wq) return ERR_PTR(-ENOMEM); refcount_inc(&data->hash->refs); wq->hash = data->hash; - wq->free_work = data->free_work; - wq->do_work = data->do_work; ret = -ENOMEM; @@ -1190,22 +1276,24 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) for (i = 0; i < IO_WQ_ACCT_NR; i++) { struct io_wq_acct *acct = &wq->acct[i]; - acct->index = i; atomic_set(&acct->nr_running, 0); + + raw_spin_lock_init(&acct->workers_lock); + INIT_HLIST_NULLS_HEAD(&acct->free_list, 0); + INIT_LIST_HEAD(&acct->all_list); + INIT_WQ_LIST(&acct->work_list); raw_spin_lock_init(&acct->lock); } - raw_spin_lock_init(&wq->lock); - INIT_HLIST_NULLS_HEAD(&wq->free_list, 0); - INIT_LIST_HEAD(&wq->all_list); - wq->task = get_task_struct(data->task); atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); - if (ret) + if (ret) { + put_task_struct(wq->task); goto err; + } return wq; err: @@ -1250,6 +1338,8 @@ static void io_wq_cancel_tw_create(struct io_wq *wq) static void io_wq_exit_workers(struct io_wq *wq) { + unsigned long timeout, warn_timeout; + if (!wq->task) return; @@ -1259,7 +1349,26 @@ static void io_wq_exit_workers(struct io_wq *wq) io_wq_for_each_worker(wq, io_wq_worker_wake, NULL); rcu_read_unlock(); io_worker_ref_put(wq); - wait_for_completion(&wq->worker_done); + + /* + * Shut up hung task complaint, see for example + * + * https://lore.kernel.org/all/696fc9e7.a70a0220.111c58.0006.GAE@google.com/ + * + * where completely overloading the system with tons of long running + * io-wq items can easily trigger the hung task timeout. Only sleep + * uninterruptibly for half that time, and warn if we exceeded end + * up waiting more than IO_URING_EXIT_WAIT_MAX. + */ + timeout = sysctl_hung_task_timeout_secs * HZ / 2; + if (!timeout) + timeout = MAX_SCHEDULE_TIMEOUT; + warn_timeout = jiffies + IO_URING_EXIT_WAIT_MAX; + do { + if (wait_for_completion_timeout(&wq->worker_done, timeout)) + break; + WARN_ON_ONCE(time_after(jiffies, warn_timeout)); + } while (1); spin_lock_irq(&wq->hash->wait.lock); list_del_init(&wq->wait.entry); @@ -1385,14 +1494,14 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) rcu_read_lock(); - raw_spin_lock(&wq->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { acct = &wq->acct[i]; + raw_spin_lock(&acct->workers_lock); prev[i] = max_t(int, acct->max_workers, prev[i]); if (new_count[i]) acct->max_workers = new_count[i]; + raw_spin_unlock(&acct->workers_lock); } - raw_spin_unlock(&wq->lock); rcu_read_unlock(); for (i = 0; i < IO_WQ_ACCT_NR; i++) diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index b3b004a7b625..42f00a47a9c9 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef INTERNAL_IO_WQ_H #define INTERNAL_IO_WQ_H @@ -21,9 +22,6 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; -typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); -typedef void (io_wq_work_fn)(struct io_wq_work *); - struct io_wq_hash { refcount_t refs; unsigned long map; @@ -39,13 +37,12 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash) struct io_wq_data { struct io_wq_hash *hash; struct task_struct *task; - io_wq_work_fn *do_work; - free_work_fn *free_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); void io_wq_exit_start(struct io_wq *wq); void io_wq_put_and_exit(struct io_wq *wq); +void io_wq_set_exit_on_idle(struct io_wq *wq, bool enable); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); @@ -54,9 +51,14 @@ int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask); int io_wq_max_workers(struct io_wq *wq, int *new_count); bool io_wq_worker_stopped(void); +static inline bool __io_wq_is_hashed(unsigned int work_flags) +{ + return work_flags & IO_WQ_WORK_HASHED; +} + static inline bool io_wq_is_hashed(struct io_wq_work *work) { - return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; + return __io_wq_is_hashed(atomic_read(&work->flags)); } typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f7acae5f7e1d..103b6c88f252 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -29,7 +29,7 @@ * * Also see the examples in the liburing library: * - * git://git.kernel.dk/liburing + * git://git.kernel.org/pub/scm/linux/kernel/git/axboe/liburing.git * * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens * from data shared between the kernel and application. This is done both @@ -40,37 +40,25 @@ * Copyright (c) 2018-2019 Christoph Hellwig */ #include <linux/kernel.h> -#include <linux/init.h> #include <linux/errno.h> #include <linux/syscalls.h> -#include <net/compat.h> #include <linux/refcount.h> -#include <linux/uio.h> #include <linux/bits.h> #include <linux/sched/signal.h> #include <linux/fs.h> -#include <linux/file.h> #include <linux/mm.h> -#include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> -#include <linux/bvec.h> -#include <linux/net.h> -#include <net/sock.h> #include <linux/anon_inodes.h> -#include <linux/sched/mm.h> #include <linux/uaccess.h> #include <linux/nospec.h> -#include <linux/fsnotify.h> -#include <linux/fadvise.h> #include <linux/task_work.h> #include <linux/io_uring.h> #include <linux/io_uring/cmd.h> #include <linux/audit.h> #include <linux/security.h> #include <linux/jump_label.h> -#include <asm/shmparam.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -79,6 +67,7 @@ #include "io-wq.h" +#include "filetable.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" @@ -97,58 +86,42 @@ #include "uring_cmd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" +#include "bpf-ops.h" #include "timeout.h" #include "poll.h" #include "rw.h" #include "alloc_cache.h" #include "eventfd.h" +#include "wait.h" +#include "bpf_filter.h" +#include "loop.h" #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) -#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ - IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ - REQ_F_ASYNC_DATA) + REQ_F_INFLIGHT | REQ_F_CREDS | REQ_F_ASYNC_DATA) -#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ - REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) +#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \ + REQ_F_REISSUE | REQ_F_POLLED | \ + IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 -#define IO_LOCAL_TW_DEFAULT_MAX 20 - -struct io_defer_entry { - struct list_head list; - struct io_kiocb *req; - u32 seq; -}; /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) -#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) - -/* - * No waiters. It's larger than any valid value of the tw counter - * so that tests against ->cq_wait_nr would fail and skip wake_up(). - */ -#define IO_CQ_WAKE_INIT (-1U) -/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ -#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) -static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct io_uring_task *tctx, - bool cancel_all, - bool is_sqpoll_thread); +static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); +static void __io_req_caches_free(struct io_ring_ctx *ctx); -static void io_queue_sqe(struct io_kiocb *req); - -static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); +static __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(io_key_has_sqarray, HZ); struct kmem_cache *req_cachep; static struct workqueue_struct *iou_wq __ro_after_init; @@ -177,52 +150,24 @@ static const struct ctl_table kernel_io_uring_disabled_table[] = { }; #endif -static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) +static void io_poison_cached_req(struct io_kiocb *req) { - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); + req->ctx = IO_URING_PTR_POISON; + req->tctx = IO_URING_PTR_POISON; + req->file = IO_URING_PTR_POISON; + req->creds = IO_URING_PTR_POISON; + req->io_task_work.func = IO_URING_PTR_POISON; + req->apoll = IO_URING_PTR_POISON; } -static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) +static void io_poison_req(struct io_kiocb *req) { - return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); -} - -static bool io_match_linked(struct io_kiocb *head) -{ - struct io_kiocb *req; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; -} - -/* - * As io_match_task() but protected against racing with linked timeouts. - * User must not hold timeout_lock. - */ -bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, - bool cancel_all) -{ - bool matched; - - if (tctx && head->tctx != tctx) - return false; - if (cancel_all) - return true; - - if (head->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = head->ctx; - - /* protect against races with linked timeouts */ - raw_spin_lock_irq(&ctx->timeout_lock); - matched = io_match_linked(head); - raw_spin_unlock_irq(&ctx->timeout_lock); - } else { - matched = io_match_linked(head); - } - return matched; + io_poison_cached_req(req); + req->async_data = IO_URING_PTR_POISON; + req->kbuf = IO_URING_PTR_POISON; + req->comp_list.next = IO_URING_PTR_POISON; + req->file_node = IO_URING_PTR_POISON; + req->link = IO_URING_PTR_POISON; } static inline void req_fail_link_node(struct io_kiocb *req, int res) @@ -233,6 +178,8 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { + if (IS_ENABLED(CONFIG_KASAN)) + io_poison_cached_req(req); wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } @@ -243,23 +190,6 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } -static __cold void io_fallback_req_func(struct work_struct *work) -{ - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, - fallback_work.work); - struct llist_node *node = llist_del_all(&ctx->fallback_llist); - struct io_kiocb *req, *tmp; - struct io_tw_state ts = {}; - - percpu_ref_get(&ctx->refs); - mutex_lock(&ctx->uring_lock); - llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, &ts); - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - percpu_ref_put(&ctx->refs); -} - static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) { unsigned int hash_buckets; @@ -267,8 +197,8 @@ static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) do { hash_buckets = 1U << bits; - table->hbs = kvmalloc_array(hash_buckets, sizeof(table->hbs[0]), - GFP_KERNEL_ACCOUNT); + table->hbs = kvmalloc_objs(table->hbs[0], hash_buckets, + GFP_KERNEL_ACCOUNT); if (table->hbs) break; if (bits == 1) @@ -282,13 +212,23 @@ static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) return 0; } +static void io_free_alloc_caches(struct io_ring_ctx *ctx) +{ + io_alloc_cache_free(&ctx->apoll_cache, kfree); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); + io_futex_cache_free(ctx); + io_rsrc_cache_free(ctx); +} + static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; bool ret; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kzalloc_obj(*ctx); if (!ctx) return NULL; @@ -313,7 +253,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); - INIT_LIST_HEAD(&ctx->io_buffers_cache); ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, sizeof(struct async_poll), 0); ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, @@ -322,12 +261,11 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_rw), offsetof(struct io_async_rw, clear)); - ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_uring_cmd_data), 0); - spin_lock_init(&ctx->msg_lock); - ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_kiocb), 0); + ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_cmd), + sizeof(struct io_async_cmd)); ret |= io_futex_cache_init(ctx); + ret |= io_rsrc_cache_init(ctx); if (ret) goto free_ref; init_completion(&ctx->ref_comp); @@ -337,15 +275,16 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); raw_spin_lock_init(&ctx->timeout_lock); - INIT_WQ_LIST(&ctx->iopoll_list); - INIT_LIST_HEAD(&ctx->io_buffers_comp); + INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); + mutex_init(&ctx->tctx_lock); ctx->submit_state.free_list.next = NULL; INIT_HLIST_HEAD(&ctx->waitid_list); + xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); #endif @@ -360,44 +299,17 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) free_ref: percpu_ref_exit(&ctx->refs); err: - io_alloc_cache_free(&ctx->apoll_cache, kfree); - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, kfree); - io_futex_cache_free(ctx); + io_free_alloc_caches(ctx); kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } -static void io_account_cq_overflow(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; -} - -static bool req_need_defer(struct io_kiocb *req, u32 seq) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) { - struct io_ring_ctx *ctx = req->ctx; - - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; - } - - return false; -} - static void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_BUFFER_SELECTED) { - spin_lock(&req->ctx->completion_lock); - io_kbuf_drop(req); - spin_unlock(&req->ctx->completion_lock); - } + if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) + io_kbuf_drop_legacy(req); if (req->flags & REQ_F_NEED_CLEANUP) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; @@ -405,11 +317,6 @@ static void io_clean_op(struct io_kiocb *req) if (def->cleanup) def->cleanup(req); } - if ((req->flags & REQ_F_POLLED) && req->apoll) { - kfree(req->apoll->double_poll); - kfree(req->apoll); - req->apoll = NULL; - } if (req->flags & REQ_F_INFLIGHT) atomic_dec(&req->tctx->inflight_tracked); if (req->flags & REQ_F_CREDS) @@ -421,7 +328,12 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } -static inline void io_req_track_inflight(struct io_kiocb *req) +/* + * Mark the request as inflight, so that file cancelation will find it. + * Can be used if the file is an io_uring instance, or if the request itself + * relies on ->mm being alive for the duration of the request. + */ +inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; @@ -443,28 +355,9 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) return req->link; } -static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) - return NULL; - return __io_prep_linked_timeout(req); -} - -static noinline void __io_arm_ltimeout(struct io_kiocb *req) -{ - io_queue_linked_timeout(__io_prep_linked_timeout(req)); -} - -static inline void io_arm_ltimeout(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) - __io_arm_ltimeout(req); -} - static void io_prep_async_work(struct io_kiocb *req) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { req->flags |= REQ_F_CREDS; @@ -486,7 +379,7 @@ static void io_prep_async_work(struct io_kiocb *req) if (should_hash && (req->file->f_flags & O_DIRECT) && (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE)) should_hash = false; - if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) + if (should_hash || (req->flags & REQ_F_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) @@ -513,7 +406,6 @@ static void io_prep_async_link(struct io_kiocb *req) static void io_queue_iowq(struct io_kiocb *req) { - struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); @@ -538,13 +430,11 @@ static void io_queue_iowq(struct io_kiocb *req) trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); - if (link) - io_queue_linked_timeout(link); } -static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_queue_iowq_tw(struct io_tw_req tw_req, io_tw_token_t tw) { - io_queue_iowq(req); + io_queue_iowq(tw_req.req); } void io_req_queue_iowq(struct io_kiocb *req) @@ -553,37 +443,52 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } +unsigned io_linked_nr(struct io_kiocb *req) +{ + struct io_kiocb *tmp; + unsigned nr = 0; + + io_for_each_link(tmp, req) + nr++; + return nr; +} + static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { - spin_lock(&ctx->completion_lock); + bool drain_seen = false, first = true; + + lockdep_assert_held(&ctx->uring_lock); + __io_req_caches_free(ctx); + while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); - if (req_need_defer(de->req, de->seq)) - break; + drain_seen |= de->req->flags & REQ_F_IO_DRAIN; + if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained) + return; + list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue(de->req); kfree(de); + first = false; } - spin_unlock(&ctx->completion_lock); } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (ctx->poll_activated) + if (ctx->int_flags & IO_RING_F_POLL_ACTIVATED) io_poll_wq_wake(ctx); - if (ctx->off_timeout_used) + if (ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED) io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); - if (ctx->has_evfd) - io_eventfd_flush_signal(ctx); + if (ctx->int_flags & IO_RING_F_HAS_EVFD) + io_eventfd_signal(ctx, true); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) { - if (!ctx->lockless_cq) + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) spin_lock(&ctx->completion_lock); } @@ -596,11 +501,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx) static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) { io_commit_cqring(ctx); - if (!ctx->task_complete) { - if (!ctx->lockless_cq) + if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) spin_unlock(&ctx->completion_lock); /* IOPOLL rings only need to wake up if it's also SQPOLL */ - if (!ctx->syscall_iopoll) + if (!(ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL)) io_cqring_wake(ctx); } io_commit_cqring_flush(ctx); @@ -617,27 +522,31 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { - size_t cqe_size = sizeof(struct io_uring_cqe); - lockdep_assert_held(&ctx->uring_lock); /* don't abort if we're dying, entries must get freed */ if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; - if (ctx->flags & IORING_SETUP_CQE32) - cqe_size <<= 1; - io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { + size_t cqe_size = sizeof(struct io_uring_cqe); struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; + bool is_cqe32 = false; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); + if (ocqe->cqe.flags & IORING_CQE_F_32 || + ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + cqe_size <<= 1; + } + if (ctx->flags & IORING_SETUP_CQE32) + is_cqe32 = false; if (!dying) { - if (!io_get_cqe_overflow(ctx, &cqe, true)) + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) break; memcpy(cqe, &ocqe->cqe, cqe_size); } @@ -652,6 +561,7 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) * to care for a non-real case. */ if (need_resched()) { + ctx->cqe_sentinel = ctx->cqe_cached; io_cq_unlock_post(ctx); mutex_unlock(&ctx->uring_lock); cond_resched(); @@ -673,13 +583,18 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) __io_cqring_overflow_flush(ctx, true); } -static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) +void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); __io_cqring_overflow_flush(ctx, false); mutex_unlock(&ctx->uring_lock); } +void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx) +{ + __io_cqring_overflow_flush(ctx, false); +} + /* must to be called somewhat shortly after putting a request */ static inline void io_put_task(struct io_kiocb *req) { @@ -704,7 +619,7 @@ void io_task_refs_refill(struct io_uring_task *tctx) tctx->cached_refs += refill; } -static __cold void io_uring_drop_tctx_refs(struct task_struct *task) +__cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; @@ -716,27 +631,20 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, u64 extra2) +static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx, + struct io_overflow_cqe *ocqe) { - struct io_overflow_cqe *ocqe; - size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - lockdep_assert_held(&ctx->completion_lock); - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); - - ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { + struct io_rings *r = ctx->rings; + /* * If we're in ring overflow flush mode, or in task cancel mode, * or cannot allocate an overflow entry, then we need to drop it * on the floor. */ - io_account_cq_overflow(ctx); + WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } @@ -745,23 +653,69 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; - } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } -static void io_req_cqe_overflow(struct io_kiocb *req) +static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe, gfp_t gfp) +{ + struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = false; + + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + ocq_size += sizeof(struct io_uring_cqe); + } + + ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); + if (ocqe) { + ocqe->cqe.user_data = cqe->user_data; + ocqe->cqe.res = cqe->res; + ocqe->cqe.flags = cqe->flags; + if (is_cqe32 && big_cqe) { + ocqe->cqe.big_cqe[0] = big_cqe->extra1; + ocqe->cqe.big_cqe[1] = big_cqe->extra2; + } + } + if (big_cqe) + big_cqe->extra1 = big_cqe->extra2 = 0; + return ocqe; +} + +/* + * Compute queued CQEs for free-space calculation, clamped to cq_entries. + */ +static unsigned int io_cqring_queued(struct io_ring_ctx *ctx) { - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, req->big_cqe.extra2); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); + struct io_rings *rings = io_get_rings(ctx); + int diff; + + diff = (int)(ctx->cached_cq_tail - READ_ONCE(rings->cq.head)); + if (diff >= 0) + return min((unsigned int)diff, ctx->cq_entries); + return 0; +} + +/* + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE + * because the ring is a single 16b entry away from wrapping. + */ +static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) +{ + if (io_cqring_queued(ctx) < ctx->cq_entries) { + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; + + cqe->user_data = 0; + cqe->res = 0; + cqe->flags = IORING_CQE_F_SKIP; + ctx->cached_cq_tail++; + return true; + } + return false; } /* @@ -769,11 +723,11 @@ static void io_req_cqe_overflow(struct io_kiocb *req) * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int free, queued, len; + unsigned int free, len; /* * Posting into the CQ when there are pending overflowed CQEs may break @@ -783,12 +737,20 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) return false; - /* userspace may cheat modifying the tail, be safe and do min */ - queued = min(__io_cqring_events(ctx), ctx->cq_entries); - free = ctx->cq_entries - queued; + /* + * Post dummy CQE if a 32b CQE is needed and there's only room for a + * 16b CQE before the ring wraps. + */ + if (cqe32 && off + 1 == ctx->cq_entries) { + if (!io_fill_nop_cqe(ctx, off)) + return false; + off = 0; + } + + free = ctx->cq_entries - io_cqring_queued(ctx); /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); - if (!len) + if (len < (cqe32 + 1)) return false; if (ctx->flags & IORING_SETUP_CQE32) { @@ -801,24 +763,33 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) return true; } +static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, + struct io_uring_cqe src_cqe[2]) +{ + struct io_uring_cqe *cqe; + + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) + return false; + if (unlikely(!io_get_cqe(ctx, &cqe, true))) + return false; + + memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); + trace_io_uring_complete(ctx, NULL, cqe); + return true; +} + static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + bool cqe32 = cflags & IORING_CQE_F_32; struct io_uring_cqe *cqe; - ctx->cq_extra++; - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - if (likely(io_get_cqe(ctx, &cqe))) { + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - if (ctx->flags & IORING_SETUP_CQE32) { + if (cqe32) { WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } @@ -829,16 +800,30 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } -static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags) +static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags) { - bool filled; + return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags }; +} - filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled) - filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); +static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; - return filled; + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL); + spin_lock(&ctx->completion_lock); + io_cqring_add_overflow(ctx, ocqe); + spin_unlock(&ctx->completion_lock); +} + +static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_NOWAIT); + return io_cqring_add_overflow(ctx, ocqe); } bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) @@ -846,30 +831,70 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags bool filled; io_cq_lock(ctx); - filled = __io_post_aux_cqe(ctx, user_data, res, cflags); + filled = io_fill_cqe_aux(ctx, user_data, res, cflags); + if (unlikely(!filled)) { + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); + + filled = io_cqe_overflow_locked(ctx, &cqe, NULL); + } io_cq_unlock_post(ctx); return filled; } /* - * Must be called from inline task_work so we now a flush will happen later, + * Must be called from inline task_work so we know a flush will happen later, * and obviously with ctx->uring_lock held (tw always has that). */ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + lockdep_assert_held(&ctx->uring_lock); + lockdep_assert(ctx->int_flags & IO_RING_F_LOCKLESS_CQ); + if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); + + io_cqe_overflow(ctx, &cqe, NULL); + } + ctx->submit_state.cq_flush = true; +} + +/* + * A helper for multishot requests posting additional CQEs. + * Should only be used from a task_work including IO_URING_F_MULTISHOT. + */ +bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) +{ + struct io_ring_ctx *ctx = req->ctx; + bool posted; + + /* + * If multishot has already posted deferred completions, ensure that + * those are flushed first before posting this one. If not, CQEs + * could get reordered. + */ + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) + __io_submit_flush_completions(ctx); + + lockdep_assert(!io_wq_current_is_worker()); + lockdep_assert_held(&ctx->uring_lock); + + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) { spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); spin_unlock(&ctx->completion_lock); + } else { + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); } + ctx->submit_state.cq_flush = true; + return posted; } /* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ -bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) +bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2]) { struct io_ring_ctx *ctx = req->ctx; bool posted; @@ -877,16 +902,23 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); - __io_cq_lock(ctx); - posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + cqe[0].user_data = req->cqe.user_data; + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) { + spin_lock(&ctx->completion_lock); + posted = io_fill_cqe_aux32(ctx, cqe); + spin_unlock(&ctx->completion_lock); + } else { + posted = io_fill_cqe_aux32(ctx, cqe); + } + ctx->submit_state.cq_flush = true; - __io_cq_unlock_post(ctx); return posted; } static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; + bool completed = true; /* * All execution paths but io-wq use the deferred completions by @@ -899,19 +931,21 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { + if ((ctx->int_flags & IO_RING_F_LOCKLESS_CQ) || (req->flags & REQ_F_REISSUE)) { +defer_complete: req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return; } io_cq_lock(ctx); - if (!(req->flags & REQ_F_CQE_SKIP)) { - if (!io_fill_cqe_req(ctx, req)) - io_req_cqe_overflow(req); - } + if (!(req->flags & REQ_F_CQE_SKIP)) + completed = io_fill_cqe_req(ctx, req); io_cq_unlock_post(ctx); + if (!completed) + goto defer_complete; + /* * We don't free the request here because we know it's called from * io-wq only, which holds a reference, so it cannot be the last put. @@ -927,29 +961,13 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); - io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED)); + io_req_set_res(req, res, io_put_kbuf(req, res, NULL)); if (def->fail) def->fail(req); io_req_complete_defer(req); } /* - * Don't initialise the fields below on every allocation, but do that in - * advance and keep them valid across allocations. - */ -static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - req->ctx = ctx; - req->buf_node = NULL; - req->file_node = NULL; - req->link = NULL; - req->async_data = NULL; - /* not necessary, but safer to zero */ - memset(&req->cqe, 0, sizeof(req->cqe)); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); -} - -/* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. * Because of that, io_alloc_req() should be called only under ->uring_lock @@ -958,7 +976,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO; void *reqs[IO_REQ_ALLOC_BATCH]; int ret; @@ -976,10 +994,11 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); + ctx->nr_req_allocated += ret; + while (ret--) { struct io_kiocb *req = reqs[ret]; - io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); } return true; @@ -1021,356 +1040,26 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return nxt; } -static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static void io_req_task_cancel(struct io_tw_req tw_req, io_tw_token_t tw) { - if (!ctx) - return; - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + struct io_kiocb *req = tw_req.req; - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - percpu_ref_put(&ctx->refs); -} - -/* - * Run queued task_work, returning the number of entries processed in *count. - * If more entries than max_entries are available, stop processing once this - * is reached and return the rest of the list. - */ -struct llist_node *io_handle_tw_list(struct llist_node *node, - unsigned int *count, - unsigned int max_entries) -{ - struct io_ring_ctx *ctx = NULL; - struct io_tw_state ts = { }; - - do { - struct llist_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - - if (req->ctx != ctx) { - ctx_flush_and_put(ctx, &ts); - ctx = req->ctx; - mutex_lock(&ctx->uring_lock); - percpu_ref_get(&ctx->refs); - } - INDIRECT_CALL_2(req->io_task_work.func, - io_poll_task_func, io_req_rw_complete, - req, &ts); - node = next; - (*count)++; - if (unlikely(need_resched())) { - ctx_flush_and_put(ctx, &ts); - ctx = NULL; - cond_resched(); - } - } while (node && *count < max_entries); - - ctx_flush_and_put(ctx, &ts); - return node; -} - -static __cold void __io_fallback_tw(struct llist_node *node, bool sync) -{ - struct io_ring_ctx *last_ctx = NULL; - struct io_kiocb *req; - - while (node) { - req = container_of(node, struct io_kiocb, io_task_work.node); - node = node->next; - if (sync && last_ctx != req->ctx) { - if (last_ctx) { - flush_delayed_work(&last_ctx->fallback_work); - percpu_ref_put(&last_ctx->refs); - } - last_ctx = req->ctx; - percpu_ref_get(&last_ctx->refs); - } - if (llist_add(&req->io_task_work.node, - &req->ctx->fallback_llist)) - schedule_delayed_work(&req->ctx->fallback_work, 1); - } - - if (last_ctx) { - flush_delayed_work(&last_ctx->fallback_work); - percpu_ref_put(&last_ctx->refs); - } -} - -static void io_fallback_tw(struct io_uring_task *tctx, bool sync) -{ - struct llist_node *node = llist_del_all(&tctx->task_list); - - __io_fallback_tw(node, sync); -} - -struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, - unsigned int max_entries, - unsigned int *count) -{ - struct llist_node *node; - - if (unlikely(current->flags & PF_EXITING)) { - io_fallback_tw(tctx, true); - return NULL; - } - - node = llist_del_all(&tctx->task_list); - if (node) { - node = llist_reverse_order(node); - node = io_handle_tw_list(node, count, max_entries); - } - - /* relaxed read is enough as only the task itself sets ->in_cancel */ - if (unlikely(atomic_read(&tctx->in_cancel))) - io_uring_drop_tctx_refs(current); - - trace_io_uring_task_work_run(tctx, *count); - return node; -} - -void tctx_task_work(struct callback_head *cb) -{ - struct io_uring_task *tctx; - struct llist_node *ret; - unsigned int count = 0; - - tctx = container_of(cb, struct io_uring_task, task_work); - ret = tctx_task_work_run(tctx, UINT_MAX, &count); - /* can't happen */ - WARN_ON_ONCE(ret); -} - -static inline void io_req_local_work_add(struct io_kiocb *req, - struct io_ring_ctx *ctx, - unsigned flags) -{ - unsigned nr_wait, nr_tw, nr_tw_prev; - struct llist_node *head; - - /* See comment above IO_CQ_WAKE_INIT */ - BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); - - /* - * We don't know how many reuqests is there in the link and whether - * they can even be queued lazily, fall back to non-lazy. - */ - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) - flags &= ~IOU_F_TWQ_LAZY_WAKE; - - guard(rcu)(); - - head = READ_ONCE(ctx->work_llist.first); - do { - nr_tw_prev = 0; - if (head) { - struct io_kiocb *first_req = container_of(head, - struct io_kiocb, - io_task_work.node); - /* - * Might be executed at any moment, rely on - * SLAB_TYPESAFE_BY_RCU to keep it alive. - */ - nr_tw_prev = READ_ONCE(first_req->nr_tw); - } - - /* - * Theoretically, it can overflow, but that's fine as one of - * previous adds should've tried to wake the task. - */ - nr_tw = nr_tw_prev + 1; - if (!(flags & IOU_F_TWQ_LAZY_WAKE)) - nr_tw = IO_CQ_WAKE_FORCE; - - req->nr_tw = nr_tw; - req->io_task_work.node.next = head; - } while (!try_cmpxchg(&ctx->work_llist.first, &head, - &req->io_task_work.node)); - - /* - * cmpxchg implies a full barrier, which pairs with the barrier - * in set_current_state() on the io_cqring_wait() side. It's used - * to ensure that either we see updated ->cq_wait_nr, or waiters - * going to sleep will observe the work added to the list, which - * is similar to the wait/wawke task state sync. - */ - - if (!head) { - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ctx->has_evfd) - io_eventfd_signal(ctx); - } - - nr_wait = atomic_read(&ctx->cq_wait_nr); - /* not enough or no one is waiting */ - if (nr_tw < nr_wait) - return; - /* the previous add has already woken it up */ - if (nr_tw_prev >= nr_wait) - return; - wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); + io_tw_lock(req->ctx, tw); + io_req_defer_failed(req, req->cqe.res); } -static void io_req_normal_work_add(struct io_kiocb *req) +void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw) { - struct io_uring_task *tctx = req->tctx; + struct io_kiocb *req = tw_req.req; struct io_ring_ctx *ctx = req->ctx; - /* task_work already pending, we're done */ - if (!llist_add(&req->io_task_work.node, &tctx->task_list)) - return; - - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - - /* SQPOLL doesn't need the task_work added, it'll run it itself */ - if (ctx->flags & IORING_SETUP_SQPOLL) { - __set_notify_signal(tctx->task); - return; - } - - if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) - return; - - io_fallback_tw(tctx, false); -} - -void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) -{ - if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) - io_req_local_work_add(req, req->ctx, flags); - else - io_req_normal_work_add(req); -} - -void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, - unsigned flags) -{ - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))) - return; - io_req_local_work_add(req, ctx, flags); -} - -static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) -{ - struct llist_node *node = llist_del_all(&ctx->work_llist); - - __io_fallback_tw(node, false); - node = llist_del_all(&ctx->retry_llist); - __io_fallback_tw(node, false); -} - -static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, - int min_events) -{ - if (!io_local_work_pending(ctx)) - return false; - if (events < min_events) - return true; - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - return false; -} - -static int __io_run_local_work_loop(struct llist_node **node, - struct io_tw_state *ts, - int events) -{ - int ret = 0; - - while (*node) { - struct llist_node *next = (*node)->next; - struct io_kiocb *req = container_of(*node, struct io_kiocb, - io_task_work.node); - INDIRECT_CALL_2(req->io_task_work.func, - io_poll_task_func, io_req_rw_complete, - req, ts); - *node = next; - if (++ret >= events) - break; - } - - return ret; -} - -static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, - int min_events, int max_events) -{ - struct llist_node *node; - unsigned int loops = 0; - int ret = 0; - - if (WARN_ON_ONCE(ctx->submitter_task != current)) - return -EEXIST; - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); -again: - min_events -= ret; - ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, max_events); - if (ctx->retry_llist.first) - goto retry_done; - - /* - * llists are in reverse order, flip it back the right way before - * running the pending items. - */ - node = llist_reverse_order(llist_del_all(&ctx->work_llist)); - ret += __io_run_local_work_loop(&node, ts, max_events - ret); - ctx->retry_llist.first = node; - loops++; - - if (io_run_local_work_continue(ctx, ret, min_events)) - goto again; -retry_done: - io_submit_flush_completions(ctx); - if (io_run_local_work_continue(ctx, ret, min_events)) - goto again; - - trace_io_uring_local_work_run(ctx, ret, loops); - return ret; -} - -static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, - int min_events) -{ - struct io_tw_state ts = {}; - - if (!io_local_work_pending(ctx)) - return 0; - return __io_run_local_work(ctx, &ts, min_events, - max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); -} - -static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, - int max_events) -{ - struct io_tw_state ts = {}; - int ret; - - mutex_lock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &ts, min_events, max_events); - mutex_unlock(&ctx->uring_lock); - return ret; -} - -static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) -{ - io_tw_lock(req->ctx, ts); - io_req_defer_failed(req, req->cqe.res); -} - -void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) -{ - io_tw_lock(req->ctx, ts); - if (unlikely(io_should_terminate_tw())) + io_tw_lock(ctx, tw); + if (unlikely(tw.cancel)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); else - io_queue_sqe(req); + io_queue_sqe(req, 0); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -1394,6 +1083,18 @@ void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } +static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (req->file_node) { + io_put_rsrc_node(ctx, req->file_node); + req->file_node = NULL; + } + if (req->flags & REQ_F_BUF_NODE) + io_put_rsrc_node(ctx, req->buf_node); +} + static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) @@ -1419,8 +1120,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, if (apoll->double_poll) kfree(apoll->double_poll); - if (!io_alloc_cache_put(&ctx->apoll_cache, apoll)) - kfree(apoll); + io_cache_free(&ctx->apoll_cache, apoll); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) @@ -1455,13 +1155,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) */ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { - if (ctx->lockless_cq) { - spin_lock(&ctx->completion_lock); - io_req_cqe_overflow(req); - spin_unlock(&ctx->completion_lock); - } else { - io_req_cqe_overflow(req); - } + if (ctx->int_flags & IO_RING_F_LOCKLESS_CQ) + io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); + else + io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); } } __io_cq_unlock_post(ctx); @@ -1470,27 +1167,24 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } - ctx->submit_state.cq_flush = false; -} -static unsigned io_cqring_events(struct io_ring_ctx *ctx) -{ - /* See comment at the top of this file */ - smp_rmb(); - return __io_cqring_events(ctx); + if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)) + io_queue_deferred(ctx); + + ctx->submit_state.cq_flush = false; } /* * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) +__cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; mutex_lock(&ctx->uring_lock); - while (!wq_list_empty(&ctx->iopoll_list)) { + while (!list_empty(&ctx->iopoll_list)) { /* let it sleep and repeat later if can't complete a request */ if (io_do_iopoll(ctx, true) == 0) break; @@ -1506,13 +1200,17 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) } } mutex_unlock(&ctx->uring_lock); + + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_move_task_work_from_local(ctx); } -static int io_iopoll_check(struct io_ring_ctx *ctx, long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) { - unsigned int nr_events = 0; unsigned long check_cq; + min_events = min(min_events, ctx->cq_entries); + lockdep_assert_held(&ctx->uring_lock); if (!io_allowed_run_tw(ctx)) @@ -1550,24 +1248,19 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) * forever, while the workqueue is stuck trying to acquire the * very same mutex. */ - if (wq_list_empty(&ctx->iopoll_list) || - io_task_work_pending(ctx)) { - u32 tail = ctx->cached_cq_tail; + if (list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) { + (void) io_run_local_work_locked(ctx, min_events); - (void) io_run_local_work_locked(ctx, min); - - if (task_work_pending(current) || - wq_list_empty(&ctx->iopoll_list)) { + if (task_work_pending(current) || list_empty(&ctx->iopoll_list)) { mutex_unlock(&ctx->uring_lock); io_run_task_work(); mutex_lock(&ctx->uring_lock); } /* some requests don't go through iopoll_list */ - if (tail != ctx->cached_cq_tail || - wq_list_empty(&ctx->iopoll_list)) + if (list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, !min); + ret = io_do_iopoll(ctx, !min_events); if (unlikely(ret < 0)) return ret; @@ -1575,16 +1268,14 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return -EINTR; if (need_resched()) break; - - nr_events += ret; - } while (nr_events < min); + } while (io_cqring_events(ctx) < min_events); return 0; } -void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw) { - io_req_complete_defer(req); + io_req_complete_defer(tw_req.req); } /* @@ -1607,25 +1298,17 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) * how we do polling eventually, not spinning if we're on potentially * different devices. */ - if (wq_list_empty(&ctx->iopoll_list)) { + if (list_empty(&ctx->iopoll_list)) { ctx->poll_multi_queue = false; } else if (!ctx->poll_multi_queue) { struct io_kiocb *list_req; - list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, - comp_list); + list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, iopoll_node); if (list_req->file != req->file) ctx->poll_multi_queue = true; } - /* - * For fast devices, IO may have already completed. If it has, add - * it to the front so we find it first. - */ - if (READ_ONCE(req->iopoll_completed)) - wq_list_add_head(&req->comp_list, &ctx->iopoll_list); - else - wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); + list_add_tail(&req->iopoll_node, &ctx->iopoll_list); if (unlikely(needs_lock)) { /* @@ -1646,6 +1329,8 @@ io_req_flags_t io_file_get_flags(struct file *file) { io_req_flags_t res = 0; + BUILD_BUG_ON(REQ_F_ISREG_BIT != REQ_F_SUPPORT_NOWAIT_BIT + 1); + if (S_ISREG(file_inode(file)->i_mode)) res |= REQ_F_ISREG; if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) @@ -1653,56 +1338,28 @@ io_req_flags_t io_file_get_flags(struct file *file) return res; } -static u32 io_get_sequence(struct io_kiocb *req) -{ - u32 seq = req->ctx->cached_sq_head; - struct io_kiocb *cur; - - /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(cur, req) - seq--; - return seq; -} - static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; + bool drain = req->flags & IOSQE_IO_DRAIN; struct io_defer_entry *de; - int ret; - u32 seq = io_get_sequence(req); - - /* Still need defer if there is pending req in defer list. */ - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); -queue: - ctx->drain_active = false; - io_req_task_queue(req); - return; - } - spin_unlock(&ctx->completion_lock); - io_prep_async_link(req); - de = kmalloc(sizeof(*de), GFP_KERNEL); + de = kmalloc_obj(*de, GFP_KERNEL_ACCOUNT); if (!de) { - ret = -ENOMEM; - io_req_defer_failed(req, ret); + io_req_defer_failed(req, -ENOMEM); return; } - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); - kfree(de); - goto queue; - } - + io_prep_async_link(req); trace_io_uring_defer(req); de->req = req; - de->seq = seq; + + ctx->nr_drained += io_linked_nr(req); list_add_tail(&de->list, &ctx->defer_list); - spin_unlock(&ctx->completion_lock); + io_queue_deferred(ctx); + if (!drain && list_empty(&ctx->defer_list)) + ctx->int_flags &= ~IO_RING_F_DRAIN_ACTIVE; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, @@ -1719,17 +1376,22 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, return !!req->file; } -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +#define REQ_ISSUE_SLOW_FLAGS (REQ_F_CREDS | REQ_F_ARM_LTIMEOUT) + +static inline int __io_issue_sqe(struct io_kiocb *req, + unsigned int issue_flags, + const struct io_issue_def *def) { - const struct io_issue_def *def = &io_issue_defs[req->opcode]; const struct cred *creds = NULL; + struct io_kiocb *link = NULL; int ret; - if (unlikely(!io_assign_file(req, def, issue_flags))) - return -EBADF; - - if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) - creds = override_creds(req->creds); + if (unlikely(req->flags & REQ_ISSUE_SLOW_FLAGS)) { + if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) + creds = override_creds(req->creds); + if (req->flags & REQ_F_ARM_LTIMEOUT) + link = __io_prep_linked_timeout(req); + } if (!def->audit_skip) audit_uring_entry(req->opcode); @@ -1739,10 +1401,27 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (!def->audit_skip) audit_uring_exit(!ret, ret); - if (creds) - revert_creds(creds); + if (unlikely(creds || link)) { + if (creds) + revert_creds(creds); + if (link) + io_queue_linked_timeout(link); + } + + return ret; +} + +static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + int ret; + + if (unlikely(!io_assign_file(req, def, issue_flags))) + return -EBADF; - if (ret == IOU_OK) { + ret = __io_issue_sqe(req, issue_flags, def); + + if (ret == IOU_COMPLETE) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_defer(req); else @@ -1753,20 +1432,30 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (ret == IOU_ISSUE_SKIP_COMPLETE) { ret = 0; - io_arm_ltimeout(req); - /* If the op doesn't have a file, we're not polling for it */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) + if (req->flags & REQ_F_IOPOLL) io_iopoll_req_issued(req, issue_flags); } return ret; } -int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) +int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); - return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| - IO_URING_F_COMPLETE_DEFER); + const unsigned int issue_flags = IO_URING_F_NONBLOCK | + IO_URING_F_MULTISHOT | + IO_URING_F_COMPLETE_DEFER; + int ret; + + io_tw_lock(req->ctx, tw); + + WARN_ON_ONCE(!req->file); + if (WARN_ON_ONCE(req->flags & REQ_F_IOPOLL)) + return -EFAULT; + + ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); + + WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE); + return ret; } struct io_wq_work *io_wq_free_work(struct io_wq_work *work) @@ -1774,9 +1463,14 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *nxt = NULL; - if (req_ref_put_and_test(req)) { - if (req->flags & IO_REQ_LINK_FLAGS) + if (req_ref_put_and_test_atomic(req)) { + if (req->flags & IO_REQ_LINK_FLAGS) { + struct io_ring_ctx *ctx = req->ctx; + + mutex_lock(&ctx->uring_lock); nxt = io_req_find_next(req); + mutex_unlock(&ctx->uring_lock); + } io_free_req(req); } return nxt ? &nxt->work : NULL; @@ -1790,14 +1484,12 @@ void io_wq_submit_work(struct io_wq_work *work) bool needs_poll = false; int ret = 0, err = -ECANCELED; - /* one will be dropped by ->io_wq_free_work() after returning to io-wq */ + /* one will be dropped by io_wq_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) __io_req_set_refcount(req, 2); else req_ref_get(req); - io_arm_ltimeout(req); - /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { fail: @@ -1818,7 +1510,7 @@ fail: * Don't allow any multishot execution from io-wq. It's more restrictive * than necessary and also cleaner. */ - if (req->flags & REQ_F_APOLL_MULTISHOT) { + if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) { err = -EBADFD; if (!io_file_can_poll(req)) goto fail; @@ -1829,7 +1521,7 @@ fail: goto fail; return; } else { - req->flags &= ~REQ_F_APOLL_MULTISHOT; + req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT); } } @@ -1860,7 +1552,7 @@ fail: * wait for request slots on the block side. */ if (!needs_poll) { - if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (!(req->flags & REQ_F_IOPOLL)) break; if (io_wq_worker_stopped()) break; @@ -1890,7 +1582,8 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->file_table.data, fd); if (node) { - io_req_assign_rsrc_node(&req->file_node, node); + node->refs++; + req->file_node = node; req->flags |= io_slot_flags(node); file = io_slot_file(node); } @@ -1910,48 +1603,61 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) return file; } -static void io_queue_async(struct io_kiocb *req, int ret) - __must_hold(&req->ctx->uring_lock) +static int io_req_sqe_copy(struct io_kiocb *req, unsigned int issue_flags) { - struct io_kiocb *linked_timeout; + const struct io_cold_def *def = &io_cold_defs[req->opcode]; + + if (req->flags & REQ_F_SQE_COPIED) + return 0; + req->flags |= REQ_F_SQE_COPIED; + if (!def->sqe_copy) + return 0; + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_INLINE))) + return -EFAULT; + def->sqe_copy(req); + return 0; +} +static void io_queue_async(struct io_kiocb *req, unsigned int issue_flags, int ret) + __must_hold(&req->ctx->uring_lock) +{ if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { +fail: io_req_defer_failed(req, ret); return; } - linked_timeout = io_prep_linked_timeout(req); + ret = io_req_sqe_copy(req, issue_flags); + if (unlikely(ret)) + goto fail; switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: - io_kbuf_recycle(req, 0); io_req_task_queue(req); break; case IO_APOLL_ABORTED: - io_kbuf_recycle(req, 0); io_queue_iowq(req); break; case IO_APOLL_OK: break; } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); } -static inline void io_queue_sqe(struct io_kiocb *req) +static inline void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags) __must_hold(&req->ctx->uring_lock) { + unsigned int issue_flags = IO_URING_F_NONBLOCK | + IO_URING_F_COMPLETE_DEFER | extra_flags; int ret; - ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + ret = io_issue_sqe(req, issue_flags); /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ if (unlikely(ret)) - io_queue_async(req, ret); + io_queue_async(req, issue_flags, ret); } static void io_queue_sqe_fallback(struct io_kiocb *req) @@ -1966,7 +1672,9 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); } else { - if (unlikely(req->ctx->drain_active)) + /* can't fail with IO_URING_F_INLINE */ + io_req_sqe_copy(req, IO_URING_F_INLINE); + if (unlikely(req->ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)) io_drain_req(req); else io_queue_iowq(req); @@ -1982,6 +1690,8 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { + if (!(ctx->int_flags & IO_RING_F_OP_RESTRICTED)) + return true; if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; @@ -1996,12 +1706,11 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, return true; } -static void io_init_req_drain(struct io_kiocb *req) +static void io_init_drain(struct io_ring_ctx *ctx) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *head = ctx->submit_state.link.head; - ctx->drain_active = true; + ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE; if (head) { /* * If we need to drain a request in the middle of a link, drain @@ -2011,7 +1720,7 @@ static void io_init_req_drain(struct io_kiocb *req) * link. */ head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; - ctx->drain_next = true; + ctx->int_flags |= IO_RING_F_DRAIN_NEXT; } } @@ -2023,16 +1732,15 @@ static __cold int io_init_fail_req(struct io_kiocb *req, int err) } static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, unsigned int *left) __must_hold(&ctx->uring_lock) { const struct io_issue_def *def; unsigned int sqe_flags; int personality; - u8 opcode; - /* req is partially pre-initialised, see io_preinit_req() */ - req->opcode = opcode = READ_ONCE(sqe->opcode); + req->ctx = ctx; + req->opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ sqe_flags = READ_ONCE(sqe->flags); req->flags = (__force io_req_flags_t) sqe_flags; @@ -2040,14 +1748,33 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->file = NULL; req->tctx = current->io_uring; req->cancel_seq_set = false; + req->async_data = NULL; - if (unlikely(opcode >= IORING_OP_LAST)) { + if (unlikely(req->opcode >= IORING_OP_LAST)) { req->opcode = 0; return io_init_fail_req(req, -EINVAL); } - opcode = array_index_nospec(opcode, IORING_OP_LAST); + req->opcode = array_index_nospec(req->opcode, IORING_OP_LAST); + + def = &io_issue_defs[req->opcode]; + if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) { + /* + * A 128b op on a non-128b SQ requires mixed SQE support as + * well as 2 contiguous entries. + */ + if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || + (unsigned)(sqe - ctx->sq_sqes) >= ctx->sq_entries - 1) + return io_init_fail_req(req, -EINVAL); + /* + * A 128b operation on a mixed SQ uses two entries, so we have + * to increment the head and cached refs, and decrement what's + * left. + */ + current->io_uring->cached_refs++; + ctx->cached_sq_head++; + (*left)--; + } - def = &io_issue_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) @@ -2058,23 +1785,23 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->buf_index = READ_ONCE(sqe->buf_group); } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) - ctx->drain_disabled = true; + ctx->int_flags |= IO_RING_F_DRAIN_DISABLED; if (sqe_flags & IOSQE_IO_DRAIN) { - if (ctx->drain_disabled) + if (ctx->int_flags & IO_RING_F_DRAIN_DISABLED) return io_init_fail_req(req, -EOPNOTSUPP); - io_init_req_drain(req); + io_init_drain(ctx); } } - if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { - if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) + if (unlikely(ctx->int_flags & (IO_RING_F_OP_RESTRICTED | IO_RING_F_DRAIN_ACTIVE | IO_RING_F_DRAIN_NEXT))) { + if (!io_check_restriction(ctx, req, sqe_flags)) return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ - if (ctx->drain_active) + if (ctx->int_flags & IO_RING_F_DRAIN_ACTIVE) req->flags |= REQ_F_FORCE_ASYNC; /* if there is no link, we're at "next" request and need to drain */ - if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { - ctx->drain_next = false; - ctx->drain_active = true; + if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_NEXT) && !ctx->submit_state.link.head) { + ctx->int_flags &= ~IO_RING_F_DRAIN_NEXT; + ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE; req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; } } @@ -2157,16 +1884,22 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, } static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, unsigned int *left) __must_hold(&ctx->uring_lock) { struct io_submit_link *link = &ctx->submit_state.link; int ret; - ret = io_init_req(ctx, req, sqe); + ret = io_init_req(ctx, req, sqe, left); if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); + if (unlikely(ctx->bpf_filters)) { + ret = io_uring_run_bpf_filters(ctx->bpf_filters, req); + if (ret) + return io_submit_fail_init(sqe, req, ret); + } + trace_io_uring_submit_req(req); /* @@ -2178,6 +1911,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, */ if (unlikely(link->head)) { trace_io_uring_link(req, link->last); + io_req_sqe_copy(req, IO_URING_F_INLINE); link->last->link = req; link->last = req; @@ -2201,7 +1935,7 @@ fallback: return 0; } - io_queue_sqe(req); + io_queue_sqe(req, IO_URING_F_INLINE); return 0; } @@ -2237,12 +1971,16 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - /* - * Ensure any loads from the SQEs are done at this point, - * since once we write the new head, the application could - * write new data to them. - */ - smp_store_release(&rings->sq.head, ctx->cached_sq_head); + if (ctx->flags & IORING_SETUP_SQ_REWIND) { + ctx->cached_sq_head = 0; + } else { + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&rings->sq.head, ctx->cached_sq_head); + } } /* @@ -2258,14 +1996,10 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) unsigned mask = ctx->sq_entries - 1; unsigned head = ctx->cached_sq_head++ & mask; - if (static_branch_unlikely(&io_key_has_sqarray) && + if (static_branch_unlikely(&io_key_has_sqarray.key) && (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { - /* drop invalid entries */ - spin_lock(&ctx->completion_lock); - ctx->cq_extra--; - spin_unlock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); return false; @@ -2292,14 +2026,20 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { - unsigned int entries = io_sqring_entries(ctx); + unsigned int entries; unsigned int left; int ret; + if (ctx->flags & IORING_SETUP_SQ_REWIND) + entries = ctx->sq_entries; + else + entries = __io_sqring_entries(ctx); + + entries = min(nr, entries); if (unlikely(!entries)) return 0; - /* make sure SQ entry isn't read before tail */ - ret = left = min(nr, entries); + + ret = left = entries; io_get_task_refs(left); io_submit_state_start(&ctx->submit_state, left); @@ -2318,7 +2058,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) * Continue submitting even for sqe failure if the * ring was setup with IORING_SETUP_SUBMIT_ALL */ - if (unlikely(io_submit_sqe(ctx, req, sqe)) && + if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) && !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { left--; break; @@ -2339,379 +2079,105 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) return ret; } -static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, - int wake_flags, void *key) -{ - struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); - - /* - * Cannot safely flush overflowed CQEs from here, ensure we wake up - * the task, and the next invocation will do it. - */ - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) - return autoremove_wake_function(curr, mode, wake_flags, key); - return -1; -} - -int io_run_task_work_sig(struct io_ring_ctx *ctx) -{ - if (io_local_work_pending(ctx)) { - __set_current_state(TASK_RUNNING); - if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) - return 0; - } - if (io_run_task_work() > 0) - return 0; - if (task_sigpending(current)) - return -EINTR; - return 0; -} - -static bool current_pending_io(void) -{ - struct io_uring_task *tctx = current->io_uring; - - if (!tctx) - return false; - return percpu_counter_read_positive(&tctx->inflight); -} - -static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) +static void io_rings_free(struct io_ring_ctx *ctx) { - struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); - - WRITE_ONCE(iowq->hit_timeout, 1); - iowq->min_timeout = 0; - wake_up_process(iowq->wq.private); - return HRTIMER_NORESTART; -} - -/* - * Doing min_timeout portion. If we saw any timeouts, events, or have work, - * wake up. If not, and we have a normal timeout, switch to that and keep - * sleeping. - */ -static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) -{ - struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); - struct io_ring_ctx *ctx = iowq->ctx; - - /* no general timeout, or shorter (or equal), we are done */ - if (iowq->timeout == KTIME_MAX || - ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) - goto out_wake; - /* work we may need to run, wake function will see if we need to wake */ - if (io_has_work(ctx)) - goto out_wake; - /* got events since we started waiting, min timeout is done */ - if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) - goto out_wake; - /* if we have any events and min timeout expired, we're done */ - if (io_cqring_events(ctx)) - goto out_wake; - - /* - * If using deferred task_work running and application is waiting on - * more than one request, ensure we reset it now where we are switching - * to normal sleeps. Any request completion post min_wait should wake - * the task and return. - */ - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 1); - smp_mb(); - if (!llist_empty(&ctx->work_llist)) - goto out_wake; - } - - iowq->t.function = io_cqring_timer_wakeup; - hrtimer_set_expires(timer, iowq->timeout); - return HRTIMER_RESTART; -out_wake: - return io_cqring_timer_wakeup(timer); + io_free_region(ctx->user, &ctx->sq_region); + io_free_region(ctx->user, &ctx->ring_region); + ctx->rings = NULL; + RCU_INIT_POINTER(ctx->rings_rcu, NULL); + ctx->sq_sqes = NULL; } -static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, - clockid_t clock_id, ktime_t start_time) +static int rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, struct io_rings_layout *rl) { - ktime_t timeout; + struct io_rings *rings; + size_t sqe_size; + size_t off; - if (iowq->min_timeout) { - timeout = ktime_add_ns(iowq->min_timeout, start_time); - hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, - HRTIMER_MODE_ABS); - } else { - timeout = iowq->timeout; - hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, - HRTIMER_MODE_ABS); + if (flags & IORING_SETUP_CQE_MIXED) { + if (cq_entries < 2) + return -EOVERFLOW; } - - hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); - hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); - - if (!READ_ONCE(iowq->hit_timeout)) - schedule(); - - hrtimer_cancel(&iowq->t); - destroy_hrtimer_on_stack(&iowq->t); - __set_current_state(TASK_RUNNING); - - return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; -} - -static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - ktime_t start_time) -{ - int ret = 0; - - /* - * Mark us as being in io_wait if we have pending requests, so cpufreq - * can take into account that the task is waiting for IO - turns out - * to be important for low QD IO. - */ - if (current_pending_io()) - current->in_iowait = 1; - if (iowq->timeout != KTIME_MAX || iowq->min_timeout) - ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); - else - schedule(); - current->in_iowait = 0; - return ret; -} - -/* If this returns > 0, the caller should retry */ -static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - ktime_t start_time) -{ - if (unlikely(READ_ONCE(ctx->check_cq))) - return 1; - if (unlikely(io_local_work_pending(ctx))) - return 1; - if (unlikely(task_work_pending(current))) - return 1; - if (unlikely(task_sigpending(current))) - return -EINTR; - if (unlikely(io_should_wake(iowq))) - return 0; - - return __io_cqring_wait_schedule(ctx, iowq, start_time); -} - -struct ext_arg { - size_t argsz; - struct timespec64 ts; - const sigset_t __user *sig; - ktime_t min_time; - bool ts_set; -}; - -/* - * Wait until events become available, if we don't already have some. The - * application must reap them itself, as they reside on the shared cq ring. - */ -static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, - struct ext_arg *ext_arg) -{ - struct io_wait_queue iowq; - struct io_rings *rings = ctx->rings; - ktime_t start_time; - int ret; - - if (!io_allowed_run_tw(ctx)) - return -EEXIST; - if (io_local_work_pending(ctx)) - io_run_local_work(ctx, min_events, - max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); - io_run_task_work(); - - if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) - io_cqring_do_overflow_flush(ctx); - if (__io_cqring_events_user(ctx) >= min_events) - return 0; - - init_waitqueue_func_entry(&iowq.wq, io_wake_function); - iowq.wq.private = current; - INIT_LIST_HEAD(&iowq.wq.entry); - iowq.ctx = ctx; - iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; - iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); - iowq.hit_timeout = 0; - iowq.min_timeout = ext_arg->min_time; - iowq.timeout = KTIME_MAX; - start_time = io_get_time(ctx); - - if (ext_arg->ts_set) { - iowq.timeout = timespec64_to_ktime(ext_arg->ts); - if (!(flags & IORING_ENTER_ABS_TIMER)) - iowq.timeout = ktime_add(iowq.timeout, start_time); - } - - if (ext_arg->sig) { -#ifdef CONFIG_COMPAT - if (in_compat_syscall()) - ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, - ext_arg->argsz); - else -#endif - ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); - - if (ret) - return ret; + if (flags & IORING_SETUP_SQE_MIXED) { + if (sq_entries < 2) + return -EOVERFLOW; } - io_napi_busy_loop(ctx, &iowq); - - trace_io_uring_cqring_wait(ctx, min_events); - do { - unsigned long check_cq; - int nr_wait; + rl->sq_array_offset = SIZE_MAX; - /* if min timeout has been hit, don't reset wait count */ - if (!iowq.hit_timeout) - nr_wait = (int) iowq.cq_tail - - READ_ONCE(ctx->rings->cq.tail); - else - nr_wait = 1; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, nr_wait); - set_current_state(TASK_INTERRUPTIBLE); - } else { - prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, - TASK_INTERRUPTIBLE); - } + sqe_size = sizeof(struct io_uring_sqe); + if (flags & IORING_SETUP_SQE128) + sqe_size *= 2; - ret = io_cqring_wait_schedule(ctx, &iowq, start_time); - __set_current_state(TASK_RUNNING); - atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); - - /* - * Run task_work after scheduling and before io_should_wake(). - * If we got woken because of task_work being processed, run it - * now rather than let the caller do another wait loop. - */ - if (io_local_work_pending(ctx)) - io_run_local_work(ctx, nr_wait, nr_wait); - io_run_task_work(); - - /* - * Non-local task_work will be run on exit to userspace, but - * if we're using DEFER_TASKRUN, then we could have waited - * with a timeout for a number of requests. If the timeout - * hits, we could have some requests ready to process. Ensure - * this break is _after_ we have run task_work, to avoid - * deferring running potentially pending requests until the - * next time we wait for events. - */ - if (ret < 0) - break; - - check_cq = READ_ONCE(ctx->check_cq); - if (unlikely(check_cq)) { - /* let the caller flush overflows, retry */ - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - io_cqring_do_overflow_flush(ctx); - if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { - ret = -EBADR; - break; - } - } - - if (io_should_wake(&iowq)) { - ret = 0; - break; - } - cond_resched(); - } while (1); - - if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) - finish_wait(&ctx->cq_wait, &iowq.wq); - restore_saved_sigmask_unless(ret == -EINTR); - - return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; -} - -static void io_rings_free(struct io_ring_ctx *ctx) -{ - io_free_region(ctx, &ctx->sq_region); - io_free_region(ctx, &ctx->ring_region); - ctx->rings = NULL; - ctx->sq_sqes = NULL; -} - -unsigned long rings_size(unsigned int flags, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) -{ - struct io_rings *rings; - size_t off, sq_array_size; + rl->sq_size = array_size(sqe_size, sq_entries); + if (rl->sq_size == SIZE_MAX) + return -EOVERFLOW; off = struct_size(rings, cqes, cq_entries); + if (flags & IORING_SETUP_CQE32) + off = size_mul(off, 2); if (off == SIZE_MAX) - return SIZE_MAX; - if (flags & IORING_SETUP_CQE32) { - if (check_shl_overflow(off, 1, &off)) - return SIZE_MAX; - } + return -EOVERFLOW; #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); if (off == 0) - return SIZE_MAX; + return -EOVERFLOW; #endif - if (flags & IORING_SETUP_NO_SQARRAY) { - *sq_offset = SIZE_MAX; - return off; - } - - *sq_offset = off; + if (!(flags & IORING_SETUP_NO_SQARRAY)) { + size_t sq_array_size; - sq_array_size = array_size(sizeof(u32), sq_entries); - if (sq_array_size == SIZE_MAX) - return SIZE_MAX; + rl->sq_array_offset = off; - if (check_add_overflow(off, sq_array_size, &off)) - return SIZE_MAX; + sq_array_size = array_size(sizeof(u32), sq_entries); + off = size_add(off, sq_array_size); + if (off == SIZE_MAX) + return -EOVERFLOW; + } - return off; + rl->rings_size = off; + return 0; } -static void io_req_caches_free(struct io_ring_ctx *ctx) +static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; int nr = 0; - mutex_lock(&ctx->uring_lock); - while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); + io_poison_req(req); kmem_cache_free(req_cachep, req); nr++; } - if (nr) + if (nr) { + ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); - mutex_unlock(&ctx->uring_lock); + } +} + +static __cold void io_req_caches_free(struct io_ring_ctx *ctx) +{ + guard(mutex)(&ctx->uring_lock); + __io_req_caches_free(ctx); } static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { + io_unregister_bpf_ops(ctx); io_sq_thread_finish(ctx); mutex_lock(&ctx->uring_lock); io_sqe_buffers_unregister(ctx); io_sqe_files_unregister(ctx); + io_unregister_zcrx(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); - io_alloc_cache_free(&ctx->apoll_cache, kfree); - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, kfree); - io_futex_cache_free(ctx); + io_free_alloc_caches(ctx); io_destroy_buffers(ctx); - io_free_region(ctx, &ctx->param_region); + io_free_region(ctx->user, &ctx->param_region); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); @@ -2727,11 +2193,22 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_rings_free(ctx); if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - static_branch_dec(&io_key_has_sqarray); + static_branch_slow_dec_deferred(&io_key_has_sqarray); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); + + if (ctx->restrictions.bpf_filters) { + WARN_ON_ONCE(ctx->bpf_filters != + ctx->restrictions.bpf_filters->filters); + } else { + WARN_ON_ONCE(ctx->bpf_filters); + } + io_put_bpf_filters(&ctx->restrictions); + + WARN_ON_ONCE(ctx->nr_req_allocated); + if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); io_napi_free(ctx); @@ -2746,7 +2223,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb) poll_wq_task_work); mutex_lock(&ctx->uring_lock); - ctx->poll_activated = true; + ctx->int_flags |= IO_RING_F_POLL_ACTIVATED; mutex_unlock(&ctx->uring_lock); /* @@ -2761,9 +2238,9 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx) { spin_lock(&ctx->completion_lock); /* already activated or in progress */ - if (ctx->poll_activated || ctx->poll_wq_task_work.func) + if ((ctx->int_flags & IO_RING_F_POLL_ACTIVATED) || ctx->poll_wq_task_work.func) goto out; - if (WARN_ON_ONCE(!ctx->task_complete)) + if (WARN_ON_ONCE(!(ctx->int_flags & IO_RING_F_TASK_COMPLETE))) goto out; if (!ctx->submitter_task) goto out; @@ -2784,7 +2261,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - if (unlikely(!ctx->poll_activated)) + if (unlikely(!(data_race(ctx->int_flags) & IO_RING_F_POLL_ACTIVATED))) io_activate_pollwq(ctx); /* * provides mb() which pairs with barrier from wq_has_sleeper @@ -2792,7 +2269,9 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) */ poll_wait(file, &ctx->poll_wq, wait); - if (!io_sqring_full(ctx)) + rcu_read_lock(); + + if (!__io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; /* @@ -2812,6 +2291,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) if (__io_cqring_events_user(ctx) || io_has_work(ctx)) mask |= EPOLLIN | EPOLLRDNORM; + rcu_read_unlock(); return mask; } @@ -2838,22 +2318,19 @@ static __cold void io_tctx_exit_cb(struct callback_head *cb) complete(&work->completion); } -static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - return req->ctx == data; -} - static __cold void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); - unsigned long timeout = jiffies + HZ * 60 * 5; + unsigned long timeout = jiffies + IO_URING_EXIT_WAIT_MAX; unsigned long interval = HZ / 20; struct io_tctx_exit exit; struct io_tctx_node *node; int ret; + mutex_lock(&ctx->uring_lock); + io_terminate_zcrx(ctx); + mutex_unlock(&ctx->uring_lock); + /* * If we're doing polled IO and end up having requests being * submitted async (out-of-line), then completions can come in while @@ -2867,19 +2344,19 @@ static __cold void io_ring_exit_work(struct work_struct *work) mutex_unlock(&ctx->uring_lock); } - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) - io_move_task_work_from_local(ctx); - /* The SQPOLL thread never reaches this path */ - while (io_uring_try_cancel_requests(ctx, NULL, true, false)) + do { + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_move_task_work_from_local(ctx); cond_resched(); + } while (io_uring_try_cancel_requests(ctx, NULL, true, false)); if (ctx->sq_data) { struct io_sq_data *sqd = ctx->sq_data; struct task_struct *tsk; io_sq_thread_park(sqd); - tsk = sqd->thread; + tsk = sqpoll_task_locked(sqd); if (tsk && tsk->io_uring && tsk->io_uring->io_wq) io_wq_cancel_cb(tsk->io_uring->io_wq, io_cancel_ctx_cb, ctx, true); @@ -2910,6 +2387,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) exit.ctx = ctx; mutex_lock(&ctx->uring_lock); + mutex_lock(&ctx->tctx_lock); while (!list_empty(&ctx->tctx_list)) { WARN_ON_ONCE(time_after(jiffies, timeout)); @@ -2921,6 +2399,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) if (WARN_ON_ONCE(ret)) continue; + mutex_unlock(&ctx->tctx_lock); mutex_unlock(&ctx->uring_lock); /* * See comment above for @@ -2929,7 +2408,9 @@ static __cold void io_ring_exit_work(struct work_struct *work) */ wait_for_completion_interruptible(&exit.completion); mutex_lock(&ctx->uring_lock); + mutex_lock(&ctx->tctx_lock); } + mutex_unlock(&ctx->tctx_lock); mutex_unlock(&ctx->uring_lock); spin_lock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock); @@ -2944,7 +2425,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; - struct creds *creds; + struct cred *creds; mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); @@ -2956,10 +2437,10 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* - * Use system_unbound_wq to avoid spawning tons of event kworkers + * Use system_dfl_wq to avoid spawning tons of event kworkers * if we're exiting a ton of rings at the same time. It just adds * noise and overhead, there's no discernable change in runtime - * over using system_wq. + * over using system_percpu_wq. */ queue_work(iou_wq, &ctx->exit_work); } @@ -2973,231 +2454,6 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } -struct io_task_cancel { - struct io_uring_task *tctx; - bool all; -}; - -static bool io_cancel_task_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_task_cancel *cancel = data; - - return io_match_task_safe(req, cancel->tctx, cancel->all); -} - -static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct io_uring_task *tctx, - bool cancel_all) -{ - struct io_defer_entry *de; - LIST_HEAD(list); - - spin_lock(&ctx->completion_lock); - list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task_safe(de->req, tctx, cancel_all)) { - list_cut_position(&list, &ctx->defer_list, &de->list); - break; - } - } - spin_unlock(&ctx->completion_lock); - if (list_empty(&list)) - return false; - - while (!list_empty(&list)) { - de = list_first_entry(&list, struct io_defer_entry, list); - list_del_init(&de->list); - io_req_task_queue_fail(de->req, -ECANCELED); - kfree(de); - } - return true; -} - -static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) -{ - struct io_tctx_node *node; - enum io_wq_cancel cret; - bool ret = false; - - mutex_lock(&ctx->uring_lock); - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - /* - * io_wq will stay alive while we hold uring_lock, because it's - * killed after ctx nodes, which requires to take the lock. - */ - if (!tctx || !tctx->io_wq) - continue; - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - mutex_unlock(&ctx->uring_lock); - - return ret; -} - -static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct io_uring_task *tctx, - bool cancel_all, - bool is_sqpoll_thread) -{ - struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; - enum io_wq_cancel cret; - bool ret = false; - - /* set it so io_req_local_work_add() would wake us up */ - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 1); - smp_mb(); - } - - /* failed during ring init, it couldn't have issued any requests */ - if (!ctx->rings) - return false; - - if (!tctx) { - ret |= io_uring_try_cancel_iowq(ctx); - } else if (tctx->io_wq) { - /* - * Cancels requests of all rings, not only @ctx, but - * it's fine as the task is in exit/exec. - */ - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, - &cancel, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - - /* SQPOLL thread does its own polling */ - if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - is_sqpoll_thread) { - while (!wq_list_empty(&ctx->iopoll_list)) { - io_iopoll_try_reap_events(ctx); - ret = true; - cond_resched(); - } - } - - if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && - io_allowed_defer_tw_run(ctx)) - ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; - ret |= io_cancel_defer_files(ctx, tctx, cancel_all); - mutex_lock(&ctx->uring_lock); - ret |= io_poll_remove_all(ctx, tctx, cancel_all); - ret |= io_waitid_remove_all(ctx, tctx, cancel_all); - ret |= io_futex_remove_all(ctx, tctx, cancel_all); - ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); - mutex_unlock(&ctx->uring_lock); - ret |= io_kill_timeouts(ctx, tctx, cancel_all); - if (tctx) - ret |= io_run_task_work() > 0; - else - ret |= flush_delayed_work(&ctx->fallback_work); - return ret; -} - -static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) -{ - if (tracked) - return atomic_read(&tctx->inflight_tracked); - return percpu_counter_sum(&tctx->inflight); -} - -/* - * Find any io_uring ctx that this task has registered or done IO on, and cancel - * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. - */ -__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_ring_ctx *ctx; - struct io_tctx_node *node; - unsigned long index; - s64 inflight; - DEFINE_WAIT(wait); - - WARN_ON_ONCE(sqd && sqd->thread != current); - - if (!current->io_uring) - return; - if (tctx->io_wq) - io_wq_exit_start(tctx->io_wq); - - atomic_inc(&tctx->in_cancel); - do { - bool loop = false; - - io_uring_drop_tctx_refs(current); - if (!tctx_inflight(tctx, !cancel_all)) - break; - - /* read completions before cancelations */ - inflight = tctx_inflight(tctx, false); - if (!inflight) - break; - - if (!sqd) { - xa_for_each(&tctx->xa, index, node) { - /* sqpoll task will cancel all its requests */ - if (node->ctx->sq_data) - continue; - loop |= io_uring_try_cancel_requests(node->ctx, - current->io_uring, - cancel_all, - false); - } - } else { - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - loop |= io_uring_try_cancel_requests(ctx, - current->io_uring, - cancel_all, - true); - } - - if (loop) { - cond_resched(); - continue; - } - - prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); - io_run_task_work(); - io_uring_drop_tctx_refs(current); - xa_for_each(&tctx->xa, index, node) { - if (io_local_work_pending(node->ctx)) { - WARN_ON_ONCE(node->ctx->submitter_task && - node->ctx->submitter_task != current); - goto end_wait; - } - } - /* - * If we've seen completions, retry without waiting. This - * avoids a race where a completion comes in before we did - * prepare_to_wait(). - */ - if (inflight == tctx_inflight(tctx, !cancel_all)) - schedule(); -end_wait: - finish_wait(&tctx->wait, &wait); - } while (1); - - io_uring_clean_tctx(tctx); - if (cancel_all) { - /* - * We shouldn't run task_works after cancel, so just leave - * ->in_cancel set for normal exit. - */ - atomic_dec(&tctx->in_cancel); - /* for exec all current's requests should be gone, kill tctx */ - __io_uring_free(current); - } -} - -void __io_uring_cancel(bool cancel_all) -{ - io_uring_unreg_ringfd(); - io_uring_cancel_generic(cancel_all, NULL); -} - static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, const struct io_uring_getevents_arg __user *uarg) { @@ -3239,6 +2495,8 @@ static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags, const struct io_uring_getevents_arg __user *uarg = argp; struct io_uring_getevents_arg arg; + ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT); + /* * If EXT_ARG isn't set, then we have no timespec and the argp pointer * is just a pointer to the sigset_t. @@ -3304,48 +2562,69 @@ uaccess_end: #endif } -SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, - u32, min_complete, u32, flags, const void __user *, argp, - size_t, argsz) +/* + * Given an 'fd' value, return the ctx associated with if. If 'registered' is + * true, then the registered index is used. Otherwise, the normal fd table. + * Caller must call fput() on the returned file if it isn't a registered file, + * unless it's an ERR_PTR. + */ +struct file *io_uring_ctx_get_file(unsigned int fd, bool registered) { - struct io_ring_ctx *ctx; struct file *file; - long ret; - if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | - IORING_ENTER_REGISTERED_RING | - IORING_ENTER_ABS_TIMER | - IORING_ENTER_EXT_ARG_REG))) - return -EINVAL; - - /* - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we - * need only dereference our task private array to find it. - */ - if (flags & IORING_ENTER_REGISTERED_RING) { + if (registered) { + /* + * Ring fd has been registered via IORING_REGISTER_RING_FDS, we + * need only dereference our task private array to find it. + */ struct io_uring_task *tctx = current->io_uring; if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) - return -EINVAL; + return ERR_PTR(-EINVAL); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; - if (unlikely(!file)) - return -EBADF; } else { file = fget(fd); - if (unlikely(!file)) - return -EBADF; - ret = -EOPNOTSUPP; - if (unlikely(!io_is_uring_fops(file))) - goto out; } + if (unlikely(!file)) + return ERR_PTR(-EBADF); + if (io_is_uring_fops(file)) + return file; + if (!registered) + fput(file); + return ERR_PTR(-EOPNOTSUPP); +} + + +SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, + u32, min_complete, u32, flags, const void __user *, argp, + size_t, argsz) +{ + struct io_ring_ctx *ctx; + struct file *file; + long ret; + + if (unlikely(flags & ~IORING_ENTER_FLAGS)) + return -EINVAL; + + file = io_uring_ctx_get_file(fd, flags & IORING_ENTER_REGISTERED_RING); + if (IS_ERR(file)) + return PTR_ERR(file); ctx = file->private_data; ret = -EBADFD; - if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) + /* + * Keep IORING_SETUP_R_DISABLED check before submitter_task load + * in io_uring_add_tctx_node() -> __io_uring_add_tctx_node_from_submit() + */ + if (unlikely(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED)) goto out; + if (io_has_loop_ops(ctx)) { + ret = io_run_loop(ctx); + goto out; + } + /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if @@ -3375,7 +2654,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { - if (ctx->syscall_iopoll) + if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) goto iopoll_locked; /* * Ignore errors, we'll soon call io_cqring_wait() and @@ -3390,7 +2669,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (flags & IORING_ENTER_GETEVENTS) { int ret2; - if (ctx->syscall_iopoll) { + if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) { /* * We disallow the app entering submit/complete with * polling, but we still need to lock the ring to @@ -3400,22 +2679,16 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, mutex_lock(&ctx->uring_lock); iopoll_locked: ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); + if (likely(!ret2)) ret2 = io_iopoll_check(ctx, min_complete); - } mutex_unlock(&ctx->uring_lock); } else { struct ext_arg ext_arg = { .argsz = argsz }; ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); + if (likely(!ret2)) ret2 = io_cqring_wait(ctx, min_complete, flags, &ext_arg); - } } if (!ret) { @@ -3456,24 +2729,20 @@ bool io_is_uring_fops(struct file *file) } static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, - struct io_uring_params *p) + struct io_ctx_config *config) { + struct io_uring_params *p = &config->p; + struct io_rings_layout *rl = &config->layout; struct io_uring_region_desc rd; struct io_rings *rings; - size_t size, sq_array_offset; int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(ctx->flags, p->sq_entries, p->cq_entries, - &sq_array_offset); - if (size == SIZE_MAX) - return -EOVERFLOW; - memset(&rd, 0, sizeof(rd)); - rd.size = PAGE_ALIGN(size); + rd.size = PAGE_ALIGN(rl->rings_size); if (ctx->flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p->cq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; @@ -3482,25 +2751,12 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (ret) return ret; ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); - + rcu_assign_pointer(ctx->rings_rcu, rings); if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); - rings->sq_ring_mask = p->sq_entries - 1; - rings->cq_ring_mask = p->cq_entries - 1; - rings->sq_ring_entries = p->sq_entries; - rings->cq_ring_entries = p->cq_entries; - - if (p->flags & IORING_SETUP_SQE128) - size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); - else - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); - if (size == SIZE_MAX) { - io_rings_free(ctx); - return -EOVERFLOW; - } + ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset); memset(&rd, 0, sizeof(rd)); - rd.size = PAGE_ALIGN(size); + rd.size = PAGE_ALIGN(rl->sq_size); if (ctx->flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p->sq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; @@ -3511,6 +2767,12 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return ret; } ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); + + memset(rings, 0, sizeof(*rings)); + WRITE_ONCE(rings->sq_ring_mask, ctx->sq_entries - 1); + WRITE_ONCE(rings->cq_ring_mask, ctx->cq_entries - 1); + WRITE_ONCE(rings->sq_ring_entries, ctx->sq_entries); + WRITE_ONCE(rings->cq_ring_entries, ctx->cq_entries); return 0; } @@ -3537,8 +2799,72 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) O_RDWR | O_CLOEXEC, NULL); } -int io_uring_fill_params(unsigned entries, struct io_uring_params *p) +static int io_uring_sanitise_params(struct io_uring_params *p) { + unsigned flags = p->flags; + + if (flags & ~IORING_SETUP_FLAGS) + return -EINVAL; + + if (flags & IORING_SETUP_SQ_REWIND) { + if ((flags & IORING_SETUP_SQPOLL) || + !(flags & IORING_SETUP_NO_SQARRAY)) + return -EINVAL; + } + + /* There is no way to mmap rings without a real fd */ + if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && + !(flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + + if (flags & IORING_SETUP_SQPOLL) { + /* IPI related flags don't make sense with SQPOLL */ + if (flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_TASKRUN_FLAG | + IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + } + + if (flags & IORING_SETUP_TASKRUN_FLAG) { + if (!(flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_DEFER_TASKRUN))) + return -EINVAL; + } + + /* HYBRID_IOPOLL only valid with IOPOLL */ + if ((flags & IORING_SETUP_HYBRID_IOPOLL) && !(flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + /* + * For DEFER_TASKRUN we require the completion task to be the same as + * the submission task. This implies that there is only one submitter. + */ + if ((flags & IORING_SETUP_DEFER_TASKRUN) && + !(flags & IORING_SETUP_SINGLE_ISSUER)) + return -EINVAL; + + /* + * Nonsensical to ask for CQE32 and mixed CQE support, it's not + * supported to post 16b CQEs on a ring setup with CQE32. + */ + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) + return -EINVAL; + /* + * Nonsensical to ask for SQE128 and mixed SQE support, it's not + * supported to post 64b SQEs on a ring setup with SQE128. + */ + if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) == + (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) + return -EINVAL; + + return 0; +} + +static int io_uring_fill_params(struct io_uring_params *p) +{ + unsigned entries = p->sq_entries; + if (!entries) return -EINVAL; if (entries > IORING_MAX_ENTRIES) { @@ -3547,10 +2873,6 @@ int io_uring_fill_params(unsigned entries, struct io_uring_params *p) entries = IORING_MAX_ENTRIES; } - if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) - && !(p->flags & IORING_SETUP_NO_MMAP)) - return -EINVAL; - /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, @@ -3580,6 +2902,27 @@ int io_uring_fill_params(unsigned entries, struct io_uring_params *p) p->cq_entries = 2 * p->sq_entries; } + return 0; +} + +int io_prepare_config(struct io_ctx_config *config) +{ + struct io_uring_params *p = &config->p; + int ret; + + ret = io_uring_sanitise_params(p); + if (ret) + return ret; + + ret = io_uring_fill_params(p); + if (ret) + return ret; + + ret = rings_size(p->flags, p->sq_entries, p->cq_entries, + &config->layout); + if (ret) + return ret; + p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); @@ -3600,20 +2943,48 @@ int io_uring_fill_params(unsigned entries, struct io_uring_params *p) p->cq_off.resv1 = 0; if (!(p->flags & IORING_SETUP_NO_MMAP)) p->cq_off.user_addr = 0; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + p->sq_off.array = config->layout.sq_array_offset; return 0; } -static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, - struct io_uring_params __user *params) +void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src) +{ + memcpy(&dst->register_op, &src->register_op, sizeof(dst->register_op)); + memcpy(&dst->sqe_op, &src->sqe_op, sizeof(dst->sqe_op)); + dst->sqe_flags_allowed = src->sqe_flags_allowed; + dst->sqe_flags_required = src->sqe_flags_required; + dst->op_registered = src->op_registered; + dst->reg_registered = src->reg_registered; + + io_bpf_filter_clone(dst, src); +} + +static void io_ctx_restriction_clone(struct io_ring_ctx *ctx, + struct io_restriction *src) { + struct io_restriction *dst = &ctx->restrictions; + + io_restriction_clone(dst, src); + if (dst->bpf_filters) + WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters); + if (dst->op_registered) + ctx->int_flags |= IO_RING_F_OP_RESTRICTED; + if (dst->reg_registered) + ctx->int_flags |= IO_RING_F_REG_RESTRICTED; +} + +static __cold int io_uring_create(struct io_ctx_config *config) +{ + struct io_uring_params *p = &config->p; struct io_ring_ctx *ctx; struct io_uring_task *tctx; struct file *file; int ret; - ret = io_uring_fill_params(entries, p); - if (unlikely(ret)) + ret = io_prepare_config(config); + if (ret) return ret; ctx = io_ring_ctx_alloc(p); @@ -3624,22 +2995,22 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->clock_offset = 0; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - static_branch_inc(&io_key_has_sqarray); + static_branch_deferred_inc(&io_key_has_sqarray); if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && - !(ctx->flags & IORING_SETUP_IOPOLL) && - !(ctx->flags & IORING_SETUP_SQPOLL)) - ctx->task_complete = true; + !(ctx->flags & IORING_SETUP_IOPOLL)) + ctx->int_flags |= IO_RING_F_TASK_COMPLETE; - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) - ctx->lockless_cq = true; + if ((ctx->int_flags & IO_RING_F_TASK_COMPLETE) || + (ctx->flags & IORING_SETUP_IOPOLL)) + ctx->int_flags |= IO_RING_F_LOCKLESS_CQ; /* * lazy poll_wq activation relies on ->task_complete for synchronisation * purposes, see io_activate_pollwq() */ - if (!ctx->task_complete) - ctx->poll_activated = true; + if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) + ctx->int_flags |= IO_RING_F_POLL_ACTIVATED; /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user @@ -3649,9 +3020,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) - ctx->syscall_iopoll = 1; + ctx->int_flags |= IO_RING_F_SYSCALL_IOPOLL; - ctx->compat = in_compat_syscall(); + if (in_compat_syscall()) + ctx->int_flags |= IO_RING_F_COMPAT; if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); @@ -3659,37 +3031,17 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if * COOP_TASKRUN is set, then IPIs are never needed by the app. */ - ret = -EINVAL; - if (ctx->flags & IORING_SETUP_SQPOLL) { - /* IPI related flags don't make sense with SQPOLL */ - if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | - IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_DEFER_TASKRUN)) - goto err; + if (ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_COOP_TASKRUN)) ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { - ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else { - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && - !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) - goto err; + else ctx->notify_method = TWA_SIGNAL; - } - - /* HYBRID_IOPOLL only valid with IOPOLL */ - if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) == - IORING_SETUP_HYBRID_IOPOLL) - goto err; /* - * For DEFER_TASKRUN we require the completion task to be the same as the - * submission task. This implies that there is only one submitter, so enforce - * that. + * If the current task has restrictions enabled, then copy them to + * our newly created ring and mark it as registered. */ - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && - !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { - goto err; - } + if (current->io_uring_restrict) + io_ctx_restriction_clone(ctx, current->io_uring_restrict); /* * This is just grabbed for accounting purposes. When a process exits, @@ -3700,35 +3052,24 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, mmgrab(current->mm); ctx->mm_account = current->mm; - ret = io_allocate_scq_urings(ctx, p); + ret = io_allocate_scq_urings(ctx, config); if (ret) goto err; - if (!(p->flags & IORING_SETUP_NO_SQARRAY)) - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; - ret = io_sq_offload_create(ctx, p); if (ret) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | - IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | - IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | - IORING_FEAT_RW_ATTR; - - if (copy_to_user(params, p, sizeof(*p))) { + p->features = IORING_FEAT_FLAGS; + + if (copy_to_user(config->uptr, p, sizeof(*p))) { ret = -EFAULT; goto err; } if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !(ctx->flags & IORING_SETUP_R_DISABLED)) - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); + ctx->submitter_task = get_task_struct(current); file = io_uring_get_file(ctx); if (IS_ERR(file)) { @@ -3769,53 +3110,51 @@ err_fput: */ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { - struct io_uring_params p; - int i; + struct io_ctx_config config; - if (copy_from_user(&p, params, sizeof(p))) + memset(&config, 0, sizeof(config)); + + if (copy_from_user(&config.p, params, sizeof(config.p))) return -EFAULT; - for (i = 0; i < ARRAY_SIZE(p.resv); i++) { - if (p.resv[i]) - return -EINVAL; - } - if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | - IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | - IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | - IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) + if (!mem_is_zero(&config.p.resv, sizeof(config.p.resv))) return -EINVAL; - return io_uring_create(entries, &p, params); + config.p.sq_entries = entries; + config.uptr = params; + return io_uring_create(&config); } -static inline bool io_uring_allowed(void) +static inline int io_uring_allowed(void) { int disabled = READ_ONCE(sysctl_io_uring_disabled); kgid_t io_uring_group; if (disabled == 2) - return false; + return -EPERM; if (disabled == 0 || capable(CAP_SYS_ADMIN)) - return true; + goto allowed_lsm; io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group); if (!gid_valid(io_uring_group)) - return false; + return -EPERM; + + if (!in_group_p(io_uring_group)) + return -EPERM; - return in_group_p(io_uring_group); +allowed_lsm: + return security_uring_allowed(); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, struct io_uring_params __user *, params) { - if (!io_uring_allowed()) - return -EPERM; + int ret; + + ret = io_uring_allowed(); + if (ret) + return ret; return io_uring_setup(entries, params); } @@ -3877,6 +3216,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_SQE_ELEM(44, __u16, addr_len); + BUILD_BUG_SQE_ELEM(44, __u8, write_stream); + BUILD_BUG_SQE_ELEM(45, __u8, __pad4[0]); BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); @@ -3908,6 +3249,9 @@ static int __init io_uring_init(void) io_uring_optable_init(); + /* imu->dir is u8 */ + BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); + /* * Allow user copy in the per-command field, which starts after the * file in io_kiocb and until the opcode field. The openat2 handling @@ -3918,10 +3262,9 @@ static int __init io_uring_init(void) req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); - io_buf_cachep = KMEM_CACHE(io_buffer, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); + BUG_ON(!iou_wq); #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ab619e63ef39..e612a66ee80e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -1,40 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOU_CORE_H #define IOU_CORE_H #include <linux/errno.h> #include <linux/lockdep.h> #include <linux/resume_user_mode.h> -#include <linux/kasan.h> #include <linux/poll.h> #include <linux/io_uring_types.h> #include <uapi/linux/eventpoll.h> #include "alloc_cache.h" #include "io-wq.h" #include "slist.h" -#include "filetable.h" +#include "tw.h" #include "opdef.h" #ifndef CREATE_TRACE_POINTS #include <trace/events/io_uring.h> #endif +struct io_rings_layout { + /* size of CQ + headers + SQ offset array */ + size_t rings_size; + size_t sq_size; + + size_t sq_array_offset; +}; + +struct io_ctx_config { + struct io_uring_params p; + struct io_rings_layout layout; + struct io_uring_params __user *uptr; +}; + +#define IORING_FEAT_FLAGS (IORING_FEAT_SINGLE_MMAP |\ + IORING_FEAT_NODROP |\ + IORING_FEAT_SUBMIT_STABLE |\ + IORING_FEAT_RW_CUR_POS |\ + IORING_FEAT_CUR_PERSONALITY |\ + IORING_FEAT_FAST_POLL |\ + IORING_FEAT_POLL_32BITS |\ + IORING_FEAT_SQPOLL_NONFIXED |\ + IORING_FEAT_EXT_ARG |\ + IORING_FEAT_NATIVE_WORKERS |\ + IORING_FEAT_RSRC_TAGS |\ + IORING_FEAT_CQE_SKIP |\ + IORING_FEAT_LINKED_FILE |\ + IORING_FEAT_REG_REG_RING |\ + IORING_FEAT_RECVSEND_BUNDLE |\ + IORING_FEAT_MIN_TIMEOUT |\ + IORING_FEAT_RW_ATTR |\ + IORING_FEAT_NO_IOWAIT) + +#define IORING_SETUP_FLAGS (IORING_SETUP_IOPOLL |\ + IORING_SETUP_SQPOLL |\ + IORING_SETUP_SQ_AFF |\ + IORING_SETUP_CQSIZE |\ + IORING_SETUP_CLAMP |\ + IORING_SETUP_ATTACH_WQ |\ + IORING_SETUP_R_DISABLED |\ + IORING_SETUP_SUBMIT_ALL |\ + IORING_SETUP_COOP_TASKRUN |\ + IORING_SETUP_TASKRUN_FLAG |\ + IORING_SETUP_SQE128 |\ + IORING_SETUP_CQE32 |\ + IORING_SETUP_SINGLE_ISSUER |\ + IORING_SETUP_DEFER_TASKRUN |\ + IORING_SETUP_NO_MMAP |\ + IORING_SETUP_REGISTERED_FD_ONLY |\ + IORING_SETUP_NO_SQARRAY |\ + IORING_SETUP_HYBRID_IOPOLL |\ + IORING_SETUP_CQE_MIXED |\ + IORING_SETUP_SQE_MIXED |\ + IORING_SETUP_SQ_REWIND) + +#define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ + IORING_ENTER_SQ_WAKEUP |\ + IORING_ENTER_SQ_WAIT |\ + IORING_ENTER_EXT_ARG |\ + IORING_ENTER_REGISTERED_RING |\ + IORING_ENTER_ABS_TIMER |\ + IORING_ENTER_EXT_ARG_REG |\ + IORING_ENTER_NO_IOWAIT) + + +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE |\ + IOSQE_IO_DRAIN |\ + IOSQE_IO_LINK |\ + IOSQE_IO_HARDLINK |\ + IOSQE_ASYNC |\ + IOSQE_BUFFER_SELECT |\ + IOSQE_CQE_SKIP_SUCCESS) + +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) + +/* + * Complaint timeout for io_uring cancelation exits, and for io-wq exit + * worker waiting. + */ +#define IO_URING_EXIT_WAIT_MAX (HZ * 60 * 5) + enum { - IOU_OK = 0, + IOU_COMPLETE = 0, + IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, /* + * The request has more work to do and should be retried. io_uring will + * attempt to wait on the file for eligible opcodes, but otherwise + * it'll be handed to iowq for blocking execution. It works for normal + * requests as well as for the multi shot mode. + */ + IOU_RETRY = -EAGAIN, + + /* * Requeue the task_work to restart operations on this request. The * actual value isn't important, should just be not an otherwise * valid error code, yet less than -MAX_ERRNO and valid internally. */ IOU_REQUEUE = -3072, +}; - /* - * Intended only when both IO_URING_F_MULTISHOT is passed - * to indicate to the poll runner that multishot should be - * removed and the result is set on req->cqe.res. - */ - IOU_STOP_MULTISHOT = -ECANCELED, +struct io_defer_entry { + struct list_head list; + struct io_kiocb *req; }; struct io_wait_queue { @@ -54,59 +142,65 @@ struct io_wait_queue { #endif }; +static inline struct io_rings *io_get_rings(struct io_ring_ctx *ctx) +{ + return rcu_dereference_check(ctx->rings_rcu, + lockdep_is_held(&ctx->uring_lock) || + lockdep_is_held(&ctx->completion_lock)); +} + static inline bool io_should_wake(struct io_wait_queue *iowq) { struct io_ring_ctx *ctx = iowq->ctx; - int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; + struct io_rings *rings; + int dist; + + guard(rcu)(); + rings = io_get_rings(ctx); /* * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ + dist = READ_ONCE(rings->cq.tail) - (int) iowq->cq_tail; return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -unsigned long rings_size(unsigned int flags, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset); -int io_uring_fill_params(unsigned entries, struct io_uring_params *p); -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); -int io_run_task_work_sig(struct io_ring_ctx *ctx); +int io_prepare_config(struct io_ctx_config *config); + +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); +bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe src_cqe[2]); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); +unsigned io_linked_nr(struct io_kiocb *req); +void io_req_track_inflight(struct io_kiocb *req); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); +struct file *io_uring_ctx_get_file(unsigned int fd, bool registered); -void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); -void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, - unsigned flags); -bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw); void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); -struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); -struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); -void tctx_task_work(struct callback_head *cb); -__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx); +void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw); +__cold void io_uring_drop_tctx_refs(struct task_struct *task); int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); void io_req_queue_iowq(struct io_kiocb *req); -int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); +int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); +__cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx); void __io_submit_flush_completions(struct io_ring_ctx *ctx); struct io_wq_work *io_wq_free_work(struct io_wq_work *work); @@ -117,10 +211,8 @@ void io_queue_next(struct io_kiocb *req); void io_task_refs_refill(struct io_uring_task *tctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx); -bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, - bool cancel_all); - void io_activate_pollwq(struct io_ring_ctx *ctx); +void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src); static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) { @@ -132,7 +224,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) if (ctx->flags & IORING_SETUP_IOPOLL) { lockdep_assert_held(&ctx->uring_lock); - } else if (!ctx->task_complete) { + } else if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) { lockdep_assert_held(&ctx->completion_lock); } else if (ctx->submitter_task) { /* @@ -147,9 +239,9 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) #endif } -static inline void io_req_task_work_add(struct io_kiocb *req) +static inline bool io_is_compat(struct io_ring_ctx *ctx) { - __io_req_task_work_add(req, 0); + return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->int_flags & IO_RING_F_COMPAT); } static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) @@ -164,43 +256,57 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, - bool overflow) + bool overflow, bool cqe32) { io_lockdep_assert_cq_locked(ctx); - if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { - if (unlikely(!io_cqe_cache_refill(ctx, overflow))) + if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) { + if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32))) return false; } *ret = ctx->cqe_cached; ctx->cached_cq_tail++; ctx->cqe_cached++; - if (ctx->flags & IORING_SETUP_CQE32) + if (ctx->flags & IORING_SETUP_CQE32) { + ctx->cqe_cached++; + } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) { ctx->cqe_cached++; + ctx->cached_cq_tail++; + } + WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel); return true; } -static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) +static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, + bool cqe32) { - return io_get_cqe_overflow(ctx, ret, false); + return io_get_cqe_overflow(ctx, ret, false, cqe32); +} + +static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, + struct io_uring_cqe **cqe_ret) +{ + io_lockdep_assert_cq_locked(ctx); + + ctx->submit_state.cq_flush = true; + return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED); } static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { + bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32; struct io_uring_cqe *cqe; /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. + * If we can't get a cq entry, userspace overflowed the submission + * (by quite a lot). */ - if (unlikely(!io_get_cqe(ctx, &cqe))) + if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32))) return false; - memcpy(cqe, &req->cqe, sizeof(*cqe)); - if (ctx->flags & IORING_SETUP_CQE32) { + if (ctx->flags & IORING_SETUP_CQE32 || is_cqe32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } @@ -225,6 +331,22 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx) +{ + if (ctx->flags & IORING_SETUP_CQE_MIXED) + return IORING_CQE_F_32; + return 0; +} + +static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags, + __u64 extra1, __u64 extra2) +{ + req->cqe.res = res; + req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx); + req->big_cqe.extra1 = extra1; + req->big_cqe.extra2 = extra2; +} + static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, struct io_kiocb *req) { @@ -246,6 +368,19 @@ static inline bool req_has_async_data(struct io_kiocb *req) return req->flags & REQ_F_ASYNC_DATA; } +static inline void io_req_async_data_clear(struct io_kiocb *req, + io_req_flags_t extra_flags) +{ + req->flags &= ~(REQ_F_ASYNC_DATA|extra_flags); + req->async_data = NULL; +} + +static inline void io_req_async_data_free(struct io_kiocb *req) +{ + kfree(req->async_data); + io_req_async_data_clear(req, 0); +} + static inline void io_put_file(struct io_kiocb *req) { if (!(req->flags & REQ_F_FIXED_FILE) && req->file) @@ -280,11 +415,22 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static inline void __io_wq_wake(struct wait_queue_head *wq) +{ + /* + * + * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter + * set in the mask so that if we recurse back into our own poll + * waitqueue handlers, we know we have a dependency between eventfd or + * epoll and should terminate multishot poll at that point. + */ + if (wq_has_sleeper(wq)) + __wake_up(wq, TASK_NORMAL, 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); +} + static inline void io_poll_wq_wake(struct io_ring_ctx *ctx) { - if (wq_has_sleeper(&ctx->poll_wq)) - __wake_up(&ctx->poll_wq, TASK_NORMAL, 0, - poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); + __io_wq_wake(&ctx->poll_wq); } static inline void io_cqring_wake(struct io_ring_ctx *ctx) @@ -293,20 +439,14 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx) * Trigger waitqueue handler on all waiters on our waitqueue. This * won't necessarily wake up all the tasks, io_should_wake() will make * that decision. - * - * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter - * set in the mask so that if we recurse back into our own poll - * waitqueue handlers, we know we have a dependency between eventfd or - * epoll and should terminate multishot poll at that point. */ - if (wq_has_sleeper(&ctx->cq_wait)) - __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, - poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); + + __io_wq_wake(&ctx->cq_wait); } -static inline bool io_sqring_full(struct io_ring_ctx *ctx) +static inline bool __io_sqring_full(struct io_ring_ctx *ctx) { - struct io_rings *r = ctx->rings; + struct io_rings *r = io_get_rings(ctx); /* * SQPOLL must use the actual sqring head, as using the cached_sq_head @@ -318,9 +458,15 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx) return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries; } -static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) +static inline bool io_sqring_full(struct io_ring_ctx *ctx) +{ + guard(rcu)(); + return __io_sqring_full(ctx); +} + +static inline unsigned int __io_sqring_entries(struct io_ring_ctx *ctx) { - struct io_rings *rings = ctx->rings; + struct io_rings *rings = io_get_rings(ctx); unsigned int entries; /* make sure SQ entry isn't read before tail */ @@ -328,57 +474,10 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return min(entries, ctx->sq_entries); } -static inline int io_run_task_work(void) -{ - bool ret = false; - - /* - * Always check-and-clear the task_work notification signal. With how - * signaling works for task_work, we can find it set with nothing to - * run. We need to clear it for that case, like get_signal() does. - */ - if (test_thread_flag(TIF_NOTIFY_SIGNAL)) - clear_notify_signal(); - /* - * PF_IO_WORKER never returns to userspace, so check here if we have - * notify work that needs processing. - */ - if (current->flags & PF_IO_WORKER) { - if (test_thread_flag(TIF_NOTIFY_RESUME)) { - __set_current_state(TASK_RUNNING); - resume_user_mode_work(NULL); - } - if (current->io_uring) { - unsigned int count = 0; - - __set_current_state(TASK_RUNNING); - tctx_task_work_run(current->io_uring, UINT_MAX, &count); - if (count) - ret = true; - } - } - if (task_work_pending(current)) { - __set_current_state(TASK_RUNNING); - task_work_run(); - ret = true; - } - - return ret; -} - -static inline bool io_local_work_pending(struct io_ring_ctx *ctx) -{ - return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); -} - -static inline bool io_task_work_pending(struct io_ring_ctx *ctx) -{ - return task_work_pending(current) || io_local_work_pending(ctx); -} - -static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) { - lockdep_assert_held(&ctx->uring_lock); + guard(rcu)(); + return __io_sqring_entries(ctx); } /* @@ -396,10 +495,12 @@ static inline void io_req_complete_defer(struct io_kiocb *req) wq_list_add_tail(&req->comp_list, &state->compl_reqs); } +#define SHOULD_FLUSH_MASK (IO_RING_F_OFF_TIMEOUT_USED | \ + IO_RING_F_HAS_EVFD | IO_RING_F_POLL_ACTIVATED) + static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || - ctx->has_evfd || ctx->poll_activated)) + if (unlikely(data_race(ctx->int_flags) & SHOULD_FLUSH_MASK)) __io_commit_cqring_flush(ctx); } @@ -418,7 +519,6 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) } extern struct kmem_cache *req_cachep; -extern struct kmem_cache *io_buf_cachep; static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) { @@ -439,30 +539,6 @@ static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req) return true; } -static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) -{ - return likely(ctx->submitter_task == current); -} - -static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) -{ - return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) || - ctx->submitter_task == current); -} - -/* - * Terminate the request if either of these conditions are true: - * - * 1) It's being executed by the original task, but that task is marked - * with PF_EXITING as it's exiting. - * 2) PF_KTHREAD is set, in which case the invoker of the task_work is - * our fallback task_work. - */ -static inline bool io_should_terminate_tw(void) -{ - return current->flags & (PF_KTHREAD | PF_EXITING); -} - static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) { io_req_set_res(req, res, 0); @@ -470,17 +546,6 @@ static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) io_req_task_work_add(req); } -/* - * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each - * slot. - */ -static inline size_t uring_sqe_size(struct io_ring_ctx *ctx) -{ - if (ctx->flags & IORING_SETUP_SQE128) - return 2 * sizeof(struct io_uring_sqe); - return sizeof(struct io_uring_sqe); -} - static inline bool io_file_can_poll(struct io_kiocb *req) { if (req->flags & REQ_F_CAN_POLL) @@ -492,6 +557,12 @@ static inline bool io_file_can_poll(struct io_kiocb *req) return false; } +static inline bool io_is_uring_cmd(const struct io_kiocb *req) +{ + return req->opcode == IORING_OP_URING_CMD || + req->opcode == IORING_OP_URING_CMD128; +} + static inline ktime_t io_get_time(struct io_ring_ctx *ctx) { if (ctx->clockid == CLOCK_MONOTONIC) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 8e72de7712ac..63061aa1cab9 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -20,7 +20,8 @@ /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) -struct kmem_cache *io_buf_cachep; +/* Mapped buffer ring, return io_uring_buf from head */ +#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] struct io_provide_buf { struct file *file; @@ -31,6 +32,49 @@ struct io_provide_buf { __u16 bid; }; +static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len) +{ + /* No data consumed, return false early to avoid consuming the buffer */ + if (!len) + return false; + + while (len) { + struct io_uring_buf *buf; + u32 buf_len, this_len; + + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); + buf_len = READ_ONCE(buf->len); + this_len = min_t(u32, len, buf_len); + buf_len -= this_len; + /* Stop looping for invalid buffer length of 0 */ + if (buf_len > bl->min_left_sub_one || !this_len) { + WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len); + WRITE_ONCE(buf->len, buf_len); + return false; + } + WRITE_ONCE(buf->len, 0); + bl->head++; + len -= this_len; + } + return true; +} + +bool io_kbuf_commit(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr) +{ + if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) + return true; + + req->flags &= ~REQ_F_BUFFERS_COMMIT; + + if (unlikely(len < 0)) + return true; + if (bl->flags & IOBL_INC) + return io_kbuf_inc_commit(bl, len); + bl->head += nr; + return true; +} + static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { @@ -52,6 +96,15 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } +void io_kbuf_drop_legacy(struct io_kiocb *req) +{ + if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED))) + return; + req->flags &= ~REQ_F_BUFFER_SELECTED; + kfree(req->kbuf); + req->kbuf = NULL; +} + bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; @@ -62,39 +115,21 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) buf = req->kbuf; bl = io_buffer_get_list(ctx, buf->bgid); - list_add(&buf->list, &bl->buf_list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - req->buf_index = buf->bgid; - - io_ring_submit_unlock(ctx, issue_flags); - return true; -} - -void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) -{ /* - * We can add this buffer back to two lists: - * - * 1) The io_buffers_cache list. This one is protected by the - * ctx->uring_lock. If we already hold this lock, add back to this - * list as we can grab it from issue as well. - * 2) The io_buffers_comp list. This one is protected by the - * ctx->completion_lock. - * - * We migrate buffers from the comp_list to the issue cache list - * when we need one. + * If the buffer list was upgraded to a ring-based one, or removed, + * while the request was in-flight in io-wq, drop it. */ - if (issue_flags & IO_URING_F_UNLOCKED) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - __io_put_kbuf_list(req, len, &ctx->io_buffers_comp); - spin_unlock(&ctx->completion_lock); + if (bl && !(bl->flags & IOBL_BUF_RING)) { + list_add(&buf->list, &bl->buf_list); + bl->nbufs++; } else { - lockdep_assert_held(&req->ctx->uring_lock); - - __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache); + kfree(buf); } + req->flags &= ~REQ_F_BUFFER_SELECTED; + req->kbuf = NULL; + + io_ring_submit_unlock(ctx, issue_flags); + return true; } static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, @@ -105,6 +140,7 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&kbuf->list); + bl->nbufs--; if (*len == 0 || *len > kbuf->len) *len = kbuf->len; if (list_empty(&bl->buf_list)) @@ -132,65 +168,79 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, return 1; } -static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl, - unsigned int issue_flags) +static bool io_should_commit(struct io_kiocb *req, unsigned int issue_flags) +{ + /* + * If we came in unlocked, we have no choice but to consume the + * buffer here, otherwise nothing ensures that the buffer won't + * get used by others. This does mean it'll be pinned until the + * IO completes, coming in unlocked means we're being called from + * io-wq context and there may be further retries in async hybrid + * mode. For the locked case, the caller must call commit when + * the transfer completes (or if we get -EAGAIN and must poll of + * retry). + */ + if (issue_flags & IO_URING_F_UNLOCKED) + return true; + + /* uring_cmd commits kbuf upfront, no need to auto-commit */ + if (!io_file_can_poll(req) && !io_is_uring_cmd(req)) + return true; + return false; +} + +static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + unsigned int issue_flags) { struct io_uring_buf_ring *br = bl->buf_ring; __u16 tail, head = bl->head; + struct io_br_sel sel = { }; struct io_uring_buf *buf; - void __user *ret; + u32 buf_len; tail = smp_load_acquire(&br->tail); if (unlikely(tail == head)) - return NULL; + return sel; if (head + 1 == tail) req->flags |= REQ_F_BL_EMPTY; buf = io_ring_head_to_buf(br, head, bl->mask); - if (*len == 0 || *len > buf->len) - *len = buf->len; + buf_len = READ_ONCE(buf->len); + if (*len == 0 || *len > buf_len) + *len = buf_len; req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; - req->buf_list = bl; - req->buf_index = buf->bid; - ret = u64_to_user_ptr(buf->addr); - - if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { - /* - * If we came in unlocked, we have no choice but to consume the - * buffer here, otherwise nothing ensures that the buffer won't - * get used by others. This does mean it'll be pinned until the - * IO completes, coming in unlocked means we're being called from - * io-wq context and there may be further retries in async hybrid - * mode. For the locked case, the caller must call commit when - * the transfer completes (or if we get -EAGAIN and must poll of - * retry). - */ - io_kbuf_commit(req, bl, *len, 1); - req->buf_list = NULL; + req->buf_index = READ_ONCE(buf->bid); + sel.buf_list = bl; + sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr)); + + if (io_should_commit(req, issue_flags)) { + if (!io_kbuf_commit(req, sel.buf_list, *len, 1)) + req->flags |= REQ_F_BUF_MORE; + sel.buf_list = NULL; } - return ret; + return sel; } -void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) +struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned buf_group, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + struct io_br_sel sel = { }; struct io_buffer_list *bl; - void __user *ret = NULL; - io_ring_submit_lock(req->ctx, issue_flags); + io_ring_submit_lock(ctx, issue_flags); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, buf_group); if (likely(bl)) { if (bl->flags & IOBL_BUF_RING) - ret = io_ring_buffer_select(req, len, bl, issue_flags); + sel = io_ring_buffer_select(req, len, bl, issue_flags); else - ret = io_provided_buffer_select(req, len, bl); + sel.addr = io_provided_buffer_select(req, len, bl); } - io_ring_submit_unlock(req->ctx, issue_flags); - return ret; + io_ring_submit_unlock(ctx, issue_flags); + return sel; } /* cap it at a reasonable 256, will be one page even for 4K */ @@ -214,25 +264,14 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, buf = io_ring_head_to_buf(br, head, bl->mask); if (arg->max_len) { u32 len = READ_ONCE(buf->len); + size_t needed; if (unlikely(!len)) return -ENOBUFS; - /* - * Limit incremental buffers to 1 segment. No point trying - * to peek ahead and map more than we need, when the buffers - * themselves should be large when setup with - * IOU_PBUF_RING_INC. - */ - if (bl->flags & IOBL_INC) { - nr_avail = 1; - } else { - size_t needed; - - needed = (arg->max_len + len - 1) / len; - needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); - if (nr_avail > needed) - nr_avail = needed; - } + needed = (arg->max_len + len - 1) / len; + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; } /* @@ -240,7 +279,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, * a speculative peek operation. */ if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) { - iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL); + iov = kmalloc_objs(struct iovec, nr_avail); if (unlikely(!iov)) return -ENOMEM; if (arg->mode & KBUF_MODE_FREE) @@ -255,18 +294,22 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, if (!arg->max_len) arg->max_len = INT_MAX; - req->buf_index = buf->bid; + req->buf_index = READ_ONCE(buf->bid); do { - u32 len = buf->len; + u32 len = READ_ONCE(buf->len); /* truncate end piece, if needed, for non partial buffers */ if (len > arg->max_len) { len = arg->max_len; - if (!(bl->flags & IOBL_INC)) - buf->len = len; + if (!(bl->flags & IOBL_INC)) { + arg->partial_map = 1; + if (iov != arg->iovs) + break; + WRITE_ONCE(buf->len, len); + } } - iov->iov_base = u64_to_user_ptr(buf->addr); + iov->iov_base = u64_to_user_ptr(READ_ONCE(buf->addr)); iov->iov_len = len; iov++; @@ -282,24 +325,22 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, req->flags |= REQ_F_BL_EMPTY; req->flags |= REQ_F_BUFFER_RING; - req->buf_list = bl; return iov - arg->iovs; } int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, - unsigned int issue_flags) + struct io_br_sel *sel, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; int ret = -ENOENT; io_ring_submit_lock(ctx, issue_flags); - bl = io_buffer_get_list(ctx, req->buf_index); - if (unlikely(!bl)) + sel->buf_list = io_buffer_get_list(ctx, arg->buf_group); + if (unlikely(!sel->buf_list)) goto out_unlock; - if (bl->flags & IOBL_BUF_RING) { - ret = io_ring_buffers_peek(req, arg, bl); + if (sel->buf_list->flags & IOBL_BUF_RING) { + ret = io_ring_buffers_peek(req, arg, sel->buf_list); /* * Don't recycle these buffers if we need to go through poll. * Nobody else can use them anyway, and holding on to provided @@ -309,17 +350,22 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, */ if (ret > 0) { req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; - io_kbuf_commit(req, bl, arg->out_len, ret); + if (!io_kbuf_commit(req, sel->buf_list, arg->out_len, ret)) + req->flags |= REQ_F_BUF_MORE; } } else { - ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); + ret = io_provided_buffers_select(req, &arg->out_len, sel->buf_list, arg->iovs); } out_unlock: - io_ring_submit_unlock(ctx, issue_flags); + if (issue_flags & IO_URING_F_UNLOCKED) { + sel->buf_list = NULL; + mutex_unlock(&ctx->uring_lock); + } return ret; } -int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, + struct io_br_sel *sel) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; @@ -327,7 +373,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) lockdep_assert_held(&ctx->uring_lock); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, arg->buf_group); if (unlikely(!bl)) return -ENOENT; @@ -335,58 +381,80 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) ret = io_ring_buffers_peek(req, arg, bl); if (ret > 0) req->flags |= REQ_F_BUFFERS_COMMIT; + sel->buf_list = bl; return ret; } /* don't support multiple buffer selections for legacy */ + sel->buf_list = NULL; return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); } -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) +static inline bool __io_put_kbuf_ring(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr) { - unsigned i = 0; + bool ret = true; - /* shouldn't happen */ - if (!nbufs) - return 0; + if (bl) + ret = io_kbuf_commit(req, bl, len, nr); + if (ret && (req->flags & REQ_F_BUF_MORE)) + ret = false; - if (bl->flags & IOBL_BUF_RING) { - i = bl->buf_ring->tail - bl->head; - io_free_region(ctx, &bl->region); - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - bl->flags &= ~IOBL_BUF_RING; - return i; + req->flags &= ~(REQ_F_BUFFER_RING | REQ_F_BUF_MORE); + return ret; +} + +unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl, + int len, int nbufs) +{ + unsigned int ret; + + ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + + if (unlikely(!(req->flags & REQ_F_BUFFER_RING))) { + io_kbuf_drop_legacy(req); + return ret; } + if (!__io_put_kbuf_ring(req, bl, len, nbufs)) + ret |= IORING_CQE_F_BUF_MORE; + return ret; +} + +static int io_remove_buffers_legacy(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, + unsigned long nbufs) +{ + unsigned long i = 0; + struct io_buffer *nxt; + /* protects io_buffers_cache */ lockdep_assert_held(&ctx->uring_lock); + WARN_ON_ONCE(bl->flags & IOBL_BUF_RING); - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - + for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) { nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_move(&nxt->list, &ctx->io_buffers_cache); - if (++i == nbufs) - return i; + list_del(&nxt->list); + bl->nbufs--; + kfree(nxt); cond_resched(); } - return i; } static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - __io_remove_buffers(ctx, bl, -1U); + if (bl->flags & IOBL_BUF_RING) + io_free_region(ctx->user, &bl->region); + else + io_remove_buffers_legacy(ctx, bl, -1U); + kfree(bl); } void io_destroy_buffers(struct io_ring_ctx *ctx) { struct io_buffer_list *bl; - struct list_head *item, *tmp; - struct io_buffer *buf; while (1) { unsigned long index = 0; @@ -400,19 +468,6 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) break; io_put_bl(ctx, bl); } - - /* - * Move deferred locked entries to cache before pruning - */ - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) - list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - - list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { - buf = list_entry(item, struct io_buffer, list); - kmem_cache_free(io_buf_cachep, buf); - } } static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) @@ -441,30 +496,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!(bl->flags & IOBL_BUF_RING)) - ret = __io_remove_buffers(ctx, bl, p->nbufs); - } - io_ring_submit_unlock(ctx, issue_flags); - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned long size, tmp_check; @@ -480,14 +511,14 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe p->nbufs = tmp; p->addr = READ_ONCE(sqe->addr); p->len = READ_ONCE(sqe->len); + if (!p->len) + return -EINVAL; if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, &size)) return -EOVERFLOW; if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; if (!access_ok(u64_to_user_ptr(p->addr), size)) return -EFAULT; @@ -501,67 +532,29 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return 0; } -#define IO_BUFFER_ALLOC_BATCH 64 - -static int io_refill_buffer_cache(struct io_ring_ctx *ctx) -{ - struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH]; - int allocated; - - /* - * Completions that don't happen inline (eg not under uring_lock) will - * add to ->io_buffers_comp. If we don't have any free buffers, check - * the completion list and splice those entries first. - */ - if (!list_empty_careful(&ctx->io_buffers_comp)) { - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) { - list_splice_init(&ctx->io_buffers_comp, - &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - return 0; - } - spin_unlock(&ctx->completion_lock); - } - - /* - * No free buffers and no completion entries either. Allocate a new - * batch of buffer entries and add those to our freelist. - */ - - allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT, - ARRAY_SIZE(bufs), (void **) bufs); - if (unlikely(!allocated)) { - /* - * Bulk alloc is all-or-nothing. If we fail to get a batch, - * retry single alloc to be on the safe side. - */ - bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); - if (!bufs[0]) - return -ENOMEM; - allocated = 1; - } - - while (allocated) - list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache); - - return 0; -} - static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, struct io_buffer_list *bl) { struct io_buffer *buf; u64 addr = pbuf->addr; - int i, bid = pbuf->bid; + int ret = -ENOMEM, i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { - if (list_empty(&ctx->io_buffers_cache) && - io_refill_buffer_cache(ctx)) + /* + * Nonsensical to have more than sizeof(bid) buffers in a + * buffer list, as the application then has no way of knowing + * which duplicate bid refers to what buffer. + */ + if (bl->nbufs == USHRT_MAX) { + ret = -EOVERFLOW; + break; + } + buf = kmalloc_obj(*buf, GFP_KERNEL_ACCOUNT); + if (!buf) break; - buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, - list); - list_move_tail(&buf->list, &bl->buf_list); + + list_add_tail(&buf->list, &bl->buf_list); + bl->nbufs++; buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; @@ -571,52 +564,59 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, cond_resched(); } - return i ? 0 : -ENOMEM; + return i ? 0 : ret; } -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) +static int __io_manage_buffers_legacy(struct io_kiocb *req, + struct io_buffer_list *bl) { struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; + int ret; - io_ring_submit_lock(ctx, issue_flags); + if (!bl) { + if (req->opcode != IORING_OP_PROVIDE_BUFFERS) + return -ENOENT; + bl = kzalloc_obj(*bl, GFP_KERNEL_ACCOUNT); + if (!bl) + return -ENOMEM; - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { - bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); - if (!bl) { - ret = -ENOMEM; - goto err; - } INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); + ret = io_buffer_add_list(req->ctx, bl, p->bgid); if (ret) { kfree(bl); - goto err; + return ret; } } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->flags & IOBL_BUF_RING) { - ret = -EINVAL; - goto err; - } + /* can't use provide/remove buffers command on mapped buffers */ + if (bl->flags & IOBL_BUF_RING) + return -EINVAL; + if (req->opcode == IORING_OP_PROVIDE_BUFFERS) + return io_add_buffers(req->ctx, p, bl); + return io_remove_buffers_legacy(req->ctx, bl, p->nbufs); +} - ret = io_add_buffers(ctx, p, bl); -err: +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; + + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, p->bgid); + ret = __io_manage_buffers_legacy(req, bl); io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; + struct io_buffer_list *bl; struct io_uring_region_desc rd; struct io_uring_buf_ring *br; unsigned long mmap_offset; @@ -627,8 +627,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (!mem_is_zero(reg.resv, sizeof(reg.resv))) return -EINVAL; if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; @@ -638,6 +637,10 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.ring_entries >= 65536) return -EINVAL; + /* minimum left byte count is a property of incremental buffers */ + if (!(reg.flags & IOU_PBUF_RING_INC) && reg.min_left) + return -EINVAL; + bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ @@ -646,7 +649,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) io_destroy_bl(ctx, bl); } - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + bl = kzalloc_obj(*bl, GFP_KERNEL_ACCOUNT); if (!bl) return -ENOMEM; @@ -659,7 +662,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) rd.user_addr = reg.ring_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } - ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); + ret = io_create_region(ctx, &bl->region, &rd, mmap_offset); if (ret) goto fail; br = io_region_get_ptr(&bl->region); @@ -681,17 +684,19 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) } #endif - bl->nr_entries = reg.ring_entries; bl->mask = reg.ring_entries - 1; bl->flags |= IOBL_BUF_RING; bl->buf_ring = br; + if (reg.min_left) + bl->min_left_sub_one = reg.min_left - 1; if (reg.flags & IOU_PBUF_RING_INC) bl->flags |= IOBL_INC; - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; + ret = io_buffer_add_list(ctx, bl, reg.bgid); + if (!ret) + return 0; fail: - io_free_region(ctx, &bl->region); - kfree(free_bl); + io_free_region(ctx->user, &bl->region); + kfree(bl); return ret; } @@ -704,9 +709,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (reg.flags) + if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags) return -EINVAL; bl = io_buffer_get_list(ctx, reg.bgid); @@ -726,14 +729,11 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_status buf_status; struct io_buffer_list *bl; - int i; if (copy_from_user(&buf_status, arg, sizeof(buf_status))) return -EFAULT; - - for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++) - if (buf_status.resv[i]) - return -EINVAL; + if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv))) + return -EINVAL; bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index bd80c44c5af1..401773e1ef80 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -14,23 +14,31 @@ enum { struct io_buffer_list { /* - * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, - * then these are classic provided buffers and ->buf_list is used. + * If the IOBL_BUF_RING flag is set, then buf_ring is used. If not, then + * these are classic provided buffers and ->buf_list is used. */ union { struct list_head buf_list; struct io_uring_buf_ring *buf_ring; }; + /* count of classic/legacy buffers in buffer list */ + int nbufs; + __u16 bgid; /* below is for ring provided buffers */ - __u16 buf_nr_pages; - __u16 nr_entries; __u16 head; __u16 mask; __u16 flags; + /* + * minimum required amount to be left to reuse an incrementally + * consumed buffer. If less than this is left at consumption time, + * buffer is done and head is incremented to the next buffer. + */ + __u32 min_left_sub_one; + struct io_mapped_region region; }; @@ -55,43 +63,41 @@ struct buf_sel_arg { size_t max_len; unsigned short nr_iovs; unsigned short mode; + unsigned short buf_group; + unsigned short partial_map; }; -void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags); +struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned buf_group, unsigned int issue_flags); int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, - unsigned int issue_flags); -int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); + struct io_br_sel *sel, unsigned int issue_flags); +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, + struct io_br_sel *sel); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags); - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); -void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); - bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); +void io_kbuf_drop_legacy(struct io_kiocb *req); + +unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl, + int len, int nbufs); +bool io_kbuf_commit(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr); struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, unsigned int bgid); -static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) +static inline bool io_kbuf_recycle_ring(struct io_kiocb *req, + struct io_buffer_list *bl) { - /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. - * The exception is partial io, that case we should increment bl->head - * to monopolize the buffer. - */ - if (req->buf_list) { - req->buf_index = req->buf_list->bgid; + if (bl) { req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); return true; } @@ -105,111 +111,31 @@ static inline bool io_do_buffer_select(struct io_kiocb *req) return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); } -static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +static inline bool io_kbuf_recycle(struct io_kiocb *req, struct io_buffer_list *bl, + unsigned issue_flags) { if (req->flags & REQ_F_BL_NO_RECYCLE) return false; + if (req->flags & REQ_F_BUFFER_RING) + return io_kbuf_recycle_ring(req, bl); if (req->flags & REQ_F_BUFFER_SELECTED) return io_kbuf_recycle_legacy(req, issue_flags); - if (req->flags & REQ_F_BUFFER_RING) - return io_kbuf_recycle_ring(req); return false; } -/* Mapped buffer ring, return io_uring_buf from head */ -#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] - -static inline bool io_kbuf_commit(struct io_kiocb *req, - struct io_buffer_list *bl, int len, int nr) -{ - if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) - return true; - - req->flags &= ~REQ_F_BUFFERS_COMMIT; - - if (unlikely(len < 0)) - return true; - - if (bl->flags & IOBL_INC) { - struct io_uring_buf *buf; - - buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); - if (WARN_ON_ONCE(len > buf->len)) - len = buf->len; - buf->len -= len; - if (buf->len) { - buf->addr += len; - return false; - } - } - - bl->head += nr; - return true; -} - -static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) -{ - struct io_buffer_list *bl = req->buf_list; - bool ret = true; - - if (bl) { - ret = io_kbuf_commit(req, bl, len, nr); - req->buf_index = bl->bgid; - } - req->flags &= ~REQ_F_BUFFER_RING; - return ret; -} - -static inline void __io_put_kbuf_list(struct io_kiocb *req, int len, - struct list_head *list) -{ - if (req->flags & REQ_F_BUFFER_RING) { - __io_put_kbuf_ring(req, len, 1); - } else { - req->buf_index = req->kbuf->bgid; - list_add(&req->kbuf->list, list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - } -} - -static inline void io_kbuf_drop(struct io_kiocb *req) -{ - lockdep_assert_held(&req->ctx->completion_lock); - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return; - - /* len == 0 is fine here, non-ring will always drop all of it */ - __io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp); -} - -static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, - int nbufs, unsigned issue_flags) +static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, + struct io_buffer_list *bl) { - unsigned int ret; - if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) return 0; - - ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); - if (req->flags & REQ_F_BUFFER_RING) { - if (!__io_put_kbuf_ring(req, len, nbufs)) - ret |= IORING_CQE_F_BUF_MORE; - } else { - __io_put_kbuf(req, len, issue_flags); - } - return ret; -} - -static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, - unsigned issue_flags) -{ - return __io_put_kbufs(req, len, 1, issue_flags); + return __io_put_kbufs(req, bl, len, 1); } static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, - int nbufs, unsigned issue_flags) + struct io_buffer_list *bl, int nbufs) { - return __io_put_kbufs(req, len, nbufs, issue_flags); + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) + return 0; + return __io_put_kbufs(req, bl, len, nbufs); } #endif diff --git a/io_uring/loop.c b/io_uring/loop.c new file mode 100644 index 000000000000..31843cc3e451 --- /dev/null +++ b/io_uring/loop.c @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include "io_uring.h" +#include "wait.h" +#include "loop.h" + +static inline int io_loop_nr_cqes(const struct io_ring_ctx *ctx, + const struct iou_loop_params *lp) +{ + return lp->cq_wait_idx - READ_ONCE(ctx->rings->cq.tail); +} + +static inline void io_loop_wait_start(struct io_ring_ctx *ctx, unsigned nr_wait) +{ + atomic_set(&ctx->cq_wait_nr, nr_wait); + set_current_state(TASK_INTERRUPTIBLE); +} + +static inline void io_loop_wait_finish(struct io_ring_ctx *ctx) +{ + __set_current_state(TASK_RUNNING); + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); +} + +static void io_loop_wait(struct io_ring_ctx *ctx, struct iou_loop_params *lp, + unsigned nr_wait) +{ + io_loop_wait_start(ctx, nr_wait); + + if (unlikely(io_local_work_pending(ctx) || + io_loop_nr_cqes(ctx, lp) <= 0) || + READ_ONCE(ctx->check_cq)) { + io_loop_wait_finish(ctx); + return; + } + + mutex_unlock(&ctx->uring_lock); + schedule(); + io_loop_wait_finish(ctx); + mutex_lock(&ctx->uring_lock); +} + +static int __io_run_loop(struct io_ring_ctx *ctx) +{ + struct iou_loop_params lp = {}; + + while (true) { + int nr_wait, step_res; + + if (unlikely(!ctx->loop_step)) + return -EFAULT; + + step_res = ctx->loop_step(ctx, &lp); + if (step_res == IOU_LOOP_STOP) + break; + if (step_res != IOU_LOOP_CONTINUE) + return -EINVAL; + + nr_wait = io_loop_nr_cqes(ctx, &lp); + if (nr_wait > 0) + io_loop_wait(ctx, &lp, nr_wait); + else + nr_wait = 0; + + if (task_work_pending(current)) { + mutex_unlock(&ctx->uring_lock); + io_run_task_work(); + mutex_lock(&ctx->uring_lock); + } + if (unlikely(task_sigpending(current))) + return -EINTR; + io_run_local_work_locked(ctx, nr_wait); + + if (READ_ONCE(ctx->check_cq) & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + io_cqring_overflow_flush_locked(ctx); + } + + return 0; +} + +int io_run_loop(struct io_ring_ctx *ctx) +{ + int ret; + + if (!io_allowed_run_tw(ctx)) + return -EEXIST; + + mutex_lock(&ctx->uring_lock); + ret = __io_run_loop(ctx); + mutex_unlock(&ctx->uring_lock); + return ret; +} diff --git a/io_uring/loop.h b/io_uring/loop.h new file mode 100644 index 000000000000..d7718b9ce61e --- /dev/null +++ b/io_uring/loop.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_LOOP_H +#define IOU_LOOP_H + +#include <linux/io_uring_types.h> + +struct iou_loop_params { + /* + * The CQE index to wait for. Only serves as a hint and can still be + * woken up earlier. + */ + __u32 cq_wait_idx; +}; + +enum { + IOU_LOOP_CONTINUE = 0, + IOU_LOOP_STOP, +}; + +static inline bool io_has_loop_ops(struct io_ring_ctx *ctx) +{ + return data_race(ctx->loop_step); +} + +int io_run_loop(struct io_ring_ctx *ctx); + +#endif diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 361134544427..4f9b439319c4 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -13,27 +13,28 @@ #include "memmap.h" #include "kbuf.h" #include "rsrc.h" +#include "zcrx.h" -static void *io_mem_alloc_compound(struct page **pages, int nr_pages, - size_t size, gfp_t gfp) +static bool io_mem_alloc_compound(struct page **pages, int nr_pages, + size_t size, gfp_t gfp) { struct page *page; int i, order; order = get_order(size); if (order > MAX_PAGE_ORDER) - return ERR_PTR(-ENOMEM); + return false; else if (order) gfp |= __GFP_COMP; page = alloc_pages(gfp, order); if (!page) - return ERR_PTR(-ENOMEM); + return false; for (i = 0; i < nr_pages; i++) pages[i] = page + i; - return page_address(page); + return true; } struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) @@ -55,7 +56,7 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) if (WARN_ON_ONCE(nr_pages > INT_MAX)) return ERR_PTR(-EOVERFLOW); - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + pages = kvmalloc_objs(struct page *, nr_pages, GFP_KERNEL_ACCOUNT); if (!pages) return ERR_PTR(-ENOMEM); @@ -87,7 +88,7 @@ enum { IO_REGION_F_SINGLE_REF = 4, }; -void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) +void io_free_region(struct user_struct *user, struct io_mapped_region *mr) { if (mr->pages) { long nr_refs = mr->nr_pages; @@ -104,8 +105,8 @@ void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) } if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) vunmap(mr->ptr); - if (mr->nr_pages && ctx->user) - __io_unaccount_mem(ctx->user, mr->nr_pages); + if (mr->nr_pages && user) + __io_unaccount_mem(user, mr->nr_pages); memset(mr, 0, sizeof(*mr)); } @@ -116,7 +117,7 @@ static int io_region_init_ptr(struct io_mapped_region *mr) void *ptr; if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { - if (ifd.nr_folios == 1) { + if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) { mr->ptr = page_address(mr->pages[0]); return 0; } @@ -130,11 +131,10 @@ static int io_region_init_ptr(struct io_mapped_region *mr) return 0; } -static int io_region_pin_pages(struct io_ring_ctx *ctx, - struct io_mapped_region *mr, - struct io_uring_region_desc *reg) +static int io_region_pin_pages(struct io_mapped_region *mr, + struct io_uring_region_desc *reg) { - unsigned long size = mr->nr_pages << PAGE_SHIFT; + size_t size = io_region_size(mr); struct page **pages; int nr_pages; @@ -149,23 +149,20 @@ static int io_region_pin_pages(struct io_ring_ctx *ctx, return 0; } -static int io_region_allocate_pages(struct io_ring_ctx *ctx, - struct io_mapped_region *mr, +static int io_region_allocate_pages(struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; - unsigned long size = mr->nr_pages << PAGE_SHIFT; + size_t size = io_region_size(mr); unsigned long nr_allocated; struct page **pages; - void *p; - pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); + pages = kvmalloc_objs(*pages, mr->nr_pages, gfp); if (!pages) return -ENOMEM; - p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); - if (!IS_ERR(p)) { + if (io_mem_alloc_compound(pages, mr->nr_pages, size, gfp)) { mr->flags |= IO_REGION_F_SINGLE_REF; goto done; } @@ -218,9 +215,9 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, mr->nr_pages = nr_pages; if (reg->flags & IORING_MEM_REGION_TYPE_USER) - ret = io_region_pin_pages(ctx, mr, reg); + ret = io_region_pin_pages(mr, reg); else - ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset); + ret = io_region_allocate_pages(mr, reg, mmap_offset); if (ret) goto out_free; @@ -229,36 +226,16 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, goto out_free; return 0; out_free: - io_free_region(ctx, mr); + io_free_region(ctx->user, mr); return ret; } -int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct io_uring_region_desc *reg, - unsigned long mmap_offset) -{ - struct io_mapped_region tmp_mr; - int ret; - - memcpy(&tmp_mr, mr, sizeof(tmp_mr)); - ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset); - if (ret) - return ret; - - /* - * Once published mmap can find it without holding only the ->mmap_lock - * and not ->uring_lock. - */ - guard(mutex)(&ctx->mmap_lock); - memcpy(mr, &tmp_mr, sizeof(tmp_mr)); - return 0; -} - static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, loff_t pgoff) { loff_t offset = pgoff << PAGE_SHIFT; - unsigned int bgid; + unsigned int id; + switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: @@ -267,10 +244,13 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, case IORING_OFF_SQES: return &ctx->sq_region; case IORING_OFF_PBUF_RING: - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - return io_pbuf_get_region(ctx, bgid); + id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + return io_pbuf_get_region(ctx, id); case IORING_MAP_OFF_PARAM_REGION: return &ctx->param_region; + case IORING_MAP_OFF_ZCRX_REGION: + id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT; + return io_zcrx_get_region(ctx, id); } return NULL; } @@ -288,8 +268,7 @@ static void *io_region_validate_mmap(struct io_ring_ctx *ctx, return io_region_get_ptr(mr); } -static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, - size_t sz) +static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff) { struct io_ring_ctx *ctx = file->private_data; struct io_mapped_region *region; @@ -324,7 +303,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) guard(mutex)(&ctx->mmap_lock); - ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff); if (IS_ERR(ptr)) return PTR_ERR(ptr); @@ -356,7 +335,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, guard(mutex)(&ctx->mmap_lock); - ptr = io_uring_validate_mmap_request(filp, pgoff, len); + ptr = io_uring_validate_mmap_request(filp, pgoff); if (IS_ERR(ptr)) return -ENOMEM; @@ -382,14 +361,58 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, #else addr = 0UL; #endif - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ +/* + * Drop the pages that were initially referenced and added in + * io_uring_mmap(). We cannot have had a mremap() as that isn't supported, + * hence the vma should be identical to the one we initially referenced and + * mapped, and partial unmaps and splitting isn't possible on a file backed + * mapping. + */ +static void io_uring_nommu_vm_close(struct vm_area_struct *vma) +{ + unsigned long index; + + for (index = vma->vm_start; index < vma->vm_end; index += PAGE_SIZE) + put_page(virt_to_page((void *) index)); +} + +static const struct vm_operations_struct io_uring_nommu_vm_ops = { + .close = io_uring_nommu_vm_close, +}; + int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { - return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; + struct io_ring_ctx *ctx = file->private_data; + struct io_mapped_region *region; + unsigned long i; + + if (!is_nommu_shared_mapping(vma->vm_flags)) + return -EINVAL; + + guard(mutex)(&ctx->mmap_lock); + region = io_mmap_get_region(ctx, vma->vm_pgoff); + if (!region || !io_region_is_set(region)) + return -EINVAL; + + if ((vma->vm_end - vma->vm_start) != + (unsigned long) region->nr_pages << PAGE_SHIFT) + return -EINVAL; + + /* + * Pin the pages so io_free_region()'s release_pages() does not + * drop the last reference while this VMA exists. delete_vma() + * in mm/nommu.c calls vma_close() which runs ->close above. + */ + for (i = 0; i < region->nr_pages; i++) + get_page(region->pages[i]); + + vma->vm_ops = &io_uring_nommu_vm_ops; + return 0; } unsigned int io_uring_nommu_mmap_capabilities(struct file *file) @@ -406,7 +429,7 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, guard(mutex)(&ctx->mmap_lock); - ptr = io_uring_validate_mmap_request(file, pgoff, len); + ptr = io_uring_validate_mmap_request(file, pgoff); if (IS_ERR(ptr)) return PTR_ERR(ptr); diff --git a/io_uring/memmap.h b/io_uring/memmap.h index c898dcba2b4e..f4cfbb6b9a1f 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -1,9 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef IO_URING_MEMMAP_H #define IO_URING_MEMMAP_H #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL +#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); +#define IORING_OFF_ZCRX_SHIFT 16 + +struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages); #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); @@ -13,16 +17,11 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long flags); int io_uring_mmap(struct file *file, struct vm_area_struct *vma); -void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); +void io_free_region(struct user_struct *user, struct io_mapped_region *mr); int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset); -int io_create_region_mmap_safe(struct io_ring_ctx *ctx, - struct io_mapped_region *mr, - struct io_uring_region_desc *reg, - unsigned long mmap_offset); - static inline void *io_region_get_ptr(struct io_mapped_region *mr) { return mr->ptr; @@ -33,4 +32,21 @@ static inline bool io_region_is_set(struct io_mapped_region *mr) return !!mr->nr_pages; } +static inline void io_region_publish(struct io_ring_ctx *ctx, + struct io_mapped_region *src_region, + struct io_mapped_region *dst_region) +{ + /* + * Once published mmap can find it without holding only the ->mmap_lock + * and not ->uring_lock. + */ + guard(mutex)(&ctx->mmap_lock); + *dst_region = *src_region; +} + +static inline size_t io_region_size(struct io_mapped_region *mr) +{ + return (size_t) mr->nr_pages << PAGE_SHIFT; +} + #endif diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c new file mode 100644 index 000000000000..b318ed697998 --- /dev/null +++ b/io_uring/mock_file.c @@ -0,0 +1,351 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/device.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/anon_inodes.h> +#include <linux/ktime.h> +#include <linux/hrtimer.h> +#include <linux/poll.h> + +#include <linux/io_uring/cmd.h> +#include <linux/io_uring_types.h> +#include <uapi/linux/io_uring/mock_file.h> + +struct io_mock_iocb { + struct kiocb *iocb; + struct hrtimer timer; + int res; +}; + +struct io_mock_file { + size_t size; + u64 rw_delay_ns; + bool pollable; + struct wait_queue_head poll_wq; +}; + +#define IO_VALID_COPY_CMD_FLAGS IORING_MOCK_COPY_FROM + +static int io_copy_regbuf(struct iov_iter *reg_iter, void __user *ubuf) +{ + size_t ret, copied = 0; + size_t buflen = PAGE_SIZE; + void *tmp_buf; + + tmp_buf = kzalloc(buflen, GFP_KERNEL); + if (!tmp_buf) + return -ENOMEM; + + while (iov_iter_count(reg_iter)) { + size_t len = min(iov_iter_count(reg_iter), buflen); + + if (iov_iter_rw(reg_iter) == ITER_SOURCE) { + ret = copy_from_iter(tmp_buf, len, reg_iter); + if (ret <= 0) + break; + if (copy_to_user(ubuf, tmp_buf, ret)) + break; + } else { + if (copy_from_user(tmp_buf, ubuf, len)) + break; + ret = copy_to_iter(tmp_buf, len, reg_iter); + if (ret <= 0) + break; + } + ubuf += ret; + copied += ret; + } + + kfree(tmp_buf); + return copied; +} + +static int io_cmd_copy_regbuf(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + const struct iovec __user *iovec; + unsigned flags, iovec_len; + struct iov_iter iter; + void __user *ubuf; + int dir, ret; + + ubuf = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + iovec = u64_to_user_ptr(READ_ONCE(sqe->addr)); + iovec_len = READ_ONCE(sqe->len); + flags = READ_ONCE(sqe->file_index); + + if (unlikely(sqe->ioprio || sqe->__pad1)) + return -EINVAL; + if (flags & ~IO_VALID_COPY_CMD_FLAGS) + return -EINVAL; + + dir = (flags & IORING_MOCK_COPY_FROM) ? ITER_SOURCE : ITER_DEST; + ret = io_uring_cmd_import_fixed_vec(cmd, iovec, iovec_len, dir, &iter, + issue_flags); + if (ret) + return ret; + ret = io_copy_regbuf(&iter, ubuf); + return ret ? ret : -EFAULT; +} + +static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + switch (cmd->cmd_op) { + case IORING_MOCK_CMD_COPY_REGBUF: + return io_cmd_copy_regbuf(cmd, issue_flags); + } + return -ENOTSUPP; +} + +static enum hrtimer_restart io_mock_rw_timer_expired(struct hrtimer *timer) +{ + struct io_mock_iocb *mio = container_of(timer, struct io_mock_iocb, timer); + struct kiocb *iocb = mio->iocb; + + WRITE_ONCE(iocb->private, NULL); + iocb->ki_complete(iocb, mio->res); + kfree(mio); + return HRTIMER_NORESTART; +} + +static ssize_t io_mock_delay_rw(struct kiocb *iocb, size_t len) +{ + struct io_mock_file *mf = iocb->ki_filp->private_data; + struct io_mock_iocb *mio; + + mio = kzalloc_obj(*mio); + if (!mio) + return -ENOMEM; + + mio->iocb = iocb; + mio->res = len; + hrtimer_setup(&mio->timer, io_mock_rw_timer_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_start(&mio->timer, ns_to_ktime(mf->rw_delay_ns), + HRTIMER_MODE_REL); + return -EIOCBQUEUED; +} + +static ssize_t io_mock_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct io_mock_file *mf = iocb->ki_filp->private_data; + size_t len = iov_iter_count(to); + size_t nr_zeroed; + + if (iocb->ki_pos + len > mf->size) + return -EINVAL; + nr_zeroed = iov_iter_zero(len, to); + if (!mf->rw_delay_ns || nr_zeroed != len) + return nr_zeroed; + + return io_mock_delay_rw(iocb, len); +} + +static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct io_mock_file *mf = iocb->ki_filp->private_data; + size_t len = iov_iter_count(from); + + if (iocb->ki_pos + len > mf->size) + return -EINVAL; + if (!mf->rw_delay_ns) { + iov_iter_advance(from, len); + return len; + } + + return io_mock_delay_rw(iocb, len); +} + +static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence) +{ + struct io_mock_file *mf = file->private_data; + + return fixed_size_llseek(file, offset, whence, mf->size); +} + +static __poll_t io_mock_poll(struct file *file, struct poll_table_struct *pt) +{ + struct io_mock_file *mf = file->private_data; + __poll_t mask = 0; + + poll_wait(file, &mf->poll_wq, pt); + + mask |= EPOLLOUT | EPOLLWRNORM; + mask |= EPOLLIN | EPOLLRDNORM; + return mask; +} + +static int io_mock_release(struct inode *inode, struct file *file) +{ + struct io_mock_file *mf = file->private_data; + + kfree(mf); + return 0; +} + +static const struct file_operations io_mock_fops = { + .owner = THIS_MODULE, + .release = io_mock_release, + .uring_cmd = io_mock_cmd, + .read_iter = io_mock_read_iter, + .write_iter = io_mock_write_iter, + .llseek = io_mock_llseek, +}; + +static const struct file_operations io_mock_poll_fops = { + .owner = THIS_MODULE, + .release = io_mock_release, + .uring_cmd = io_mock_cmd, + .read_iter = io_mock_read_iter, + .write_iter = io_mock_write_iter, + .llseek = io_mock_llseek, + .poll = io_mock_poll, +}; + +#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT | \ + IORING_MOCK_CREATE_F_POLL) + +static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + const struct file_operations *fops = &io_mock_fops; + const struct io_uring_sqe *sqe = cmd->sqe; + struct io_uring_mock_create mc, __user *uarg; + struct file *file; + struct io_mock_file *mf __free(kfree) = NULL; + size_t uarg_size; + + /* + * It's a testing only driver that allows exercising edge cases + * that wouldn't be possible to hit otherwise. + */ + add_taint(TAINT_TEST, LOCKDEP_STILL_OK); + + uarg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + uarg_size = READ_ONCE(sqe->len); + + if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index) + return -EINVAL; + if (uarg_size != sizeof(mc)) + return -EINVAL; + + memset(&mc, 0, sizeof(mc)); + if (copy_from_user(&mc, uarg, uarg_size)) + return -EFAULT; + if (!mem_is_zero(mc.__resv, sizeof(mc.__resv))) + return -EINVAL; + if (mc.flags & ~IO_VALID_CREATE_FLAGS) + return -EINVAL; + if (mc.file_size > SZ_1G) + return -EINVAL; + if (mc.rw_delay_ns > NSEC_PER_SEC) + return -EINVAL; + + mf = kzalloc_obj(*mf, GFP_KERNEL_ACCOUNT); + if (!mf) + return -ENOMEM; + + init_waitqueue_head(&mf->poll_wq); + mf->size = mc.file_size; + mf->rw_delay_ns = mc.rw_delay_ns; + if (mc.flags & IORING_MOCK_CREATE_F_POLL) { + fops = &io_mock_poll_fops; + mf->pollable = true; + } + + FD_PREPARE(fdf, O_RDWR | O_CLOEXEC, + anon_inode_create_getfile("[io_uring_mock]", fops, mf, + O_RDWR | O_CLOEXEC, NULL)); + if (fdf.err) + return fdf.err; + + retain_and_null_ptr(mf); + file = fd_prepare_file(fdf); + file->f_mode |= FMODE_READ | FMODE_CAN_READ | FMODE_WRITE | + FMODE_CAN_WRITE | FMODE_LSEEK; + if (mc.flags & IORING_MOCK_CREATE_F_SUPPORT_NOWAIT) + file->f_mode |= FMODE_NOWAIT; + + mc.out_fd = fd_prepare_fd(fdf); + if (copy_to_user(uarg, &mc, uarg_size)) + return -EFAULT; + + fd_publish(fdf); + return 0; +} + +static int io_probe_mock(struct io_uring_cmd *cmd) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + struct io_uring_mock_probe mp, __user *uarg; + size_t uarg_size; + + uarg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + uarg_size = READ_ONCE(sqe->len); + + if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index || + uarg_size != sizeof(mp)) + return -EINVAL; + + memset(&mp, 0, sizeof(mp)); + if (copy_from_user(&mp, uarg, uarg_size)) + return -EFAULT; + if (!mem_is_zero(&mp, sizeof(mp))) + return -EINVAL; + + mp.features = IORING_MOCK_FEAT_END; + + if (copy_to_user(uarg, &mp, uarg_size)) + return -EFAULT; + return 0; +} + +static int iou_mock_mgr_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd->cmd_op) { + case IORING_MOCK_MGR_CMD_PROBE: + return io_probe_mock(cmd); + case IORING_MOCK_MGR_CMD_CREATE: + return io_create_mock_file(cmd, issue_flags); + } + return -EOPNOTSUPP; +} + +static const struct file_operations iou_mock_dev_fops = { + .owner = THIS_MODULE, + .uring_cmd = iou_mock_mgr_cmd, +}; + +static struct miscdevice iou_mock_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "io_uring_mock", + .fops = &iou_mock_dev_fops, +}; + +static int __init io_mock_init(void) +{ + int ret; + + ret = misc_register(&iou_mock_miscdev); + if (ret < 0) { + pr_err("Could not initialize io_uring mock device\n"); + return ret; + } + return 0; +} + +static void __exit io_mock_exit(void) +{ + misc_deregister(&iou_mock_miscdev); +} + +module_init(io_mock_init) +module_exit(io_mock_exit) + +MODULE_AUTHOR("Pavel Begunkov <asml.silence@gmail.com>"); +MODULE_DESCRIPTION("io_uring mock file"); +MODULE_LICENSE("GPL"); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 7e6f68e911f1..3ff9098573db 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -11,7 +11,6 @@ #include "io_uring.h" #include "rsrc.h" #include "filetable.h" -#include "alloc_cache.h" #include "msg_ring.h" /* All valid masks for MSG_RING */ @@ -38,8 +37,8 @@ static void io_double_unlock_ctx(struct io_ring_ctx *octx) mutex_unlock(&octx->uring_lock); } -static int io_double_lock_ctx(struct io_ring_ctx *octx, - unsigned int issue_flags) +static int io_lock_external_ctx(struct io_ring_ctx *octx, + unsigned int issue_flags) { /* * To ensure proper ordering between the two ctxs, we can only @@ -68,52 +67,30 @@ void io_msg_ring_cleanup(struct io_kiocb *req) static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) { - return target_ctx->task_complete; + return target_ctx->int_flags & IO_RING_F_TASK_COMPLETE; } -static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_ring_ctx *ctx = req->ctx; io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); - if (spin_trylock(&ctx->msg_lock)) { - if (io_alloc_cache_put(&ctx->msg_cache, req)) - req = NULL; - spin_unlock(&ctx->msg_lock); - } - if (req) - kmem_cache_free(req_cachep, req); + kfree_rcu(req, rcu_head); percpu_ref_put(&ctx->refs); } -static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, +static void io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, int res, u32 cflags, u64 user_data) { - if (!READ_ONCE(ctx->submitter_task)) { - kmem_cache_free(req_cachep, req); - return -EOWNERDEAD; - } + req->opcode = IORING_OP_NOP; req->cqe.user_data = user_data; io_req_set_res(req, res, cflags); percpu_ref_get(&ctx->refs); req->ctx = ctx; req->tctx = NULL; req->io_task_work.func = io_msg_tw_complete; - io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); - return 0; -} - -static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) -{ - struct io_kiocb *req = NULL; - - if (spin_trylock(&ctx->msg_lock)) { - req = io_alloc_cache_get(&ctx->msg_cache); - spin_unlock(&ctx->msg_lock); - if (req) - return req; - } - return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); + io_req_task_work_add_remote(req, IOU_F_TWQ_LAZY_WAKE); } static int io_msg_data_remote(struct io_ring_ctx *target_ctx, @@ -122,15 +99,15 @@ static int io_msg_data_remote(struct io_ring_ctx *target_ctx, struct io_kiocb *target; u32 flags = 0; - target = io_msg_get_kiocb(target_ctx); + target = kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO) ; if (unlikely(!target)) return -ENOMEM; if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; - return io_msg_remote_post(target_ctx, target, msg->len, flags, - msg->user_data); + io_msg_remote_post(target_ctx, target, msg->len, flags, msg->user_data); + return 0; } static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, @@ -143,7 +120,11 @@ static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, return -EINVAL; if (!(msg->flags & IORING_MSG_RING_FLAGS_PASS) && msg->dst_fd) return -EINVAL; - if (target_ctx->flags & IORING_SETUP_R_DISABLED) + /* + * Keep IORING_SETUP_R_DISABLED check before submitter_task load + * in io_msg_data_remote() -> io_req_task_work_add_remote() + */ + if (smp_load_acquire(&target_ctx->flags) & IORING_SETUP_R_DISABLED) return -EBADFD; if (io_msg_need_remote(target_ctx)) @@ -154,7 +135,7 @@ static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) { - if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) + if (unlikely(io_lock_external_ctx(target_ctx, issue_flags))) return -EAGAIN; } if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) @@ -199,7 +180,7 @@ static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flag struct file *src_file = msg->src_file; int ret; - if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) + if (unlikely(io_lock_external_ctx(target_ctx, issue_flags))) return -EAGAIN; ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); @@ -241,10 +222,7 @@ static int io_msg_fd_remote(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); - struct task_struct *task = READ_ONCE(ctx->submitter_task); - - if (unlikely(!task)) - return -EOWNERDEAD; + struct task_struct *task = ctx->submitter_task; init_task_work(&msg->tw, io_msg_tw_fd_complete); if (task_work_add(task, &msg->tw, TWA_SIGNAL)) @@ -263,7 +241,11 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) return -EINVAL; if (target_ctx == ctx) return -EINVAL; - if (target_ctx->flags & IORING_SETUP_R_DISABLED) + /* + * Keep IORING_SETUP_R_DISABLED check before submitter_task load + * in io_msg_fd_remote() + */ + if (smp_load_acquire(&target_ctx->flags) & IORING_SETUP_R_DISABLED) return -EBADFD; if (!msg->src_file) { int ret = io_msg_grab_file(req, issue_flags); @@ -327,7 +309,7 @@ done: req_set_fail(req); } io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_uring_sync_msg_ring(struct io_uring_sqe *sqe) diff --git a/io_uring/napi.c b/io_uring/napi.c index b1ade3fda30f..bfc771445912 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -38,13 +38,14 @@ static inline ktime_t net_to_ktime(unsigned long t) return ns_to_ktime(t << 10); } -int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id, + unsigned int mode) { struct hlist_head *hash_list; struct io_napi_entry *e; /* Non-NAPI IDs can be rejected. */ - if (napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi_id)) return -EINVAL; hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; @@ -69,6 +70,11 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) * kfree() */ spin_lock(&ctx->napi_lock); + if (unlikely(READ_ONCE(ctx->napi_track_mode) != mode)) { + spin_unlock(&ctx->napi_lock); + kfree(e); + return -EINVAL; + } if (unlikely(io_napi_hash_find(hash_list, napi_id))) { spin_unlock(&ctx->napi_lock); kfree(e); @@ -87,7 +93,7 @@ static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id) struct io_napi_entry *e; /* Non-NAPI IDs can be rejected. */ - if (napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi_id)) return -EINVAL; hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; @@ -196,9 +202,14 @@ __io_napi_do_busy_loop(struct io_ring_ctx *ctx, bool (*loop_end)(void *, unsigned long), void *loop_end_arg) { - if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) + switch (READ_ONCE(ctx->napi_track_mode)) { + case IO_URING_NAPI_TRACKING_STATIC: return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); - return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); + case IO_URING_NAPI_TRACKING_DYNAMIC: + return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); + default: + return false; + } } static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, @@ -273,11 +284,13 @@ static int io_napi_register_napi(struct io_ring_ctx *ctx, default: return -EINVAL; } - /* clean the napi list for new settings */ + WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); io_napi_free(ctx); - WRITE_ONCE(ctx->napi_track_mode, napi->op_param); + /* cap NAPI at 10 msec of spin time */ + napi->busy_poll_to = min(10000, napi->busy_poll_to); WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); + WRITE_ONCE(ctx->napi_track_mode, napi->op_param); return 0; } @@ -313,7 +326,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) case IO_URING_NAPI_STATIC_ADD_ID: if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) return -EINVAL; - return __io_napi_add_id(ctx, napi.op_param); + return __io_napi_add_id(ctx, napi.op_param, + IO_URING_NAPI_TRACKING_STATIC); case IO_URING_NAPI_STATIC_DEL_ID: if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) return -EINVAL; @@ -341,9 +355,10 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) if (arg && copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; + WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); - WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); + io_napi_free(ctx); return 0; } diff --git a/io_uring/napi.h b/io_uring/napi.h index fa742f42e09b..e0aecccc5065 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -15,7 +15,8 @@ void io_napi_free(struct io_ring_ctx *ctx); int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); -int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id); +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id, + unsigned int mode); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); @@ -43,13 +44,14 @@ static inline void io_napi_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct socket *sock; + unsigned int mode = IO_URING_NAPI_TRACKING_DYNAMIC; - if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC) + if (READ_ONCE(ctx->napi_track_mode) != mode) return; sock = sock_from_file(req->file); if (sock && sock->sk) - __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id)); + __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id), mode); } #else diff --git a/io_uring/net.c b/io_uring/net.c index a288c75bb92c..8df15b639358 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -4,20 +4,22 @@ #include <linux/file.h> #include <linux/slab.h> #include <linux/net.h> +#include <linux/un.h> #include <linux/compat.h> #include <net/compat.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> +#include "filetable.h" #include "io_uring.h" #include "kbuf.h" #include "alloc_cache.h" #include "net.h" #include "notif.h" #include "rsrc.h" +#include "zcrx.h" -#if defined(CONFIG_NET) struct io_shutdown { struct file *file; int how; @@ -75,19 +77,50 @@ struct io_sr_msg { u16 flags; /* initialised and used only by !msg send variants */ u16 buf_group; - u16 buf_index; + /* per-invocation mshot limit */ + unsigned mshot_len; + /* overall mshot byte limit */ + unsigned mshot_total_len; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; }; /* + * The UAPI flags are the lower 8 bits, as that's all sqe->ioprio will hold + * anyway. Use the upper 8 bits for internal uses. + */ +enum sr_retry_flags { + IORING_RECV_RETRY = (1U << 15), + IORING_RECV_PARTIAL_MAP = (1U << 14), + IORING_RECV_MSHOT_CAP = (1U << 13), + IORING_RECV_MSHOT_LIM = (1U << 12), + IORING_RECV_MSHOT_DONE = (1U << 11), + + IORING_RECV_RETRY_CLEAR = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP, + IORING_RECV_NO_RETRY = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP | + IORING_RECV_MSHOT_CAP | IORING_RECV_MSHOT_DONE, +}; + +/* * Number of times we'll try and do receives if there's more data. If we * exceed this limit, then add us to the back of the queue and retry from * there. This helps fairness between flooding clients. */ #define MULTISHOT_MAX_RETRY 32 +struct io_recvzc { + struct file *file; + u16 flags; + u32 len; + struct io_zcrx_ifq *ifq; +}; + +static int io_sg_from_iter_iovec(struct sk_buff *skb, + struct iov_iter *from, size_t length); +static int io_sg_from_iter(struct sk_buff *skb, + struct iov_iter *from, size_t length); + int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); @@ -115,7 +148,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_shutdown_sock(sock, shutdown->how); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static bool io_net_retry(struct socket *sock, int flags) @@ -127,11 +160,8 @@ static bool io_net_retry(struct socket *sock, int flags) static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) { - if (kmsg->free_iov) { - kfree(kmsg->free_iov); - kmsg->free_iov_nr = 0; - kmsg->free_iov = NULL; - } + if (kmsg->vec.iovec) + io_vec_free(&kmsg->vec); } static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) @@ -145,11 +175,12 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) } /* Let normal cleanup path reap it if we fail adding to the cache */ - io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr); - if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { - req->async_data = NULL; - req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); - } + io_alloc_cache_vec_kasan(&hdr->vec); + if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP) + io_vec_free(&hdr->vec); + + if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) + io_req_async_data_clear(req, REQ_F_NEED_CLEANUP); } static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) @@ -162,24 +193,11 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) return NULL; /* If the async data was cached, we might have an iov cached inside. */ - if (hdr->free_iov) + if (hdr->vec.iovec) req->flags |= REQ_F_NEED_CLEANUP; return hdr; } -/* assign new iovec to kmsg, if we need to */ -static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, - struct iovec *iov) -{ - if (iov) { - req->flags |= REQ_F_NEED_CLEANUP; - kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; - if (kmsg->free_iov) - kfree(kmsg->free_iov); - kmsg->free_iov = iov; - } -} - static inline void io_mshot_prep_retry(struct io_kiocb *req, struct io_async_msghdr *kmsg) { @@ -187,156 +205,139 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; - sr->len = 0; /* get from the provided buffer */ - req->buf_index = sr->buf_group; + sr->flags &= ~IORING_RECV_RETRY_CLEAR; + sr->len = sr->mshot_len; } -#ifdef CONFIG_COMPAT -static int io_compat_msg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg, - struct compat_msghdr *msg, int ddir) +static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, + const struct iovec __user *uiov, unsigned uvec_seg, + int ddir) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct compat_iovec __user *uiov; struct iovec *iov; int ret, nr_segs; - if (iomsg->free_iov) { - nr_segs = iomsg->free_iov_nr; - iov = iomsg->free_iov; + if (iomsg->vec.iovec) { + nr_segs = iomsg->vec.nr; + iov = iomsg->vec.iovec; } else { - iov = &iomsg->fast_iov; nr_segs = 1; + iov = &iomsg->fast_iov; + } + + ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov, + &iomsg->msg.msg_iter, io_is_compat(req->ctx)); + if (unlikely(ret < 0)) + return ret; + + if (iov) { + req->flags |= REQ_F_NEED_CLEANUP; + io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs); } + return 0; +} + +static int io_compat_msg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg, + struct compat_msghdr *msg, int ddir, + struct sockaddr __user **save_addr) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct compat_iovec __user *uiov; + int ret; if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) return -EFAULT; + ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr); + if (ret) + return ret; + uiov = compat_ptr(msg->msg_iov); if (req->flags & REQ_F_BUFFER_SELECT) { - compat_ssize_t clen; - if (msg->msg_iovlen == 0) { - sr->len = iov->iov_len = 0; - iov->iov_base = NULL; + sr->len = 0; } else if (msg->msg_iovlen > 1) { return -EINVAL; } else { - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) + struct compat_iovec tmp_iov; + + if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) return -EFAULT; - if (clen < 0) - return -EINVAL; - sr->len = clen; + sr->len = tmp_iov.iov_len; } - - return 0; } - - ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, - nr_segs, &iov, &iomsg->msg.msg_iter, true); - if (unlikely(ret < 0)) - return ret; - - io_net_vec_assign(req, iomsg, iov); return 0; } -#endif -static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, - struct user_msghdr *msg, int ddir) +static int io_copy_msghdr_from_user(struct user_msghdr *msg, + struct user_msghdr __user *umsg) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct user_msghdr __user *umsg = sr->umsg; - struct iovec *iov; - int ret, nr_segs; - - if (iomsg->free_iov) { - nr_segs = iomsg->free_iov_nr; - iov = iomsg->free_iov; - } else { - iov = &iomsg->fast_iov; - nr_segs = 1; - } - if (!user_access_begin(umsg, sizeof(*umsg))) return -EFAULT; - - ret = -EFAULT; unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); - msg->msg_flags = 0; - - if (req->flags & REQ_F_BUFFER_SELECT) { - if (msg->msg_iovlen == 0) { - sr->len = iov->iov_len = 0; - iov->iov_base = NULL; - } else if (msg->msg_iovlen > 1) { - ret = -EINVAL; - goto ua_end; - } else { - struct iovec __user *uiov = msg->msg_iov; - - /* we only need the length for provided buffers */ - if (!access_ok(&uiov->iov_len, sizeof(uiov->iov_len))) - goto ua_end; - unsafe_get_user(iov->iov_len, &uiov->iov_len, ua_end); - sr->len = iov->iov_len; - } - ret = 0; -ua_end: - user_access_end(); - return ret; - } - user_access_end(); - ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, - &iov, &iomsg->msg.msg_iter, false); - if (unlikely(ret < 0)) - return ret; - - io_net_vec_assign(req, iomsg, iov); return 0; +ua_end: + user_access_end(); + return -EFAULT; } -static int io_sendmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) +static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, + struct user_msghdr *msg, int ddir, + struct sockaddr __user **save_addr) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct user_msghdr msg; + struct user_msghdr __user *umsg = sr->umsg; int ret; iomsg->msg.msg_name = &iomsg->addr; iomsg->msg.msg_iter.nr_segs = 0; -#ifdef CONFIG_COMPAT - if (unlikely(req->ctx->compat)) { + if (io_is_compat(req->ctx)) { struct compat_msghdr cmsg; - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); - if (unlikely(ret)) + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr); + if (ret) return ret; - ret = __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); - sr->msg_control = iomsg->msg.msg_control_user; - return ret; + memset(msg, 0, sizeof(*msg)); + msg->msg_namelen = cmsg.msg_namelen; + msg->msg_controllen = cmsg.msg_controllen; + msg->msg_iov = compat_ptr(cmsg.msg_iov); + msg->msg_iovlen = cmsg.msg_iovlen; + return 0; } -#endif - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE); + ret = io_copy_msghdr_from_user(msg, umsg); if (unlikely(ret)) return ret; - ret = __copy_msghdr(&iomsg->msg, &msg, NULL); + msg->msg_flags = 0; - /* save msg_control as sys_sendmsg() overwrites it */ - sr->msg_control = iomsg->msg.msg_control_user; - return ret; + ret = __copy_msghdr(&iomsg->msg, msg, save_addr); + if (ret) + return ret; + + if (req->flags & REQ_F_BUFFER_SELECT) { + if (msg->msg_iovlen == 0) { + sr->len = 0; + } else if (msg->msg_iovlen > 1) { + return -EINVAL; + } else { + struct iovec __user *uiov = msg->msg_iov; + struct iovec tmp_iov; + + if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) + return -EFAULT; + sr->len = tmp_iov.iov_len; + } + } + return 0; } void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) @@ -374,67 +375,83 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) kmsg->msg.msg_name = &kmsg->addr; kmsg->msg.msg_namelen = addr_len; } - if (!io_do_buffer_select(req)) { - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret < 0)) - return ret; + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + if (!(sr->flags & IORING_SEND_VECTORIZED)) { + req->flags |= REQ_F_IMPORT_BUFFER; + return 0; + } + + kmsg->msg.msg_iter.nr_segs = sr->len; + return io_prep_reg_iovec(req, &kmsg->vec, sr->buf, sr->len); } - return 0; + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + + if (sr->flags & IORING_SEND_VECTORIZED) + return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE); + + return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); } static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; + struct user_msghdr msg; int ret; + sr->flags |= IORING_SEND_VECTORIZED; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); + if (unlikely(ret)) + return ret; + /* save msg_control as sys_sendmsg() overwrites it */ + sr->msg_control = kmsg->msg.msg_control_user; - ret = io_sendmsg_copy_hdr(req, kmsg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; + return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, + msg.msg_iovlen); + } + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); } -#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) +#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | IORING_SEND_VECTORIZED) int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; - - if (req->opcode != IORING_OP_SEND) { - if (sqe->addr2 || sqe->file_index) - return -EINVAL; - } - sr->len = READ_ONCE(sqe->len); + if (unlikely(sr->len < 0)) + return -EINVAL; sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~SENDMSG_FLAGS) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + if (req->flags & REQ_F_BUFFER_SELECT) + sr->buf_group = req->buf_index; if (sr->flags & IORING_RECVSEND_BUNDLE) { if (req->opcode == IORING_OP_SENDMSG) return -EINVAL; - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; sr->msg_flags |= MSG_WAITALL; - sr->buf_group = req->buf_index; - req->buf_list = NULL; + req->flags |= REQ_F_MULTISHOT; } -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) sr->msg_flags |= MSG_CMSG_COMPAT; -#endif + if (unlikely(!io_msg_alloc_async(req))) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG) return io_send_setup(req, sqe); + if (unlikely(sqe->addr2 || sqe->file_index)) + return -EINVAL; return io_sendmsg_setup(req, sqe); } @@ -463,7 +480,7 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) if (iter_is_ubuf(&kmsg->msg.msg_iter)) return 1; - iov = kmsg->free_iov; + iov = kmsg->vec.iovec; if (!iov) iov = &kmsg->fast_iov; @@ -483,37 +500,50 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) return nbufs; } -static inline bool io_send_finish(struct io_kiocb *req, int *ret, +static int io_net_kbuf_recyle(struct io_kiocb *req, struct io_buffer_list *bl, + struct io_async_msghdr *kmsg, int len) +{ + req->flags |= REQ_F_BL_NO_RECYCLE; + if (req->flags & REQ_F_BUFFERS_COMMIT) + io_kbuf_commit(req, bl, len, io_bundle_nbufs(kmsg, len)); + return IOU_RETRY; +} + +static inline bool io_send_finish(struct io_kiocb *req, struct io_async_msghdr *kmsg, - unsigned issue_flags) + struct io_br_sel *sel) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - bool bundle_finished = *ret <= 0; + bool bundle_finished = sel->val <= 0; unsigned int cflags; if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { - cflags = io_put_kbuf(req, *ret, issue_flags); + cflags = io_put_kbuf(req, sel->val, sel->buf_list); goto finish; } - cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); + cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val)); - if (bundle_finished || req->flags & REQ_F_BL_EMPTY) + /* + * Don't start new bundles if the buffer list is empty, or if the + * current operation needed to go through polling to complete. + */ + if (bundle_finished || req->flags & (REQ_F_BL_EMPTY | REQ_F_POLLED)) goto finish; /* * Fill CQE for this receive and see if we should keep trying to * receive from this socket. */ - if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { + if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) { io_mshot_prep_retry(req, kmsg); return false; } /* Otherwise stop bundle and use the current result. */ finish: - io_req_set_res(req, *ret, cflags); - *ret = IOU_OK; + io_req_set_res(req, sel->val, cflags); + sel->val = IOU_COMPLETE; return true; } @@ -551,7 +581,6 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_controllen = 0; kmsg->msg.msg_control = NULL; sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; return -EAGAIN; } if (ret == -ERESTARTSYS) @@ -564,24 +593,24 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) else if (sr->done_io) ret = sr->done_io; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, - struct io_async_msghdr *kmsg) + struct io_br_sel *sel, struct io_async_msghdr *kmsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - - int ret; struct buf_sel_arg arg = { .iovs = &kmsg->fast_iov, .max_len = min_not_zero(sr->len, INT_MAX), .nr_iovs = 1, + .buf_group = sr->buf_group, }; + int ret; - if (kmsg->free_iov) { - arg.nr_iovs = kmsg->free_iov_nr; - arg.iovs = kmsg->free_iov; + if (kmsg->vec.iovec) { + arg.nr_iovs = kmsg->vec.nr; + arg.iovs = kmsg->vec.iovec; arg.mode = KBUF_MODE_FREE; } @@ -590,13 +619,13 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, else arg.mode |= KBUF_MODE_EXPAND; - ret = io_buffers_select(req, &arg, issue_flags); + ret = io_buffers_select(req, &arg, sel, issue_flags); if (unlikely(ret < 0)) return ret; - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { - kmsg->free_iov_nr = ret; - kmsg->free_iov = arg.iovs; + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { + kmsg->vec.nr = ret; + kmsg->vec.iovec = arg.iovs; req->flags |= REQ_F_NEED_CLEANUP; } sr->len = arg.out_len; @@ -619,6 +648,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; + struct io_br_sel sel = { }; struct socket *sock; unsigned flags; int min_ret = 0; @@ -637,8 +667,9 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_bundle: + sel.buf_list = NULL; if (io_do_buffer_select(req)) { - ret = io_send_select_buffer(req, issue_flags, kmsg); + ret = io_send_select_buffer(req, issue_flags, &sel, kmsg); if (ret) return ret; } @@ -662,8 +693,7 @@ retry_bundle: sr->len -= ret; sr->buf += ret; sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -674,11 +704,12 @@ retry_bundle: else if (sr->done_io) ret = sr->done_io; - if (!io_send_finish(req, &ret, kmsg, issue_flags)) + sel.val = ret; + if (!io_send_finish(req, kmsg, &sel)) goto retry_bundle; io_req_msg_cleanup(req, issue_flags); - return ret; + return sel.val; } static int io_recvmsg_mshot_prep(struct io_kiocb *req, @@ -711,34 +742,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct user_msghdr msg; int ret; - iomsg->msg.msg_name = &iomsg->addr; - iomsg->msg.msg_iter.nr_segs = 0; - -#ifdef CONFIG_COMPAT - if (unlikely(req->ctx->compat)) { - struct compat_msghdr cmsg; - - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); - if (unlikely(ret)) - return ret; + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); + if (unlikely(ret)) + return ret; - ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr); + if (!(req->flags & REQ_F_BUFFER_SELECT)) { + ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen, + ITER_DEST); if (unlikely(ret)) return ret; - - return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, - cmsg.msg_controllen); } -#endif - - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST); - if (unlikely(ret)) - return ret; - - ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); - if (unlikely(ret)) - return ret; - return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, msg.msg_controllen); } @@ -747,7 +760,6 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg; - int ret; kmsg = io_msg_alloc_async(req); if (unlikely(!kmsg)) @@ -763,19 +775,13 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) kmsg->msg.msg_iocb = NULL; kmsg->msg.msg_ubuf = NULL; - if (!io_do_buffer_select(req)) { - ret = import_ubuf(ITER_DEST, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; - } - return 0; + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + return import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); } - ret = io_recvmsg_copy_hdr(req, kmsg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + return io_recvmsg_copy_hdr(req, kmsg); } #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ @@ -787,11 +793,13 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->done_io = 0; - if (unlikely(sqe->file_index || sqe->addr2)) + if (unlikely(sqe->addr2)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); + if (unlikely(sr->len < 0)) + return -EINVAL; sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~RECVMSG_FLAGS) return -EINVAL; @@ -800,49 +808,52 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; - if (req->flags & REQ_F_BUFFER_SELECT) { - /* - * Store the buffer group for this multishot receive separately, - * as if we end up doing an io-wq based issue that selects a - * buffer, it has to be committed immediately and that will - * clear ->buf_list. This means we lose the link to the buffer - * list, and the eventual buffer put on completion then cannot - * restore it. - */ + if (req->flags & REQ_F_BUFFER_SELECT) sr->buf_group = req->buf_index; - req->buf_list = NULL; - } + sr->mshot_total_len = sr->mshot_len = 0; if (sr->flags & IORING_RECV_MULTISHOT) { if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sr->msg_flags & MSG_WAITALL) return -EINVAL; - if (req->opcode == IORING_OP_RECV && sr->len) + if (req->opcode == IORING_OP_RECV) { + sr->mshot_len = sr->len; + sr->mshot_total_len = READ_ONCE(sqe->optlen); + if (sr->mshot_total_len) + sr->flags |= IORING_RECV_MSHOT_LIM; + } else if (sqe->optlen) { return -EINVAL; + } req->flags |= REQ_F_APOLL_MULTISHOT; + } else if (sqe->optlen) { + return -EINVAL; } + if (sr->flags & IORING_RECVSEND_BUNDLE) { if (req->opcode == IORING_OP_RECVMSG) return -EINVAL; } -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) sr->msg_flags |= MSG_CMSG_COMPAT; -#endif + sr->nr_multishot_loops = 0; return io_recvmsg_prep_setup(req); } +/* bits to clear in old and inherit in new cflags on bundle retry */ +#define CQE_F_MASK (IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE) + /* * Finishes io_recv and io_recvmsg. * * Returns true if it is actually finished, or false if it should run * again (for multishot). */ -static inline bool io_recv_finish(struct io_kiocb *req, int *ret, +static inline bool io_recv_finish(struct io_kiocb *req, struct io_async_msghdr *kmsg, - bool mshot_finished, unsigned issue_flags) + struct io_br_sel *sel, bool mshot_finished, + unsigned issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); unsigned int cflags = 0; @@ -850,14 +861,45 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, if (kmsg->msg.msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; + if (sel->val > 0 && sr->flags & IORING_RECV_MSHOT_LIM) { + /* + * If sr->len hits zero, the limit has been reached. Mark + * mshot as finished, and flag MSHOT_DONE as well to prevent + * a potential bundle from being retried. + */ + sr->mshot_total_len -= min_t(int, sel->val, sr->mshot_total_len); + if (!sr->mshot_total_len) { + sr->flags |= IORING_RECV_MSHOT_DONE; + mshot_finished = true; + } + } + if (sr->flags & IORING_RECVSEND_BUNDLE) { - cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), - issue_flags); + size_t this_ret = sel->val - sr->done_io; + + cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret)); + if (sr->flags & IORING_RECV_RETRY) + cflags = req->cqe.flags | (cflags & CQE_F_MASK); + if (sr->mshot_len && sel->val >= sr->mshot_len) + sr->flags |= IORING_RECV_MSHOT_CAP; /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) goto finish; + /* + * If more is available AND it was a full transfer, retry and + * append to this one + */ + if (!(sr->flags & IORING_RECV_NO_RETRY) && + kmsg->msg.msg_inq > 1 && this_ret > 0 && + !iov_iter_count(&kmsg->msg.msg_iter)) { + req->cqe.flags = cflags & ~CQE_F_MASK; + sr->len = kmsg->msg.msg_inq; + sr->done_io += this_ret; + sr->flags |= IORING_RECV_RETRY; + return false; + } } else { - cflags |= io_put_kbuf(req, *ret, issue_flags); + cflags |= io_put_kbuf(req, sel->val, sel->buf_list); } /* @@ -865,33 +907,28 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, * receive from this socket. */ if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && - io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { - int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; - + io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) { + sel->val = IOU_RETRY; io_mshot_prep_retry(req, kmsg); /* Known not-empty or unknown state, retry */ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { - if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) + if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY && + !(sr->flags & IORING_RECV_MSHOT_CAP)) { return false; + } /* mshot retries exceeded, force a requeue */ sr->nr_multishot_loops = 0; - mshot_retry_ret = IOU_REQUEUE; + sr->flags &= ~IORING_RECV_MSHOT_CAP; + if (issue_flags & IO_URING_F_MULTISHOT) + sel->val = IOU_REQUEUE; } - if (issue_flags & IO_URING_F_MULTISHOT) - *ret = mshot_retry_ret; - else - *ret = -EAGAIN; return true; } /* Finish the request / stop multishot. */ finish: - io_req_set_res(req, *ret, cflags); - - if (issue_flags & IO_URING_F_MULTISHOT) - *ret = IOU_STOP_MULTISHOT; - else - *ret = IOU_OK; + io_req_set_res(req, sel->val, cflags); + sel->val = IOU_COMPLETE; io_req_msg_cleanup(req, issue_flags); return true; } @@ -984,6 +1021,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; + struct io_br_sel sel = { }; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -1003,23 +1041,23 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_multishot: + sel.buf_list = NULL; if (io_do_buffer_select(req)) { - void __user *buf; size_t len = sr->len; - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) + sel = io_buffer_select(req, &len, sr->buf_group, issue_flags); + if (!sel.addr) return -ENOBUFS; if (req->flags & REQ_F_APOLL_MULTISHOT) { - ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); + ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len); if (ret) { - io_kbuf_recycle(req, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); return ret; } } - iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); + iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, sel.addr, len); } kmsg->msg.msg_get_inq = 1; @@ -1038,16 +1076,12 @@ retry_multishot: if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) { - io_kbuf_recycle(req, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; - } - return -EAGAIN; + io_kbuf_recycle(req, sel.buf_list, issue_flags); + return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1061,16 +1095,17 @@ retry_multishot: else if (sr->done_io) ret = sr->done_io; else - io_kbuf_recycle(req, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); - if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) + sel.val = ret; + if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) goto retry_multishot; - return ret; + return sel.val; } static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, - size_t *len, unsigned int issue_flags) + struct io_br_sel *sel, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); int ret; @@ -1086,21 +1121,35 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg .iovs = &kmsg->fast_iov, .nr_iovs = 1, .mode = KBUF_MODE_EXPAND, + .buf_group = sr->buf_group, }; - if (kmsg->free_iov) { - arg.nr_iovs = kmsg->free_iov_nr; - arg.iovs = kmsg->free_iov; + if (kmsg->vec.iovec) { + arg.nr_iovs = kmsg->vec.nr; + arg.iovs = kmsg->vec.iovec; arg.mode |= KBUF_MODE_FREE; } - if (kmsg->msg.msg_inq > 0) - arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); + if (sel->val) + arg.max_len = sel->val; + else if (kmsg->msg.msg_inq > 1) + arg.max_len = min_not_zero(sel->val, (ssize_t) kmsg->msg.msg_inq); - ret = io_buffers_peek(req, &arg); + /* if mshot limited, ensure we don't go over */ + if (sr->flags & IORING_RECV_MSHOT_LIM) + arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len); + ret = io_buffers_peek(req, &arg, sel); if (unlikely(ret < 0)) return ret; + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { + kmsg->vec.nr = ret; + kmsg->vec.iovec = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + if (arg.partial_map) + sr->flags |= IORING_RECV_PARTIAL_MAP; + /* special case 1 vec, can be a fast path */ if (ret == 1) { sr->buf = arg.iovs[0].iov_base; @@ -1109,20 +1158,14 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg } iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, arg.out_len); - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { - kmsg->free_iov_nr = ret; - kmsg->free_iov = arg.iovs; - req->flags |= REQ_F_NEED_CLEANUP; - } } else { - void __user *buf; + size_t len = sel->val; - *len = sr->len; - buf = io_buffer_select(req, len, issue_flags); - if (!buf) + *sel = io_buffer_select(req, &len, sr->buf_group, issue_flags); + if (!sel->addr) return -ENOBUFS; - sr->buf = buf; - sr->len = *len; + sr->buf = sel->addr; + sr->len = len; map_ubuf: ret = import_ubuf(ITER_DEST, sr->buf, sr->len, &kmsg->msg.msg_iter); @@ -1137,11 +1180,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; + struct io_br_sel sel; struct socket *sock; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - size_t len = sr->len; bool mshot_finished; if (!(req->flags & REQ_F_POLLED) && @@ -1157,9 +1200,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_multishot: + sel.buf_list = NULL; if (io_do_buffer_select(req)) { - ret = io_recv_buf_select(req, kmsg, &len, issue_flags); - if (unlikely(ret)) { + sel.val = sr->len; + ret = io_recv_buf_select(req, kmsg, &sel, issue_flags); + if (unlikely(ret < 0)) { kmsg->msg.msg_inq = -1; goto out_free; } @@ -1175,19 +1220,14 @@ retry_multishot: ret = sock_recvmsg(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) { - io_kbuf_recycle(req, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; - } - - return -EAGAIN; + io_kbuf_recycle(req, sel.buf_list, issue_flags); + return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; sr->buf += ret; sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1203,12 +1243,76 @@ out_free: else if (sr->done_io) ret = sr->done_io; else - io_kbuf_recycle(req, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); - if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) + sel.val = ret; + if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) goto retry_multishot; - return ret; + return sel.val; +} + +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + unsigned ifq_idx; + + if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3)) + return -EINVAL; + + ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); + zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx); + if (!zc->ifq) + return -EINVAL; + + zc->len = READ_ONCE(sqe->len); + zc->flags = READ_ONCE(sqe->ioprio); + if (READ_ONCE(sqe->msg_flags)) + return -EINVAL; + if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) + return -EINVAL; + /* multishot required */ + if (!(zc->flags & IORING_RECV_MULTISHOT)) + return -EINVAL; + /* All data completions are posted as aux CQEs. */ + req->flags |= REQ_F_APOLL_MULTISHOT; + + return 0; +} + +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + struct socket *sock; + unsigned int len; + int ret; + + if (!(req->flags & REQ_F_POLLED) && + (zc->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + len = zc->len; + ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len); + if (len && zc->len == 0) { + io_req_set_res(req, 0, 0); + + return IOU_COMPLETE; + } + if (unlikely(ret <= 0) && ret != -EAGAIN) { + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret == IOU_REQUEUE) + return IOU_REQUEUE; + + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_COMPLETE; + } + return IOU_RETRY; } void io_send_zc_cleanup(struct io_kiocb *req) @@ -1225,18 +1329,21 @@ void io_send_zc_cleanup(struct io_kiocb *req) } #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) -#define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) +#define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \ + IORING_SEND_VECTORIZED) int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_ring_ctx *ctx = req->ctx; + struct io_async_msghdr *iomsg; struct io_kiocb *notif; + u64 user_data; + int ret; zc->done_io = 0; - req->flags |= REQ_F_POLL_NO_LAZY; - if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) + if (unlikely(READ_ONCE(sqe->__pad2[0]))) return -EINVAL; /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ if (req->flags & REQ_F_CQE_SKIP) @@ -1245,10 +1352,14 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) notif = zc->notif = io_alloc_notif(ctx); if (!notif) return -ENOMEM; - notif->cqe.user_data = req->cqe.user_data; + user_data = READ_ONCE(sqe->addr3); + if (!user_data) + user_data = req->cqe.user_data; + + notif->cqe.user_data = user_data; notif->cqe.res = 0; notif->cqe.flags = IORING_CQE_F_NOTIF; - req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY; zc->flags = READ_ONCE(sqe->ioprio); if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { @@ -1263,28 +1374,35 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } } - if (req->opcode != IORING_OP_SEND_ZC) { - if (unlikely(sqe->addr2 || sqe->file_index)) - return -EINVAL; - if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) - return -EINVAL; - } - zc->len = READ_ONCE(sqe->len); zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; - zc->buf_index = READ_ONCE(sqe->buf_index); + req->buf_index = READ_ONCE(sqe->buf_index); if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(ctx)) zc->msg_flags |= MSG_CMSG_COMPAT; -#endif - if (unlikely(!io_msg_alloc_async(req))) + + iomsg = io_msg_alloc_async(req); + if (unlikely(!iomsg)) return -ENOMEM; - if (req->opcode != IORING_OP_SENDMSG_ZC) - return io_send_setup(req, sqe); - return io_sendmsg_setup(req, sqe); + + if (req->opcode == IORING_OP_SEND_ZC) { + ret = io_send_setup(req, sqe); + } else { + if (unlikely(sqe->addr2 || sqe->file_index)) + return -EINVAL; + ret = io_sendmsg_setup(req, sqe); + } + if (unlikely(ret)) + return ret; + + if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) { + iomsg->msg.sg_from_iter = io_sg_from_iter_iovec; + return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count); + } + iomsg->msg.sg_from_iter = io_sg_from_iter; + return 0; } static int io_sg_from_iter_iovec(struct sk_buff *skb, @@ -1337,50 +1455,39 @@ static int io_sg_from_iter(struct sk_buff *skb, return ret; } -static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) +static int io_send_zc_import(struct io_kiocb *req, + struct io_async_msghdr *kmsg, + unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg = req->async_data; + struct io_kiocb *notif = sr->notif; int ret; - if (sr->flags & IORING_RECVSEND_FIXED_BUF) { - struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *node; - - ret = -EFAULT; - io_ring_submit_lock(ctx, issue_flags); - node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index); - if (node) { - io_req_assign_buf_node(sr->notif, node); - ret = 0; - } - io_ring_submit_unlock(ctx, issue_flags); + WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF)); - if (unlikely(ret)) - return ret; + notif->buf_index = req->buf_index; - ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, - node->buf, (u64)(uintptr_t)sr->buf, - sr->len); - if (unlikely(ret)) - return ret; - kmsg->msg.sg_from_iter = io_sg_from_iter; + if (!(sr->flags & IORING_SEND_VECTORIZED)) { + ret = io_import_reg_buf(notif, &kmsg->msg.msg_iter, + (u64)(uintptr_t)sr->buf, sr->len, + ITER_SOURCE, issue_flags); } else { - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; - ret = io_notif_account_mem(sr->notif, sr->len); - if (unlikely(ret)) - return ret; - kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; + unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; + + ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, + notif, &kmsg->vec, uvec_segs, + issue_flags); } - return ret; + if (unlikely(ret)) + return ret; + req->flags &= ~REQ_F_IMPORT_BUFFER; + return 0; } -int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) +int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) { - struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned msg_flags; @@ -1391,97 +1498,39 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - if (!(req->flags & REQ_F_POLLED) && - (zc->flags & IORING_RECVSEND_POLL_FIRST)) + (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; - if (!zc->done_io) { - ret = io_send_zc_import(req, issue_flags); + if (req->flags & REQ_F_IMPORT_BUFFER) { + ret = io_send_zc_import(req, kmsg, issue_flags); if (unlikely(ret)) return ret; } - msg_flags = zc->msg_flags; + msg_flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) msg_flags |= MSG_DONTWAIT; if (msg_flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); - msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; - - kmsg->msg.msg_flags = msg_flags; - kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; - ret = sock_sendmsg(sock, &kmsg->msg); - - if (unlikely(ret < min_ret)) { - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return -EAGAIN; - - if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { - zc->len -= ret; - zc->buf += ret; - zc->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - req_set_fail(req); - } - if (ret >= 0) - ret += zc->done_io; - else if (zc->done_io) - ret = zc->done_io; + kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; - /* - * If we're in io-wq we can't rely on tw ordering guarantees, defer - * flushing notif to io_send_zc_cleanup() - */ - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - io_notif_flush(zc->notif); - io_req_msg_cleanup(req, 0); + if (req->opcode == IORING_OP_SEND_ZC) { + msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; + kmsg->msg.msg_flags = msg_flags; + ret = sock_sendmsg(sock, &kmsg->msg); + } else { + kmsg->msg.msg_control_user = sr->msg_control; + ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags); } - io_req_set_res(req, ret, IORING_CQE_F_MORE); - return IOU_OK; -} - -int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg = req->async_data; - struct socket *sock; - unsigned flags; - int ret, min_ret = 0; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) - return -EOPNOTSUPP; - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return -EAGAIN; - - flags = sr->msg_flags; - if (issue_flags & IO_URING_F_NONBLOCK) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); - - kmsg->msg.msg_control_user = sr->msg_control; - kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; - kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return -EAGAIN; - if (ret > 0 && io_net_retry(sock, flags)) { + if (ret > 0 && io_net_retry(sock, sr->msg_flags)) { sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; return -EAGAIN; } if (ret == -ERESTARTSYS) @@ -1500,10 +1549,11 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(sr->notif); + sr->notif = NULL; io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); - return IOU_OK; + return IOU_COMPLETE; } void io_sendrecv_fail(struct io_kiocb *req) @@ -1586,19 +1636,11 @@ retry: put_unused_fd(fd); ret = PTR_ERR(file); if (ret == -EAGAIN && force_nonblock && - !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) { - /* - * if it's multishot and polled, we don't need to - * return EAGAIN to arm the poll infra since it - * has already been done - */ - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return ret; - } + !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) + return IOU_RETRY; + if (ret == -ERESTARTSYS) ret = -EINTR; - req_set_fail(req); } else if (!fixed) { fd_install(fd, file); ret = fd; @@ -1611,23 +1653,26 @@ retry: if (!arg.is_empty) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - io_req_set_res(req, ret, cflags); - return IOU_OK; - } - - if (ret < 0) - return ret; - if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { + if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) && + io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) goto retry; - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return -EAGAIN; + return IOU_RETRY; } io_req_set_res(req, ret, cflags); - return IOU_STOP_MULTISHOT; + if (ret < 0) + req_set_fail(req); + return IOU_COMPLETE; +} + +void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req) +{ + struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); + + bctx->socket.family = sock->domain; + bctx->socket.type = sock->type; + bctx->socket.protocol = sock->protocol; } int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1681,7 +1726,7 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags) sock->file_slot); } io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1711,9 +1756,11 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - if (unlikely(req->flags & REQ_F_FAIL)) { - ret = -ECONNRESET; - goto out; + if (connect->in_progress) { + struct poll_table_struct pt = { ._key = EPOLLERR }; + + if (vfs_poll(req->file, &pt) & EPOLLERR) + goto get_sock_err; } file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -1738,8 +1785,10 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) * which means the previous result is good. For both of these, * grab the sock_error() and use that for the completion. */ - if (ret == -EBADFD || ret == -EISCONN) + if (ret == -EBADFD || ret == -EISCONN) { +get_sock_err: ret = sock_error(sock_from_file(req->file)->sk); + } } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1748,7 +1797,24 @@ out: req_set_fail(req); io_req_msg_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; +} + +/* + * Check if bind request would potentially end up with filename_create(), + * which in turn end up in mnt_want_write() which will grab the fs + * percpu start write sem. This can trigger a lockdep warning. + */ +static int io_bind_file_create(const struct io_async_msghdr *io, int addr_len) +{ + const struct sockaddr_un *sun; + + if (io->addr.ss_family != AF_UNIX) + return 0; + if (addr_len <= offsetof(struct sockaddr_un, sun_path)) + return 0; + sun = (const struct sockaddr_un *) &io->addr; + return sun->sun_path[0] != '\0'; } int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1756,6 +1822,7 @@ int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); struct sockaddr __user *uaddr; struct io_async_msghdr *io; + int ret; if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; @@ -1766,7 +1833,12 @@ int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) io = io_msg_alloc_async(req); if (unlikely(!io)) return -ENOMEM; - return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); + ret = move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); + if (unlikely(ret)) + return ret; + if (io_bind_file_create(io, bind->addr_len)) + req->flags |= REQ_F_FORCE_ASYNC; + return 0; } int io_bind(struct io_kiocb *req, unsigned int issue_flags) @@ -1819,8 +1891,6 @@ void io_netmsg_cache_free(const void *entry) { struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; - if (kmsg->free_iov) - io_netmsg_iovec_free(kmsg); + io_vec_free(&kmsg->vec); kfree(kmsg); } -#endif diff --git a/io_uring/net.h b/io_uring/net.h index b804c2b36e60..d4d1ddce50e3 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -2,12 +2,13 @@ #include <linux/net.h> #include <linux/uio.h> +#include <linux/io_uring_types.h> +#include <uapi/linux/io_uring/bpf_filter.h> struct io_async_msghdr { #if defined(CONFIG_NET) - struct iovec *free_iov; - /* points to an allocated iov, if NULL we use fast_iov instead */ - int free_iov_nr; + struct iou_vec vec; + struct_group(clear, int namelen; struct iovec fast_iov; @@ -44,11 +45,11 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags); int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_socket(struct io_kiocb *req, unsigned int issue_flags); +void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req); int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); -int io_send_zc(struct io_kiocb *req, unsigned int issue_flags); int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); void io_send_zc_cleanup(struct io_kiocb *req); @@ -64,4 +65,8 @@ void io_netmsg_cache_free(const void *entry); static inline void io_netmsg_cache_free(const void *entry) { } +static inline void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, + struct io_kiocb *req) +{ +} #endif diff --git a/io_uring/nop.c b/io_uring/nop.c index 5e5196df650a..f5c9969e7f64 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -16,12 +16,14 @@ struct io_nop { struct file *file; int result; int fd; - int buffer; unsigned int flags; + __u64 extra1; + __u64 extra2; }; #define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ - IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE) + IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \ + IORING_NOP_TW | IORING_NOP_CQE32) int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -40,9 +42,15 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) else nop->fd = -1; if (nop->flags & IORING_NOP_FIXED_BUFFER) - nop->buffer = READ_ONCE(sqe->buf_index); - else - nop->buffer = -1; + req->buf_index = READ_ONCE(sqe->buf_index); + if (nop->flags & IORING_NOP_CQE32) { + struct io_ring_ctx *ctx = req->ctx; + + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + nop->extra1 = READ_ONCE(sqe->off); + nop->extra2 = READ_ONCE(sqe->addr); + } return 0; } @@ -64,21 +72,20 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags) } } if (nop->flags & IORING_NOP_FIXED_BUFFER) { - struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *node; - - ret = -EFAULT; - io_ring_submit_lock(ctx, issue_flags); - node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer); - if (node) { - io_req_assign_buf_node(req, node); - ret = 0; - } - io_ring_submit_unlock(ctx, issue_flags); + if (!io_find_buf_node(req, issue_flags)) + ret = -EFAULT; } done: if (ret < 0) req_set_fail(req); - io_req_set_res(req, nop->result, 0); - return IOU_OK; + if (nop->flags & IORING_NOP_CQE32) + io_req_set_res32(req, ret, 0, nop->extra1, nop->extra2); + else + io_req_set_res(req, ret, 0); + if (nop->flags & IORING_NOP_TW) { + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + return IOU_ISSUE_SKIP_COMPLETE; + } + return IOU_COMPLETE; } diff --git a/io_uring/notif.c b/io_uring/notif.c index ee3a33510b3c..efce8ae12eaa 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> @@ -11,13 +12,19 @@ static const struct ubuf_info_ops io_ubuf_ops; -static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) +static void io_notif_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *notif = tw_req.req; struct io_notif_data *nd = io_notif_to_data(notif); + struct io_ring_ctx *ctx = notif->ctx; + + lockdep_assert_held(&ctx->uring_lock); do { notif = cmd_to_io_kiocb(nd); + if (WARN_ON_ONCE(ctx != notif->ctx)) + return; lockdep_assert(refcount_read(&nd->uarg.refcnt) == 0); if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used)) @@ -29,7 +36,7 @@ static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) } nd = nd->next; - io_req_task_complete(notif, ts); + io_req_task_complete((struct io_tw_req){notif}, tw); } while (nd); } @@ -85,9 +92,9 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) return -EEXIST; prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); - prev_notif = cmd_to_io_kiocb(nd); + prev_notif = cmd_to_io_kiocb(prev_nd); - /* make sure all noifications can be finished in the same task_work */ + /* make sure all notifications can be finished in the same task_work */ if (unlikely(notif->ctx != prev_notif->ctx || notif->tctx != prev_notif->tctx)) return -EEXIST; @@ -112,6 +119,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) if (unlikely(!io_alloc_req(ctx, ¬if))) return NULL; + notif->ctx = ctx; notif->opcode = IORING_OP_NOP; notif->flags = 0; notif->file = NULL; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index e8baef4e5146..c3ef52b70811 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -37,6 +37,7 @@ #include "waitid.h" #include "futex.h" #include "truncate.h" +#include "zcrx.h" static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) { @@ -66,7 +67,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_readv, @@ -81,7 +81,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .vectored = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_writev, @@ -101,10 +100,9 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read_fixed, - .issue = io_read, + .issue = io_read_fixed, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -115,10 +113,9 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write_fixed, - .issue = io_write, + .issue = io_write_fixed, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -215,12 +212,15 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .hash_reg_file = 1, .prep = io_fallocate_prep, .issue = io_fallocate, }, [IORING_OP_OPENAT] = { + .filter_pdu_size = sizeof_field(struct io_uring_bpf_ctx, open), .prep = io_openat_prep, .issue = io_openat, + .filter_populate = io_openat_bpf_populate, }, [IORING_OP_CLOSE] = { .prep = io_close_prep, @@ -246,7 +246,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read, .issue = io_read, @@ -260,7 +259,6 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, - .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write, .issue = io_write, @@ -307,8 +305,10 @@ const struct io_issue_def io_issue_defs[] = { #endif }, [IORING_OP_OPENAT2] = { + .filter_pdu_size = sizeof_field(struct io_uring_bpf_ctx, open), .prep = io_openat2_prep, .issue = io_openat2, + .filter_populate = io_openat_bpf_populate, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, @@ -332,13 +332,13 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .iopoll = 1, .prep = io_provide_buffers_prep, - .issue = io_provide_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, .prep = io_remove_buffers_prep, - .issue = io_remove_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_TEE] = { .needs_file = 1, @@ -404,18 +404,20 @@ const struct io_issue_def io_issue_defs[] = { [IORING_OP_SOCKET] = { .audit_skip = 1, #if defined(CONFIG_NET) + .filter_pdu_size = sizeof_field(struct io_uring_bpf_ctx, socket), .prep = io_socket_prep, .issue = io_socket, + .filter_populate = io_socket_bpf_populate, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_URING_CMD] = { + .buffer_select = 1, .needs_file = 1, .plug = 1, .iopoll = 1, - .iopoll_queue = 1, - .async_size = sizeof(struct io_uring_cmd_data), + .async_size = sizeof(struct io_async_cmd), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, @@ -428,7 +430,7 @@ const struct io_issue_def io_issue_defs[] = { #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, - .issue = io_send_zc, + .issue = io_sendmsg_zc, #else .prep = io_eopnotsupp_prep, #endif @@ -516,6 +518,77 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_RECV_ZC] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .ioprio = 1, +#if defined(CONFIG_NET) + .prep = io_recvzc_prep, + .issue = io_recvzc, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_EPOLL_WAIT] = { + .needs_file = 1, + .audit_skip = 1, + .pollin = 1, +#if defined(CONFIG_EPOLL) + .prep = io_epoll_wait_prep, + .issue = io_epoll_wait, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_READV_FIXED] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .vectored = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_readv_fixed, + .issue = io_read, + }, + [IORING_OP_WRITEV_FIXED] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .vectored = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_writev_fixed, + .issue = io_write, + }, + [IORING_OP_PIPE] = { + .prep = io_pipe_prep, + .issue = io_pipe, + }, + [IORING_OP_NOP128] = { + .audit_skip = 1, + .iopoll = 1, + .is_128 = 1, + .prep = io_nop_prep, + .issue = io_nop, + }, + [IORING_OP_URING_CMD128] = { + .buffer_select = 1, + .needs_file = 1, + .plug = 1, + .iopoll = 1, + .is_128 = 1, + .async_size = sizeof(struct io_async_cmd), + .prep = io_uring_cmd_prep, + .issue = io_uring_cmd, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -702,6 +775,8 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_URING_CMD] = { .name = "URING_CMD", + .sqe_copy = io_uring_cmd_sqe_copy, + .cleanup = io_uring_cmd_cleanup, }, [IORING_OP_SEND_ZC] = { .name = "SEND_ZC", @@ -745,6 +820,33 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_LISTEN] = { .name = "LISTEN", }, + [IORING_OP_RECV_ZC] = { + .name = "RECV_ZC", + }, + [IORING_OP_EPOLL_WAIT] = { + .name = "EPOLL_WAIT", + }, + [IORING_OP_READV_FIXED] = { + .name = "READV_FIXED", + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, + [IORING_OP_WRITEV_FIXED] = { + .name = "WRITEV_FIXED", + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, + [IORING_OP_PIPE] = { + .name = "PIPE", + }, + [IORING_OP_NOP128] = { + .name = "NOP128", + }, + [IORING_OP_URING_CMD128] = { + .name = "URING_CMD128", + .sqe_copy = io_uring_cmd_sqe_copy, + .cleanup = io_uring_cmd_cleanup, + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 14456436ff74..667f981e63b0 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -2,11 +2,19 @@ #ifndef IOU_OP_DEF_H #define IOU_OP_DEF_H +struct io_uring_bpf_ctx; + struct io_issue_def { /* needs req->file assigned */ unsigned needs_file : 1; /* should block plug */ unsigned plug : 1; + /* supports ioprio */ + unsigned ioprio : 1; + /* supports iopoll */ + unsigned iopoll : 1; + /* op supports buffer selection */ + unsigned buffer_select : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ @@ -15,29 +23,28 @@ struct io_issue_def { unsigned pollin : 1; unsigned pollout : 1; unsigned poll_exclusive : 1; - /* op supports buffer selection */ - unsigned buffer_select : 1; /* skip auditing */ unsigned audit_skip : 1; - /* supports ioprio */ - unsigned ioprio : 1; - /* supports iopoll */ - unsigned iopoll : 1; - /* have to be put into the iopoll list */ - unsigned iopoll_queue : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ unsigned vectored : 1; + /* set to 1 if this opcode uses 128b sqes in a mixed sq */ + unsigned is_128 : 1; /* size of async data needed, if any */ unsigned short async_size; + /* bpf filter pdu size, if any */ + unsigned short filter_pdu_size; + int (*issue)(struct io_kiocb *, unsigned int); int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); + void (*filter_populate)(struct io_uring_bpf_ctx *, struct io_kiocb *); }; struct io_cold_def { const char *name; + void (*sqe_copy)(struct io_kiocb *); void (*cleanup)(struct io_kiocb *); void (*fail)(struct io_kiocb *); }; diff --git a/io_uring/openclose.c b/io_uring/openclose.c index e3357dfa14ca..c71242915dad 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -6,12 +6,15 @@ #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/namei.h> +#include <linux/pipe_fs_i.h> +#include <linux/watch_queue.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../fs/internal.h" +#include "filetable.h" #include "io_uring.h" #include "rsrc.h" #include "openclose.h" @@ -20,7 +23,7 @@ struct io_open { struct file *file; int dfd; u32 file_slot; - struct filename *filename; + struct delayed_filename filename; struct open_how how; unsigned long nofile; }; @@ -64,24 +67,30 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe open->dfd = READ_ONCE(sqe->fd); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - open->filename = getname(fname); - if (IS_ERR(open->filename)) { - ret = PTR_ERR(open->filename); - open->filename = NULL; + ret = delayed_getname(&open->filename, fname); + if (unlikely(ret)) return ret; - } + req->flags |= REQ_F_NEED_CLEANUP; open->file_slot = READ_ONCE(sqe->file_index); if (open->file_slot && (open->how.flags & O_CLOEXEC)) return -EINVAL; open->nofile = rlimit(RLIMIT_NOFILE); - req->flags |= REQ_F_NEED_CLEANUP; if (io_openat_force_async(open)) req->flags |= REQ_F_FORCE_ASYNC; return 0; } +void io_openat_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req) +{ + struct io_open *open = io_kiocb_to_cmd(req, struct io_open); + + bctx->open.flags = open->how.flags; + bctx->open.mode = open->how.mode; + bctx->open.resolve = open->how.resolve; +} + int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); @@ -118,6 +127,7 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags) struct file *file; bool resolve_nonblock, nonblock_set; bool fixed = !!open->file_slot; + CLASS(filename_complete_delayed, name)(&open->filename); int ret; ret = build_open_flags(&open->how, &op); @@ -137,7 +147,7 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags) goto err; } - file = do_filp_open(open->dfd, open->filename, &op); + file = do_file_open(open->dfd, name, &op); if (IS_ERR(file)) { /* * We could hang on to this 'fd' on retrying, but seems like @@ -149,9 +159,13 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags) ret = PTR_ERR(file); /* only retry if RESOLVE_CACHED wasn't already set by application */ - if (ret == -EAGAIN && - (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) - return -EAGAIN; + if (ret == -EAGAIN && !resolve_nonblock && + (issue_flags & IO_URING_F_NONBLOCK)) { + ret = putname_to_delayed(&open->filename, + no_free_ptr(name)); + if (likely(!ret)) + return -EAGAIN; + } goto err; } @@ -164,12 +178,11 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags) ret = io_fixed_fd_install(req, issue_flags, file, open->file_slot); err: - putname(open->filename); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_openat(struct io_kiocb *req, unsigned int issue_flags) @@ -181,8 +194,7 @@ void io_open_cleanup(struct io_kiocb *req) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); - if (open->filename) - putname(open->filename); + dismiss_delayed_filename(&open->filename); } int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags, @@ -257,7 +269,7 @@ err: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -300,5 +312,137 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; +} + +struct io_pipe { + struct file *file; + int __user *fds; + int flags; + int file_slot; + unsigned long nofile; +}; + +int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + + if (sqe->fd || sqe->off || sqe->addr3) + return -EINVAL; + + p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr)); + p->flags = READ_ONCE(sqe->pipe_flags); + if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) + return -EINVAL; + + p->file_slot = READ_ONCE(sqe->file_index); + p->nofile = rlimit(RLIMIT_NOFILE); + return 0; +} + +static int io_pipe_fixed(struct io_kiocb *req, struct file **files, + unsigned int issue_flags) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + struct io_ring_ctx *ctx = req->ctx; + bool alloc_slot; + int ret, fds[2] = { -1, -1 }; + int slot = p->file_slot; + + if (p->flags & O_CLOEXEC) + return -EINVAL; + + alloc_slot = slot == IORING_FILE_INDEX_ALLOC; + + io_ring_submit_lock(ctx, issue_flags); + + ret = __io_fixed_fd_install(ctx, files[0], slot); + if (ret < 0) + goto err; + fds[0] = alloc_slot ? ret : slot - 1; + files[0] = NULL; + + /* + * If a specific slot is given, next one will be used for + * the write side. + */ + if (!alloc_slot) + slot++; + + ret = __io_fixed_fd_install(ctx, files[1], slot); + if (ret < 0) + goto err; + fds[1] = alloc_slot ? ret : slot - 1; + files[1] = NULL; + + io_ring_submit_unlock(ctx, issue_flags); + + if (!copy_to_user(p->fds, fds, sizeof(fds))) + return 0; + + ret = -EFAULT; + io_ring_submit_lock(ctx, issue_flags); +err: + if (fds[0] != -1) + io_fixed_fd_remove(ctx, fds[0]); + if (fds[1] != -1) + io_fixed_fd_remove(ctx, fds[1]); + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +static int io_pipe_fd(struct io_kiocb *req, struct file **files) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + int ret, fds[2] = { -1, -1 }; + + ret = __get_unused_fd_flags(p->flags, p->nofile); + if (ret < 0) + goto err; + fds[0] = ret; + + ret = __get_unused_fd_flags(p->flags, p->nofile); + if (ret < 0) + goto err; + fds[1] = ret; + + if (!copy_to_user(p->fds, fds, sizeof(fds))) { + fd_install(fds[0], files[0]); + fd_install(fds[1], files[1]); + return 0; + } + ret = -EFAULT; +err: + if (fds[0] != -1) + put_unused_fd(fds[0]); + if (fds[1] != -1) + put_unused_fd(fds[1]); + return ret; +} + +int io_pipe(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + struct file *files[2]; + int ret; + + ret = create_pipe_files(files, p->flags); + if (ret) + return ret; + + if (!!p->file_slot) + ret = io_pipe_fixed(req, files, issue_flags); + else + ret = io_pipe_fd(req, files); + + io_req_set_res(req, ret, 0); + if (!ret) + return IOU_COMPLETE; + + req_set_fail(req); + if (files[0]) + fput(files[0]); + if (files[1]) + fput(files[1]); + return ret; } diff --git a/io_uring/openclose.h b/io_uring/openclose.h index 8a93c98ad0ad..566739920658 100644 --- a/io_uring/openclose.h +++ b/io_uring/openclose.h @@ -1,11 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 +#include "bpf_filter.h" + int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags, unsigned int offset); int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_openat(struct io_kiocb *req, unsigned int issue_flags); void io_open_cleanup(struct io_kiocb *req); +void io_openat_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req); int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_openat2(struct io_kiocb *req, unsigned int issue_flags); @@ -13,5 +16,8 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags); int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_close(struct io_kiocb *req, unsigned int issue_flags); +int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_pipe(struct io_kiocb *req, unsigned int issue_flags); + int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/poll.c b/io_uring/poll.c index bb1c0cd4f809..0204affdc308 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -93,7 +93,7 @@ static bool io_poll_get_ownership_slowpath(struct io_kiocb *req) */ static inline bool io_poll_get_ownership(struct io_kiocb *req) { - if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) + if (unlikely((unsigned int)atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) return io_poll_get_ownership_slowpath(req); return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } @@ -138,14 +138,32 @@ static void io_init_poll_iocb(struct io_poll *poll, __poll_t events) init_waitqueue_func_entry(&poll->wait, io_poll_wake); } +static void io_poll_remove_waitq(struct io_poll *poll) +{ + /* + * If the waitqueue is being freed early but someone is already holds + * ownership over it, we have to tear down the request as best we can. + * That means immediately removing the request from its waitqueue and + * preventing all further accesses to the waitqueue via the request. + */ + list_del_init(&poll->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon as req->head is + * NULL'ed out, the request can be completed and freed, since + * io_poll_remove_entry() will no longer need to take the waitqueue + * lock. + */ + smp_store_release(&poll->head, NULL); +} + static inline void io_poll_remove_entry(struct io_poll *poll) { struct wait_queue_head *head = smp_load_acquire(&poll->head); if (head) { spin_lock_irq(&head->lock); - list_del_init(&poll->wait.entry); - poll->head = NULL; + io_poll_remove_waitq(poll); spin_unlock_irq(&head->lock); } } @@ -220,11 +238,11 @@ static inline void io_poll_execute(struct io_kiocb *req, int res) * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * poll and that the result is stored in req->cqe. */ -static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) +static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; - if (unlikely(io_should_terminate_tw())) + if (unlikely(tw.cancel)) return -ECANCELED; do { @@ -254,12 +272,15 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs); v &= ~IO_POLL_RETRY_FLAG; } + v &= IO_POLL_REF_MASK; } /* the mask was stashed in __io_poll_execute */ if (!req->cqe.res) { - struct poll_table_struct pt = { ._key = req->apoll_events }; - req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; + __poll_t events = req->apoll_events; + struct poll_table_struct pt = { ._key = events }; + + req->cqe.res = vfs_poll(req->file, &pt) & events; /* * We got woken with a mask, but someone else got to * it first. The above vfs_poll() doesn't add us back @@ -268,13 +289,11 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) */ if (unlikely(!req->cqe.res)) { /* Multishot armed need not reissue */ - if (!(req->apoll_events & EPOLLONESHOT)) + if (!(events & EPOLLONESHOT)) continue; return IOU_POLL_REISSUE; } } - if (unlikely(req->cqe.res & EPOLLERR)) - req_set_fail(req); if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE; @@ -288,12 +307,18 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_REMOVE_POLL_USE_RES; } } else { - int ret = io_poll_issue(req, ts); - if (ret == IOU_STOP_MULTISHOT) + int ret; + + /* multiple refs and HUP, ensure we loop once more */ + if ((req->cqe.res & (POLLHUP | POLLRDHUP)) && v != 1) + v--; + + ret = io_poll_issue(req, tw); + if (ret == IOU_COMPLETE) return IOU_POLL_REMOVE_POLL_USE_RES; else if (ret == IOU_REQUEUE) return IOU_POLL_REQUEUE; - if (ret < 0) + if (ret != IOU_RETRY && ret < 0) return ret; } @@ -304,23 +329,21 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) * Release all references, retry if someone tried to restart * task_work while we were executing it. */ - v &= IO_POLL_REF_MASK; } while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK); io_napi_add(req); return IOU_POLL_NO_ACTION; } -void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) +void io_poll_task_func(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; int ret; - ret = io_poll_check_events(req, ts); + ret = io_poll_check_events(req, tw); if (ret == IOU_POLL_NO_ACTION) { - io_kbuf_recycle(req, 0); return; } else if (ret == IOU_POLL_REQUEUE) { - io_kbuf_recycle(req, 0); __io_poll_execute(req, 0); return; } @@ -335,7 +358,7 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { - io_req_task_submit(req, ts); + io_req_task_submit(tw_req, tw); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; @@ -343,14 +366,14 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) } io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, ts); + io_req_task_complete(tw_req, tw); } else { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_task_complete(req, ts); + io_req_task_complete(tw_req, tw); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) - io_req_task_submit(req, ts); + io_req_task_submit(tw_req, tw); else io_req_defer_failed(req, ret); } @@ -370,23 +393,7 @@ static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll) io_poll_mark_cancelled(req); /* we have to kick tw in case it's not already */ io_poll_execute(req, 0); - - /* - * If the waitqueue is being freed early but someone is already - * holds ownership over it, we have to tear down the request as - * best we can. That means immediately removing the request from - * its waitqueue and preventing all further accesses to the - * waitqueue via the request. - */ - list_del_init(&poll->wait.entry); - - /* - * Careful: this *must* be the last step, since as soon - * as req->head is NULL'ed out, the request can be - * completed and freed, since aio_poll_complete_work() - * will no longer need to take the waitqueue lock. - */ - smp_store_release(&poll->head, NULL); + io_poll_remove_waitq(poll); return 1; } @@ -410,13 +417,14 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * disable multishot as there is a circular dependency between * CQ posting and triggering the event. */ - if (mask & EPOLL_URING_WAKE) + if (mask & EPOLL_URING_WAKE) { poll->events |= EPOLLONESHOT; + req->apoll_events |= EPOLLONESHOT; + } /* optional, saves extra locking for removal in tw handler */ if (mask && poll->events & EPOLLONESHOT) { - list_del_init(&poll->wait.entry); - poll->head = NULL; + io_poll_remove_waitq(poll); if (wqe_is_double(wait)) req->flags &= ~REQ_F_DOUBLE_POLL; else @@ -479,7 +487,7 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, return; } - poll = kmalloc(sizeof(*poll), GFP_ATOMIC); + poll = kmalloc_obj(*poll, GFP_ATOMIC); if (!poll) { pt->error = -ENOMEM; return; @@ -656,7 +664,7 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, if (!(issue_flags & IO_URING_F_UNLOCKED)) apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC); else - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + apoll = kmalloc_obj(*apoll, GFP_ATOMIC); if (!apoll) return NULL; apoll->poll.retries = APOLL_MAX_RETRY; @@ -668,33 +676,18 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, return apoll; } -int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) +int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask) { - const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t mask = POLLPRI | POLLERR | EPOLLET; int ret; - if (!def->pollin && !def->pollout) - return IO_APOLL_ABORTED; + mask |= EPOLLET; if (!io_file_can_poll(req)) return IO_APOLL_ABORTED; if (!(req->flags & REQ_F_APOLL_MULTISHOT)) mask |= EPOLLONESHOT; - if (def->pollin) { - mask |= EPOLLIN | EPOLLRDNORM; - - /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ - if (req->flags & REQ_F_CLEAR_POLLIN) - mask &= ~EPOLLIN; - } else { - mask |= EPOLLOUT | EPOLLWRNORM; - } - if (def->poll_exclusive) - mask |= EPOLLEXCLUSIVE; - apoll = io_req_alloc_apoll(req, issue_flags); if (!apoll) return IO_APOLL_ABORTED; @@ -702,8 +695,6 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; - io_kbuf_recycle(req, issue_flags); - ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags); if (ret) return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED; @@ -711,6 +702,31 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) return IO_APOLL_OK; } +int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + __poll_t mask = POLLPRI | POLLERR; + + if (!def->pollin && !def->pollout) + return IO_APOLL_ABORTED; + if (!io_file_can_poll(req)) + return IO_APOLL_ABORTED; + + if (def->pollin) { + mask |= EPOLLIN | EPOLLRDNORM; + + /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ + if (req->flags & REQ_F_CLEAR_POLLIN) + mask &= ~EPOLLIN; + } else { + mask |= EPOLLOUT | EPOLLWRNORM; + } + if (def->poll_exclusive) + mask |= EPOLLEXCLUSIVE; + + return io_arm_apoll(req, issue_flags, mask); +} + /* * Returns true if we found and killed one or more poll requests */ @@ -892,7 +908,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); if (ret > 0) { io_req_set_res(req, ipt.result_mask, 0); - return IOU_OK; + return IOU_COMPLETE; } return ret ?: IOU_ISSUE_SKIP_COMPLETE; } @@ -931,12 +947,17 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED); /* successfully updated, don't complete poll request */ - if (!ret2 || ret2 == -EIOCBQUEUED) + if (ret2 == IOU_ISSUE_SKIP_COMPLETE) goto out; + /* request completed as part of the update, complete it */ + else if (ret2 == IOU_COMPLETE) + goto complete; } - req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); +complete: + if (preq->cqe.res < 0) + req_set_fail(preq); preq->io_task_work.func = io_req_task_complete; io_req_task_work_add(preq); out: @@ -947,5 +968,5 @@ out: } /* complete update request, we're done with it */ io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/poll.h b/io_uring/poll.h index 04ede93113dc..5647c5138932 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/io_uring_types.h> + #define IO_POLL_ALLOC_CACHE_MAX 32 enum { @@ -39,8 +41,9 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags); struct io_cancel_data; int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags); +int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask); int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); -void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); +void io_poll_task_func(struct io_tw_req tw_req, io_tw_token_t tw); diff --git a/io_uring/query.c b/io_uring/query.c new file mode 100644 index 000000000000..c1704d088374 --- /dev/null +++ b/io_uring/query.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "linux/io_uring/query.h" + +#include "query.h" +#include "io_uring.h" +#include "zcrx.h" + +union io_query_data { + struct io_uring_query_opcode opcodes; + struct io_uring_query_zcrx zcrx; + struct io_uring_query_scq scq; +}; + +#define IO_MAX_QUERY_SIZE sizeof(union io_query_data) +#define IO_MAX_QUERY_ENTRIES 1000 + +static ssize_t io_query_ops(union io_query_data *data) +{ + struct io_uring_query_opcode *e = &data->opcodes; + + e->nr_request_opcodes = IORING_OP_LAST; + e->nr_register_opcodes = IORING_REGISTER_LAST; + e->feature_flags = IORING_FEAT_FLAGS; + e->ring_setup_flags = IORING_SETUP_FLAGS; + e->enter_flags = IORING_ENTER_FLAGS; + e->sqe_flags = SQE_VALID_FLAGS; + e->nr_query_opcodes = __IO_URING_QUERY_MAX; + e->__pad = 0; + return sizeof(*e); +} + +static ssize_t io_query_zcrx(union io_query_data *data) +{ + struct io_uring_query_zcrx *e = &data->zcrx; + + e->register_flags = ZCRX_SUPPORTED_REG_FLAGS; + e->area_flags = IORING_ZCRX_AREA_DMABUF; + e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST; + e->rq_hdr_size = sizeof(struct io_uring); + e->rq_hdr_alignment = L1_CACHE_BYTES; + e->features = ZCRX_FEATURES; + e->__resv2 = 0; + return sizeof(*e); +} + +static ssize_t io_query_scq(union io_query_data *data) +{ + struct io_uring_query_scq *e = &data->scq; + + e->hdr_size = sizeof(struct io_rings); + e->hdr_alignment = SMP_CACHE_BYTES; + return sizeof(*e); +} + +static int io_handle_query_entry(union io_query_data *data, void __user *uhdr, + u64 *next_entry) +{ + struct io_uring_query_hdr hdr; + size_t usize, res_size = 0; + ssize_t ret = -EINVAL; + void __user *udata; + + if (copy_from_user(&hdr, uhdr, sizeof(hdr))) + return -EFAULT; + usize = hdr.size; + hdr.size = min(hdr.size, IO_MAX_QUERY_SIZE); + udata = u64_to_user_ptr(hdr.query_data); + + if (hdr.query_op >= __IO_URING_QUERY_MAX) { + ret = -EOPNOTSUPP; + goto out; + } + if (!mem_is_zero(hdr.__resv, sizeof(hdr.__resv)) || hdr.result || !hdr.size) + goto out; + if (copy_from_user(data, udata, hdr.size)) + return -EFAULT; + + switch (hdr.query_op) { + case IO_URING_QUERY_OPCODES: + ret = io_query_ops(data); + break; + case IO_URING_QUERY_ZCRX: + ret = io_query_zcrx(data); + break; + case IO_URING_QUERY_SCQ: + ret = io_query_scq(data); + break; + } + + if (ret >= 0) { + if (WARN_ON_ONCE(ret > IO_MAX_QUERY_SIZE)) + return -EFAULT; + res_size = ret; + ret = 0; + } +out: + hdr.result = ret; + hdr.size = min_t(size_t, usize, res_size); + + if (copy_struct_to_user(udata, usize, data, hdr.size, NULL)) + return -EFAULT; + if (copy_to_user(uhdr, &hdr, sizeof(hdr))) + return -EFAULT; + *next_entry = hdr.next_entry; + return 0; +} + +int io_query(void __user *arg, unsigned nr_args) +{ + union io_query_data entry_buffer; + void __user *uhdr = arg; + int ret, nr = 0; + + memset(&entry_buffer, 0, sizeof(entry_buffer)); + + if (nr_args) + return -EINVAL; + + while (uhdr) { + u64 next_hdr; + + ret = io_handle_query_entry(&entry_buffer, uhdr, &next_hdr); + if (ret) + return ret; + uhdr = u64_to_user_ptr(next_hdr); + + /* Have some limit to avoid a potential cycle */ + if (++nr >= IO_MAX_QUERY_ENTRIES) + return -ERANGE; + if (fatal_signal_pending(current)) + return -EINTR; + cond_resched(); + } + return 0; +} diff --git a/io_uring/query.h b/io_uring/query.h new file mode 100644 index 000000000000..b35eb52f0ea8 --- /dev/null +++ b/io_uring/query.h @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IORING_QUERY_H +#define IORING_QUERY_H + +#include <linux/io_uring_types.h> + +int io_query(void __user *arg, unsigned nr_args); + +#endif diff --git a/io_uring/refs.h b/io_uring/refs.h index 63982ead9f7d..0fe16b67c308 100644 --- a/io_uring/refs.h +++ b/io_uring/refs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOU_REQ_REF_H #define IOU_REQ_REF_H @@ -17,6 +18,13 @@ static inline bool req_ref_inc_not_zero(struct io_kiocb *req) return atomic_inc_not_zero(&req->refs); } +static inline bool req_ref_put_and_test_atomic(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(data_race(req->flags) & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->refs); +} + static inline bool req_ref_put_and_test(struct io_kiocb *req) { if (likely(!(req->flags & REQ_F_REFCOUNT))) diff --git a/io_uring/register.c b/io_uring/register.c index 9a4d2fbce4ae..dce5e2f9cf77 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -18,6 +18,7 @@ #include <linux/io_uring.h> #include <linux/io_uring_types.h> +#include "filetable.h" #include "io_uring.h" #include "opdef.h" #include "tctx.h" @@ -30,6 +31,9 @@ #include "eventfd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" +#include "query.h" +#include "bpf_filter.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -45,13 +49,9 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, nr_args = IORING_OP_LAST; size = struct_size(p, ops, nr_args); - p = kzalloc(size, GFP_KERNEL); - if (!p) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(p, arg, size)) - goto out; + p = memdup_user(arg, size); + if (IS_ERR(p)) + return PTR_ERR(p); ret = -EINVAL; if (memchr_inv(p, 0, size)) goto out; @@ -104,6 +104,10 @@ static int io_register_personality(struct io_ring_ctx *ctx) return id; } +/* + * Returns number of restrictions parsed and added on success, or < 0 for + * an error. + */ static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, struct io_restriction *restrictions) { @@ -130,25 +134,31 @@ static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, if (res[i].register_op >= IORING_REGISTER_LAST) goto err; __set_bit(res[i].register_op, restrictions->register_op); + restrictions->reg_registered = true; break; case IORING_RESTRICTION_SQE_OP: if (res[i].sqe_op >= IORING_OP_LAST) goto err; __set_bit(res[i].sqe_op, restrictions->sqe_op); + restrictions->op_registered = true; break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: restrictions->sqe_flags_allowed = res[i].sqe_flags; + restrictions->op_registered = true; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: restrictions->sqe_flags_required = res[i].sqe_flags; + restrictions->op_registered = true; break; default: goto err; } } - - ret = 0; - + ret = nr_args; + if (!nr_args) { + restrictions->op_registered = true; + restrictions->reg_registered = true; + } err: kfree(res); return ret; @@ -164,16 +174,104 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, return -EBADFD; /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) + if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered) return -EBUSY; ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); - /* Reset all restrictions if an error happened */ - if (ret != 0) + /* + * Reset all restrictions if an error happened, but retain any COW'ed + * settings. + */ + if (ret < 0) { + struct io_bpf_filters *bpf = ctx->restrictions.bpf_filters; + bool cowed = ctx->restrictions.bpf_filters_cow; + memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); - else - ctx->restrictions.registered = true; - return ret; + ctx->restrictions.bpf_filters = bpf; + ctx->restrictions.bpf_filters_cow = cowed; + return ret; + } + if (ctx->restrictions.op_registered) + ctx->int_flags |= IO_RING_F_OP_RESTRICTED; + if (ctx->restrictions.reg_registered) + ctx->int_flags |= IO_RING_F_REG_RESTRICTED; + return 0; +} + +static int io_register_restrictions_task(void __user *arg, unsigned int nr_args) +{ + struct io_uring_task_restriction __user *ures = arg; + struct io_uring_task_restriction tres; + struct io_restriction *res; + int ret; + + /* Disallow if task already has registered restrictions */ + if (current->io_uring_restrict) + return -EPERM; + /* + * Similar to seccomp, disallow setting a filter if task_no_new_privs + * is false and we're not CAP_SYS_ADMIN. + */ + if (!task_no_new_privs(current) && + !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) + return -EACCES; + if (nr_args != 1) + return -EINVAL; + + if (copy_from_user(&tres, arg, sizeof(tres))) + return -EFAULT; + + if (tres.flags) + return -EINVAL; + if (!mem_is_zero(tres.resv, sizeof(tres.resv))) + return -EINVAL; + + res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT); + if (!res) + return -ENOMEM; + + ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res); + if (ret < 0) { + kfree(res); + return ret; + } + current->io_uring_restrict = res; + return 0; +} + +static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args) +{ + struct io_restriction *res; + int ret; + + /* + * Similar to seccomp, disallow setting a filter if task_no_new_privs + * is false and we're not CAP_SYS_ADMIN. + */ + if (!task_no_new_privs(current) && + !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) + return -EACCES; + + if (nr_args != 1) + return -EINVAL; + + /* If no task restrictions exist, setup a new set */ + res = current->io_uring_restrict; + if (!res) { + res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT); + if (!res) + return -ENOMEM; + } + + ret = io_register_bpf_filter(res, arg); + if (ret) { + if (res != current->io_uring_restrict) + kfree(res); + return ret; + } + if (!current->io_uring_restrict) + current->io_uring_restrict = res; + return 0; } static int io_register_enable_rings(struct io_ring_ctx *ctx) @@ -181,8 +279,8 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) if (!(ctx->flags & IORING_SETUP_R_DISABLED)) return -EBADFD; - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) { + ctx->submitter_task = get_task_struct(current); /* * Lazy activation attempts would fail if it was polled before * submitter_task is set. @@ -191,10 +289,8 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) io_activate_pollwq(ctx); } - if (ctx->restrictions.registered) - ctx->restricted = 1; - - ctx->flags &= ~IORING_SETUP_R_DISABLED; + /* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */ + smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED); if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); return 0; @@ -272,6 +368,8 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, if (ctx->flags & IORING_SETUP_SQPOLL) { sqd = ctx->sq_data; if (sqd) { + struct task_struct *tsk; + /* * Observe the correct sqd->lock -> ctx->uring_lock * ordering. Fine to drop uring_lock here, we hold @@ -281,8 +379,9 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, mutex_unlock(&ctx->uring_lock); mutex_lock(&sqd->lock); mutex_lock(&ctx->uring_lock); - if (sqd->thread) - tctx = sqd->thread->io_uring; + tsk = sqpoll_task_locked(sqd); + if (tsk) + tctx = tsk->io_uring; } } else { tctx = current->io_uring; @@ -293,7 +392,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i]) ctx->iowq_limits[i] = new_count[i]; - ctx->iowq_limits_set = true; + ctx->int_flags |= IO_RING_F_IOWQ_LIMITS_SET; if (tctx && tctx->io_wq) { ret = io_wq_max_workers(tctx->io_wq, new_count); @@ -318,6 +417,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, return 0; /* now propagate the restriction to all registered users */ + mutex_lock(&ctx->tctx_lock); list_for_each_entry(node, &ctx->tctx_list, ctx_node) { tctx = node->task->io_uring; if (WARN_ON_ONCE(!tctx->io_wq)) @@ -328,6 +428,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, /* ignore errors, it always returns zero anyway */ (void)io_wq_max_workers(tctx->io_wq, new_count); } + mutex_unlock(&ctx->tctx_lock); return 0; err: if (sqd) { @@ -377,11 +478,10 @@ struct io_ring_ctx_rings { }; static void io_register_free_rings(struct io_ring_ctx *ctx, - struct io_uring_params *p, struct io_ring_ctx_rings *r) { - io_free_region(ctx, &r->sq_region); - io_free_region(ctx, &r->ring_region); + io_free_region(ctx->user, &r->sq_region); + io_free_region(ctx->user, &r->ring_region); } #define swap_old(ctx, o, n, field) \ @@ -392,59 +492,46 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ - IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ + IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED) static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { + struct io_ctx_config config; struct io_uring_region_desc rd; struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; - size_t size, sq_array_offset; unsigned i, tail, old_head; - struct io_uring_params p; + struct io_uring_params *p = &config.p; + struct io_rings_layout *rl = &config.layout; int ret; - /* for single issuer, must be owner resizing */ - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && - current != ctx->submitter_task) - return -EEXIST; + memset(&config, 0, sizeof(config)); + /* limited to DEFER_TASKRUN for now */ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) return -EINVAL; - if (copy_from_user(&p, arg, sizeof(p))) + if (copy_from_user(p, arg, sizeof(*p))) return -EFAULT; - if (p.flags & ~RESIZE_FLAGS) + if (p->flags & ~RESIZE_FLAGS) return -EINVAL; /* properties that are always inherited */ - p.flags |= (ctx->flags & COPY_FLAGS); + p->flags |= (ctx->flags & COPY_FLAGS); - ret = io_uring_fill_params(p.sq_entries, &p); + ret = io_prepare_config(&config); if (unlikely(ret)) return ret; - /* nothing to do, but copy params back */ - if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { - if (copy_to_user(arg, &p, sizeof(p))) - return -EFAULT; - return 0; - } - - size = rings_size(p.flags, p.sq_entries, p.cq_entries, - &sq_array_offset); - if (size == SIZE_MAX) - return -EOVERFLOW; - memset(&rd, 0, sizeof(rd)); - rd.size = PAGE_ALIGN(size); - if (p.flags & IORING_SETUP_NO_MMAP) { - rd.user_addr = p.cq_off.user_addr; + rd.size = PAGE_ALIGN(rl->rings_size); + if (p->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->cq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } - ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); - if (ret) { - io_register_free_rings(ctx, &p, &n); + ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) return ret; - } + n.rings = io_region_get_ptr(&n.ring_region); /* @@ -455,34 +542,25 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) * intent... Use read/write once helpers from here on to indicate the * shared nature of it. */ - WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); - WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); - WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); - WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); + WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1); + WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1); + WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries); + WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries); - if (copy_to_user(arg, &p, sizeof(p))) { - io_register_free_rings(ctx, &p, &n); + if (copy_to_user(arg, p, sizeof(*p))) { + io_register_free_rings(ctx, &n); return -EFAULT; } - if (p.flags & IORING_SETUP_SQE128) - size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); - else - size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); - if (size == SIZE_MAX) { - io_register_free_rings(ctx, &p, &n); - return -EOVERFLOW; - } - memset(&rd, 0, sizeof(rd)); - rd.size = PAGE_ALIGN(size); - if (p.flags & IORING_SETUP_NO_MMAP) { - rd.user_addr = p.sq_off.user_addr; + rd.size = PAGE_ALIGN(rl->sq_size); + if (p->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->sq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } - ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); + ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES); if (ret) { - io_register_free_rings(ctx, &p, &n); + io_register_free_rings(ctx, &n); return ret; } n.sq_sqes = io_region_get_ptr(&n.sq_region); @@ -518,20 +596,30 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) */ tail = READ_ONCE(o.rings->sq.tail); old_head = READ_ONCE(o.rings->sq.head); - if (tail - old_head > p.sq_entries) + if (tail - old_head > p->sq_entries) goto overflow; for (i = old_head; i < tail; i++) { - unsigned src_head = i & (ctx->sq_entries - 1); - unsigned dst_head = i & (p.sq_entries - 1); - - n.sq_sqes[dst_head] = o.sq_sqes[src_head]; + unsigned index, dst_mask, src_mask; + size_t sq_size; + + index = i; + sq_size = sizeof(struct io_uring_sqe); + src_mask = ctx->sq_entries - 1; + dst_mask = p->sq_entries - 1; + if (ctx->flags & IORING_SETUP_SQE128) { + index <<= 1; + sq_size <<= 1; + src_mask = (ctx->sq_entries << 1) - 1; + dst_mask = (p->sq_entries << 1) - 1; + } + memcpy(&n.sq_sqes[index & dst_mask], &o.sq_sqes[index & src_mask], sq_size); } WRITE_ONCE(n.rings->sq.head, old_head); WRITE_ONCE(n.rings->sq.tail, tail); tail = READ_ONCE(o.rings->cq.tail); old_head = READ_ONCE(o.rings->cq.head); - if (tail - old_head > p.cq_entries) { + if (tail - old_head > p->cq_entries) { overflow: /* restore old rings, and return -EOVERFLOW via cleanup path */ ctx->rings = o.rings; @@ -541,10 +629,20 @@ overflow: goto out; } for (i = old_head; i < tail; i++) { - unsigned src_head = i & (ctx->cq_entries - 1); - unsigned dst_head = i & (p.cq_entries - 1); - - n.rings->cqes[dst_head] = o.rings->cqes[src_head]; + unsigned index, dst_mask, src_mask; + size_t cq_size; + + index = i; + cq_size = sizeof(struct io_uring_cqe); + src_mask = ctx->cq_entries - 1; + dst_mask = p->cq_entries - 1; + if (ctx->flags & IORING_SETUP_CQE32) { + index <<= 1; + cq_size <<= 1; + src_mask = (ctx->cq_entries << 1) - 1; + dst_mask = (p->cq_entries << 1) - 1; + } + memcpy(&n.rings->cqes[index & dst_mask], &o.rings->cqes[index & src_mask], cq_size); } WRITE_ONCE(n.rings->cq.head, old_head); WRITE_ONCE(n.rings->cq.tail, tail); @@ -558,12 +656,20 @@ overflow: /* all done, store old pointers and assign new ones */ if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); + ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset); - ctx->sq_entries = p.sq_entries; - ctx->cq_entries = p.cq_entries; + ctx->sq_entries = p->sq_entries; + ctx->cq_entries = p->cq_entries; + /* + * Just mark any flag we may have missed and that the application + * should act on unconditionally. Worst case it'll be an extra + * syscall. + */ + atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags); ctx->rings = n.rings; + rcu_assign_pointer(ctx->rings_rcu, n.rings); + ctx->sq_sqes = n.sq_sqes; swap_old(ctx, o, n, ring_region); swap_old(ctx, o, n, sq_region); @@ -572,7 +678,10 @@ overflow: out: spin_unlock(&ctx->completion_lock); mutex_unlock(&ctx->mmap_lock); - io_register_free_rings(ctx, &p, to_free); + /* Wait for concurrent io_ctx_mark_taskrun() */ + if (to_free == &o) + synchronize_rcu_expedited(); + io_register_free_rings(ctx, to_free); if (ctx->sq_data) io_sq_thread_unpark(ctx->sq_data); @@ -586,6 +695,7 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) struct io_uring_mem_region_reg reg; struct io_uring_region_desc __user *rd_uptr; struct io_uring_region_desc rd; + struct io_mapped_region region = {}; int ret; if (io_region_is_set(&ctx->param_region)) @@ -609,19 +719,20 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) !(ctx->flags & IORING_SETUP_R_DISABLED)) return -EINVAL; - ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, - IORING_MAP_OFF_PARAM_REGION); + ret = io_create_region(ctx, ®ion, &rd, IORING_MAP_OFF_PARAM_REGION); if (ret) return ret; if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { - io_free_region(ctx, &ctx->param_region); + io_free_region(ctx->user, ®ion); return -EFAULT; } if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { - ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); + ctx->cq_wait_arg = io_region_get_ptr(®ion); ctx->cq_wait_size = rd.size; } + + io_region_publish(ctx, ®ion, &ctx->param_region); return 0; } @@ -642,7 +753,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST; - if (ctx->restricted) { + if ((ctx->int_flags & IO_RING_F_REG_RESTRICTED) && !(ctx->flags & IORING_SETUP_R_DISABLED)) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; @@ -813,6 +924,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_clone_buffers(ctx, arg); break; + case IORING_REGISTER_ZCRX_IFQ: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_zcrx(ctx, arg); + break; case IORING_REGISTER_RESIZE_RINGS: ret = -EINVAL; if (!arg || nr_args != 1) @@ -825,6 +942,22 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_mem_region(ctx, arg); break; + case IORING_REGISTER_QUERY: + ret = io_query(arg, nr_args); + break; + case IORING_REGISTER_ZCRX_CTRL: + ret = io_zcrx_ctrl(ctx, arg, nr_args); + break; + case IORING_REGISTER_BPF_FILTER: + ret = -EINVAL; + + if (nr_args != 1) + break; + ret = io_register_bpf_filter(&ctx->restrictions, arg); + if (!ret) + WRITE_ONCE(ctx->bpf_filters, + ctx->restrictions.bpf_filters->filters); + break; default: ret = -EINVAL; break; @@ -833,38 +966,21 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return ret; } -/* - * Given an 'fd' value, return the ctx associated with if. If 'registered' is - * true, then the registered index is used. Otherwise, the normal fd table. - * Caller must call fput() on the returned file, unless it's an ERR_PTR. - */ -struct file *io_uring_register_get_file(unsigned int fd, bool registered) +static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args) { - struct file *file; + struct io_uring_sqe sqe; - if (registered) { - /* - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we - * need only dereference our task private array to find it. - */ - struct io_uring_task *tctx = current->io_uring; - - if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) - return ERR_PTR(-EINVAL); - fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); - file = tctx->registered_rings[fd]; - if (file) - get_file(file); - } else { - file = fget(fd); - } + if (!arg || nr_args != 1) + return -EINVAL; + if (copy_from_user(&sqe, arg, sizeof(sqe))) + return -EFAULT; + /* no flags supported */ + if (sqe.flags) + return -EINVAL; + if (sqe.opcode != IORING_OP_MSG_RING) + return -EINVAL; - if (unlikely(!file)) - return ERR_PTR(-EBADF); - if (io_is_uring_fops(file)) - return file; - fput(file); - return ERR_PTR(-EOPNOTSUPP); + return io_uring_sync_msg_ring(&sqe); } /* @@ -875,21 +991,15 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg, unsigned int nr_args) { switch (opcode) { - case IORING_REGISTER_SEND_MSG_RING: { - struct io_uring_sqe sqe; - - if (!arg || nr_args != 1) - return -EINVAL; - if (copy_from_user(&sqe, arg, sizeof(sqe))) - return -EFAULT; - /* no flags supported */ - if (sqe.flags) - return -EINVAL; - if (sqe.opcode == IORING_OP_MSG_RING) - return io_uring_sync_msg_ring(&sqe); - } + case IORING_REGISTER_SEND_MSG_RING: + return io_uring_register_send_msg_ring(arg, nr_args); + case IORING_REGISTER_QUERY: + return io_query(arg, nr_args); + case IORING_REGISTER_RESTRICTIONS: + return io_register_restrictions_task(arg, nr_args); + case IORING_REGISTER_BPF_FILTER: + return io_register_bpf_filter_task(arg, nr_args); } - return -EINVAL; } @@ -910,7 +1020,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, if (fd == -1) return io_uring_register_blind(opcode, arg, nr_args); - file = io_uring_register_get_file(fd, use_registered_ring); + file = io_uring_ctx_get_file(fd, use_registered_ring); if (IS_ERR(file)) return PTR_ERR(file); ctx = file->private_data; @@ -922,6 +1032,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, ctx->buf_table.nr, ret); mutex_unlock(&ctx->uring_lock); - fput(file); + if (!use_registered_ring) + fput(file); return ret; } diff --git a/io_uring/register.h b/io_uring/register.h index a5f39d5ef9e0..c9da997d503c 100644 --- a/io_uring/register.h +++ b/io_uring/register.h @@ -4,6 +4,5 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); -struct file *io_uring_register_get_file(unsigned int fd, bool registered); #endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index af39b69eb4fd..650303626be6 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -9,9 +9,11 @@ #include <linux/hugetlb.h> #include <linux/compat.h> #include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include <uapi/linux/io_uring.h> +#include "filetable.h" #include "io_uring.h" #include "openclose.h" #include "rsrc.h" @@ -32,6 +34,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) +#define IO_CACHED_BVECS_SEGS 32 + int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -52,85 +56,135 @@ int __io_account_mem(struct user_struct *user, unsigned long nr_pages) return 0; } -static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, + unsigned long nr_pages) { - if (ctx->user) - __io_unaccount_mem(ctx->user, nr_pages); + if (user) + __io_unaccount_mem(user, nr_pages); - if (ctx->mm_account) - atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); + if (mm_account) + atomic64_sub(nr_pages, &mm_account->pinned_vm); } -static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, + unsigned long nr_pages) { int ret; - if (ctx->user) { - ret = __io_account_mem(ctx->user, nr_pages); + if (user) { + ret = __io_account_mem(user, nr_pages); if (ret) return ret; } - if (ctx->mm_account) - atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); + if (mm_account) + atomic64_add(nr_pages, &mm_account->pinned_vm); return 0; } -static int io_buffer_validate(struct iovec *iov) +int io_validate_user_buf_range(u64 uaddr, u64 ulen) { - unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); - - /* - * Don't impose further limits on the size and buffer - * constraints here, we'll -EINVAL later when IO is - * submitted if they are wrong. - */ - if (!iov->iov_base) - return iov->iov_len ? -EFAULT : 0; - if (!iov->iov_len) - return -EFAULT; + unsigned long tmp, base = (unsigned long)uaddr; + unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); /* arbitrary limit, but we need something */ - if (iov->iov_len > SZ_1G) + if (ulen > SZ_1G || !ulen) return -EFAULT; - - if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) + if (check_add_overflow(base, acct_len, &tmp)) return -EOVERFLOW; - return 0; } -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) +static void io_release_ubuf(void *priv) { + struct io_mapped_ubuf *imu = priv; unsigned int i; - if (node->buf) { - struct io_mapped_ubuf *imu = node->buf; + for (i = 0; i < imu->nr_bvecs; i++) { + struct folio *folio = page_folio(imu->bvec[i].bv_page); + + unpin_user_folio(folio, 1); + } +} + +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, + int nr_bvecs) +{ + if (nr_bvecs <= IO_CACHED_BVECS_SEGS) + return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); + return kvmalloc_flex(struct io_mapped_ubuf, bvec, nr_bvecs); +} + +static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) +{ + if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) + io_cache_free(&ctx->imu_cache, imu); + else + kvfree(imu); +} +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) +{ + if (unlikely(refcount_read(&imu->refs) > 1)) { if (!refcount_dec_and_test(&imu->refs)) return; - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); - if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu); } + + if (imu->acct_pages) + io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages); + imu->release(imu->priv); + io_free_imu(ctx, imu); } -struct io_rsrc_node *io_rsrc_node_alloc(int type) +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) { struct io_rsrc_node *node; - node = kzalloc(sizeof(*node), GFP_KERNEL); + node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); if (node) { node->type = type; node->refs = 1; + node->tag = 0; + node->file_ptr = 0; } return node; } -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) +bool io_rsrc_cache_init(struct io_ring_ctx *ctx) +{ + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, + IO_CACHED_BVECS_SEGS); + const int node_size = sizeof(struct io_rsrc_node); + bool ret; + + ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, + node_size, 0); + ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, + imu_cache_size, 0); + return ret; +} + +void io_rsrc_cache_free(struct io_ring_ctx *ctx) +{ + io_alloc_cache_free(&ctx->node_cache, kfree); + io_alloc_cache_free(&ctx->imu_cache, kvfree); +} + +static void io_clear_table_tags(struct io_rsrc_data *data) +{ + int i; + + for (i = 0; i < data->nr; i++) { + struct io_rsrc_node *node = data->nodes[i]; + + if (node) + node->tag = 0; + } +} + +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, + struct io_rsrc_data *data) { if (!data->nr) return; @@ -145,8 +199,8 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) { - data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), - GFP_KERNEL_ACCOUNT | __GFP_ZERO); + data->nodes = kvmalloc_objs(struct io_rsrc_node *, nr, + GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (data->nodes) { data->nr = nr; return 0; @@ -184,6 +238,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, continue; i = up->offset + done; + if (i >= ctx->file_table.data.nr) + break; + i = array_index_nospec(i, ctx->file_table.data.nr); if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) io_file_bitmap_clear(&ctx->file_table, i); @@ -203,7 +260,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) { err = -ENOMEM; fput(file); @@ -241,7 +298,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, u64 tag = 0; uvec = u64_to_user_ptr(user_data); - iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); if (IS_ERR(iov)) { err = PTR_ERR(iov); break; @@ -250,9 +307,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, err = -EFAULT; break; } - err = io_buffer_validate(iov); - if (err) - break; node = io_sqe_buffer_register(ctx, iov, &last_hpage); if (IS_ERR(node)) { err = PTR_ERR(node); @@ -268,7 +322,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, i = array_index_nospec(up->offset + done, ctx->buf_table.nr); io_reset_rsrc_node(ctx, &ctx->buf_table, i); ctx->buf_table.nodes[i] = node; - if (ctx->compat) + if (io_is_compat(ctx)) user_data += sizeof(struct compat_iovec); else user_data += sizeof(struct iovec); @@ -387,7 +441,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, return -ENXIO; for (done = 0; done < up->nr_args; done++) { - if (copy_from_user(&fd, &fds[done], sizeof(fd))) { + if (get_user(fd, &fds[done])) { ret = -EFAULT; break; } @@ -401,7 +455,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, IORING_FILE_INDEX_ALLOC); if (ret < 0) break; - if (copy_to_user(&fds[done], &ret, sizeof(ret))) { + if (put_user(ret, &fds[done])) { __io_close_fixed(req->ctx, issue_flags, ret); ret = -EFAULT; break; @@ -439,7 +493,7 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) @@ -449,19 +503,17 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) switch (node->type) { case IORING_RSRC_FILE: - if (io_slot_file(node)) - fput(io_slot_file(node)); + fput(io_slot_file(node)); break; case IORING_RSRC_BUFFER: - if (node->buf) - io_buffer_unmap(ctx, node); + io_buffer_unmap(ctx, node->buf); break; default: WARN_ON_ONCE(1); break; } - kfree(node); + io_cache_free(&ctx->node_cache, node); } int io_sqe_files_unregister(struct io_ring_ctx *ctx) @@ -523,7 +575,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, goto fail; } ret = -ENOMEM; - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) { fput(file); goto fail; @@ -539,6 +591,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); return 0; fail: + io_clear_table_tags(&ctx->file_table.data); io_sqe_files_unregister(ctx); return ret; } @@ -618,7 +671,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, if (!imu->acct_pages) return 0; - ret = io_account_mem(ctx, imu->acct_pages); + ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages); if (ret) imu->acct_pages = 0; return ret; @@ -628,38 +681,34 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, struct io_imu_folio_data *data) { struct page **page_array = *pages, **new_array = NULL; - int nr_pages_left = *nr_pages, i, j; - int nr_folios = data->nr_folios; + unsigned nr_pages_left = *nr_pages; + unsigned nr_folios = data->nr_folios; + unsigned i, j; /* Store head pages only*/ - new_array = kvmalloc_array(nr_folios, sizeof(struct page *), - GFP_KERNEL); + new_array = kvmalloc_objs(struct page *, nr_folios); if (!new_array) return false; - new_array[0] = compound_head(page_array[0]); - /* - * The pages are bound to the folio, it doesn't - * actually unpin them but drops all but one reference, - * which is usually put down by io_buffer_unmap(). - * Note, needs a better helper. - */ - if (data->nr_pages_head > 1) - unpin_user_pages(&page_array[1], data->nr_pages_head - 1); - - j = data->nr_pages_head; - nr_pages_left -= data->nr_pages_head; - for (i = 1; i < nr_folios; i++) { - unsigned int nr_unpin; - - new_array[i] = page_array[j]; - nr_unpin = min_t(unsigned int, nr_pages_left - 1, - data->nr_pages_mid - 1); - if (nr_unpin) - unpin_user_pages(&page_array[j+1], nr_unpin); - j += data->nr_pages_mid; - nr_pages_left -= data->nr_pages_mid; + for (i = 0, j = 0; i < nr_folios; i++) { + struct page *p = compound_head(page_array[j]); + struct folio *folio = page_folio(p); + unsigned int nr; + + WARN_ON_ONCE(i > 0 && p != page_array[j]); + + nr = i ? data->nr_pages_mid : data->nr_pages_head; + nr = min(nr, nr_pages_left); + /* Drop all but one ref, the entire folio will remain pinned. */ + if (nr > 1) + unpin_user_folio(folio, nr - 1); + j += nr; + nr_pages_left -= nr; + new_array[i] = p; } + + WARN_ON_ONCE(j != *nr_pages); + kvfree(page_array); *pages = new_array; *nr_pages = nr_folios; @@ -675,6 +724,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, data->nr_pages_mid = folio_nr_pages(folio); data->folio_shift = folio_shift(folio); + data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); /* * Check if pages are contiguous inside a folio, and all folios have @@ -725,13 +775,21 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, struct io_imu_folio_data data; bool coalesced = false; - if (!iov->iov_base) + if (!iov->iov_base) { + if (iov->iov_len) + return ERR_PTR(-EFAULT); + /* remove the buffer without installing a new one */ return NULL; + } + + ret = io_validate_user_buf_range((unsigned long)iov->iov_base, + iov->iov_len); + if (ret) + return ERR_PTR(ret); - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!node) return ERR_PTR(-ENOMEM); - node->buf = NULL; ret = -ENOMEM; pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, @@ -748,26 +806,32 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); } - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + imu = io_alloc_imu(ctx, nr_pages); if (!imu) goto done; + imu->nr_bvecs = nr_pages; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); - if (ret) { - unpin_user_pages(pages, nr_pages); + if (ret) goto done; - } size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->len = iov->iov_len; - imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; + imu->release = io_release_ubuf; + imu->priv = imu; + imu->flags = 0; + imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); - off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); + + off = (unsigned long)iov->iov_base & ~PAGE_MASK; + if (coalesced) + off += data.first_folio_page_idx << PAGE_SHIFT; + node->buf = imu; ret = 0; @@ -781,9 +845,13 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, } done: if (ret) { - kvfree(imu); - if (node) - io_put_rsrc_node(ctx, node); + if (imu) + io_free_imu(ctx, imu); + if (pages) { + for (i = 0; i < nr_pages; i++) + unpin_user_folio(page_folio(pages[i]), 1); + } + io_cache_free(&ctx->node_cache, node); node = ERR_PTR(ret); } kvfree(pages); @@ -818,15 +886,12 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, if (arg) { uvec = (struct iovec __user *) arg; - iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); if (IS_ERR(iov)) { ret = PTR_ERR(iov); break; } - ret = io_buffer_validate(iov); - if (ret) - break; - if (ctx->compat) + if (io_is_compat(ctx)) arg += sizeof(struct compat_iovec); else arg += sizeof(struct iovec); @@ -855,68 +920,220 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, } ctx->buf_table = data; - if (ret) + if (ret) { + io_clear_table_tags(&ctx->buf_table); io_sqe_buffers_unregister(ctx); + } return ret; } -int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len) +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct req_iterator rq_iter; + struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; + struct bio_vec bv; + unsigned int nr_bvecs = 0; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) { + ret = -EINVAL; + goto unlock; + } + index = array_index_nospec(index, data->nr); + + if (data->nodes[index]) { + ret = -EBUSY; + goto unlock; + } + + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + if (!node) { + ret = -ENOMEM; + goto unlock; + } + + /* + * blk_rq_nr_phys_segments() may overestimate the number of bvecs + * but avoids needing to iterate over the bvecs + */ + imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); + if (!imu) { + io_cache_free(&ctx->node_cache, node); + ret = -ENOMEM; + goto unlock; + } + + imu->ubuf = 0; + imu->len = blk_rq_bytes(rq); + imu->acct_pages = 0; + imu->folio_shift = PAGE_SHIFT; + refcount_set(&imu->refs, 1); + imu->release = release; + imu->priv = rq; + imu->flags = IO_REGBUF_F_KBUF; + imu->dir = 1 << rq_data_dir(rq); + + rq_for_each_bvec(bv, rq, rq_iter) + imu->bvec[nr_bvecs++] = bv; + imu->nr_bvecs = nr_bvecs; + + node->buf = imu; + data->nodes[index] = node; +unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_buffer_register_bvec); + +int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct io_rsrc_node *node; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) { + ret = -EINVAL; + goto unlock; + } + index = array_index_nospec(index, data->nr); + + node = data->nodes[index]; + if (!node) { + ret = -EINVAL; + goto unlock; + } + if (!(node->buf->flags & IO_REGBUF_F_KBUF)) { + ret = -EBUSY; + goto unlock; + } + + io_put_rsrc_node(ctx, node); + data->nodes[index] = NULL; +unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); + +static int validate_fixed_range(u64 buf_addr, size_t len, + const struct io_mapped_ubuf *imu) { u64 buf_end; - size_t offset; - if (WARN_ON_ONCE(!imu)) - return -EFAULT; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; + if (unlikely(len > MAX_RW_COUNT)) + return -EFAULT; + return 0; +} + +static int io_import_kbuf(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, size_t len, size_t offset) +{ + size_t count = len + offset; + + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); + iov_iter_advance(iter, offset); + return 0; +} + +static int io_import_fixed(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len) +{ + const struct bio_vec *bvec; + size_t folio_mask; + unsigned nr_segs; + size_t offset; + int ret; + + ret = validate_fixed_range(buf_addr, len, imu); + if (unlikely(ret)) + return ret; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; + if (unlikely(!len)) { + iov_iter_bvec(iter, ddir, NULL, 0, 0); + return 0; + } - /* - * Might not be a start of buffer, set size appropriately - * and advance us to the beginning. - */ offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len); - if (offset) { - /* - * Don't use iov_iter_advance() here, as it's really slow for - * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: - * - * 1) it's a BVEC iter, we set it up - * 2) all bvecs are the same in size, except potentially the - * first and last bvec - * - * So just find our index, and adjust the iterator afterwards. - * If the offset is within the first bvec (or the whole first - * bvec, just use iov_iter_advance(). This makes it easier - * since we can just skip the first segment, which may not - * be folio_size aligned. - */ - const struct bio_vec *bvec = imu->bvec; + if (imu->flags & IO_REGBUF_F_KBUF) + return io_import_kbuf(ddir, iter, imu, len, offset); - if (offset < bvec->bv_len) { - iter->iov_offset = offset; - } else { - unsigned long seg_skip; + /* + * Don't use iov_iter_advance() here, as it's really slow for + * using the latter parts of a big fixed buffer - it iterates + * over each segment manually. We can cheat a bit here for user + * registered nodes, because we know that: + * + * 1) it's a BVEC iter, we set it up + * 2) all bvecs are the same in size, except potentially the + * first and last bvec + */ + folio_mask = (1UL << imu->folio_shift) - 1; + bvec = imu->bvec; + if (offset >= bvec->bv_len) { + unsigned long seg_skip; + + /* skip first vec */ + offset -= bvec->bv_len; + seg_skip = 1 + (offset >> imu->folio_shift); + bvec += seg_skip; + offset &= folio_mask; + } + nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; + iov_iter_bvec(iter, ddir, bvec, nr_segs, len); + iter->iov_offset = offset; + return 0; +} + +inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, + unsigned issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *node; - /* skip first vec */ - offset -= bvec->bv_len; - seg_skip = 1 + (offset >> imu->folio_shift); + if (req->flags & REQ_F_BUF_NODE) + return req->buf_node; + req->flags |= REQ_F_BUF_NODE; - iter->bvec += seg_skip; - iter->nr_segs -= seg_skip; - iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); - } + io_ring_submit_lock(ctx, issue_flags); + node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); + if (node) { + node->refs++; + req->buf_node = node; + io_ring_submit_unlock(ctx, issue_flags); + return node; } + req->flags &= ~REQ_F_BUF_NODE; + io_ring_submit_unlock(ctx, issue_flags); + return NULL; +} - return 0; +int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, + u64 buf_addr, size_t len, int ddir, + unsigned issue_flags) +{ + struct io_rsrc_node *node; + + node = io_find_buf_node(req, issue_flags); + if (!node) + return -EFAULT; + return io_import_fixed(ddir, iter, node->buf, buf_addr, len); } /* Lock two rings at once. The rings must be different! */ @@ -954,44 +1171,35 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx return -EBUSY; nbufs = src_ctx->buf_table.nr; + if (!nbufs) + return -ENXIO; if (!arg->nr) arg->nr = nbufs; else if (arg->nr > nbufs) return -EINVAL; else if (arg->nr > IORING_MAX_REG_BUFFERS) return -EINVAL; + if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs) + return -EOVERFLOW; if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) return -EOVERFLOW; + if (nbufs > IORING_MAX_REG_BUFFERS) + return -EINVAL; ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); if (ret) return ret; - /* Fill entries in data from dst that won't overlap with src */ + /* Copy original dst nodes from before the cloned range */ for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { - struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; + struct io_rsrc_node *node = ctx->buf_table.nodes[i]; - if (src_node) { - data.nodes[i] = src_node; - src_node->refs++; + if (node) { + data.nodes[i] = node; + node->refs++; } } - ret = -ENXIO; - nbufs = src_ctx->buf_table.nr; - if (!nbufs) - goto out_free; - ret = -EINVAL; - if (!arg->nr) - arg->nr = nbufs; - else if (arg->nr > nbufs) - goto out_free; - ret = -EOVERFLOW; - if (check_add_overflow(arg->nr, arg->src_off, &off)) - goto out_free; - if (off > nbufs) - goto out_free; - off = arg->dst_off; i = arg->src_off; nr = arg->nr; @@ -1002,10 +1210,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx if (!src_node) { dst_node = NULL; } else { - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!dst_node) { - ret = -ENOMEM; - goto out_free; + io_rsrc_data_free(ctx, &data); + return -ENOMEM; } refcount_inc(&src_node->buf->refs); @@ -1015,6 +1223,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx i++; } + /* Copy original dst nodes from after the cloned range */ + for (i = nbufs; i < ctx->buf_table.nr; i++) { + struct io_rsrc_node *node = ctx->buf_table.nodes[i]; + + if (node) { + data.nodes[i] = node; + node->refs++; + } + } + /* * If asked for replace, put the old table. data->nodes[] holds both * old and new nodes at this point. @@ -1031,10 +1249,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx WARN_ON_ONCE(ctx->buf_table.nr); ctx->buf_table = data; return 0; - -out_free: - io_rsrc_data_free(ctx, &data); - return ret; } /* @@ -1062,7 +1276,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) return -EINVAL; registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; - file = io_uring_register_get_file(buf.src_fd, registered_src); + file = io_uring_ctx_get_file(buf.src_fd, registered_src); if (IS_ERR(file)) return PTR_ERR(file); @@ -1070,13 +1284,280 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) if (src_ctx != ctx) { mutex_unlock(&ctx->uring_lock); lock_two_rings(ctx, src_ctx); + + if (src_ctx->submitter_task && + src_ctx->submitter_task != current) { + ret = -EEXIST; + goto out; + } } ret = io_clone_buffers(ctx, src_ctx, &buf); +out: if (src_ctx != ctx) mutex_unlock(&src_ctx->uring_lock); - fput(file); + if (!registered_src) + fput(file); return ret; } + +void io_vec_free(struct iou_vec *iv) +{ + if (!iv->iovec) + return; + kfree(iv->iovec); + iv->iovec = NULL; + iv->nr = 0; +} + +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_NOWARN; + struct iovec *iov; + + iov = kmalloc_objs(iov[0], nr_entries, gfp); + if (!iov) + return -ENOMEM; + + io_vec_free(iv); + iv->iovec = iov; + iv->nr = nr_entries; + return 0; +} + +static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + struct iovec *iovec, unsigned nr_iovs, + struct iou_vec *vec) +{ + unsigned long folio_size = 1 << imu->folio_shift; + unsigned long folio_mask = folio_size - 1; + struct bio_vec *res_bvec = vec->bvec; + size_t total_len = 0; + unsigned bvec_idx = 0; + unsigned iov_idx; + + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { + size_t iov_len = iovec[iov_idx].iov_len; + u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; + struct bio_vec *src_bvec; + size_t offset; + int ret; + + ret = validate_fixed_range(buf_addr, iov_len, imu); + if (unlikely(ret)) + return ret; + + if (unlikely(!iov_len)) + return -EFAULT; + if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) + return -EOVERFLOW; + + offset = buf_addr - imu->ubuf; + /* + * Only the first bvec can have non zero bv_offset, account it + * here and work with full folios below. + */ + offset += imu->bvec[0].bv_offset; + + src_bvec = imu->bvec + (offset >> imu->folio_shift); + offset &= folio_mask; + + for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { + size_t seg_size = min_t(size_t, iov_len, + folio_size - offset); + + bvec_set_page(&res_bvec[bvec_idx], + src_bvec->bv_page, seg_size, offset); + iov_len -= seg_size; + } + } + if (total_len > MAX_RW_COUNT) + return -EINVAL; + + iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); + return 0; +} + +static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, + struct io_mapped_ubuf *imu) +{ + unsigned shift = imu->folio_shift; + size_t max_segs = 0; + unsigned i; + + for (i = 0; i < nr_iovs; i++) { + max_segs += (iov[i].iov_len >> shift) + 2; + if (max_segs > INT_MAX) + return -EOVERFLOW; + } + return max_segs; +} + +static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + struct iovec *iovec, unsigned nr_iovs, + struct iou_vec *vec) +{ + const struct bio_vec *src_bvec = imu->bvec; + struct bio_vec *res_bvec = vec->bvec; + unsigned res_idx = 0; + size_t total_len = 0; + unsigned iov_idx; + + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { + size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; + size_t iov_len = iovec[iov_idx].iov_len; + struct bvec_iter bi = { + .bi_size = offset + iov_len, + }; + struct bio_vec bv; + + bvec_iter_advance(src_bvec, &bi, offset); + for_each_mp_bvec(bv, src_bvec, bi, bi) + res_bvec[res_idx++] = bv; + total_len += iov_len; + } + iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); + return 0; +} + +static int iov_kern_bvec_size(const struct iovec *iov, + const struct io_mapped_ubuf *imu, + unsigned int *nr_seg) +{ + size_t offset = (size_t)(uintptr_t)iov->iov_base; + const struct bio_vec *bvec = imu->bvec; + int start = 0, i = 0; + size_t off = 0; + int ret; + + ret = validate_fixed_range(offset, iov->iov_len, imu); + if (unlikely(ret)) + return ret; + + for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; + off += bvec[i].bv_len, i++) { + if (offset >= off && offset < off + bvec[i].bv_len) + start = i; + } + *nr_seg = i - start; + return 0; +} + +static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, + struct io_mapped_ubuf *imu, unsigned *nr_segs) +{ + unsigned max_segs = 0; + size_t total_len = 0; + unsigned i; + int ret; + + *nr_segs = 0; + for (i = 0; i < nr_iovs; i++) { + if (unlikely(!iov[i].iov_len)) + return -EFAULT; + if (unlikely(check_add_overflow(total_len, iov[i].iov_len, + &total_len))) + return -EOVERFLOW; + ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); + if (unlikely(ret)) + return ret; + *nr_segs += max_segs; + } + if (total_len > MAX_RW_COUNT) + return -EINVAL; + return 0; +} + +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + unsigned nr_iovs, unsigned issue_flags) +{ + struct io_rsrc_node *node; + struct io_mapped_ubuf *imu; + unsigned iovec_off; + struct iovec *iov; + unsigned nr_segs; + + node = io_find_buf_node(req, issue_flags); + if (!node) + return -EFAULT; + imu = node->buf; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; + + iovec_off = vec->nr - nr_iovs; + iov = vec->iovec + iovec_off; + + if (imu->flags & IO_REGBUF_F_KBUF) { + int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); + + if (unlikely(ret)) + return ret; + } else { + int ret = io_estimate_bvec_size(iov, nr_iovs, imu); + + if (ret < 0) + return ret; + nr_segs = ret; + } + + if (sizeof(struct bio_vec) > sizeof(struct iovec)) { + size_t bvec_bytes; + + bvec_bytes = nr_segs * sizeof(struct bio_vec); + nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); + nr_segs += nr_iovs; + } + + if (nr_segs > vec->nr) { + struct iou_vec tmp_vec = {}; + int ret; + + ret = io_vec_realloc(&tmp_vec, nr_segs); + if (ret) + return ret; + + iovec_off = tmp_vec.nr - nr_iovs; + memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); + io_vec_free(vec); + + *vec = tmp_vec; + iov = vec->iovec + iovec_off; + req->flags |= REQ_F_NEED_CLEANUP; + } + + if (imu->flags & IO_REGBUF_F_KBUF) + return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); + + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); +} + +int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, + const struct iovec __user *uvec, size_t uvec_segs) +{ + struct iovec *iov; + int iovec_off, ret; + void *res; + + if (uvec_segs > iv->nr) { + ret = io_vec_realloc(iv, uvec_segs); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + } + + /* pad iovec to the right */ + iovec_off = iv->nr - uvec_segs; + iov = iv->iovec + iovec_off; + res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, + io_is_compat(req->ctx)); + if (IS_ERR(res)) + return PTR_ERR(res); + + req->flags |= REQ_F_IMPORT_BUFFER; + return 0; +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 89ea0135a1a0..44e3386f7c1c 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -2,8 +2,11 @@ #ifndef IOU_RSRC_H #define IOU_RSRC_H +#include <linux/io_uring_types.h> #include <linux/lockdep.h> +#define IO_VEC_CACHE_SOFT_CAP 256 + enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, @@ -20,6 +23,15 @@ struct io_rsrc_node { }; }; +enum { + IO_IMU_DEST = 1 << ITER_DEST, + IO_IMU_SOURCE = 1 << ITER_SOURCE, +}; + +enum { + IO_REGBUF_F_KBUF = 1, +}; + struct io_mapped_ubuf { u64 ubuf; unsigned int len; @@ -27,6 +39,10 @@ struct io_mapped_ubuf { unsigned int folio_shift; refcount_t refs; unsigned long acct_pages; + void (*release)(void *); + void *priv; + u8 flags; + u8 dir; struct bio_vec bvec[] __counted_by(nr_bvecs); }; @@ -37,16 +53,26 @@ struct io_imu_folio_data { unsigned int nr_pages_mid; unsigned int folio_shift; unsigned int nr_folios; + unsigned long first_folio_page_idx; }; -struct io_rsrc_node *io_rsrc_node_alloc(int type); +bool io_rsrc_cache_init(struct io_ring_ctx *ctx); +void io_rsrc_cache_free(struct io_ring_ctx *ctx); +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); -int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len); +struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, + unsigned issue_flags); +int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, + u64 buf_addr, size_t len, int ddir, + unsigned issue_flags); +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + unsigned nr_iovs, unsigned issue_flags); +int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, + const struct iovec __user *uvec, size_t uvec_segs); int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); @@ -62,12 +88,13 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); +int io_validate_user_buf_range(u64 uaddr, u64 ulen); bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data); static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data, - int index) + unsigned int index) { if (index < data->nr) return data->nodes[array_index_nospec(index, data->nr)]; @@ -77,15 +104,20 @@ static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { lockdep_assert_held(&ctx->uring_lock); - if (node && !--node->refs) + if (!--node->refs) io_free_rsrc_node(ctx, node); } static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx, - struct io_rsrc_data *data, int index) + struct io_rsrc_data *data, + unsigned int index) { - struct io_rsrc_node *node = data->nodes[index]; + struct io_rsrc_node *node; + if (index >= data->nr) + return false; + index = array_index_nospec(index, data->nr); + node = data->nodes[index]; if (!node) return false; io_put_rsrc_node(ctx, node); @@ -93,36 +125,14 @@ static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx, return true; } -static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) -{ - if (req->file_node) { - io_put_rsrc_node(req->ctx, req->file_node); - req->file_node = NULL; - } - if (req->flags & REQ_F_BUF_NODE) { - io_put_rsrc_node(req->ctx, req->buf_node); - req->buf_node = NULL; - } -} - -static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node, - struct io_rsrc_node *node) -{ - node->refs++; - *dst_node = node; -} - -static inline void io_req_assign_buf_node(struct io_kiocb *req, - struct io_rsrc_node *node) -{ - io_req_assign_rsrc_node(&req->buf_node, node); - req->flags |= REQ_F_BUF_NODE; -} - int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int __io_account_mem(struct user_struct *user, unsigned long nr_pages); +int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, + unsigned long nr_pages); +void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, + unsigned long nr_pages); static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) @@ -130,4 +140,21 @@ static inline void __io_unaccount_mem(struct user_struct *user, atomic_long_sub(nr_pages, &user->locked_vm); } +void io_vec_free(struct iou_vec *iv); +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries); + +static inline void io_vec_reset_iovec(struct iou_vec *iv, + struct iovec *iovec, unsigned nr) +{ + io_vec_free(iv); + iv->iovec = iovec; + iv->nr = nr; +} + +static inline void io_alloc_cache_vec_kasan(struct iou_vec *iv) +{ + if (IS_ENABLED(CONFIG_KASAN)) + io_vec_free(iv); +} + #endif diff --git a/io_uring/rw.c b/io_uring/rw.c index e5528cebcd06..0c4834645279 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -15,6 +15,7 @@ #include <uapi/linux/io_uring.h> +#include "filetable.h" #include "io_uring.h" #include "opdef.h" #include "kbuf.h" @@ -49,24 +50,16 @@ static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask) return false; } -#ifdef CONFIG_COMPAT static int io_iov_compat_buffer_select_prep(struct io_rw *rw) { - struct compat_iovec __user *uiov; - compat_ssize_t clen; + struct compat_iovec __user *uiov = u64_to_user_ptr(rw->addr); + struct compat_iovec iov; - uiov = u64_to_user_ptr(rw->addr); - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) + if (copy_from_user(&iov, uiov, sizeof(iov))) return -EFAULT; - if (clen < 0) - return -EINVAL; - - rw->len = clen; + rw->len = iov.iov_len; return 0; } -#endif static int io_iov_buffer_select_prep(struct io_kiocb *req) { @@ -77,10 +70,8 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) if (rw->len != 1) return -EINVAL; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) return io_iov_compat_buffer_select_prep(rw); -#endif uiov = u64_to_user_ptr(rw->addr); if (copy_from_user(&iov, uiov, sizeof(*uiov))) @@ -89,59 +80,63 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) return 0; } -static int __io_import_iovec(int ddir, struct io_kiocb *req, - struct io_async_rw *io, - unsigned int issue_flags) +static int io_import_vec(int ddir, struct io_kiocb *req, + struct io_async_rw *io, + const struct iovec __user *uvec, + size_t uvec_segs) { - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + int ret, nr_segs; struct iovec *iov; - void __user *buf; - int nr_segs, ret; - size_t sqe_len; - - buf = u64_to_user_ptr(rw->addr); - sqe_len = rw->len; - - if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) { - if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); - if (!buf) - return -ENOBUFS; - rw->addr = (unsigned long) buf; - rw->len = sqe_len; - } - return import_ubuf(ddir, buf, sqe_len, &io->iter); - } - - if (io->free_iovec) { - nr_segs = io->free_iov_nr; - iov = io->free_iovec; + if (io->vec.iovec) { + nr_segs = io->vec.nr; + iov = io->vec.iovec; } else { - iov = &io->fast_iov; nr_segs = 1; + iov = &io->fast_iov; } - ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter, - req->ctx->compat); + + ret = __import_iovec(ddir, uvec, uvec_segs, nr_segs, &iov, &io->iter, + io_is_compat(req->ctx)); if (unlikely(ret < 0)) return ret; if (iov) { req->flags |= REQ_F_NEED_CLEANUP; - io->free_iov_nr = io->iter.nr_segs; - kfree(io->free_iovec); - io->free_iovec = iov; + io_vec_reset_iovec(&io->vec, iov, io->iter.nr_segs); } return 0; } -static inline int io_import_iovec(int rw, struct io_kiocb *req, - struct io_async_rw *io, - unsigned int issue_flags) +static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, + struct io_async_rw *io, struct io_br_sel *sel, + unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + size_t sqe_len = rw->len; + + sel->addr = u64_to_user_ptr(rw->addr); + if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT)) + return io_import_vec(ddir, req, io, sel->addr, sqe_len); + + if (io_do_buffer_select(req)) { + *sel = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); + if (!sel->addr) + return -ENOBUFS; + rw->addr = (unsigned long) sel->addr; + rw->len = sqe_len; + } + return import_ubuf(ddir, sel->addr, sqe_len, &io->iter); +} + +static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, + struct io_async_rw *io, + struct io_br_sel *sel, + unsigned int issue_flags) { int ret; - ret = __io_import_iovec(rw, req, io, issue_flags); + ret = __io_import_rw_buffer(rw, req, io, sel, issue_flags); if (unlikely(ret < 0)) return ret; @@ -149,18 +144,22 @@ static inline int io_import_iovec(int rw, struct io_kiocb *req, return 0; } -static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) +static bool io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_rw *rw = req->async_data; if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) - return; + return false; + + io_alloc_cache_vec_kasan(&rw->vec); + if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP) + io_vec_free(&rw->vec); - io_alloc_cache_kasan(&rw->free_iovec, &rw->free_iov_nr); if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { - req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; + io_req_async_data_clear(req, 0); + return true; } + return false; } static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) @@ -190,11 +189,15 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) * This is really a bug in the core code that does this, any issue * path should assume that a successful (or -EIOCBQUEUED) return can * mean that the underlying data can be gone at any time. But that - * should be fixed seperately, and then this check could be killed. + * should be fixed separately, and then this check could be killed. */ if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { req->flags &= ~REQ_F_NEED_CLEANUP; - io_rw_recycle(req, issue_flags); + if (!io_rw_recycle(req, issue_flags)) { + struct io_async_rw *rw = req->async_data; + + io_vec_free(&rw->vec); + } } } @@ -206,26 +209,12 @@ static int io_rw_alloc_async(struct io_kiocb *req) rw = io_uring_alloc_async_data(&ctx->rw_cache, req); if (!rw) return -ENOMEM; - if (rw->free_iovec) + if (rw->vec.iovec) req->flags |= REQ_F_NEED_CLEANUP; rw->bytes_done = 0; return 0; } -static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) -{ - struct io_async_rw *rw; - - if (io_rw_alloc_async(req)) - return -ENOMEM; - - if (!do_import || io_do_buffer_select(req)) - return 0; - - rw = req->async_data; - return io_import_iovec(ddir, req, rw, 0); -} - static inline void io_meta_save_state(struct io_async_rw *io) { io->meta_state.seed = io->meta.seed; @@ -241,7 +230,7 @@ static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) } static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, - u64 attr_ptr, u64 attr_type_mask) + u64 attr_ptr) { struct io_uring_attr_pi pi_attr; struct io_async_rw *io; @@ -267,17 +256,23 @@ static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, return ret; } -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, - int ddir, bool do_import) +static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct io_async_rw *io; unsigned ioprio; u64 attr_type_mask; int ret; + if (io_rw_alloc_async(req)) + return -ENOMEM; + io = req->async_data; + rw->kiocb.ki_pos = READ_ONCE(sqe->off); /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); + io->buf_group = req->buf_index; ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { @@ -289,8 +284,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, } else { rw->kiocb.ki_ioprio = get_current_ioprio(); } - rw->kiocb.dio_complete = NULL; rw->kiocb.ki_flags = 0; + rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream); if (req->ctx->flags & IORING_SETUP_IOPOLL) rw->kiocb.ki_complete = io_complete_rw_iopoll; @@ -299,11 +294,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); - rw->flags = READ_ONCE(sqe->rw_flags); - ret = io_prep_rw_setup(req, ddir, do_import); - - if (unlikely(ret)) - return ret; + rw->flags = (__force rwf_t) READ_ONCE(sqe->rw_flags); attr_type_mask = READ_ONCE(sqe->attr_type_mask); if (attr_type_mask) { @@ -314,36 +305,57 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EINVAL; attr_ptr = READ_ONCE(sqe->attr_ptr); - ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + return io_prep_rw_pi(req, rw, ddir, attr_ptr); } - return ret; + return 0; +} + +static int io_rw_do_import(struct io_kiocb *req, int ddir) +{ + struct io_br_sel sel = { }; + + if (io_do_buffer_select(req)) + return 0; + + return io_import_rw_buffer(ddir, req, req->async_data, &sel, 0); +} + +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) +{ + int ret; + + ret = __io_prep_rw(req, sqe, ddir); + if (unlikely(ret)) + return ret; + + return io_rw_do_import(req, ddir); } int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw(req, sqe, ITER_DEST, true); + return io_prep_rw(req, sqe, ITER_DEST); } int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw(req, sqe, ITER_SOURCE, true); + return io_prep_rw(req, sqe, ITER_SOURCE); } static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir) { - const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT); int ret; - ret = io_prep_rw(req, sqe, ddir, do_import); + ret = io_prep_rw(req, sqe, ddir); if (unlikely(ret)) return ret; - if (do_import) + if (!(req->flags & REQ_F_BUFFER_SELECT)) return 0; /* * Have to do this validation here, as this is in io_read() rw->len - * might have chanaged due to buffer selection + * might have changed due to buffer selection */ return io_iov_buffer_select_prep(req); } @@ -358,38 +370,77 @@ int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_prep_rwv(req, sqe, ITER_SOURCE); } -static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe, +static int io_init_rw_fixed(struct io_kiocb *req, unsigned int issue_flags, int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *node; - struct io_async_rw *io; + struct io_async_rw *io = req->async_data; int ret; - ret = io_prep_rw(req, sqe, ddir, false); - if (unlikely(ret)) - return ret; - - node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); - if (!node) - return -EFAULT; - io_req_assign_buf_node(req, node); + if (io->bytes_done) + return 0; - io = req->async_data; - ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len); + ret = io_import_reg_buf(req, &io->iter, rw->addr, rw->len, ddir, + issue_flags); iov_iter_save_state(&io->iter, &io->iter_state); return ret; } int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw_fixed(req, sqe, ITER_DEST); + return __io_prep_rw(req, sqe, ITER_DEST); } int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw_fixed(req, sqe, ITER_SOURCE); + return __io_prep_rw(req, sqe, ITER_SOURCE); +} + +static int io_rw_import_reg_vec(struct io_kiocb *req, + struct io_async_rw *io, + int ddir, unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + unsigned uvec_segs = rw->len; + int ret; + + ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec, + uvec_segs, issue_flags); + if (unlikely(ret)) + return ret; + iov_iter_save_state(&io->iter, &io->iter_state); + req->flags &= ~REQ_F_IMPORT_BUFFER; + return 0; +} + +static int io_rw_prep_reg_vec(struct io_kiocb *req) +{ + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct io_async_rw *io = req->async_data; + const struct iovec __user *uvec; + + uvec = u64_to_user_ptr(rw->addr); + return io_prep_reg_iovec(req, &io->vec, uvec, rw->len); +} + +int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret; + + ret = __io_prep_rw(req, sqe, ITER_DEST); + if (unlikely(ret)) + return ret; + return io_rw_prep_reg_vec(req); +} + +int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret; + + ret = __io_prep_rw(req, sqe, ITER_SOURCE); + if (unlikely(ret)) + return ret; + return io_rw_prep_reg_vec(req); } /* @@ -405,7 +456,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - ret = io_prep_rw(req, sqe, ITER_DEST, false); + ret = __io_prep_rw(req, sqe, ITER_DEST); if (unlikely(ret)) return ret; @@ -418,7 +469,10 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) void io_readv_writev_cleanup(struct io_kiocb *req) { + struct io_async_rw *rw = req->async_data; + lockdep_assert_held(&req->ctx->uring_lock); + io_vec_free(&rw->vec); io_rw_recycle(req, 0); } @@ -450,7 +504,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req) if (!S_ISBLK(mode) && !S_ISREG(mode)) return false; if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && - !(ctx->flags & IORING_SETUP_IOPOLL))) + !(req->flags & REQ_F_IOPOLL))) return false; /* * If ref is dying, we might be running poll reap from the exit work. @@ -497,7 +551,7 @@ static void __io_complete_rw_common(struct io_kiocb *req, long res) { if (res == req->cqe.res) return; - if (res == -EAGAIN && io_rw_should_reissue(req)) { + if ((res == -EOPNOTSUPP || res == -EAGAIN) && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; } else { req_set_fail(req); @@ -519,24 +573,17 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) return res; } -void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw) { - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct kiocb *kiocb = &rw->kiocb; - - if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) { - long res = kiocb->dio_complete(rw->kiocb.private); - - io_req_set_res(req, io_fixup_rw_res(req, res), 0); - } + struct io_kiocb *req = tw_req.req; io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) - req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); + req->cqe.flags |= io_put_kbuf(req, max(req->cqe.res, 0), NULL); io_req_rw_cleanup(req, 0); - io_req_task_complete(req, ts); + io_req_task_complete(tw_req, tw); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -544,10 +591,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res) struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); struct io_kiocb *req = cmd_to_io_kiocb(rw); - if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { - __io_complete_rw_common(req, res); - io_req_set_res(req, io_fixup_rw_res(req, res), 0); - } + __io_complete_rw_common(req, res); + io_req_set_res(req, io_fixup_rw_res(req, res), 0); req->io_task_work.func = io_req_rw_complete; __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE); } @@ -595,30 +640,34 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) } } - if (req->ctx->flags & IORING_SETUP_IOPOLL) + if (req->flags & REQ_F_IOPOLL) io_complete_rw_iopoll(&rw->kiocb, ret); else io_complete_rw(&rw->kiocb, ret); } static int kiocb_done(struct io_kiocb *req, ssize_t ret, - unsigned int issue_flags) + struct io_br_sel *sel, unsigned int issue_flags) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned final_ret = io_fixup_rw_res(req, ret); if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { + if (ret >= 0 && !(req->flags & REQ_F_IOPOLL)) { + u32 cflags = 0; + __io_complete_rw_common(req, ret); /* * Safe to call io_end from here as we're inline * from the submission path. */ io_req_io_end(req); - io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); + if (sel) + cflags = io_put_kbuf(req, ret, sel->buf_list); + io_req_set_res(req, final_ret, cflags); io_req_rw_cleanup(req, issue_flags); - return IOU_OK; + return IOU_COMPLETE; } else { io_rw_done(req, ret); } @@ -637,6 +686,7 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) */ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) { + struct io_kiocb *req = cmd_to_io_kiocb(rw); struct kiocb *kiocb = &rw->kiocb; struct file *file = kiocb->ki_filp; ssize_t ret = 0; @@ -652,6 +702,9 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) if ((kiocb->ki_flags & IOCB_NOWAIT) && !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; + if ((req->flags & REQ_F_BUF_NODE) && + (req->buf_node->buf->flags & IO_REGBUF_F_KBUF)) + return -EFAULT; ppos = io_kiocb_ppos(kiocb); @@ -810,7 +863,6 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; - kiocb->ki_flags |= IOCB_ALLOC_CACHE; /* * If the file is marked O_NONBLOCK, still allow retry for it if it @@ -824,6 +876,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; + req->flags |= REQ_F_IOPOLL; kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI; req->iopoll_completed = 0; @@ -840,11 +893,14 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (req->flags & REQ_F_HAS_METADATA) { struct io_async_rw *io = req->async_data; + if (!(file->f_mode & FMODE_HAS_METADATA)) + return -EINVAL; + /* * We have a union of meta fields with wpq used for buffered-io * in io_async_rw, so fail it here. */ - if (!(req->file->f_flags & O_DIRECT)) + if (!(file->f_flags & O_DIRECT)) return -EOPNOTSUPP; kiocb->ki_flags |= IOCB_HAS_METADATA; kiocb->private = &io->meta; @@ -853,7 +909,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) return 0; } -static int __io_read(struct io_kiocb *req, unsigned int issue_flags) +static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, + unsigned int issue_flags) { bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); @@ -862,8 +919,12 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ssize_t ret; loff_t *ppos; - if (io_do_buffer_select(req)) { - ret = io_import_iovec(ITER_DEST, req, io, issue_flags); + if (req->flags & REQ_F_IMPORT_BUFFER) { + ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags); + if (unlikely(ret)) + return ret; + } else if (io_do_buffer_select(req)) { + ret = io_import_rw_buffer(ITER_DEST, req, io, sel, issue_flags); if (unlikely(ret < 0)) return ret; } @@ -901,13 +962,13 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EAGAIN) { /* If we can poll, just do that. */ if (io_file_can_poll(req)) - return -EAGAIN; + return ret; /* IOPOLL retry should happen for io-wq threads */ - if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) - goto done; + if (!force_nonblock && !(req->flags & REQ_F_IOPOLL)) + return ret; /* no retry on NONBLOCK nor RWF_NOWAIT */ if (req->flags & REQ_F_NOWAIT) - goto done; + return ret; ret = 0; } else if (ret == -EIOCBQUEUED) { return IOU_ISSUE_SKIP_COMPLETE; @@ -915,7 +976,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) || (issue_flags & IO_URING_F_MULTISHOT)) { /* read all, failed, already did sync or don't want to retry */ - goto done; + return ret; } /* @@ -958,25 +1019,28 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags &= ~IOCB_WAITQ; iov_iter_restore(&io->iter, &io->iter_state); } while (ret > 0); -done: - /* it's faster to check here then delegate to kfree */ + return ret; } int io_read(struct io_kiocb *req, unsigned int issue_flags) { + struct io_br_sel sel = { }; int ret; - ret = __io_read(req, issue_flags); + ret = __io_read(req, &sel, issue_flags); if (ret >= 0) - return kiocb_done(req, ret, issue_flags); + return kiocb_done(req, ret, &sel, issue_flags); + if (req->flags & REQ_F_BUFFERS_COMMIT) + io_kbuf_recycle(req, sel.buf_list, issue_flags); return ret; } int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct io_br_sel sel = { }; unsigned int cflags = 0; int ret; @@ -988,7 +1052,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) /* make it sync, multishot doesn't support async execution */ rw->kiocb.ki_complete = NULL; - ret = __io_read(req, issue_flags); + ret = __io_read(req, &sel, issue_flags); /* * If we get -EAGAIN, recycle our buffer and just let normal poll @@ -999,17 +1063,15 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * Reset rw->len to 0 again to avoid clamping future mshot * reads, in case the buffer size varies. */ - if (io_kbuf_recycle(req, issue_flags)) + if (io_kbuf_recycle(req, sel.buf_list, issue_flags)) rw->len = 0; - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return -EAGAIN; + return IOU_RETRY; } else if (ret <= 0) { - io_kbuf_recycle(req, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); if (ret < 0) req_set_fail(req); } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - cflags = io_put_kbuf(req, ret, issue_flags); + cflags = io_put_kbuf(req, ret, sel.buf_list); } else { /* * Any successful return value will keep the multishot read @@ -1017,20 +1079,19 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * we fail to post a CQE, or multishot is no longer set, then * jump to the termination path. This request is then done. */ - cflags = io_put_kbuf(req, ret, issue_flags); + cflags = io_put_kbuf(req, ret, sel.buf_list); rw->len = 0; /* similarly to above, reset len to 0 */ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { - if (issue_flags & IO_URING_F_MULTISHOT) { + if (issue_flags & IO_URING_F_MULTISHOT) /* * Force retry, as we might have more data to * be read and otherwise it won't get retried * until (if ever) another poll is triggered. */ io_poll_multishot_retry(req); - return IOU_ISSUE_SKIP_COMPLETE; - } - return -EAGAIN; + + return IOU_RETRY; } } @@ -1040,9 +1101,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) */ io_req_set_res(req, ret, cflags); io_req_rw_cleanup(req, issue_flags); - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_STOP_MULTISHOT; - return IOU_OK; + return IOU_COMPLETE; } static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb) @@ -1073,6 +1132,12 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ssize_t ret, ret2; loff_t *ppos; + if (req->flags & REQ_F_IMPORT_BUFFER) { + ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags); + if (unlikely(ret)) + return ret; + } + ret = io_rw_init_file(req, FMODE_WRITE, WRITE); if (unlikely(ret)) return ret; @@ -1123,7 +1188,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) goto done; if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ - if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) + if (ret2 == -EAGAIN && (req->flags & REQ_F_IOPOLL)) goto ret_eagain; if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { @@ -1143,7 +1208,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } done: - return kiocb_done(req, ret2, issue_flags); + return kiocb_done(req, ret2, NULL, issue_flags); } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); @@ -1154,6 +1219,28 @@ ret_eagain: } } +int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + ret = io_init_rw_fixed(req, issue_flags, ITER_DEST); + if (unlikely(ret)) + return ret; + + return io_read(req, issue_flags); +} + +int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + ret = io_init_rw_fixed(req, issue_flags, ITER_SOURCE); + if (unlikely(ret)) + return ret; + + return io_write(req, issue_flags); +} + void io_rw_fail(struct io_kiocb *req) { int res; @@ -1167,7 +1254,7 @@ static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob { struct file *file = req->file; - if (req->opcode == IORING_OP_URING_CMD) { + if (io_is_uring_cmd(req)) { struct io_uring_cmd *ioucmd; ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); @@ -1217,12 +1304,13 @@ static int io_uring_hybrid_poll(struct io_kiocb *req, struct io_comp_batch *iob, unsigned int poll_flags) { struct io_ring_ctx *ctx = req->ctx; - u64 runtime, sleep_time; + u64 runtime, sleep_time, iopoll_start; int ret; + iopoll_start = READ_ONCE(req->iopoll_start); sleep_time = io_hybrid_iopoll_delay(ctx, req); ret = io_uring_classic_poll(req, iob, poll_flags); - runtime = ktime_get_ns() - req->iopoll_start - sleep_time; + runtime = ktime_get_ns() - iopoll_start - sleep_time; /* * Use minimum sleep time if we're polling devices with different @@ -1236,20 +1324,25 @@ static int io_uring_hybrid_poll(struct io_kiocb *req, int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { - struct io_wq_work_node *pos, *start, *prev; unsigned int poll_flags = 0; DEFINE_IO_COMP_BATCH(iob); + struct io_kiocb *req, *tmp; int nr_events = 0; /* + * Store the polling io_ring_ctx so drivers can detect if they're + * completing a request in the same ring context that's polling. + */ + iob.poll_ctx = ctx; + + /* * Only spin for completions if we don't have multiple devices hanging * off our complete list. */ if (ctx->poll_multi_queue || force_nonspin) poll_flags |= BLK_POLL_ONESHOT; - wq_list_for_each(pos, start, &ctx->iopoll_list) { - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); + list_for_each_entry(req, &ctx->iopoll_list, iopoll_node) { int ret; /* @@ -1278,31 +1371,20 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (!rq_list_empty(&iob.req_list)) iob.complete(&iob); - else if (!pos) - return 0; - - prev = start; - wq_list_for_each_resume(pos, prev) { - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, iopoll_node) { /* order with io_complete_rw_iopoll(), e.g. ->result updates */ if (!smp_load_acquire(&req->iopoll_completed)) - break; + continue; + list_del(&req->iopoll_node); + wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs); nr_events++; - req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0); - if (req->opcode != IORING_OP_URING_CMD) + req->cqe.flags = io_put_kbuf(req, max(req->cqe.res, 0), NULL); + if (!io_is_uring_cmd(req)) io_req_rw_cleanup(req, 0); } - if (unlikely(!nr_events)) - return 0; - - pos = start ? start->next : ctx->iopoll_list.first; - wq_list_cut(&ctx->iopoll_list, prev, start); - - if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs))) - return 0; - ctx->submit_state.compl_reqs.first = pos; - __io_submit_flush_completions(ctx); + if (nr_events) + __io_submit_flush_completions(ctx); return nr_events; } @@ -1310,7 +1392,6 @@ void io_rw_cache_free(const void *entry) { struct io_async_rw *rw = (struct io_async_rw *) entry; - if (rw->free_iovec) - kfree(rw->free_iovec); + io_vec_free(&rw->vec); kfree(rw); } diff --git a/io_uring/rw.h b/io_uring/rw.h index eaa59bd64870..9bd7fbf70ea9 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/io_uring_types.h> #include <linux/pagemap.h> struct io_meta_state { @@ -8,13 +9,15 @@ struct io_meta_state { }; struct io_async_rw { + struct iou_vec vec; size_t bytes_done; - struct iovec *free_iovec; + struct_group(clear, struct iov_iter iter; struct iov_iter_state iter_state; struct iovec fast_iov; - int free_iov_nr; + unsigned buf_group; + /* * wpq is for buffered io, while meta fields are used with * direct io @@ -31,15 +34,19 @@ struct io_async_rw { int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read(struct io_kiocb *req, unsigned int issue_flags); int io_write(struct io_kiocb *req, unsigned int issue_flags); +int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags); +int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); -void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw); int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); void io_rw_cache_free(const void *entry); diff --git a/io_uring/slist.h b/io_uring/slist.h index 0eb194817242..45c21a8ad685 100644 --- a/io_uring/slist.h +++ b/io_uring/slist.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef INTERNAL_IO_SLIST_H #define INTERNAL_IO_SLIST_H @@ -9,9 +10,6 @@ #define wq_list_for_each(pos, prv, head) \ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) -#define wq_list_for_each_resume(pos, prv) \ - for (; pos; prv = pos, pos = (pos)->next) - #define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) #define INIT_WQ_LIST(list) do { \ @@ -43,15 +41,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, } } -static inline void wq_list_add_head(struct io_wq_work_node *node, - struct io_wq_work_list *list) -{ - node->next = list->first; - if (!node->next) - list->last = node; - WRITE_ONCE(list->first, node); -} - static inline void wq_list_cut(struct io_wq_work_list *list, struct io_wq_work_node *last, struct io_wq_work_node *prev) @@ -67,24 +56,6 @@ static inline void wq_list_cut(struct io_wq_work_list *list, last->next = NULL; } -static inline void __wq_list_splice(struct io_wq_work_list *list, - struct io_wq_work_node *to) -{ - list->last->next = to->next; - to->next = list->first; - INIT_WQ_LIST(list); -} - -static inline bool wq_list_splice(struct io_wq_work_list *list, - struct io_wq_work_node *to) -{ - if (!wq_list_empty(list)) { - __wq_list_splice(list, to); - return true; - } - return false; -} - static inline void wq_stack_add_head(struct io_wq_work_node *node, struct io_wq_work_node *stack) { diff --git a/io_uring/splice.c b/io_uring/splice.c index 5b84f1630611..e81ebbb91925 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -11,6 +11,7 @@ #include <uapi/linux/io_uring.h> +#include "filetable.h" #include "io_uring.h" #include "splice.h" @@ -51,7 +52,8 @@ void io_splice_cleanup(struct io_kiocb *req) { struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice); - io_put_rsrc_node(req->ctx, sp->rsrc_node); + if (sp->rsrc_node) + io_put_rsrc_node(req->ctx, sp->rsrc_node); } static struct file *io_splice_get_file(struct io_kiocb *req, @@ -102,7 +104,7 @@ done: if (ret != sp->len) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -143,5 +145,5 @@ done: if (ret != sp->len) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index d037cc68e9d3..46c12afec73e 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -11,16 +11,19 @@ #include <linux/audit.h> #include <linux/security.h> #include <linux/cpuset.h> +#include <linux/sched/cputime.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "tctx.h" #include "napi.h" +#include "cancel.h" #include "sqpoll.h" #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 -#define IORING_TW_CAP_ENTRIES_VALUE 8 +#define IORING_TW_CAP_ENTRIES_VALUE 32 enum { IO_SQ_THREAD_SHOULD_STOP = 0, @@ -30,7 +33,7 @@ enum { void io_sq_thread_unpark(struct io_sq_data *sqd) __releases(&sqd->lock) { - WARN_ON_ONCE(sqd->thread == current); + WARN_ON_ONCE(sqpoll_task_locked(sqd) == current); /* * Do the dance but not conditional clear_bit() because it'd race with @@ -46,24 +49,32 @@ void io_sq_thread_unpark(struct io_sq_data *sqd) void io_sq_thread_park(struct io_sq_data *sqd) __acquires(&sqd->lock) { - WARN_ON_ONCE(data_race(sqd->thread) == current); + struct task_struct *tsk; atomic_inc(&sqd->park_pending); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); + + tsk = sqpoll_task_locked(sqd); + if (tsk) { + WARN_ON_ONCE(tsk == current); + wake_up_process(tsk); + } } void io_sq_thread_stop(struct io_sq_data *sqd) { - WARN_ON_ONCE(sqd->thread == current); + struct task_struct *tsk; + WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); + tsk = sqpoll_task_locked(sqd); + if (tsk) { + WARN_ON_ONCE(tsk == current); + wake_up_process(tsk); + } mutex_unlock(&sqd->lock); wait_for_completion(&sqd->exited); } @@ -142,7 +153,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, return sqd; } - sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); + sqd = kzalloc_obj(*sqd); if (!sqd) return ERR_PTR(-ENOMEM); @@ -160,7 +171,38 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd) return READ_ONCE(sqd->state); } -static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) +struct io_sq_time { + bool started; + u64 usec; +}; + +u64 io_sq_cpu_usec(struct task_struct *tsk) +{ + u64 utime, stime; + + task_cputime_adjusted(tsk, &utime, &stime); + do_div(stime, 1000); + return stime; +} + +static void io_sq_update_worktime(struct io_sq_data *sqd, struct io_sq_time *ist) +{ + if (!ist->started) + return; + ist->started = false; + sqd->work_time += io_sq_cpu_usec(current) - ist->usec; +} + +static void io_sq_start_worktime(struct io_sq_time *ist) +{ + if (ist->started) + return; + ist->started = true; + ist->usec = io_sq_cpu_usec(current); +} + +static int __io_sq_thread(struct io_ring_ctx *ctx, struct io_sq_data *sqd, + bool cap_entries, struct io_sq_time *ist) { unsigned int to_submit; int ret = 0; @@ -170,14 +212,16 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - if (to_submit || !wq_list_empty(&ctx->iopoll_list)) { + if (to_submit || !list_empty(&ctx->iopoll_list)) { const struct cred *creds = NULL; + io_sq_start_worktime(ist); + if (ctx->sq_creds != current_cred()) creds = override_creds(ctx->sq_creds); mutex_lock(&ctx->uring_lock); - if (!wq_list_empty(&ctx->iopoll_list)) + if (!list_empty(&ctx->iopoll_list)) io_do_iopoll(ctx, true); /* @@ -246,23 +290,11 @@ static bool io_sq_tw_pending(struct llist_node *retry_list) return retry_list || !llist_empty(&tctx->task_list); } -static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start) -{ - struct rusage end; - - getrusage(current, RUSAGE_SELF, &end); - end.ru_stime.tv_sec -= start->ru_stime.tv_sec; - end.ru_stime.tv_usec -= start->ru_stime.tv_usec; - - sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000; -} - static int io_sq_thread(void *data) { struct llist_node *retry_list = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; - struct rusage start; unsigned long timeout = 0; char buf[TASK_COMM_LEN] = {}; DEFINE_WAIT(wait); @@ -270,7 +302,8 @@ static int io_sq_thread(void *data) /* offload context creation failed, just exit */ if (!current->io_uring) { mutex_lock(&sqd->lock); - sqd->thread = NULL; + rcu_assign_pointer(sqd->thread, NULL); + put_task_struct(current); mutex_unlock(&sqd->lock); goto err_out; } @@ -299,6 +332,7 @@ static int io_sq_thread(void *data) mutex_lock(&sqd->lock); while (1) { bool cap_entries, sqt_spin = false; + struct io_sq_time ist = { }; if (io_sqd_events_pending(sqd) || signal_pending(current)) { if (io_sqd_handle_event(sqd)) @@ -307,25 +341,27 @@ static int io_sq_thread(void *data) } cap_entries = !list_is_singular(&sqd->ctx_list); - getrusage(current, RUSAGE_SELF, &start); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - int ret = __io_sq_thread(ctx, cap_entries); + int ret = __io_sq_thread(ctx, sqd, cap_entries, &ist); - if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) + if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) sqt_spin = true; } if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) sqt_spin = true; - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - if (io_napi(ctx)) + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + if (io_napi(ctx)) { + io_sq_start_worktime(&ist); io_napi_sqpoll_busy_poll(ctx); + } + } + + io_sq_update_worktime(sqd, &ist); if (sqt_spin || !time_after(jiffies, timeout)) { - if (sqt_spin) { - io_sq_update_worktime(sqd, &start); + if (sqt_spin) timeout = jiffies + sqd->sq_thread_idle; - } if (unlikely(need_resched())) { mutex_unlock(&sqd->lock); cond_resched(); @@ -343,7 +379,7 @@ static int io_sq_thread(void *data) atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); if ((ctx->flags & IORING_SETUP_IOPOLL) && - !wq_list_empty(&ctx->iopoll_list)) { + !list_empty(&ctx->iopoll_list)) { needs_sched = false; break; } @@ -379,7 +415,8 @@ static int io_sq_thread(void *data) io_sq_tw(&retry_list, UINT_MAX); io_uring_cancel_generic(true, sqd); - sqd->thread = NULL; + rcu_assign_pointer(sqd->thread, NULL); + put_task_struct(current); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); io_run_task_work(); @@ -409,7 +446,6 @@ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) __cold int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct task_struct *task_to_put = NULL; int ret; /* Retain compatibility with failing for an invalid attach attempt */ @@ -422,6 +458,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, return -EINVAL; } if (ctx->flags & IORING_SETUP_SQPOLL) { + struct io_uring_task *tctx; struct task_struct *tsk; struct io_sq_data *sqd; bool attached; @@ -484,9 +521,17 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, goto err_sqpoll; } - sqd->thread = tsk; - task_to_put = get_task_struct(tsk); - ret = io_uring_alloc_task_context(tsk, ctx); + mutex_lock(&sqd->lock); + rcu_assign_pointer(sqd->thread, tsk); + mutex_unlock(&sqd->lock); + + ret = 0; + get_task_struct(tsk); + tctx = io_uring_alloc_task_context(tsk, ctx); + if (!IS_ERR(tctx)) + tsk->io_uring = tctx; + else + ret = PTR_ERR(tctx); wake_up_new_task(tsk); if (ret) goto err; @@ -495,16 +540,11 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, ret = -EINVAL; goto err; } - - if (task_to_put) - put_task_struct(task_to_put); return 0; err_sqpoll: complete(&ctx->sq_data->exited); err: io_sq_thread_finish(ctx); - if (task_to_put) - put_task_struct(task_to_put); return ret; } @@ -515,10 +555,13 @@ __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, int ret = -EINVAL; if (sqd) { + struct task_struct *tsk; + io_sq_thread_park(sqd); /* Don't set affinity for a dying thread */ - if (sqd->thread) - ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); + tsk = sqpoll_task_locked(sqd); + if (tsk) + ret = io_wq_cpu_affinity(tsk->io_uring, mask); io_sq_thread_unpark(sqd); } diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h index 4171666b1cf4..fd2f6f29b516 100644 --- a/io_uring/sqpoll.h +++ b/io_uring/sqpoll.h @@ -8,7 +8,7 @@ struct io_sq_data { /* ctx's that are using this sqd */ struct list_head ctx_list; - struct task_struct *thread; + struct task_struct __rcu *thread; struct wait_queue_head wait; unsigned sq_thread_idle; @@ -29,3 +29,10 @@ void io_sq_thread_unpark(struct io_sq_data *sqd); void io_put_sq_data(struct io_sq_data *sqd); void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask); +u64 io_sq_cpu_usec(struct task_struct *tsk); + +static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd) +{ + return rcu_dereference_protected(sqd->thread, + lockdep_is_held(&sqd->lock)); +} diff --git a/io_uring/statx.c b/io_uring/statx.c index 6bc4651700a2..7bcae4a6c4a3 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -16,7 +16,7 @@ struct io_statx { int dfd; unsigned int mask; unsigned int flags; - struct filename *filename; + struct delayed_filename filename; struct statx __user *buffer; }; @@ -24,6 +24,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); const char __user *path; + int ret; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; @@ -36,14 +37,10 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); sx->flags = READ_ONCE(sqe->statx_flags); - sx->filename = getname_uflags(path, sx->flags); - - if (IS_ERR(sx->filename)) { - int ret = PTR_ERR(sx->filename); + ret = delayed_getname_uflags(&sx->filename, path, sx->flags); - sx->filename = NULL; + if (unlikely(ret)) return ret; - } req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; @@ -53,19 +50,19 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_statx(struct io_kiocb *req, unsigned int issue_flags) { struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); + CLASS(filename_complete_delayed, name)(&sx->filename); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); + ret = do_statx(sx->dfd, name, sx->flags, sx->mask, sx->buffer); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_statx_cleanup(struct io_kiocb *req) { struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); - if (sx->filename) - putname(sx->filename); + dismiss_delayed_filename(&sx->filename); } diff --git a/io_uring/sync.c b/io_uring/sync.c index 255f68c37e55..ab7fa1cd7dd6 100644 --- a/io_uring/sync.c +++ b/io_uring/sync.c @@ -47,7 +47,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -62,6 +62,8 @@ int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; sync->off = READ_ONCE(sqe->off); + if (sync->off < 0) + return -EINVAL; sync->len = READ_ONCE(sqe->len); req->flags |= REQ_F_FORCE_ASYNC; return 0; @@ -79,7 +81,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, sync->flags & IORING_FSYNC_DATASYNC); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -108,5 +110,5 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) if (ret >= 0) fsnotify_modify(req->file); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/tctx.c b/io_uring/tctx.c index adc6e42c14df..42b219b34aa8 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -11,6 +11,7 @@ #include "io_uring.h" #include "tctx.h" +#include "bpf_filter.h" static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, struct task_struct *task) @@ -22,7 +23,7 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, mutex_lock(&ctx->uring_lock); hash = ctx->hash_map; if (!hash) { - hash = kzalloc(sizeof(*hash), GFP_KERNEL); + hash = kzalloc_obj(*hash); if (!hash) { mutex_unlock(&ctx->uring_lock); return ERR_PTR(-ENOMEM); @@ -35,8 +36,6 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, data.hash = hash; data.task = task; - data.free_work = io_wq_free_work; - data.do_work = io_wq_submit_work; /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); @@ -56,32 +55,39 @@ void __io_uring_free(struct task_struct *tsk) * node is stored in the xarray. Until that gets sorted out, attempt * an iteration here and warn if any entries are found. */ - xa_for_each(&tctx->xa, index, node) { - WARN_ON_ONCE(1); - break; - } - WARN_ON_ONCE(tctx->io_wq); - WARN_ON_ONCE(tctx->cached_refs); + if (tctx) { + xa_for_each(&tctx->xa, index, node) { + WARN_ON_ONCE(1); + break; + } + WARN_ON_ONCE(tctx->io_wq); + WARN_ON_ONCE(tctx->cached_refs); - percpu_counter_destroy(&tctx->inflight); - kfree(tctx); - tsk->io_uring = NULL; + percpu_counter_destroy(&tctx->inflight); + kfree(tctx); + tsk->io_uring = NULL; + } + if (tsk->io_uring_restrict) { + io_put_bpf_filters(tsk->io_uring_restrict); + kfree(tsk->io_uring_restrict); + tsk->io_uring_restrict = NULL; + } } -__cold int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx) +__cold struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; - tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); + tctx = kzalloc_obj(*tctx); if (unlikely(!tctx)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); if (unlikely(ret)) { kfree(tctx); - return ret; + return ERR_PTR(ret); } tctx->io_wq = io_init_wq_offload(ctx, task); @@ -89,7 +95,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); kfree(tctx); - return ret; + return ERR_PTR(ret); } tctx->task = task; @@ -97,52 +103,90 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_cancel, 0); atomic_set(&tctx->inflight_tracked, 0); - task->io_uring = tctx; init_llist_head(&tctx->task_list); init_task_work(&tctx->task_work, tctx_task_work); + return tctx; +} + +static int io_tctx_install_node(struct io_ring_ctx *ctx, + struct io_uring_task *tctx) +{ + struct io_tctx_node *node; + int ret; + + if (xa_load(&tctx->xa, (unsigned long)ctx)) + return 0; + + node = kmalloc_obj(*node); + if (!node) + return -ENOMEM; + node->ctx = ctx; + node->task = current; + + ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, + node, GFP_KERNEL)); + if (ret) { + kfree(node); + return ret; + } + + mutex_lock(&ctx->tctx_lock); + list_add(&node->ctx_node, &ctx->tctx_list); + mutex_unlock(&ctx->tctx_lock); return 0; } int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; + bool new_tctx = false; int ret; if (unlikely(!tctx)) { - ret = io_uring_alloc_task_context(current, ctx); - if (unlikely(ret)) - return ret; + tctx = io_uring_alloc_task_context(current, ctx); + if (IS_ERR(tctx)) + return PTR_ERR(tctx); + new_tctx = true; + + if (data_race(ctx->int_flags) & IO_RING_F_IOWQ_LIMITS_SET) { + unsigned int limits[2]; - tctx = current->io_uring; - if (ctx->iowq_limits_set) { - unsigned int limits[2] = { ctx->iowq_limits[0], - ctx->iowq_limits[1], }; + mutex_lock(&ctx->uring_lock); + limits[0] = ctx->iowq_limits[0]; + limits[1] = ctx->iowq_limits[1]; + mutex_unlock(&ctx->uring_lock); ret = io_wq_max_workers(tctx->io_wq, limits); if (ret) - return ret; + goto err_free; } } - if (!xa_load(&tctx->xa, (unsigned long)ctx)) { - node = kmalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; - node->ctx = ctx; - node->task = current; - - ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, - node, GFP_KERNEL)); - if (ret) { - kfree(node); - return ret; - } - mutex_lock(&ctx->uring_lock); - list_add(&node->ctx_node, &ctx->tctx_list); - mutex_unlock(&ctx->uring_lock); + /* + * Re-activate io-wq keepalive on any new io_uring usage. The wq may have + * been marked for idle-exit when the task temporarily had no active + * io_uring instances. + */ + if (tctx->io_wq) + io_wq_set_exit_on_idle(tctx->io_wq, false); + + if (new_tctx) + current->io_uring = tctx; + + ret = io_tctx_install_node(ctx, tctx); + if (!ret) + return 0; +err_free: + if (new_tctx) { + current->io_uring = NULL; + if (tctx->io_wq) { + io_wq_exit_start(tctx->io_wq); + io_wq_put_and_exit(tctx->io_wq); + } + percpu_counter_destroy(&tctx->inflight); + kfree(tctx); } - return 0; + return ret; } int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx) @@ -178,13 +222,16 @@ __cold void io_uring_del_tctx_node(unsigned long index) WARN_ON_ONCE(current != node->task); WARN_ON_ONCE(list_empty(&node->ctx_node)); - mutex_lock(&node->ctx->uring_lock); + mutex_lock(&node->ctx->tctx_lock); list_del(&node->ctx_node); - mutex_unlock(&node->ctx->uring_lock); + mutex_unlock(&node->ctx->tctx_lock); if (tctx->last == node->ctx) tctx->last = NULL; kfree(node); + + if (xa_empty(&tctx->xa) && tctx->io_wq) + io_wq_set_exit_on_idle(tctx->io_wq, true); } __cold void io_uring_clean_tctx(struct io_uring_task *tctx) @@ -223,14 +270,14 @@ void io_uring_unreg_ringfd(void) int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end) { - int offset; + int offset, idx; for (offset = start; offset < end; offset++) { - offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); - if (tctx->registered_rings[offset]) + idx = array_index_nospec(offset, IO_RINGFD_REG_MAX); + if (tctx->registered_rings[idx]) continue; - tctx->registered_rings[offset] = file; - return offset; + tctx->registered_rings[idx] = file; + return idx; } return -EBUSY; } @@ -353,3 +400,19 @@ int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, return i ? i : ret; } + +int __io_uring_fork(struct task_struct *tsk) +{ + struct io_restriction *res, *src = tsk->io_uring_restrict; + + /* Don't leave it dangling on error */ + tsk->io_uring_restrict = NULL; + + res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT); + if (!res) + return -ENOMEM; + + tsk->io_uring_restrict = res; + io_restriction_clone(res, src); + return 0; +} diff --git a/io_uring/tctx.h b/io_uring/tctx.h index 608e96de70a2..2310d2a0c46d 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -6,8 +6,8 @@ struct io_tctx_node { struct io_ring_ctx *ctx; }; -int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx); +struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx); void io_uring_del_tctx_node(unsigned long index); int __io_uring_add_tctx_node(struct io_ring_ctx *ctx); int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 48fc8cf70784..c4dd26cf342d 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -3,6 +3,7 @@ #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring.h> +#include <linux/time_namespace.h> #include <trace/events/io_uring.h> @@ -30,11 +31,52 @@ struct io_timeout_rem { u64 addr; /* timeout update */ - struct timespec64 ts; + ktime_t time; u32 flags; bool ltimeout; }; +static clockid_t io_flags_to_clock(unsigned flags) +{ + switch (flags & IORING_TIMEOUT_CLOCK_MASK) { + case IORING_TIMEOUT_BOOTTIME: + return CLOCK_BOOTTIME; + case IORING_TIMEOUT_REALTIME: + return CLOCK_REALTIME; + default: + /* can't happen, vetted at prep time */ + WARN_ON_ONCE(1); + fallthrough; + case 0: + return CLOCK_MONOTONIC; + } +} + +static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) +{ + struct timespec64 ts; + + if (flags & IORING_TIMEOUT_IMMEDIATE_ARG) { + *time = ns_to_ktime(arg); + if (*time < 0) + return -EINVAL; + goto out; + } + + if (get_timespec64(&ts, u64_to_user_ptr(arg))) + return -EFAULT; + if (ts.tv_sec < 0 || ts.tv_nsec < 0) + return -EINVAL; + *time = timespec64_to_ktime(ts); +out: + if (flags & IORING_TIMEOUT_ABS) + *time = timens_ktime_to_host(io_flags_to_clock(flags), *time); + return 0; +} + +static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link); + static inline bool io_is_timeout_noseq(struct io_kiocb *req) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); @@ -65,8 +107,9 @@ static inline bool io_timeout_finish(struct io_timeout *timeout, static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); -static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_timeout_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data = req->async_data; struct io_ring_ctx *ctx = req->ctx; @@ -76,13 +119,13 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) /* re-arm timer */ raw_spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + hrtimer_start(&data->timer, data->time, data->mode); raw_spin_unlock_irq(&ctx->timeout_lock); return; } } - io_req_task_complete(req, ts); + io_req_task_complete(tw_req, tw); } static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) @@ -126,7 +169,7 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) u32 seq; raw_spin_lock_irq(&ctx->timeout_lock); - seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + seq = READ_ONCE(ctx->cached_cq_tail) - atomic_read(&ctx->cq_timeouts); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); @@ -154,9 +197,11 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) io_flush_killed_timeouts(&list, 0); } -static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) +static void io_req_tw_fail_links(struct io_tw_req tw_req, io_tw_token_t tw) { - io_tw_lock(link->ctx, ts); + struct io_kiocb *link = tw_req.req; + + io_tw_lock(link->ctx, tw); while (link) { struct io_kiocb *nxt = link->link; long res = -ECANCELED; @@ -165,7 +210,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); - io_req_task_complete(link, ts); + io_req_task_complete((struct io_tw_req){link}, tw); link = nxt; } } @@ -218,7 +263,9 @@ void io_disarm_next(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; raw_spin_lock_irq(&ctx->timeout_lock); - link = io_disarm_linked_timeout(req); + if (req->link && req->link->opcode == IORING_OP_LINK_TIMEOUT) + link = __io_disarm_linked_timeout(req, req->link); + raw_spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); @@ -228,8 +275,8 @@ void io_disarm_next(struct io_kiocb *req) io_fail_links(req); } -struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, - struct io_kiocb *link) +static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link) __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { @@ -237,6 +284,10 @@ struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, struct io_timeout *timeout = io_kiocb_to_cmd(link, struct io_timeout); io_remove_next_linked(req); + + /* If this is NULL, then timer already claimed it and will complete it */ + if (!timeout->head) + return NULL; timeout->head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&timeout->list); @@ -257,8 +308,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) raw_spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&timeout->list); - atomic_set(&req->ctx->cq_timeouts, - atomic_read(&req->ctx->cq_timeouts) + 1); + atomic_set(&ctx->cq_timeouts, + atomic_read(&ctx->cq_timeouts) + 1); raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) @@ -312,14 +363,23 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return 0; } -static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_task_link_timeout(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; int ret; if (prev) { - if (!io_should_terminate_tw()) { + /* + * splice the linked timeout out of prev's chain if the regular + * completion path didn't already do it. + */ + if (prev->link == req) + prev->link = req->link; + req->link = NULL; + + if (!tw.cancel) { struct io_cancel_data cd = { .ctx = req->ctx, .data = prev->cqe.user_data, @@ -330,11 +390,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t ret = -ECANCELED; } io_req_set_res(req, ret ?: -ETIME, 0); - io_req_task_complete(req, ts); + io_req_task_complete(tw_req, tw); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); - io_req_task_complete(req, ts); + io_req_task_complete(tw_req, tw); } } @@ -353,12 +413,14 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) /* * We don't expect the list to be empty, that will only happen if we - * race with the completion of the linked work. + * race with the completion of the linked work. Splice of prev is + * done in io_req_task_link_timeout(), if needed. */ if (prev) { - io_remove_next_linked(prev); - if (!req_ref_inc_not_zero(prev)) + if (!req_ref_inc_not_zero(prev)) { + io_remove_next_linked(prev); prev = NULL; + } } list_del(&timeout->list); timeout->prev = prev; @@ -371,22 +433,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) static clockid_t io_timeout_get_clock(struct io_timeout_data *data) { - switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { - case IORING_TIMEOUT_BOOTTIME: - return CLOCK_BOOTTIME; - case IORING_TIMEOUT_REALTIME: - return CLOCK_REALTIME; - default: - /* can't happen, vetted at prep time */ - WARN_ON_ONCE(1); - fallthrough; - case 0: - return CLOCK_MONOTONIC; - } + return io_flags_to_clock(data->flags); } static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) + ktime_t ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; @@ -407,14 +458,13 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) == -1) return -EALREADY; - hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); - io->timer.function = io_link_timeout_fn; - hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); + hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode); + hrtimer_start(&io->timer, ts, mode); return 0; } static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) + ktime_t time, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { struct io_cancel_data cd = { .ctx = ctx, .data = user_data, }; @@ -427,21 +477,23 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, timeout->off = 0; /* noseq */ data = req->async_data; - data->ts = *ts; + data->time = time; list_add_tail(&timeout->list, &ctx->timeout_list); - hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); - data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode); + hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode); + hrtimer_start(&data->timer, data->time, mode); return 0; } int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem); + int ret; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; + if (sqe->addr3 || sqe->__pad2[0]) + return -EINVAL; if (sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; @@ -453,12 +505,13 @@ int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) tr->ltimeout = true; - if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) - return -EINVAL; - if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) - return -EFAULT; - if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) + if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK | + IORING_TIMEOUT_ABS | + IORING_TIMEOUT_IMMEDIATE_ARG)) return -EINVAL; + ret = io_parse_user_time(&tr->time, READ_ONCE(sqe->addr2), tr->flags); + if (ret) + return ret; } else if (tr->flags) { /* timeout removal doesn't support flags */ return -EINVAL; @@ -493,16 +546,16 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) raw_spin_lock_irq(&ctx->timeout_lock); if (tr->ltimeout) - ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); + ret = io_linked_timeout_update(ctx, tr->addr, tr->time, mode); else - ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); + ret = io_timeout_update(ctx, tr->addr, tr->time, mode); raw_spin_unlock_irq(&ctx->timeout_lock); } if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static int __io_timeout_prep(struct io_kiocb *req, @@ -513,7 +566,10 @@ static int __io_timeout_prep(struct io_kiocb *req, struct io_timeout_data *data; unsigned flags; u32 off = READ_ONCE(sqe->off); + int ret; + if (sqe->addr3 || sqe->__pad2[0]) + return -EINVAL; if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) @@ -521,7 +577,8 @@ static int __io_timeout_prep(struct io_kiocb *req, flags = READ_ONCE(sqe->timeout_flags); if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | IORING_TIMEOUT_ETIME_SUCCESS | - IORING_TIMEOUT_MULTISHOT)) + IORING_TIMEOUT_MULTISHOT | + IORING_TIMEOUT_IMMEDIATE_ARG)) return -EINVAL; /* more than one clock specified is invalid, obviously */ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) @@ -532,8 +589,8 @@ static int __io_timeout_prep(struct io_kiocb *req, INIT_LIST_HEAD(&timeout->list); timeout->off = off; - if (unlikely(off && !req->ctx->off_timeout_used)) - req->ctx->off_timeout_used = true; + if (unlikely(off && !(req->ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED))) + req->ctx->int_flags |= IO_RING_F_OFF_TIMEOUT_USED; /* * for multishot reqs w/ fixed nr of repeats, repeats tracks the * remaining nr @@ -550,14 +607,11 @@ static int __io_timeout_prep(struct io_kiocb *req, data->req = req; data->flags = flags; - if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) - return -EFAULT; - - if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) - return -EINVAL; + ret = io_parse_user_time(&data->time, READ_ONCE(sqe->addr), flags); + if (ret) + return ret; data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); if (is_timeout_link) { struct io_submit_link *link = &req->ctx->submit_state.link; @@ -568,6 +622,10 @@ static int __io_timeout_prep(struct io_kiocb *req, return -EINVAL; timeout->head = link->last; link->last->flags |= REQ_F_ARM_LTIMEOUT; + hrtimer_setup(&data->timer, io_link_timeout_fn, io_timeout_get_clock(data), + data->mode); + } else { + hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), data->mode); } return 0; } @@ -627,8 +685,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) } add: list_add(&timeout->list, entry); - data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + hrtimer_start(&data->timer, data->time, data->mode); raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; } @@ -646,9 +703,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) if (timeout->head) { struct io_timeout_data *data = req->async_data; - data->timer.function = io_link_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), - data->mode); + hrtimer_start(&data->timer, data->time, data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); } raw_spin_unlock_irq(&ctx->timeout_lock); diff --git a/io_uring/timeout.h b/io_uring/timeout.h index e91b32448dcf..1620f94dd45a 100644 --- a/io_uring/timeout.h +++ b/io_uring/timeout.h @@ -3,24 +3,11 @@ struct io_timeout_data { struct io_kiocb *req; struct hrtimer timer; - struct timespec64 ts; + ktime_t time; enum hrtimer_mode mode; u32 flags; }; -struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, - struct io_kiocb *link); - -static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *link = req->link; - - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) - return __io_disarm_linked_timeout(req, link); - - return NULL; -} - __cold void io_flush_timeouts(struct io_ring_ctx *ctx); struct io_cancel_data; int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); diff --git a/io_uring/truncate.c b/io_uring/truncate.c index 62ee73d34d72..c88d8bd8d20e 100644 --- a/io_uring/truncate.c +++ b/io_uring/truncate.c @@ -41,8 +41,8 @@ int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = do_ftruncate(req->file, ft->len, 1); + ret = do_ftruncate(req->file, ft->len, 0); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/tw.c b/io_uring/tw.c new file mode 100644 index 000000000000..023d5e6bc491 --- /dev/null +++ b/io_uring/tw.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Task work handling for io_uring + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/sched/signal.h> +#include <linux/io_uring.h> +#include <linux/indirect_call_wrapper.h> + +#include "io_uring.h" +#include "tctx.h" +#include "poll.h" +#include "rw.h" +#include "eventfd.h" +#include "wait.h" + +void io_fallback_req_func(struct work_struct *work) +{ + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + fallback_work.work); + struct llist_node *node = llist_del_all(&ctx->fallback_llist); + struct io_kiocb *req, *tmp; + struct io_tw_state ts = {}; + + percpu_ref_get(&ctx->refs); + mutex_lock(&ctx->uring_lock); + ts.cancel = io_should_terminate_tw(ctx); + llist_for_each_entry_safe(req, tmp, node, io_task_work.node) + req->io_task_work.func((struct io_tw_req){req}, ts); + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); + percpu_ref_put(&ctx->refs); +} + +static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) +{ + if (!ctx) + return; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); + percpu_ref_put(&ctx->refs); +} + +/* + * Run queued task_work, returning the number of entries processed in *count. + * If more entries than max_entries are available, stop processing once this + * is reached and return the rest of the list. + */ +struct llist_node *io_handle_tw_list(struct llist_node *node, + unsigned int *count, + unsigned int max_entries) +{ + struct io_ring_ctx *ctx = NULL; + struct io_tw_state ts = { }; + + do { + struct llist_node *next = node->next; + struct io_kiocb *req = container_of(node, struct io_kiocb, + io_task_work.node); + + if (req->ctx != ctx) { + ctx_flush_and_put(ctx, ts); + ctx = req->ctx; + mutex_lock(&ctx->uring_lock); + percpu_ref_get(&ctx->refs); + ts.cancel = io_should_terminate_tw(ctx); + } + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + (struct io_tw_req){req}, ts); + node = next; + (*count)++; + if (unlikely(need_resched())) { + ctx_flush_and_put(ctx, ts); + ctx = NULL; + cond_resched(); + } + } while (node && *count < max_entries); + + ctx_flush_and_put(ctx, ts); + return node; +} + +static __cold void __io_fallback_tw(struct llist_node *node, bool sync) +{ + struct io_ring_ctx *last_ctx = NULL; + struct io_kiocb *req; + + while (node) { + req = container_of(node, struct io_kiocb, io_task_work.node); + node = node->next; + if (last_ctx != req->ctx) { + if (last_ctx) { + if (sync) + flush_delayed_work(&last_ctx->fallback_work); + percpu_ref_put(&last_ctx->refs); + } + last_ctx = req->ctx; + percpu_ref_get(&last_ctx->refs); + } + if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) + schedule_delayed_work(&last_ctx->fallback_work, 1); + } + + if (last_ctx) { + if (sync) + flush_delayed_work(&last_ctx->fallback_work); + percpu_ref_put(&last_ctx->refs); + } +} + +static void io_fallback_tw(struct io_uring_task *tctx, bool sync) +{ + struct llist_node *node = llist_del_all(&tctx->task_list); + + __io_fallback_tw(node, sync); +} + +struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, + unsigned int max_entries, + unsigned int *count) +{ + struct llist_node *node; + + node = llist_del_all(&tctx->task_list); + if (node) { + node = llist_reverse_order(node); + node = io_handle_tw_list(node, count, max_entries); + } + + /* relaxed read is enough as only the task itself sets ->in_cancel */ + if (unlikely(atomic_read(&tctx->in_cancel))) + io_uring_drop_tctx_refs(current); + + trace_io_uring_task_work_run(tctx, *count); + return node; +} + +void tctx_task_work(struct callback_head *cb) +{ + struct io_uring_task *tctx; + struct llist_node *ret; + unsigned int count = 0; + + tctx = container_of(cb, struct io_uring_task, task_work); + ret = tctx_task_work_run(tctx, UINT_MAX, &count); + /* can't happen */ + WARN_ON_ONCE(ret); +} + +/* + * Sets IORING_SQ_TASKRUN in the sq_flags shared with userspace, using the + * RCU protected rings pointer to be safe against concurrent ring resizing. + */ +static void io_ctx_mark_taskrun(struct io_ring_ctx *ctx) +{ + lockdep_assert_in_rcu_read_lock(); + + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) { + struct io_rings *rings = rcu_dereference(ctx->rings_rcu); + + atomic_or(IORING_SQ_TASKRUN, &rings->sq_flags); + } +} + +void io_req_local_work_add(struct io_kiocb *req, unsigned flags) +{ + struct io_ring_ctx *ctx = req->ctx; + unsigned nr_wait, nr_tw, nr_tw_prev; + struct llist_node *head; + + /* See comment above IO_CQ_WAKE_INIT */ + BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); + + /* + * We don't know how many requests there are in the link and whether + * they can even be queued lazily, fall back to non-lazy. + */ + if (req->flags & IO_REQ_LINK_FLAGS) + flags &= ~IOU_F_TWQ_LAZY_WAKE; + + guard(rcu)(); + + head = READ_ONCE(ctx->work_llist.first); + do { + nr_tw_prev = 0; + if (head) { + struct io_kiocb *first_req = container_of(head, + struct io_kiocb, + io_task_work.node); + /* + * Might be executed at any moment, rely on + * SLAB_TYPESAFE_BY_RCU to keep it alive. + */ + nr_tw_prev = READ_ONCE(first_req->nr_tw); + } + + /* + * Theoretically, it can overflow, but that's fine as one of + * previous adds should've tried to wake the task. + */ + nr_tw = nr_tw_prev + 1; + if (!(flags & IOU_F_TWQ_LAZY_WAKE)) + nr_tw = IO_CQ_WAKE_FORCE; + + req->nr_tw = nr_tw; + req->io_task_work.node.next = head; + } while (!try_cmpxchg(&ctx->work_llist.first, &head, + &req->io_task_work.node)); + + /* + * cmpxchg implies a full barrier, which pairs with the barrier + * in set_current_state() on the io_cqring_wait() side. It's used + * to ensure that either we see updated ->cq_wait_nr, or waiters + * going to sleep will observe the work added to the list, which + * is similar to the wait/wawke task state sync. + */ + + if (!head) { + io_ctx_mark_taskrun(ctx); + if (data_race(ctx->int_flags) & IO_RING_F_HAS_EVFD) + io_eventfd_signal(ctx, false); + } + + nr_wait = atomic_read(&ctx->cq_wait_nr); + /* not enough or no one is waiting */ + if (nr_tw < nr_wait) + return; + /* the previous add has already woken it up */ + if (nr_tw_prev >= nr_wait) + return; + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); +} + +void io_req_normal_work_add(struct io_kiocb *req) +{ + struct io_uring_task *tctx = req->tctx; + struct io_ring_ctx *ctx = req->ctx; + + /* task_work already pending, we're done */ + if (!llist_add(&req->io_task_work.node, &tctx->task_list)) + return; + + /* + * Doesn't need to use ->rings_rcu, as resizing isn't supported for + * !DEFER_TASKRUN. + */ + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + + /* SQPOLL doesn't need the task_work added, it'll run it itself */ + if (ctx->flags & IORING_SETUP_SQPOLL) { + __set_notify_signal(tctx->task); + return; + } + + if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) + return; + + io_fallback_tw(tctx, false); +} + +void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) +{ + if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) + return; + __io_req_task_work_add(req, flags); +} + +void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) +{ + struct llist_node *node; + + /* + * Running the work items may utilize ->retry_llist as a means + * for capping the number of task_work entries run at the same + * time. But that list can potentially race with moving the work + * from here, if the task is exiting. As any normal task_work + * running holds ->uring_lock already, just guard this slow path + * with ->uring_lock to avoid racing on ->retry_llist. + */ + guard(mutex)(&ctx->uring_lock); + node = llist_del_all(&ctx->work_llist); + __io_fallback_tw(node, false); + node = llist_del_all(&ctx->retry_llist); + __io_fallback_tw(node, false); +} + +static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, + int min_events) +{ + if (!io_local_work_pending(ctx)) + return false; + if (events < min_events) + return true; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + return false; +} + +static int __io_run_local_work_loop(struct llist_node **node, + io_tw_token_t tw, + int events) +{ + int ret = 0; + + while (*node) { + struct llist_node *next = (*node)->next; + struct io_kiocb *req = container_of(*node, struct io_kiocb, + io_task_work.node); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + (struct io_tw_req){req}, tw); + *node = next; + if (++ret >= events) + break; + } + + return ret; +} + +static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, + int min_events, int max_events) +{ + struct llist_node *node; + unsigned int loops = 0; + int ret = 0; + + if (WARN_ON_ONCE(ctx->submitter_task != current)) + return -EEXIST; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); +again: + tw.cancel = io_should_terminate_tw(ctx); + min_events -= ret; + ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); + if (ctx->retry_llist.first) + goto retry_done; + + /* + * llists are in reverse order, flip it back the right way before + * running the pending items. + */ + node = llist_reverse_order(llist_del_all(&ctx->work_llist)); + ret += __io_run_local_work_loop(&node, tw, max_events - ret); + ctx->retry_llist.first = node; + loops++; + + if (io_run_local_work_continue(ctx, ret, min_events)) + goto again; +retry_done: + io_submit_flush_completions(ctx); + if (io_run_local_work_continue(ctx, ret, min_events)) + goto again; + + trace_io_uring_local_work_run(ctx, ret, loops); + return ret; +} + +int io_run_local_work_locked(struct io_ring_ctx *ctx, int min_events) +{ + struct io_tw_state ts = {}; + + if (!io_local_work_pending(ctx)) + return 0; + return __io_run_local_work(ctx, ts, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); +} + +int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) +{ + struct io_tw_state ts = {}; + int ret; + + mutex_lock(&ctx->uring_lock); + ret = __io_run_local_work(ctx, ts, min_events, max_events); + mutex_unlock(&ctx->uring_lock); + return ret; +} diff --git a/io_uring/tw.h b/io_uring/tw.h new file mode 100644 index 000000000000..415e330fabde --- /dev/null +++ b/io_uring/tw.h @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_TW_H +#define IOU_TW_H + +#include <linux/sched.h> +#include <linux/percpu-refcount.h> +#include <linux/io_uring_types.h> + +#define IO_LOCAL_TW_DEFAULT_MAX 20 + +/* + * Terminate the request if either of these conditions are true: + * + * 1) It's being executed by the original task, but that task is marked + * with PF_EXITING as it's exiting. + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is + * our fallback task_work. + * 3) The ring has been closed and is going away. + */ +static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) +{ + return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); +} + +void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); +struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); +void tctx_task_work(struct callback_head *cb); +int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events); +int io_run_task_work_sig(struct io_ring_ctx *ctx); + +__cold void io_fallback_req_func(struct work_struct *work); +__cold void io_move_task_work_from_local(struct io_ring_ctx *ctx); +int io_run_local_work_locked(struct io_ring_ctx *ctx, int min_events); + +void io_req_local_work_add(struct io_kiocb *req, unsigned flags); +void io_req_normal_work_add(struct io_kiocb *req); +struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); + +static inline void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) +{ + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_req_local_work_add(req, flags); + else + io_req_normal_work_add(req); +} + +static inline void io_req_task_work_add(struct io_kiocb *req) +{ + __io_req_task_work_add(req, 0); +} + +static inline int io_run_task_work(void) +{ + bool ret = false; + + /* + * Always check-and-clear the task_work notification signal. With how + * signaling works for task_work, we can find it set with nothing to + * run. We need to clear it for that case, like get_signal() does. + */ + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) + clear_notify_signal(); + /* + * PF_IO_WORKER never returns to userspace, so check here if we have + * notify work that needs processing. + */ + if (current->flags & PF_IO_WORKER) { + if (test_thread_flag(TIF_NOTIFY_RESUME)) { + __set_current_state(TASK_RUNNING); + resume_user_mode_work(NULL); + } + if (current->io_uring) { + unsigned int count = 0; + + __set_current_state(TASK_RUNNING); + tctx_task_work_run(current->io_uring, UINT_MAX, &count); + if (count) + ret = true; + } + } + if (task_work_pending(current)) { + __set_current_state(TASK_RUNNING); + task_work_run(); + ret = true; + } + + return ret; +} + +static inline bool io_local_work_pending(struct io_ring_ctx *ctx) +{ + return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); +} + +static inline bool io_task_work_pending(struct io_ring_ctx *ctx) +{ + return task_work_pending(current) || io_local_work_pending(ctx); +} + +static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) +{ + lockdep_assert_held(&ctx->uring_lock); +} + +static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) +{ + return likely(ctx->submitter_task == current); +} + +static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) +{ + return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) || + ctx->submitter_task == current); +} + +#endif diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index e6701b7aa147..7b25dcd9d05f 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -3,38 +3,49 @@ #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring/cmd.h> -#include <linux/io_uring/net.h> #include <linux/security.h> #include <linux/nospec.h> -#include <net/sock.h> #include <uapi/linux/io_uring.h> -#include <asm/ioctls.h> #include "io_uring.h" #include "alloc_cache.h" #include "rsrc.h" +#include "kbuf.h" #include "uring_cmd.h" +#include "poll.h" + +void io_cmd_cache_free(const void *entry) +{ + struct io_async_cmd *ac = (struct io_async_cmd *)entry; + + io_vec_free(&ac->vec); + kfree(ac); +} static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct io_uring_cmd_data *cache = req->async_data; - - if (cache->op_data) { - kfree(cache->op_data); - cache->op_data = NULL; - } + struct io_async_cmd *ac = req->async_data; if (issue_flags & IO_URING_F_UNLOCKED) return; - if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) { + + io_alloc_cache_vec_kasan(&ac->vec); + if (ac->vec.nr > IO_VEC_CACHE_SOFT_CAP) + io_vec_free(&ac->vec); + + if (io_alloc_cache_put(&req->ctx->cmd_cache, ac)) { ioucmd->sqe = NULL; - req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; + io_req_async_data_clear(req, REQ_F_NEED_CLEANUP); } } +void io_uring_cmd_cleanup(struct io_kiocb *req) +{ + io_req_uring_cleanup(req, 0); +} + bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { @@ -93,6 +104,15 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, struct io_kiocb *req = cmd_to_io_kiocb(cmd); struct io_ring_ctx *ctx = req->ctx; + /* + * Doing cancelations on IOPOLL requests are not supported. Both + * because they can't get canceled in the block stack, but also + * because iopoll completion data overlaps with the hash_node used + * for tracking. + */ + if (req->flags & REQ_F_IOPOLL) + return; + if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) { cmd->flags |= IORING_URING_CMD_CANCELABLE; io_ring_submit_lock(ctx, issue_flags); @@ -102,26 +122,16 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); -static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - unsigned int flags = IO_URING_F_COMPLETE_DEFER; - - if (io_should_terminate_tw()) - flags |= IO_URING_F_TASK_DEAD; - - /* task_work executor checks the deffered list completion */ - ioucmd->task_work_cb(ioucmd, flags); -} - void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), + io_req_tw_func_t task_work_cb, unsigned flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - ioucmd->task_work_cb = task_work_cb; - req->io_task_work.func = io_uring_cmd_work; + if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT)) + return; + + req->io_task_work.func = task_work_cb; __io_req_task_work_add(req, flags); } EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); @@ -137,21 +147,27 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, - unsigned issue_flags) +void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, + unsigned issue_flags, bool is_cqe32) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT)) + return; + io_uring_cmd_del_cancelable(ioucmd, issue_flags); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - if (req->ctx->flags & IORING_SETUP_CQE32) + if (is_cqe32) { + if (req->ctx->flags & IORING_SETUP_CQE_MIXED) + req->cqe.flags |= IORING_CQE_F_32; io_req_set_cqe32_extra(req, res2, 0); + } io_req_uring_cleanup(req, issue_flags); - if (req->ctx->flags & IORING_SETUP_IOPOLL) { + if (req->flags & REQ_F_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) { @@ -163,34 +179,12 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, io_req_task_work_add(req); } } -EXPORT_SYMBOL_GPL(io_uring_cmd_done); - -static int io_uring_cmd_prep_setup(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct io_uring_cmd_data *cache; - - cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req); - if (!cache) - return -ENOMEM; - cache->op_data = NULL; - - /* - * Unconditionally cache the SQE for now - this is only needed for - * requests that go async, but prep handlers must ensure that any - * sqe data is stable beyond prep. Since uring_cmd is special in - * that it doesn't read in per-op data, play it safe and ensure that - * any SQE data is stable beyond prep. This can later get relaxed. - */ - memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = cache->sqes; - return 0; -} +EXPORT_SYMBOL_GPL(__io_uring_cmd_done); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct io_async_cmd *ac; if (sqe->__pad1) return -EINVAL; @@ -200,23 +194,46 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; if (ioucmd->flags & IORING_URING_CMD_FIXED) { - struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *node; - u16 index = READ_ONCE(sqe->buf_index); - - node = io_rsrc_node_lookup(&ctx->buf_table, index); - if (unlikely(!node)) - return -EFAULT; - /* - * Pi node upfront, prior to io_uring_cmd_import_fixed() - * being called. This prevents destruction of the mapped buffer - * we'll need at actual import time. - */ - io_req_assign_buf_node(req, node); + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) + return -EINVAL; + req->buf_index = READ_ONCE(sqe->buf_index); } + + if (!!(ioucmd->flags & IORING_URING_CMD_MULTISHOT) != + !!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); - return io_uring_cmd_prep_setup(req, sqe); + ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req); + if (!ac) + return -ENOMEM; + ioucmd->sqe = sqe; + return 0; +} + +/* + * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each + * slot. + */ +static inline size_t uring_sqe_size(struct io_kiocb *req) +{ + if (req->ctx->flags & IORING_SETUP_SQE128 || + req->opcode == IORING_OP_URING_CMD128) + return 2 * sizeof(struct io_uring_sqe); + return sizeof(struct io_uring_sqe); +} + +void io_uring_cmd_sqe_copy(struct io_kiocb *req) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct io_async_cmd *ac = req->async_data; + + /* Should not happen, as REQ_F_SQE_COPIED covers this */ + if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes)) + return; + memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req)); + ioucmd->sqe = ac->sqes; } int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) @@ -233,43 +250,78 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) if (ret) return ret; - if (ctx->flags & IORING_SETUP_SQE128) + if (ctx->flags & IORING_SETUP_SQE128 || + req->opcode == IORING_OP_URING_CMD128) issue_flags |= IO_URING_F_SQE128; - if (ctx->flags & IORING_SETUP_CQE32) + if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED)) issue_flags |= IO_URING_F_CQE32; - if (ctx->compat) + if (io_is_compat(ctx)) issue_flags |= IO_URING_F_COMPAT; - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!file->f_op->uring_cmd_iopoll) - return -EOPNOTSUPP; + if (ctx->flags & IORING_SETUP_IOPOLL && file->f_op->uring_cmd_iopoll) { + req->flags |= REQ_F_IOPOLL; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { + /* make sure every req only blocks once */ + req->flags &= ~REQ_F_IOPOLL_STATE; + req->iopoll_start = ktime_get_ns(); + } } ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN || ret == -EIOCBQUEUED) + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) { + if (ret >= 0) + return IOU_ISSUE_SKIP_COMPLETE; + } + if (ret == -EAGAIN) { + ioucmd->flags |= IORING_URING_CMD_REISSUE; + return ret; + } + if (ret == -EIOCBQUEUED) return ret; if (ret < 0) req_set_fail(req); io_req_uring_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) + struct iov_iter *iter, + struct io_uring_cmd *ioucmd, + unsigned int issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - struct io_rsrc_node *node = req->buf_node; - /* Must have had rsrc_node assigned at prep time */ - if (node) - return io_import_fixed(rw, iter, node->buf, ubuf, len); + if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED))) + return -EINVAL; - return -EFAULT; + return io_import_reg_buf(req, iter, ubuf, len, rw, issue_flags); } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); +int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, + const struct iovec __user *uvec, + size_t uvec_segs, + int ddir, struct iov_iter *iter, + unsigned issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + struct io_async_cmd *ac = req->async_data; + int ret; + + if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED))) + return -EINVAL; + + ret = io_prep_reg_iovec(req, &ac->vec, uvec, uvec_segs); + if (ret) + return ret; + + return io_import_reg_vec(ddir, iter, req, &ac->vec, uvec_segs, + issue_flags); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed_vec); + void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -277,80 +329,80 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) io_req_queue_iowq(req); } -static inline int io_uring_cmd_getsockopt(struct socket *sock, - struct io_uring_cmd *cmd, - unsigned int issue_flags) +int io_cmd_poll_multishot(struct io_uring_cmd *cmd, + unsigned int issue_flags, __poll_t mask) { - bool compat = !!(issue_flags & IO_URING_F_COMPAT); - int optlen, optname, level, err; - void __user *optval; + struct io_kiocb *req = cmd_to_io_kiocb(cmd); + int ret; - level = READ_ONCE(cmd->sqe->level); - if (level != SOL_SOCKET) - return -EOPNOTSUPP; + if (likely(req->flags & REQ_F_APOLL_MULTISHOT)) + return 0; - optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval)); - optname = READ_ONCE(cmd->sqe->optname); - optlen = READ_ONCE(cmd->sqe->optlen); + req->flags |= REQ_F_APOLL_MULTISHOT; + mask &= ~EPOLLONESHOT; - err = do_sock_getsockopt(sock, compat, level, optname, - USER_SOCKPTR(optval), - KERNEL_SOCKPTR(&optlen)); - if (err) - return err; + ret = io_arm_apoll(req, issue_flags, mask); + return ret == IO_APOLL_OK ? -EIOCBQUEUED : -ECANCELED; +} - /* On success, return optlen */ - return optlen; +bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd, + unsigned int issue_flags, + struct io_uring_cqe cqe[2]) +{ + struct io_kiocb *req = cmd_to_io_kiocb(cmd); + + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_MULTISHOT))) + return false; + return io_req_post_cqe32(req, cqe); } -static inline int io_uring_cmd_setsockopt(struct socket *sock, - struct io_uring_cmd *cmd, - unsigned int issue_flags) +/* + * Work with io_uring_mshot_cmd_post_cqe() together for committing the + * provided buffer upfront + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags) { - bool compat = !!(issue_flags & IO_URING_F_COMPAT); - int optname, optlen, level; - void __user *optval; - sockptr_t optval_s; - - optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval)); - optname = READ_ONCE(cmd->sqe->optname); - optlen = READ_ONCE(cmd->sqe->optlen); - level = READ_ONCE(cmd->sqe->level); - optval_s = USER_SOCKPTR(optval); - - return do_sock_setsockopt(sock, compat, level, optname, optval_s, - optlen); + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) + return (struct io_br_sel) { .val = -EINVAL }; + + if (WARN_ON_ONCE(!io_do_buffer_select(req))) + return (struct io_br_sel) { .val = -EINVAL }; + + return io_buffer_select(req, len, buf_group, issue_flags); } +EXPORT_SYMBOL_GPL(io_uring_cmd_buffer_select); -#if defined(CONFIG_NET) -int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) +/* + * Return true if this multishot uring_cmd needs to be completed, otherwise + * the event CQE is posted successfully. + * + * This function must use `struct io_br_sel` returned from + * io_uring_cmd_buffer_select() for committing the buffer in the same + * uring_cmd submission context. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags) { - struct socket *sock = cmd->file->private_data; - struct sock *sk = sock->sk; - struct proto *prot = READ_ONCE(sk->sk_prot); - int ret, arg = 0; + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + unsigned int cflags = 0; - if (!prot || !prot->ioctl) - return -EOPNOTSUPP; + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) + return true; - switch (cmd->cmd_op) { - case SOCKET_URING_OP_SIOCINQ: - ret = prot->ioctl(sk, SIOCINQ, &arg); - if (ret) - return ret; - return arg; - case SOCKET_URING_OP_SIOCOUTQ: - ret = prot->ioctl(sk, SIOCOUTQ, &arg); - if (ret) - return ret; - return arg; - case SOCKET_URING_OP_GETSOCKOPT: - return io_uring_cmd_getsockopt(sock, cmd, issue_flags); - case SOCKET_URING_OP_SETSOCKOPT: - return io_uring_cmd_setsockopt(sock, cmd, issue_flags); - default: - return -EOPNOTSUPP; + if (sel->val > 0) { + cflags = io_put_kbuf(req, sel->val, sel->buf_list); + if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) + return false; } + + io_kbuf_recycle(req, sel->buf_list, issue_flags); + if (sel->val < 0) + req_set_fail(req); + io_req_set_res(req, sel->val, cflags); + return true; } -EXPORT_SYMBOL_GPL(io_uring_cmd_sock); -#endif +EXPORT_SYMBOL_GPL(io_uring_mshot_cmd_post_cqe); diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index f6837ee0955b..041aef8a8aa3 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,7 +1,26 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/io_uring/cmd.h> +#include <linux/io_uring_types.h> + +struct io_async_cmd { + struct iou_vec vec; + struct io_uring_sqe sqes[2]; +}; + int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +void io_uring_cmd_sqe_copy(struct io_kiocb *req); +void io_uring_cmd_cleanup(struct io_kiocb *req); bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); + +bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd, + unsigned int issue_flags, + struct io_uring_cqe cqe[2]); + +void io_cmd_cache_free(const void *entry); + +int io_cmd_poll_multishot(struct io_uring_cmd *cmd, + unsigned int issue_flags, __poll_t mask); diff --git a/io_uring/wait.c b/io_uring/wait.c new file mode 100644 index 000000000000..ec01e78a216d --- /dev/null +++ b/io_uring/wait.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Waiting for completion events + */ +#include <linux/kernel.h> +#include <linux/sched/signal.h> +#include <linux/io_uring.h> +#include <linux/time_namespace.h> + +#include <trace/events/io_uring.h> + +#include <uapi/linux/io_uring.h> + +#include "io_uring.h" +#include "napi.h" +#include "wait.h" + +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, + int wake_flags, void *key) +{ + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); + + /* + * Cannot safely flush overflowed CQEs from here, ensure we wake up + * the task, and the next invocation will do it. + */ + if (io_should_wake(iowq) || io_has_work(iowq->ctx)) + return autoremove_wake_function(curr, mode, wake_flags, key); + return -1; +} + +int io_run_task_work_sig(struct io_ring_ctx *ctx) +{ + if (io_local_work_pending(ctx)) { + __set_current_state(TASK_RUNNING); + if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) + return 0; + } + if (io_run_task_work() > 0) + return 0; + if (task_sigpending(current)) + return -EINTR; + return 0; +} + +static bool current_pending_io(void) +{ + struct io_uring_task *tctx = current->io_uring; + + if (!tctx) + return false; + return percpu_counter_read_positive(&tctx->inflight); +} + +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) +{ + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); + + WRITE_ONCE(iowq->hit_timeout, 1); + iowq->min_timeout = 0; + wake_up_process(iowq->wq.private); + return HRTIMER_NORESTART; +} + +/* + * Doing min_timeout portion. If we saw any timeouts, events, or have work, + * wake up. If not, and we have a normal timeout, switch to that and keep + * sleeping. + */ +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) +{ + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); + struct io_ring_ctx *ctx = iowq->ctx; + + /* no general timeout, or shorter (or equal), we are done */ + if (iowq->timeout == KTIME_MAX || + ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) + goto out_wake; + /* work we may need to run, wake function will see if we need to wake */ + if (io_has_work(ctx)) + goto out_wake; + /* got events since we started waiting, min timeout is done */ + scoped_guard(rcu) { + struct io_rings *rings = io_get_rings(ctx); + + if (iowq->cq_min_tail != READ_ONCE(rings->cq.tail)) + goto out_wake; + /* if we have any events and min timeout expired, we're done */ + if (io_cqring_events(ctx)) + goto out_wake; + } + /* + * If using deferred task_work running and application is waiting on + * more than one request, ensure we reset it now where we are switching + * to normal sleeps. Any request completion post min_wait should wake + * the task and return. + */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + if (!llist_empty(&ctx->work_llist)) + goto out_wake; + } + + /* any generated CQE posted past this time should wake us up */ + iowq->cq_tail = iowq->cq_min_tail; + + hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); + hrtimer_set_expires(timer, iowq->timeout); + return HRTIMER_RESTART; +out_wake: + return io_cqring_timer_wakeup(timer); +} + +static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, + clockid_t clock_id, ktime_t start_time) +{ + ktime_t timeout; + + if (iowq->min_timeout) { + timeout = ktime_add_ns(iowq->min_timeout, start_time); + hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, + HRTIMER_MODE_ABS); + } else { + timeout = iowq->timeout; + hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, + HRTIMER_MODE_ABS); + } + + hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); + hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); + + if (!READ_ONCE(iowq->hit_timeout)) + schedule(); + + hrtimer_cancel(&iowq->t); + destroy_hrtimer_on_stack(&iowq->t); + __set_current_state(TASK_RUNNING); + + return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; +} + +static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + struct ext_arg *ext_arg, + ktime_t start_time) +{ + int ret = 0; + + /* + * Mark us as being in io_wait if we have pending requests, so cpufreq + * can take into account that the task is waiting for IO - turns out + * to be important for low QD IO. + */ + if (ext_arg->iowait && current_pending_io()) + current->in_iowait = 1; + if (iowq->timeout != KTIME_MAX || iowq->min_timeout) + ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); + else + schedule(); + current->in_iowait = 0; + return ret; +} + +/* If this returns > 0, the caller should retry */ +static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + struct ext_arg *ext_arg, + ktime_t start_time) +{ + if (unlikely(READ_ONCE(ctx->check_cq))) + return 1; + if (unlikely(io_local_work_pending(ctx))) + return 1; + if (unlikely(task_work_pending(current))) + return 1; + if (unlikely(task_sigpending(current))) + return -EINTR; + if (unlikely(io_should_wake(iowq))) + return 0; + + return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); +} + +/* + * Wait until events become available, if we don't already have some. The + * application must reap them itself, as they reside on the shared cq ring. + */ +int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, + struct ext_arg *ext_arg) +{ + struct io_wait_queue iowq; + struct io_rings *rings; + ktime_t start_time; + int ret, nr_wait; + + min_events = min_t(int, min_events, ctx->cq_entries); + + if (!io_allowed_run_tw(ctx)) + return -EEXIST; + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); + io_run_task_work(); + + if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) + io_cqring_do_overflow_flush(ctx); + + rcu_read_lock(); + rings = io_get_rings(ctx); + if (__io_cqring_events_user(ctx) >= min_events) { + rcu_read_unlock(); + return 0; + } + + init_waitqueue_func_entry(&iowq.wq, io_wake_function); + iowq.wq.private = current; + INIT_LIST_HEAD(&iowq.wq.entry); + iowq.ctx = ctx; + iowq.cq_tail = READ_ONCE(rings->cq.head) + min_events; + iowq.cq_min_tail = READ_ONCE(rings->cq.tail); + nr_wait = (int) iowq.cq_tail - READ_ONCE(rings->cq.tail); + rcu_read_unlock(); + rings = NULL; + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + iowq.hit_timeout = 0; + iowq.min_timeout = ext_arg->min_time; + iowq.timeout = KTIME_MAX; + start_time = io_get_time(ctx); + + if (ext_arg->ts_set) { + iowq.timeout = timespec64_to_ktime(ext_arg->ts); + if (flags & IORING_ENTER_ABS_TIMER) + iowq.timeout = timens_ktime_to_host(ctx->clockid, + iowq.timeout); + else + iowq.timeout = ktime_add(iowq.timeout, start_time); + } + + if (ext_arg->sig) { +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, + ext_arg->argsz); + else +#endif + ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); + + if (ret) + return ret; + } + + io_napi_busy_loop(ctx, &iowq); + + trace_io_uring_cqring_wait(ctx, min_events); + do { + unsigned long check_cq; + + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, nr_wait); + set_current_state(TASK_INTERRUPTIBLE); + } else { + prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, + TASK_INTERRUPTIBLE); + } + + ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); + __set_current_state(TASK_RUNNING); + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); + + /* + * Run task_work after scheduling and before io_should_wake(). + * If we got woken because of task_work being processed, run it + * now rather than let the caller do another wait loop. + */ + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, nr_wait, nr_wait); + io_run_task_work(); + + /* + * Non-local task_work will be run on exit to userspace, but + * if we're using DEFER_TASKRUN, then we could have waited + * with a timeout for a number of requests. If the timeout + * hits, we could have some requests ready to process. Ensure + * this break is _after_ we have run task_work, to avoid + * deferring running potentially pending requests until the + * next time we wait for events. + */ + if (ret < 0) + break; + + check_cq = READ_ONCE(ctx->check_cq); + if (unlikely(check_cq)) { + /* let the caller flush overflows, retry */ + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + io_cqring_do_overflow_flush(ctx); + if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { + ret = -EBADR; + break; + } + } + + if (io_should_wake(&iowq)) { + ret = 0; + break; + } + cond_resched(); + + /* if min timeout has been hit, don't reset wait count */ + if (!iowq.hit_timeout) + scoped_guard(rcu) + nr_wait = (int) iowq.cq_tail - + READ_ONCE(io_get_rings(ctx)->cq.tail); + else + nr_wait = 1; + } while (1); + + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + finish_wait(&ctx->cq_wait, &iowq.wq); + restore_saved_sigmask_unless(ret == -EINTR); + + guard(rcu)(); + return READ_ONCE(io_get_rings(ctx)->cq.head) == READ_ONCE(io_get_rings(ctx)->cq.tail) ? ret : 0; +} diff --git a/io_uring/wait.h b/io_uring/wait.h new file mode 100644 index 000000000000..a4274b137f81 --- /dev/null +++ b/io_uring/wait.h @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_WAIT_H +#define IOU_WAIT_H + +#include <linux/io_uring_types.h> + +/* + * No waiters. It's larger than any valid value of the tw counter + * so that tests against ->cq_wait_nr would fail and skip wake_up(). + */ +#define IO_CQ_WAKE_INIT (-1U) +/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ +#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) + +struct ext_arg { + size_t argsz; + struct timespec64 ts; + const sigset_t __user *sig; + ktime_t min_time; + bool ts_set; + bool iowait; +}; + +int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, + struct ext_arg *ext_arg); +int io_run_task_work_sig(struct io_ring_ctx *ctx); +void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx); +void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx); + +static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = io_get_rings(ctx); + return ctx->cached_cq_tail - READ_ONCE(rings->cq.head); +} + +static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = io_get_rings(ctx); + + return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); +} + +/* + * Reads the tail/head of the CQ ring while providing an acquire ordering, + * see comment at top of io_uring.c. + */ +static inline unsigned io_cqring_events(struct io_ring_ctx *ctx) +{ + smp_rmb(); + return __io_cqring_events(ctx); +} + +#endif diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 15a7daf3ff4f..32f68fd7fcdd 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -16,7 +16,7 @@ #include "waitid.h" #include "../kernel/exit.h" -static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts); +static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw); #define IO_WAITID_CANCEL_FLAG BIT(31) #define IO_WAITID_REF_MASK GENMASK(30, 0) @@ -37,12 +37,9 @@ static void io_waitid_free(struct io_kiocb *req) struct io_waitid_async *iwa = req->async_data; put_pid(iwa->wo.wo_pid); - kfree(req->async_data); - req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; + io_req_async_data_free(req); } -#ifdef CONFIG_COMPAT static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) { struct compat_siginfo __user *infop; @@ -67,7 +64,6 @@ Efault: ret = false; goto done; } -#endif static bool io_waitid_copy_si(struct io_kiocb *req, int signo) { @@ -77,10 +73,8 @@ static bool io_waitid_copy_si(struct io_kiocb *req, int signo) if (!iw->infop) return true; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) return io_waitid_compat_copy_si(iw, signo); -#endif if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) return false; @@ -115,6 +109,22 @@ static int io_waitid_finish(struct io_kiocb *req, int ret) return ret; } +static void io_waitid_remove_wq(struct io_kiocb *req) +{ + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct wait_queue_head *head; + + head = smp_load_acquire(&iw->head); + if (head) { + struct io_waitid_async *iwa = req->async_data; + + smp_store_release(&iw->head, NULL); + spin_lock_irq(&head->lock); + list_del_init(&iwa->wo.child_wait.entry); + spin_unlock_irq(&head->lock); + } +} + static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); @@ -125,6 +135,7 @@ static void io_waitid_complete(struct io_kiocb *req, int ret) lockdep_assert_held(&req->ctx->uring_lock); hlist_del_init(&req->hash_node); + io_waitid_remove_wq(req); ret = io_waitid_finish(req, ret); if (ret < 0) @@ -132,10 +143,11 @@ static void io_waitid_complete(struct io_kiocb *req, int ret) io_req_set_res(req, ret, 0); } -static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +static bool __io_waitid_cancel(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); - struct io_waitid_async *iwa = req->async_data; + + lockdep_assert_held(&req->ctx->uring_lock); /* * Mark us canceled regardless of ownership. This will prevent a @@ -147,9 +159,6 @@ static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return false; - spin_lock_irq(&iw->head->lock); - list_del_init(&iwa->wo.child_wait.entry); - spin_unlock_irq(&iw->head->lock); io_waitid_complete(req, -ECANCELED); io_req_queue_tw_complete(req, -ECANCELED); return true; @@ -158,76 +167,41 @@ static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { - struct hlist_node *tmp; - struct io_kiocb *req; - int nr = 0; - - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) - return -ENOENT; - - io_ring_submit_lock(ctx, issue_flags); - hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { - if (req->cqe.user_data != cd->data && - !(cd->flags & IORING_ASYNC_CANCEL_ANY)) - continue; - if (__io_waitid_cancel(ctx, req)) - nr++; - if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) - break; - } - io_ring_submit_unlock(ctx, issue_flags); - - if (nr) - return nr; - - return -ENOENT; + return io_cancel_remove(ctx, cd, issue_flags, &ctx->waitid_list, __io_waitid_cancel); } bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { - struct hlist_node *tmp; - struct io_kiocb *req; - bool found = false; - - lockdep_assert_held(&ctx->uring_lock); - - hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { - if (!io_match_task_safe(req, tctx, cancel_all)) - continue; - hlist_del_init(&req->hash_node); - __io_waitid_cancel(ctx, req); - found = true; - } - - return found; + return io_cancel_remove_all(ctx, tctx, &ctx->waitid_list, cancel_all, __io_waitid_cancel); } static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); - struct io_waitid_async *iwa = req->async_data; if (!atomic_sub_return(1, &iw->refs)) return false; + io_waitid_remove_wq(req); + /* * Wakeup triggered, racing with us. It was prevented from * completing because of that, queue up the tw to do that. */ req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); - remove_wait_queue(iw->head, &iwa->wo.child_wait); return true; } -static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) +static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; - io_tw_lock(ctx, ts); + io_tw_lock(ctx, tw); ret = __do_wait(&iwa->wo); @@ -251,13 +225,12 @@ static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) io_waitid_drop_issue_ref(req); return; } - - remove_wait_queue(iw->head, &iwa->wo.child_wait); + /* fall through to complete, will kill waitqueue */ } } io_waitid_complete(req, ret); - io_req_task_complete(req, ts); + io_req_task_complete(tw_req, tw); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, @@ -272,13 +245,15 @@ static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, if (!pid_child_should_wake(wo, p)) return 0; + list_del_init(&wait->entry); + smp_store_release(&iw->head, NULL); + /* cancel is in progress */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return 1; req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); - list_del_init(&wait->entry); return 1; } @@ -291,14 +266,16 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; iwa = io_uring_alloc_async_data(NULL, req); - if (!unlikely(iwa)) + if (unlikely(!iwa)) return -ENOMEM; iwa->req = req; iw->which = READ_ONCE(sqe->len); iw->upid = READ_ONCE(sqe->fd); iw->options = READ_ONCE(sqe->file_index); + iw->head = NULL; iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + memset(&iw->info, 0, sizeof(iw->info)); return 0; } @@ -328,11 +305,16 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags) * callback. */ io_ring_submit_lock(ctx, issue_flags); + + /* + * iw->head is valid under the ring lock, and as long as the request + * is on the waitid_list where cancelations may find it. + */ + iw->head = ¤t->signal->wait_chldexit; hlist_add_head(&req->hash_node, &ctx->waitid_list); init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); iwa->wo.child_wait.private = req->tctx->task; - iw->head = ¤t->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); ret = __do_wait(&iwa->wo); @@ -355,7 +337,7 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags) } hlist_del_init(&req->hash_node); - remove_wait_queue(iw->head, &iwa->wo.child_wait); + io_waitid_remove_wq(req); ret = io_waitid_finish(req, ret); io_ring_submit_unlock(ctx, issue_flags); @@ -363,5 +345,5 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/xattr.c b/io_uring/xattr.c index de5064fcae8a..5303df3f247f 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -19,16 +19,14 @@ struct io_xattr { struct file *file; struct kernel_xattr_ctx ctx; - struct filename *filename; + struct delayed_filename filename; }; void io_xattr_cleanup(struct io_kiocb *req) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); - if (ix->filename) - putname(ix->filename); - + dismiss_delayed_filename(&ix->filename); kfree(ix->ctx.kname); kvfree(ix->ctx.kvalue); } @@ -48,7 +46,7 @@ static int __io_getxattr_prep(struct io_kiocb *req, const char __user *name; int ret; - ix->filename = NULL; + INIT_DELAYED_FILENAME(&ix->filename); ix->ctx.kvalue = NULL; name = u64_to_user_ptr(READ_ONCE(sqe->addr)); ix->ctx.value = u64_to_user_ptr(READ_ONCE(sqe->addr2)); @@ -58,7 +56,7 @@ static int __io_getxattr_prep(struct io_kiocb *req, if (ix->ctx.flags) return -EINVAL; - ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); + ix->ctx.kname = kmalloc_obj(*ix->ctx.kname); if (!ix->ctx.kname) return -ENOMEM; @@ -93,11 +91,7 @@ int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - ix->filename = getname(path); - if (IS_ERR(ix->filename)) - return PTR_ERR(ix->filename); - - return 0; + return delayed_getname(&ix->filename, path); } int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) @@ -109,20 +103,20 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) ret = file_getxattr(req->file, &ix->ctx); io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); + CLASS(filename_complete_delayed, name)(&ix->filename); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = filename_getxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx); - ix->filename = NULL; + ret = filename_getxattr(AT_FDCWD, name, LOOKUP_FOLLOW, &ix->ctx); io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } static int __io_setxattr_prep(struct io_kiocb *req, @@ -132,14 +126,14 @@ static int __io_setxattr_prep(struct io_kiocb *req, const char __user *name; int ret; - ix->filename = NULL; + INIT_DELAYED_FILENAME(&ix->filename); name = u64_to_user_ptr(READ_ONCE(sqe->addr)); ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); ix->ctx.kvalue = NULL; ix->ctx.size = READ_ONCE(sqe->len); ix->ctx.flags = READ_ONCE(sqe->xattr_flags); - ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); + ix->ctx.kname = kmalloc_obj(*ix->ctx.kname); if (!ix->ctx.kname) return -ENOMEM; @@ -169,11 +163,7 @@ int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - ix->filename = getname(path); - if (IS_ERR(ix->filename)) - return PTR_ERR(ix->filename); - - return 0; + return delayed_getname(&ix->filename, path); } int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -190,18 +180,18 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) ret = file_setxattr(req->file, &ix->ctx); io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); + CLASS(filename_complete_delayed, name)(&ix->filename); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = filename_setxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx); - ix->filename = NULL; + ret = filename_setxattr(AT_FDCWD, name, LOOKUP_FOLLOW, &ix->ctx); io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c new file mode 100644 index 000000000000..19837e0b5e91 --- /dev/null +++ b/io_uring/zcrx.c @@ -0,0 +1,1615 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/dma-map-ops.h> +#include <linux/mm.h> +#include <linux/nospec.h> +#include <linux/io_uring.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff_ref.h> +#include <linux/anon_inodes.h> + +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/netlink.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> +#include <net/tcp.h> +#include <net/rps.h> + +#include <trace/events/page_pool.h> + +#include <uapi/linux/io_uring.h> + +#include "io_uring.h" +#include "kbuf.h" +#include "memmap.h" +#include "zcrx.h" +#include "rsrc.h" + +#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) + +#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) + +static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) +{ + return pp->mp_priv; +} + +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct io_zcrx_area, nia); +} + +static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + unsigned niov_pages_shift; + + lockdep_assert(!area->mem.is_dmabuf); + + niov_pages_shift = area->ifq->niov_shift - PAGE_SHIFT; + return area->mem.pages[net_iov_idx(niov) << niov_pages_shift]; +} + +static int io_area_max_shift(struct io_zcrx_mem *mem) +{ + struct sg_table *sgt = mem->sgt; + struct scatterlist *sg; + unsigned shift = -1U; + unsigned i; + + for_each_sgtable_dma_sg(sgt, sg, i) + shift = min(shift, __ffs(sg_dma_len(sg))); + return shift; +} + +static int io_populate_area_dma(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area) +{ + unsigned niov_size = 1U << ifq->niov_shift; + struct sg_table *sgt = area->mem.sgt; + struct scatterlist *sg; + unsigned i, niov_idx = 0; + + for_each_sgtable_dma_sg(sgt, sg, i) { + dma_addr_t dma = sg_dma_address(sg); + unsigned long sg_len = sg_dma_len(sg); + + if (WARN_ON_ONCE(sg_len % niov_size)) + return -EINVAL; + + while (sg_len && niov_idx < area->nia.num_niovs) { + struct net_iov *niov = &area->nia.niovs[niov_idx]; + + if (net_mp_niov_set_dma_addr(niov, dma)) + return -EFAULT; + sg_len -= niov_size; + dma += niov_size; + niov_idx++; + } + } + + if (WARN_ON_ONCE(niov_idx != area->nia.num_niovs)) + return -EFAULT; + return 0; +} + +static void io_release_dmabuf(struct io_zcrx_mem *mem) +{ + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return; + + if (mem->sgt) + dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, + DMA_FROM_DEVICE); + if (mem->attach) + dma_buf_detach(mem->dmabuf, mem->attach); + if (mem->dmabuf) + dma_buf_put(mem->dmabuf); + + mem->sgt = NULL; + mem->attach = NULL; + mem->dmabuf = NULL; +} + +static int io_import_dmabuf(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + unsigned long off = (unsigned long)area_reg->addr; + unsigned long len = (unsigned long)area_reg->len; + unsigned long total_size = 0; + struct scatterlist *sg; + int dmabuf_fd = area_reg->dmabuf_fd; + int i, ret; + + if (!ifq->dev) + return -EINVAL; + if (off) + return -EINVAL; + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + mem->is_dmabuf = true; + mem->dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(mem->dmabuf)) { + ret = PTR_ERR(mem->dmabuf); + mem->dmabuf = NULL; + goto err; + } + + mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); + if (IS_ERR(mem->attach)) { + ret = PTR_ERR(mem->attach); + mem->attach = NULL; + goto err; + } + + mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); + if (IS_ERR(mem->sgt)) { + ret = PTR_ERR(mem->sgt); + mem->sgt = NULL; + goto err; + } + + for_each_sgtable_dma_sg(mem->sgt, sg, i) + total_size += sg_dma_len(sg); + + if (total_size != len) { + ret = -EINVAL; + goto err; + } + + mem->size = len; + return 0; +err: + io_release_dmabuf(mem); + return ret; +} + +static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages) +{ + struct folio *last_folio = NULL; + unsigned long res = 0; + int i; + + for (i = 0; i < nr_pages; i++) { + struct folio *folio = page_folio(pages[i]); + + if (folio == last_folio) + continue; + last_folio = folio; + res += folio_nr_pages(folio); + } + return res; +} + +static int io_import_umem(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct page **pages; + int nr_pages, ret; + bool mapped = false; + + if (area_reg->dmabuf_fd) + return -EINVAL; + if (!area_reg->addr) + return -EFAULT; + pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, + &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages, + 0, (unsigned long)nr_pages << PAGE_SHIFT, + GFP_KERNEL_ACCOUNT); + if (ret) + goto out_err; + + if (ifq->dev) { + ret = dma_map_sgtable(ifq->dev, &mem->page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); + if (ret < 0) + goto out_err; + mapped = true; + } + + mem->account_pages = io_count_account_pages(pages, nr_pages); + ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); + if (ret < 0) { + mem->account_pages = 0; + goto out_err; + } + + mem->sgt = &mem->page_sg_table; + mem->pages = pages; + mem->nr_folios = nr_pages; + mem->size = area_reg->len; + return ret; +out_err: + if (mapped) + dma_unmap_sgtable(ifq->dev, &mem->page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); + sg_free_table(&mem->page_sg_table); + unpin_user_pages(pages, nr_pages); + kvfree(pages); + return ret; +} + +static void io_release_area_mem(struct io_zcrx_mem *mem) +{ + if (mem->is_dmabuf) { + io_release_dmabuf(mem); + return; + } + if (mem->pages) { + unpin_user_pages(mem->pages, mem->nr_folios); + sg_free_table(mem->sgt); + mem->sgt = NULL; + kvfree(mem->pages); + } +} + +static int io_import_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + int ret; + + if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) + return -EINVAL; + if (area_reg->rq_area_token) + return -EINVAL; + if (area_reg->__resv2[0] || area_reg->__resv2[1]) + return -EINVAL; + + ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); + if (ret) + return ret; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) + return io_import_dmabuf(ifq, mem, area_reg); + return io_import_umem(ifq, mem, area_reg); +} + +static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area) +{ + int i; + + guard(mutex)(&ifq->pp_lock); + if (!area->is_mapped) + return; + area->is_mapped = false; + + if (area->nia.niovs) { + for (i = 0; i < area->nia.num_niovs; i++) + net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); + } + + if (area->mem.is_dmabuf) { + io_release_dmabuf(&area->mem); + } else { + dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); + } +} + +static void zcrx_sync_for_device(struct page_pool *pp, struct io_zcrx_ifq *zcrx, + netmem_ref *netmems, unsigned nr) +{ +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) + struct device *dev = pp->p.dev; + unsigned i, niov_size; + dma_addr_t dma_addr; + + if (!dma_dev_need_sync(dev)) + return; + niov_size = 1U << zcrx->niov_shift; + + for (i = 0; i < nr; i++) { + dma_addr = page_pool_get_dma_addr_netmem(netmems[i]); + __dma_sync_single_for_device(dev, dma_addr + pp->p.offset, + niov_size, pp->p.dma_dir); + } +#endif +} + +#define IO_RQ_MAX_ENTRIES 32768 + +#define IO_SKBS_PER_CALL_LIMIT 20 + +struct io_zcrx_args { + struct io_kiocb *req; + struct io_zcrx_ifq *ifq; + struct socket *sock; + unsigned nr_skbs; +}; + +static const struct memory_provider_ops io_uring_pp_zc_ops; + +static inline atomic_t *io_get_user_counter(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return &area->user_refs[net_iov_idx(niov)]; +} + +static bool io_zcrx_put_niov_uref(struct net_iov *niov) +{ + atomic_t *uref = io_get_user_counter(niov); + int old; + + old = atomic_read(uref); + do { + if (unlikely(old == 0)) + return false; + } while (!atomic_try_cmpxchg(uref, &old, old - 1)); + + return true; +} + +static void io_zcrx_get_niov_uref(struct net_iov *niov) +{ + atomic_inc(io_get_user_counter(niov)); +} + +static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets) +{ + offsets->head = offsetof(struct io_uring, head); + offsets->tail = offsetof(struct io_uring, tail); + offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); +} + +static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, + struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_ifq_reg *reg, + struct io_uring_region_desc *rd, + u32 id) +{ + u64 mmap_offset; + size_t off, size; + void *ptr; + int ret; + + io_fill_zcrx_offsets(®->offsets); + off = reg->offsets.rqes; + size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; + if (size > rd->size) + return -EINVAL; + + mmap_offset = IORING_MAP_OFF_ZCRX_REGION; + mmap_offset += (u64)id << IORING_OFF_ZCRX_SHIFT; + + ret = io_create_region(ctx, &ifq->rq_region, rd, mmap_offset); + if (ret < 0) + return ret; + + ptr = io_region_get_ptr(&ifq->rq_region); + ifq->rq.ring = (struct io_uring *)ptr; + ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off); + + memset(ifq->rq.ring, 0, sizeof(*ifq->rq.ring)); + return 0; +} + +static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) +{ + io_free_region(ifq->user, &ifq->rq_region); + ifq->rq.ring = NULL; + ifq->rq.rqes = NULL; +} + +static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area) +{ + io_zcrx_unmap_area(ifq, area); + io_release_area_mem(&area->mem); + + if (area->mem.account_pages) + io_unaccount_mem(ifq->user, ifq->mm_account, + area->mem.account_pages); + + kvfree(area->freelist); + kvfree(area->nia.niovs); + kvfree(area->user_refs); + kfree(area); +} + +static int io_zcrx_append_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area) +{ + bool kern_readable = !area->mem.is_dmabuf; + + if (WARN_ON_ONCE(ifq->area)) + return -EINVAL; + if (WARN_ON_ONCE(ifq->kern_readable != kern_readable)) + return -EINVAL; + + ifq->area = area; + return 0; +} + +static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_area_reg *area_reg, + struct io_uring_zcrx_ifq_reg *reg) +{ + int buf_size_shift = PAGE_SHIFT; + struct io_zcrx_area *area; + unsigned nr_iovs; + int i, ret; + + if (reg->rx_buf_len) { + if (!is_power_of_2(reg->rx_buf_len) || + reg->rx_buf_len < PAGE_SIZE) + return -EINVAL; + buf_size_shift = ilog2(reg->rx_buf_len); + } + if (!ifq->dev && buf_size_shift != PAGE_SHIFT) + return -EOPNOTSUPP; + + ret = -ENOMEM; + area = kzalloc_obj(*area); + if (!area) + goto err; + area->ifq = ifq; + + ret = io_import_area(ifq, &area->mem, area_reg); + if (ret) + goto err; + if (ifq->dev) + area->is_mapped = true; + + if (ifq->dev && buf_size_shift > io_area_max_shift(&area->mem)) { + ret = -ERANGE; + goto err; + } + + ifq->niov_shift = buf_size_shift; + nr_iovs = area->mem.size >> ifq->niov_shift; + area->nia.num_niovs = nr_iovs; + + ret = -ENOMEM; + area->nia.niovs = kvmalloc_objs(area->nia.niovs[0], nr_iovs, + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!area->nia.niovs) + goto err; + + area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!area->freelist) + goto err; + + area->user_refs = kvmalloc_objs(area->user_refs[0], nr_iovs, + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!area->user_refs) + goto err; + + for (i = 0; i < nr_iovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + + net_iov_init(niov, &area->nia, NET_IOV_IOURING); + area->freelist[i] = i; + atomic_set(&area->user_refs[i], 0); + } + + if (ifq->dev) { + ret = io_populate_area_dma(ifq, area); + if (ret) + goto err; + } + + area->free_count = nr_iovs; + /* we're only supporting one area per ifq for now */ + area->area_id = 0; + area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; + spin_lock_init(&area->freelist_lock); + + ret = io_zcrx_append_area(ifq, area); + if (!ret) + return 0; +err: + if (area) + io_zcrx_free_area(ifq, area); + return ret; +} + +static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + + ifq = kzalloc_obj(*ifq); + if (!ifq) + return NULL; + + ifq->if_rxq = -1; + spin_lock_init(&ifq->rq.lock); + mutex_init(&ifq->pp_lock); + refcount_set(&ifq->refs, 1); + refcount_set(&ifq->user_refs, 1); + return ifq; +} + +static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) +{ + guard(mutex)(&ifq->pp_lock); + + if (!ifq->netdev) + return; + netdev_put(ifq->netdev, &ifq->netdev_tracker); + ifq->netdev = NULL; +} + +static void io_close_queue(struct io_zcrx_ifq *ifq) +{ + struct net_device *netdev; + netdevice_tracker netdev_tracker; + struct pp_memory_provider_params p = { + .mp_ops = &io_uring_pp_zc_ops, + .mp_priv = ifq, + }; + + scoped_guard(mutex, &ifq->pp_lock) { + netdev = ifq->netdev; + netdev_tracker = ifq->netdev_tracker; + ifq->netdev = NULL; + } + + if (netdev) { + if (ifq->if_rxq != -1) { + netdev_lock(netdev); + netif_mp_close_rxq(netdev, ifq->if_rxq, &p); + netdev_unlock(netdev); + } + netdev_put(netdev, &netdev_tracker); + } + ifq->if_rxq = -1; +} + +static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) +{ + io_close_queue(ifq); + + if (ifq->area) + io_zcrx_free_area(ifq, ifq->area); + if (ifq->mm_account) + mmdrop(ifq->mm_account); + if (ifq->dev) + put_device(ifq->dev); + + io_free_rbuf_ring(ifq); + free_uid(ifq->user); + mutex_destroy(&ifq->pp_lock); + kfree(ifq); +} + +static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq) +{ + if (refcount_dec_and_test(&ifq->refs)) + io_zcrx_ifq_free(ifq); +} + +static void io_zcrx_return_niov_freelist(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + guard(spinlock_bh)(&area->freelist_lock); + if (WARN_ON_ONCE(area->free_count >= area->nia.num_niovs)) + return; + area->freelist[area->free_count++] = net_iov_idx(niov); +} + +static struct net_iov *zcrx_get_free_niov(struct io_zcrx_area *area) +{ + unsigned niov_idx; + + lockdep_assert_held(&area->freelist_lock); + + if (unlikely(!area->free_count)) + return NULL; + + niov_idx = area->freelist[--area->free_count]; + return &area->nia.niovs[niov_idx]; +} + +static void io_zcrx_return_niov(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + if (!niov->desc.pp) { + /* copy fallback allocated niovs */ + io_zcrx_return_niov_freelist(niov); + return; + } + page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); +} + +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + int i; + + if (!area) + return; + + /* Reclaim back all buffers given to the user space. */ + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + int nr; + + if (!atomic_read(io_get_user_counter(niov))) + continue; + nr = atomic_xchg(io_get_user_counter(niov), 0); + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) + io_zcrx_return_niov(niov); + } +} + +static void zcrx_unregister_user(struct io_zcrx_ifq *ifq) +{ + if (refcount_dec_and_test(&ifq->user_refs)) { + io_close_queue(ifq); + io_zcrx_scrub(ifq); + } +} + +static void zcrx_unregister(struct io_zcrx_ifq *ifq) +{ + zcrx_unregister_user(ifq); + io_put_zcrx_ifq(ifq); +} + +struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id) +{ + struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); + + lockdep_assert_held(&ctx->mmap_lock); + + return ifq ? &ifq->rq_region : NULL; +} + +static int zcrx_box_release(struct inode *inode, struct file *file) +{ + struct io_zcrx_ifq *ifq = file->private_data; + + if (WARN_ON_ONCE(!ifq)) + return -EFAULT; + zcrx_unregister(ifq); + return 0; +} + +static const struct file_operations zcrx_box_fops = { + .owner = THIS_MODULE, + .release = zcrx_box_release, +}; + +static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, + struct zcrx_ctrl *ctrl, void __user *arg) +{ + struct zcrx_ctrl_export *ce = &ctrl->zc_export; + struct file *file; + int fd = -1; + + if (!mem_is_zero(ce, sizeof(*ce))) + return -EINVAL; + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + ce->zcrx_fd = fd; + if (copy_to_user(arg, ctrl, sizeof(*ctrl))) { + put_unused_fd(fd); + return -EFAULT; + } + + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, + ifq, O_CLOEXEC, NULL); + if (IS_ERR(file)) { + put_unused_fd(fd); + zcrx_unregister(ifq); + return PTR_ERR(file); + } + + fd_install(fd, file); + return 0; +} + +static int import_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg, + struct io_uring_zcrx_ifq_reg *reg) +{ + struct io_zcrx_ifq *ifq; + struct file *file; + int fd, ret; + u32 id; + + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) + return -EINVAL; + if (reg->flags & ~ZCRX_REG_IMPORT) + return -EINVAL; + + fd = reg->if_idx; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + + file = fd_file(f); + if (file->f_op != &zcrx_box_fops || !file->private_data) + return -EBADF; + + ifq = file->private_data; + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto err; + } + + reg->zcrx_id = id; + io_fill_zcrx_offsets(®->offsets); + if (copy_to_user(arg, reg, sizeof(*reg))) { + ret = -EFAULT; + goto err_xa_erase; + } + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err_xa_erase; + } + + return 0; +err_xa_erase: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +err: + zcrx_unregister(ifq); + return ret; +} + +static int zcrx_register_netdev(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_ifq_reg *reg, + struct io_uring_zcrx_area_reg *area) +{ + struct pp_memory_provider_params mp_param = {}; + unsigned if_rxq = reg->if_rxq; + int ret; + + ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, + reg->if_idx); + if (!ifq->netdev) + return -ENODEV; + + netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); + + ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, if_rxq, NETDEV_QUEUE_TYPE_RX); + if (!ifq->dev) { + ret = -EOPNOTSUPP; + goto netdev_put_unlock; + } + get_device(ifq->dev); + + ret = io_zcrx_create_area(ifq, area, reg); + if (ret) + goto netdev_put_unlock; + + if (reg->rx_buf_len) + mp_param.rx_page_size = 1U << ifq->niov_shift; + mp_param.mp_ops = &io_uring_pp_zc_ops; + mp_param.mp_priv = ifq; + ret = netif_mp_open_rxq(ifq->netdev, if_rxq, &mp_param, NULL); + if (ret) + goto netdev_put_unlock; + + ifq->if_rxq = if_rxq; + ret = 0; +netdev_put_unlock: + netdev_unlock(ifq->netdev); + return ret; +} + +int io_register_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + struct io_uring_zcrx_area_reg area; + struct io_uring_zcrx_ifq_reg reg; + struct io_uring_region_desc rd; + struct io_zcrx_ifq *ifq; + int ret; + u32 id; + + /* + * 1. Interface queue allocation. + * 2. It can observe data destined for sockets of other tasks. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* mandatory io_uring features for zc rx */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.zcrx_id) + return -EINVAL; + if (reg.flags & ~ZCRX_SUPPORTED_REG_FLAGS) + return -EINVAL; + if (reg.flags & ZCRX_REG_IMPORT) + return import_zcrx(ctx, arg, ®); + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; + if (reg.if_rxq == -1 || !reg.rq_entries) + return -EINVAL; + if ((reg.if_rxq || reg.if_idx) && (reg.flags & ZCRX_REG_NODEV)) + return -EINVAL; + if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { + if (!(ctx->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + reg.rq_entries = IO_RQ_MAX_ENTRIES; + } + reg.rq_entries = roundup_pow_of_two(reg.rq_entries); + + if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) + return -EFAULT; + + ifq = io_zcrx_ifq_alloc(ctx); + if (!ifq) + return -ENOMEM; + + if (ctx->user) { + get_uid(ctx->user); + ifq->user = ctx->user; + } + if (ctx->mm_account) { + mmgrab(ctx->mm_account); + ifq->mm_account = ctx->mm_account; + } + ifq->rq.nr_entries = reg.rq_entries; + + scoped_guard(mutex, &ctx->mmap_lock) { + /* preallocate id */ + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto ifq_free; + } + + ret = io_allocate_rbuf_ring(ctx, ifq, ®, &rd, id); + if (ret) + goto err; + + ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF); + + if (!(reg.flags & ZCRX_REG_NODEV)) { + ret = zcrx_register_netdev(ifq, ®, &area); + if (ret) + goto err; + } else { + ret = io_zcrx_create_area(ifq, &area, ®); + if (ret) + goto err; + } + + reg.zcrx_id = id; + + scoped_guard(mutex, &ctx->mmap_lock) { + /* publish ifq */ + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err; + } + + reg.rx_buf_len = 1U << ifq->niov_shift; + + if (copy_to_user(arg, ®, sizeof(reg)) || + copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || + copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { + ret = -EFAULT; + goto err; + } + return 0; +err: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +ifq_free: + zcrx_unregister(ifq); + return ret; +} + +static inline bool is_zcrx_entry_marked(struct io_ring_ctx *ctx, unsigned long id) +{ + return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_1); +} + +static inline void set_zcrx_entry_mark(struct io_ring_ctx *ctx, unsigned long id) +{ + xa_set_mark(&ctx->zcrx_ctxs, id, XA_MARK_1); +} + +void io_terminate_zcrx(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + unsigned long id = 0; + + lockdep_assert_held(&ctx->uring_lock); + + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (!ifq) + break; + if (WARN_ON_ONCE(is_zcrx_entry_marked(ctx, id))) + break; + set_zcrx_entry_mark(ctx, id); + id++; + zcrx_unregister_user(ifq); + } +} + +void io_unregister_zcrx(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + + lockdep_assert_held(&ctx->uring_lock); + + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) { + unsigned long id = 0; + + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (ifq) { + if (WARN_ON_ONCE(!is_zcrx_entry_marked(ctx, id))) { + ifq = NULL; + break; + } + xa_erase(&ctx->zcrx_ctxs, id); + } + } + if (!ifq) + break; + io_put_zcrx_ifq(ifq); + } + + xa_destroy(&ctx->zcrx_ctxs); +} + +static inline u32 zcrx_rq_entries(struct zcrx_rq *rq) +{ + u32 entries; + + entries = smp_load_acquire(&rq->ring->tail) - rq->cached_head; + return min(entries, rq->nr_entries); +} + +static struct io_uring_zcrx_rqe *zcrx_next_rqe(struct zcrx_rq *rq, unsigned mask) +{ + unsigned int idx = rq->cached_head++ & mask; + + return &rq->rqes[idx]; +} + +static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, + struct io_zcrx_ifq *ifq, + struct net_iov **ret_niov) +{ + __u64 off = READ_ONCE(rqe->off); + unsigned niov_idx, area_idx; + struct io_zcrx_area *area; + + area_idx = off >> IORING_ZCRX_AREA_SHIFT; + niov_idx = (off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift; + + if (unlikely(rqe->__pad || area_idx)) + return false; + area = ifq->area; + + if (unlikely(niov_idx >= area->nia.num_niovs)) + return false; + niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); + + *ret_niov = &area->nia.niovs[niov_idx]; + return true; +} + +static unsigned io_zcrx_ring_refill(struct page_pool *pp, + struct io_zcrx_ifq *ifq, + netmem_ref *netmems, unsigned to_alloc) +{ + struct zcrx_rq *rq = &ifq->rq; + unsigned int mask = rq->nr_entries - 1; + unsigned int entries; + unsigned allocated = 0; + + guard(spinlock_bh)(&rq->lock); + + entries = zcrx_rq_entries(rq); + entries = min_t(unsigned, entries, to_alloc); + if (unlikely(!entries)) + return 0; + + do { + struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); + struct net_iov *niov; + netmem_ref netmem; + + if (!io_parse_rqe(rqe, ifq, &niov)) + continue; + if (!io_zcrx_put_niov_uref(niov)) + continue; + + netmem = net_iov_to_netmem(niov); + if (!page_pool_unref_and_test(netmem)) + continue; + + if (unlikely(niov->desc.pp != pp)) { + io_zcrx_return_niov(niov); + continue; + } + + netmems[allocated] = netmem; + allocated++; + } while (--entries); + + smp_store_release(&rq->ring->head, rq->cached_head); + return allocated; +} + +static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq, + netmem_ref *netmems, unsigned to_alloc) +{ + struct io_zcrx_area *area = ifq->area; + unsigned allocated = 0; + + guard(spinlock_bh)(&area->freelist_lock); + + for (allocated = 0; allocated < to_alloc; allocated++) { + struct net_iov *niov = zcrx_get_free_niov(area); + + if (!niov) + break; + net_mp_niov_set_page_pool(pp, niov); + netmems[allocated] = net_iov_to_netmem(niov); + } + return allocated; +} + +static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) +{ + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); + netmem_ref *netmems = pp->alloc.cache; + unsigned to_alloc = PP_ALLOC_CACHE_REFILL; + unsigned allocated; + + /* pp should already be ensuring that */ + if (WARN_ON_ONCE(pp->alloc.count)) + return 0; + + allocated = io_zcrx_ring_refill(pp, ifq, netmems, to_alloc); + if (likely(allocated)) + goto out_return; + + allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc); + if (!allocated) + return 0; +out_return: + zcrx_sync_for_device(pp, ifq, netmems, allocated); + allocated--; + pp->alloc.count += allocated; + return netmems[allocated]; +} + +static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) +{ + struct net_iov *niov; + + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return false; + + niov = netmem_to_net_iov(netmem); + net_mp_niov_clear_page_pool(niov); + io_zcrx_return_niov_freelist(niov); + return false; +} + +static int io_pp_zc_init(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); + + if (WARN_ON_ONCE(!ifq)) + return -EINVAL; + if (WARN_ON_ONCE(ifq->dev != pp->p.dev)) + return -EINVAL; + if (WARN_ON_ONCE(!pp->dma_map)) + return -EOPNOTSUPP; + if (pp->p.order + PAGE_SHIFT != ifq->niov_shift) + return -EINVAL; + if (pp->p.dma_dir != DMA_FROM_DEVICE) + return -EOPNOTSUPP; + + refcount_inc(&ifq->refs); + return 0; +} + +static void io_pp_zc_destroy(struct page_pool *pp) +{ + io_put_zcrx_ifq(io_pp_to_ifq(pp)); +} + +static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + struct nlattr *nest; + int type; + + type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; + nest = nla_nest_start(rsp, type); + if (!nest) + return -EMSGSIZE; + nla_nest_end(rsp, nest); + + return 0; +} + +static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) +{ + struct pp_memory_provider_params *p = &rxq->mp_params; + struct io_zcrx_ifq *ifq = mp_priv; + + io_zcrx_drop_netdev(ifq); + if (ifq->area) + io_zcrx_unmap_area(ifq, ifq->area); + + p->mp_ops = NULL; + p->mp_priv = NULL; +} + +static const struct memory_provider_ops io_uring_pp_zc_ops = { + .alloc_netmems = io_pp_zc_alloc_netmems, + .release_netmem = io_pp_zc_release_netmem, + .init = io_pp_zc_init, + .destroy = io_pp_zc_destroy, + .nl_fill = io_pp_nl_fill, + .uninstall = io_pp_uninstall, +}; + +static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, + struct io_zcrx_ifq *zcrx, struct zcrx_rq *rq) +{ + unsigned int mask = rq->nr_entries - 1; + unsigned int i; + + nr = min(nr, zcrx_rq_entries(rq)); + for (i = 0; i < nr; i++) { + struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); + struct net_iov *niov; + + if (!io_parse_rqe(rqe, zcrx, &niov)) + break; + netmem_array[i] = net_iov_to_netmem(niov); + } + + smp_store_release(&rq->ring->head, rq->cached_head); + return i; +} + +#define ZCRX_FLUSH_BATCH 32 + +static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr) +{ + unsigned i; + + for (i = 0; i < nr; i++) { + netmem_ref netmem = netmems[i]; + struct net_iov *niov = netmem_to_net_iov(netmem); + + if (!io_zcrx_put_niov_uref(niov)) + continue; + if (!page_pool_unref_and_test(netmem)) + continue; + io_zcrx_return_niov(niov); + } +} + +static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, + struct zcrx_ctrl *ctrl) +{ + struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush; + netmem_ref netmems[ZCRX_FLUSH_BATCH]; + unsigned total = 0; + unsigned nr; + + if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv))) + return -EINVAL; + + do { + struct zcrx_rq *rq = &zcrx->rq; + + scoped_guard(spinlock_bh, &rq->lock) { + nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx, rq); + zcrx_return_buffers(netmems, nr); + } + + total += nr; + + if (fatal_signal_pending(current)) + break; + cond_resched(); + } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq.nr_entries); + + return 0; +} + +int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct zcrx_ctrl ctrl; + struct io_zcrx_ifq *zcrx; + + BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush)); + + if (nr_args) + return -EINVAL; + if (copy_from_user(&ctrl, arg, sizeof(ctrl))) + return -EFAULT; + if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv))) + return -EFAULT; + + zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); + if (!zcrx) + return -ENXIO; + + switch (ctrl.op) { + case ZCRX_CTRL_FLUSH_RQ: + return zcrx_flush_rq(ctx, zcrx, &ctrl); + case ZCRX_CTRL_EXPORT: + return zcrx_export(ctx, zcrx, &ctrl, arg); + } + + return -EOPNOTSUPP; +} + +static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, + struct io_zcrx_ifq *ifq, int off, int len) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_zcrx_cqe *rcqe; + struct io_zcrx_area *area; + struct io_uring_cqe *cqe; + u64 offset; + + if (!io_defer_get_uncommited_cqe(ctx, &cqe)) + return false; + + cqe->user_data = req->cqe.user_data; + cqe->res = len; + cqe->flags = IORING_CQE_F_MORE; + if (ctx->flags & IORING_SETUP_CQE_MIXED) + cqe->flags |= IORING_CQE_F_32; + + area = io_zcrx_iov_to_area(niov); + offset = off + (net_iov_idx(niov) << ifq->niov_shift); + rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); + rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); + rcqe->__pad = 0; + return true; +} + +static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + struct net_iov *niov = NULL; + + if (!ifq->kern_readable) + return NULL; + + scoped_guard(spinlock_bh, &area->freelist_lock) + niov = zcrx_get_free_niov(area); + + if (niov) + page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); + return niov; +} + +struct io_copy_cache { + struct page *page; + unsigned long offset; + size_t size; +}; + +static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page, + unsigned int src_offset, size_t len) +{ + size_t copied = 0; + + len = min(len, cc->size); + + while (len) { + void *src_addr, *dst_addr; + struct page *dst_page = cc->page; + unsigned dst_offset = cc->offset; + size_t n = len; + + if (folio_test_partial_kmap(page_folio(dst_page)) || + folio_test_partial_kmap(page_folio(src_page))) { + dst_page += dst_offset / PAGE_SIZE; + dst_offset = offset_in_page(dst_offset); + src_page += src_offset / PAGE_SIZE; + src_offset = offset_in_page(src_offset); + n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset); + n = min(n, len); + } + + dst_addr = kmap_local_page(dst_page) + dst_offset; + src_addr = kmap_local_page(src_page) + src_offset; + + memcpy(dst_addr, src_addr, n); + + kunmap_local(src_addr); + kunmap_local(dst_addr); + + cc->size -= n; + cc->offset += n; + src_offset += n; + len -= n; + copied += n; + } + return copied; +} + +static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct page *src_page, unsigned int src_offset, + size_t len) +{ + size_t copied = 0; + int ret = 0; + + while (len) { + struct io_copy_cache cc; + struct net_iov *niov; + size_t n; + + niov = io_alloc_fallback_niov(ifq); + if (!niov) { + ret = -ENOMEM; + break; + } + + cc.page = io_zcrx_iov_page(niov); + cc.offset = 0; + cc.size = PAGE_SIZE; + + n = io_copy_page(&cc, src_page, src_offset, len); + + if (!io_zcrx_queue_cqe(req, niov, ifq, 0, n)) { + io_zcrx_return_niov(niov); + ret = -ENOSPC; + break; + } + + io_zcrx_get_niov_uref(niov); + src_offset += n; + len -= n; + copied += n; + } + + return copied ? copied : ret; +} + +static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + const skb_frag_t *frag, int off, int len) +{ + struct page *page = skb_frag_page(frag); + + return io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len); +} + +static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + const skb_frag_t *frag, int off, int len) +{ + struct net_iov *niov; + struct page_pool *pp; + + if (unlikely(!skb_frag_is_net_iov(frag))) + return io_zcrx_copy_frag(req, ifq, frag, off, len); + + niov = netmem_to_net_iov(frag->netmem); + pp = niov->desc.pp; + + if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq) + return -EFAULT; + + if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) + return -ENOSPC; + + /* + * Prevent it from being recycled while user is accessing it. + * It has to be done before grabbing a user reference. + */ + page_pool_ref_netmem(net_iov_to_netmem(niov)); + io_zcrx_get_niov_uref(niov); + return len; +} + +static int +io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct io_zcrx_args *args = desc->arg.data; + struct io_zcrx_ifq *ifq = args->ifq; + struct io_kiocb *req = args->req; + struct sk_buff *frag_iter; + unsigned start, start_off = offset; + int i, copy, end, off; + int ret = 0; + + len = min_t(size_t, len, desc->count); + /* + * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even + * if desc->count is already 0. This is caused by the if (offset + 1 != + * skb->len) check. Return early in this case to break out of + * __tcp_read_sock(). + */ + if (!len) + return 0; + if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) + return -EAGAIN; + + if (unlikely(offset < skb_headlen(skb))) { + ssize_t copied; + size_t to_copy; + + to_copy = min_t(size_t, skb_headlen(skb) - offset, len); + copied = io_zcrx_copy_chunk(req, ifq, virt_to_page(skb->data), + offset_in_page(skb->data) + offset, + to_copy); + if (copied < 0) { + ret = copied; + goto out; + } + offset += copied; + len -= copied; + if (!len) + goto out; + if (offset != skb_headlen(skb)) + goto out; + } + + start = skb_headlen(skb); + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag; + + if (WARN_ON(start > offset + len)) + return -EFAULT; + + frag = &skb_shinfo(skb)->frags[i]; + end = start + skb_frag_size(frag); + + if (offset < end) { + copy = end - offset; + if (copy > len) + copy = len; + + off = offset - start; + ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); + if (ret < 0) + goto out; + + offset += ret; + len -= ret; + if (len == 0 || ret != copy) + goto out; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + if (WARN_ON(start > offset + len)) + return -EFAULT; + + end = start + frag_iter->len; + if (offset < end) { + size_t count; + + copy = end - offset; + if (copy > len) + copy = len; + + off = offset - start; + count = desc->count; + ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); + desc->count = count; + if (ret < 0) + goto out; + + offset += ret; + len -= ret; + if (len == 0 || ret != copy) + goto out; + } + start = end; + } + +out: + if (offset == start_off) + return ret; + desc->count -= (offset - start_off); + return offset - start_off; +} + +static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct sock *sk, int flags, + unsigned issue_flags, unsigned int *outlen) +{ + unsigned int len = *outlen; + struct io_zcrx_args args = { + .req = req, + .ifq = ifq, + .sock = sk->sk_socket, + }; + read_descriptor_t rd_desc = { + .count = len ? len : UINT_MAX, + .arg.data = &args, + }; + int ret; + + lock_sock(sk); + ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); + if (len && ret > 0) + *outlen = len - ret; + if (ret <= 0) { + if (ret < 0 || sock_flag(sk, SOCK_DONE)) + goto out; + if (sk->sk_err) + ret = sock_error(sk); + else if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out; + else if (sk->sk_state == TCP_CLOSE) + ret = -ENOTCONN; + else + ret = -EAGAIN; + } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && + (issue_flags & IO_URING_F_MULTISHOT)) { + ret = IOU_REQUEUE; + } else if (sock_flag(sk, SOCK_DONE)) { + /* Make it to retry until it finally gets 0. */ + if (issue_flags & IO_URING_F_MULTISHOT) + ret = IOU_REQUEUE; + else + ret = -EAGAIN; + } +out: + release_sock(sk); + return ret; +} + +int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags, unsigned int *len) +{ + struct sock *sk = sock->sk; + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot->recvmsg != tcp_recvmsg) + return -EPROTONOSUPPORT; + + sock_rps_record_flow(sk); + return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); +} diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h new file mode 100644 index 000000000000..75e0a4e6ef6e --- /dev/null +++ b/io_uring/zcrx.h @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_ZC_RX_H +#define IOU_ZC_RX_H + +#include <linux/io_uring_types.h> +#include <linux/dma-buf.h> +#include <linux/socket.h> +#include <net/page_pool/types.h> +#include <net/net_trackers.h> + +#define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT | ZCRX_REG_NODEV) +#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE) + +struct io_zcrx_mem { + unsigned long size; + bool is_dmabuf; + + struct page **pages; + unsigned long nr_folios; + struct sg_table page_sg_table; + unsigned long account_pages; + struct sg_table *sgt; + + struct dma_buf_attachment *attach; + struct dma_buf *dmabuf; +}; + +struct io_zcrx_area { + struct net_iov_area nia; + struct io_zcrx_ifq *ifq; + atomic_t *user_refs; + + bool is_mapped; + u16 area_id; + + /* freelist */ + spinlock_t freelist_lock ____cacheline_aligned_in_smp; + u32 free_count; + u32 *freelist; + + struct io_zcrx_mem mem; +}; + +struct zcrx_rq { + spinlock_t lock; + struct io_uring *ring; + struct io_uring_zcrx_rqe *rqes; + u32 cached_head; + u32 nr_entries; +}; + +struct io_zcrx_ifq { + struct io_zcrx_area *area; + unsigned niov_shift; + struct user_struct *user; + struct mm_struct *mm_account; + bool kern_readable; + + struct zcrx_rq rq ____cacheline_aligned_in_smp; + + u32 if_rxq; + struct device *dev; + struct net_device *netdev; + netdevice_tracker netdev_tracker; + refcount_t refs; + /* counts userspace facing users like io_uring */ + refcount_t user_refs; + + /* + * Page pool and net configuration lock, can be taken deeper in the + * net stack. + */ + struct mutex pp_lock; + struct io_mapped_region rq_region; +}; + +#if defined(CONFIG_IO_URING_ZCRX) +int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); +int io_register_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg); +void io_unregister_zcrx(struct io_ring_ctx *ctx); +void io_terminate_zcrx(struct io_ring_ctx *ctx); +int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags, unsigned int *len); +struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id); +#else +static inline int io_register_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + return -EOPNOTSUPP; +} +static inline void io_unregister_zcrx(struct io_ring_ctx *ctx) +{ +} +static inline void io_terminate_zcrx(struct io_ring_ctx *ctx) +{ +} +static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags, unsigned int *len) +{ + return -EOPNOTSUPP; +} +static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id) +{ + return NULL; +} +static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) +{ + return -EOPNOTSUPP; +} +#endif + +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); + +#endif |
