From ea011ee10231f5fa6cbb415007048ca0bb948baf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 2 Dec 2022 17:47:22 +0000 Subject: io_uring: protect cq_timeouts with timeout_lock Read cq_timeouts in io_flush_timeouts() only after taking the timeout_lock, as it's protected by it. There are many places where we also grab ->completion_lock, but for instance io_timeout_fn() doesn't and still modifies cq_timeouts. Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9c79544dd6cf5c4018cb1bab99cf481a93ea46ef.1670002973.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 5b4bc93fd6e0..4c6a5666541c 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -72,10 +72,12 @@ static bool io_kill_timeout(struct io_kiocb *req, int status) __cold void io_flush_timeouts(struct io_ring_ctx *ctx) __must_hold(&ctx->completion_lock) { - u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + u32 seq; struct io_timeout *timeout, *tmp; spin_lock_irq(&ctx->timeout_lock); + seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); u32 events_needed, events_got; -- cgit v1.2.3 From 6971253f078766543c716db708ba2c787826690d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 2 Dec 2022 17:47:23 +0000 Subject: io_uring: revise completion_lock locking io_kill_timeouts() doesn't post any events but queues everything to task_work. Locking there is needed for protecting linked requests traversing, we should grab completion_lock directly instead of using io_cq_[un]lock helpers. Same goes for __io_req_find_next_prep(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/88e75d481a65dc295cb59722bb1cf76402d1c06b.1670002973.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 ++++++++++++++-- io_uring/io_uring.h | 11 ----------- io_uring/timeout.c | 8 ++++++-- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b521186efa5c..698c54f951ea 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -597,6 +597,18 @@ static inline void __io_cq_unlock(struct io_ring_ctx *ctx) spin_unlock(&ctx->completion_lock); } +static inline void io_cq_lock(struct io_ring_ctx *ctx) + __acquires(ctx->completion_lock) +{ + spin_lock(&ctx->completion_lock); +} + +static inline void io_cq_unlock(struct io_ring_ctx *ctx) + __releases(ctx->completion_lock) +{ + spin_unlock(&ctx->completion_lock); +} + /* keep it inlined for io_submit_flush_completions() */ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) @@ -1074,9 +1086,9 @@ static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - io_cq_lock(ctx); + spin_lock(&ctx->completion_lock); io_disarm_next(req); - io_cq_unlock_post(ctx); + spin_unlock(&ctx->completion_lock); } static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 1b2f0b2cc888..c117e029c8dc 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -87,17 +87,6 @@ static inline void io_req_task_work_add(struct io_kiocb *req) #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) -static inline void io_cq_lock(struct io_ring_ctx *ctx) - __acquires(ctx->completion_lock) -{ - spin_lock(&ctx->completion_lock); -} - -static inline void io_cq_unlock(struct io_ring_ctx *ctx) -{ - spin_unlock(&ctx->completion_lock); -} - void io_cq_unlock_post(struct io_ring_ctx *ctx); static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 4c6a5666541c..eae005b2d1d2 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -624,7 +624,11 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, struct io_timeout *timeout, *tmp; int canceled = 0; - io_cq_lock(ctx); + /* + * completion_lock is needed for io_match_task(). Take it before + * timeout_lockfirst to keep locking ordering. + */ + spin_lock(&ctx->completion_lock); spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); @@ -634,6 +638,6 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, canceled++; } spin_unlock_irq(&ctx->timeout_lock); - io_cq_unlock_post(ctx); + spin_unlock(&ctx->completion_lock); return canceled != 0; } -- cgit v1.2.3 From e5f30f6fb29a0b8fa7ca784e44571a610b949b04 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 2 Dec 2022 17:47:24 +0000 Subject: io_uring: ease timeout flush locking requirements We don't need completion_lock for timeout flushing, don't take it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1e3dc657975ac445b80e7bdc40050db783a5935a.1670002973.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 9 ++++----- io_uring/timeout.c | 2 -- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 698c54f951ea..fc64072c53eb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -572,12 +572,11 @@ static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (ctx->off_timeout_used || ctx->drain_active) { + if (ctx->off_timeout_used) + io_flush_timeouts(ctx); + if (ctx->drain_active) { spin_lock(&ctx->completion_lock); - if (ctx->off_timeout_used) - io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); + io_queue_deferred(ctx); spin_unlock(&ctx->completion_lock); } if (ctx->has_evfd) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index eae005b2d1d2..826a51bca3e4 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -50,7 +50,6 @@ static inline void io_put_req(struct io_kiocb *req) } static bool io_kill_timeout(struct io_kiocb *req, int status) - __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = req->async_data; @@ -70,7 +69,6 @@ static bool io_kill_timeout(struct io_kiocb *req, int status) } __cold void io_flush_timeouts(struct io_ring_ctx *ctx) - __must_hold(&ctx->completion_lock) { u32 seq; struct io_timeout *timeout, *tmp; -- cgit v1.2.3 From a8cf95f93610eb8282f8b6d0117ba78b74588d6b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 2 Dec 2022 17:47:25 +0000 Subject: io_uring: fix overflow handling regression Because the single task locking series got reordered ahead of the timeout and completion lock changes, two hunks inadvertently ended up using __io_fill_cqe_req() rather than io_fill_cqe_req(). This meant that we dropped overflow handling in those two spots. Reinstate the correct CQE filling helper. Fixes: f66f73421f0a ("io_uring: skip spinlocking for ->task_complete") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/rw.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fc64072c53eb..4601e48a173d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -927,7 +927,7 @@ static void __io_req_complete_post(struct io_kiocb *req) io_cq_lock(ctx); if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(ctx, req); + io_fill_cqe_req(ctx, req); /* * If we're the last reference to this request, add to our locked diff --git a/io_uring/rw.c b/io_uring/rw.c index b9cac5706e8d..8227af2e1c0f 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1062,7 +1062,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) continue; req->cqe.flags = io_put_kbuf(req, 0); - __io_fill_cqe_req(req->ctx, req); + io_fill_cqe_req(req->ctx, req); } if (unlikely(!nr_events)) -- cgit v1.2.3 From 44a84da45272b3f4beb90025a64cfbde18f1aef0 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 15 Dec 2022 10:41:38 -0800 Subject: io_uring: use call_rcu_hurry if signaling an eventfd io_uring uses call_rcu in the case it needs to signal an eventfd as a result of an eventfd signal, since recursing eventfd signals are not allowed. This should be calling the new call_rcu_hurry API to not delay the signal. Signed-off-by: Dylan Yudaken Cc: Joel Fernandes (Google) Cc: Paul E. McKenney Acked-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20221215184138.795576-1-dylany@meta.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4601e48a173d..16a323a9ff70 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -538,7 +538,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) } else { atomic_inc(&ev_fd->refs); if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) - call_rcu(&ev_fd->rcu, io_eventfd_ops); + call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops); else atomic_dec(&ev_fd->refs); } -- cgit v1.2.3 From 6434ec0186b80c734aa7a2acf95f75f5c6dd943b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 17 Dec 2022 13:40:17 -0700 Subject: io_uring: don't use TIF_NOTIFY_SIGNAL to test for availability of task_work Use task_work_pending() as a better test for whether we have task_work or not, TIF_NOTIFY_SIGNAL is only valid if the any of the task_work items had been queued with TWA_SIGNAL as the notification mechanism. Hence task_work_pending() is a more reliable check. Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index c117e029c8dc..e9f0d41ebb99 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -266,8 +266,7 @@ static inline int io_run_task_work(void) static inline bool io_task_work_pending(struct io_ring_ctx *ctx) { - return test_thread_flag(TIF_NOTIFY_SIGNAL) || - !wq_list_empty(&ctx->work_llist); + return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); } static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx) -- cgit v1.2.3 From 35d90f95cfa773b7e3b1f57ba15ce06a470f354c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 17 Dec 2022 13:42:24 -0700 Subject: io_uring: include task_work run after scheduling in wait for events It's quite possible that we got woken up because task_work was queued, and we need to process this task_work to generate the events waited for. If we return to the wait loop without running task_work, we'll end up adding the task to the waitqueue again, only to call io_cqring_wait_schedule() again which will run the task_work. This is less efficient than it could be, as it requires adding to the cq_wait queue again. It also triggers the wakeup path for completions as cq_wait is now non-empty with the task itself, and it'll require another lock grab and deletion to remove ourselves from the waitqueue. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 16a323a9ff70..ff2bbac1a10f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2481,7 +2481,14 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, } if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) return -ETIME; - return 1; + + /* + * Run task_work after scheduling. If we got woken because of + * task_work being processed, run it now rather than let the caller + * do another wait loop. + */ + ret = io_run_task_work_sig(ctx); + return ret < 0 ? ret : 1; } /* @@ -2546,6 +2553,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); ret = io_cqring_wait_schedule(ctx, &iowq, timeout); + if (__io_cqring_events_user(ctx) >= min_events) + break; cond_resched(); } while (ret > 0); -- cgit v1.2.3 From 990a4de57e44f4f4cfc33c90d2ec5d285b7c8342 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 Dec 2022 07:28:26 -0700 Subject: io_uring/net: ensure compat import handlers clear free_iov If we're not allocating the vectors because the count is below UIO_FASTIOV, we still do need to properly clear ->free_iov to prevent an erronous free of on-stack data. Reported-by: Jiri Slaby Fixes: 4c17a496a7a0 ("io_uring/net: fix cleanup double free free_iov init") Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/net.c b/io_uring/net.c index 5229976cb582..f76b688f476e 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -494,6 +494,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, if (req->flags & REQ_F_BUFFER_SELECT) { compat_ssize_t clen; + iomsg->free_iov = NULL; if (msg.msg_iovlen == 0) { sr->len = 0; } else if (msg.msg_iovlen > 1) { -- cgit v1.2.3 From 6c3e8955d4bd9811a6e1761eea412a14fb51a2e6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 19 Dec 2022 15:11:40 +0000 Subject: io_uring/net: fix cleanup after recycle Don't access io_async_msghdr io_netmsg_recycle(), it may be reallocated. Cc: stable@vger.kernel.org Fixes: 9bb66906f23e5 ("io_uring: support multishot in recvmsg") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9e326f4ad4046ddadf15bf34bf3fa58c6372f6b5.1671461985.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index f76b688f476e..fbc34a7c2743 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -820,10 +820,10 @@ retry_multishot: goto retry_multishot; if (mshot_finished) { - io_netmsg_recycle(req, issue_flags); /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); + io_netmsg_recycle(req, issue_flags); req->flags &= ~REQ_F_NEED_CLEANUP; } -- cgit v1.2.3 From 5ad70eb27d2b87ec722fedd23638354be37ea0b0 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Mon, 19 Dec 2022 23:45:21 +0700 Subject: MAINTAINERS: io_uring: Add include/trace/events/io_uring.h This header file was introduced in commit c826bd7a743f ("io_uring: add set of tracing events"). It didn't get added to the io_uring maintainers section. Add this header file to the io_uring maintainers section. Signed-off-by: Ammar Faizi Link: https://lore.kernel.org/r/20221219164521.2481728-1-ammar.faizi@intel.com Signed-off-by: Jens Axboe --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index bb77a3ed9d54..aa19c15f82a9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10825,6 +10825,7 @@ T: git git://git.kernel.dk/liburing F: io_uring/ F: include/linux/io_uring.h F: include/linux/io_uring_types.h +F: include/trace/events/io_uring.h F: include/uapi/linux/io_uring.h F: tools/io_uring/ -- cgit v1.2.3