Merge tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe: - Cleanups of the eventfd handling code, making it fully private. - Support for sending a sync message to another ring, without having a ring available to send a normal async message. - Get rid of the separate unlocked hash table, unify everything around the single locked one. - Add support for ring resizing. It can be hard to appropriately size the CQ ring upfront, if the application doesn't know how busy it will be. This results in applications sizing rings for the most busy case, which can be wasteful. With ring resizing, they can start small and grow the ring, if needed. - Add support for fixed wait regions, rather than needing to copy the same wait data tons of times for each wait operation. - Rewrite the resource node handling, which before was serialized per ring. This caused issues with particularly fixed files, where one file waiting on IO could hold up putting and freeing of other unrelated files. Now each node is handled separately. New code is much simpler too, and was a net 250 line reduction in code. - Add support for just doing partial buffer clones, rather than always cloning the entire buffer table. - Series adding static NAPI support, where a specific NAPI instance is used rather than having a list of them available that need lookup. - Add support for mapped regions, and also convert the fixed wait support mentioned above to that concept. This avoids doing special mappings for various planned features, and folds the existing registered wait into that too. - Add support for hybrid IO polling, which is a variant of strict IOPOLL but with an initial sleep delay to avoid spinning too early and wasting resources on devices that aren't necessarily in the < 5 usec category wrt latencies. - Various cleanups and little fixes. * tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux: (79 commits) io_uring/region: fix error codes after failed vmap io_uring: restore back registered wait arguments io_uring: add memory region registration io_uring: introduce concept of memory regions io_uring: temporarily disable registered waits io_uring: disable ENTER_EXT_ARG_REG for IOPOLL io_uring: fortify io_pin_pages with a warning switch io_msg_ring() to CLASS(fd) io_uring: fix invalid hybrid polling ctx leaks io_uring/uring_cmd: fix buffer index retrieval io_uring/rsrc: add & apply io_req_assign_buf_node() io_uring/rsrc: remove '->ctx_ptr' of 'struct io_rsrc_node' io_uring/rsrc: pass 'struct io_ring_ctx' reference to rsrc helpers io_uring: avoid normal tw intermediate fallback io_uring/napi: add static napi tracking strategy io_uring/napi: clean up __io_napi_do_busy_loop io_uring/napi: Use lock guards io_uring/napi: improve __io_napi_add io_uring/napi: fix io_napi_entry RCU accesses io_uring/napi: protect concurrent io_napi_entry timeout accesses ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2024-11-18 17:02:57 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2024-11-18 17:02:57 -0800
commit: 8350142a4b4cedebfa76cd4cc6e5a7ba6a330629 (patch)
tree: 14310648caff08366a28cd6d2b8dab44688a9995 /io_uring/eventfd.c
parent: 77a0cfafa9af9c0d5b43534eb90d530c189edca1 (diff)
parent: a652958888fb1ada3e4f6b548576c2d2c1b60d66 (diff)
download: lwn-8350142a4b4cedebfa76cd4cc6e5a7ba6a330629.tar.gz
lwn-8350142a4b4cedebfa76cd4cc6e5a7ba6a330629.zip
1 files changed, 92 insertions, 45 deletions
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index e37fddd5d9ce..fab936d31ba8 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -13,10 +13,12 @@
 
 struct io_ev_fd {
 	struct eventfd_ctx	*cq_ev_fd;
-	unsigned int		eventfd_async: 1;
-	struct rcu_head		rcu;
+	unsigned int		eventfd_async;
+	/* protected by ->completion_lock */
+	unsigned		last_cq_tail;
 	refcount_t		refs;
 	atomic_t		ops;
+	struct rcu_head		rcu;
 };
 
 enum {
@@ -41,14 +43,58 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
 		io_eventfd_free(rcu);
 }
 
-void io_eventfd_signal(struct io_ring_ctx *ctx)
+static void io_eventfd_put(struct io_ev_fd *ev_fd)
 {
-	struct io_ev_fd *ev_fd = NULL;
+	if (refcount_dec_and_test(&ev_fd->refs))
+		call_rcu(&ev_fd->rcu, io_eventfd_free);
+}
+
+static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref)
+{
+	if (put_ref)
+		io_eventfd_put(ev_fd);
+	rcu_read_unlock();
+}
+
+/*
+ * Returns true if the caller should put the ev_fd reference, false if not.
+ */
+static bool __io_eventfd_signal(struct io_ev_fd *ev_fd)
+{
+	if (eventfd_signal_allowed()) {
+		eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
+		return true;
+	}
+	if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) {
+		call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal);
+		return false;
+	}
+	return true;
+}
+
+/*
+ * Trigger if eventfd_async isn't set, or if it's set and the caller is
+ * an async worker. If ev_fd isn't valid, obviously return false.
+ */
+static bool io_eventfd_trigger(struct io_ev_fd *ev_fd)
+{
+	if (ev_fd)
+		return !ev_fd->eventfd_async || io_wq_current_is_worker();
+	return false;
+}
+
+/*
+ * On success, returns with an ev_fd reference grabbed and the RCU read
+ * lock held.
+ */
+static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
+{
+	struct io_ev_fd *ev_fd;
 
 	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
-		return;
+		return NULL;
 
-	guard(rcu)();
+	rcu_read_lock();
 
 	/*
 	 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
@@ -57,51 +103,53 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
 	ev_fd = rcu_dereference(ctx->io_ev_fd);
 
 	/*
-	 * Check again if ev_fd exists incase an io_eventfd_unregister call
+	 * Check again if ev_fd exists in case an io_eventfd_unregister call
 	 * completed between the NULL check of ctx->io_ev_fd at the start of
 	 * the function and rcu_read_lock.
 	 */
-	if (unlikely(!ev_fd))
-		return;
-	if (!refcount_inc_not_zero(&ev_fd->refs))
-		return;
-	if (ev_fd->eventfd_async && !io_wq_current_is_worker())
-		goto out;
-
-	if (likely(eventfd_signal_allowed())) {
-		eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
-	} else {
-		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) {
-			call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal);
-			return;
-		}
-	}
-out:
-	if (refcount_dec_and_test(&ev_fd->refs))
-		call_rcu(&ev_fd->rcu, io_eventfd_free);
+	if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs))
+		return ev_fd;
+
+	rcu_read_unlock();
+	return NULL;
 }
 
-void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
+void io_eventfd_signal(struct io_ring_ctx *ctx)
 {
-	bool skip;
+	struct io_ev_fd *ev_fd;
 
-	spin_lock(&ctx->completion_lock);
+	ev_fd = io_eventfd_grab(ctx);
+	if (ev_fd)
+		io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd));
+}
 
-	/*
-	 * Eventfd should only get triggered when at least one event has been
-	 * posted. Some applications rely on the eventfd notification count
-	 * only changing IFF a new CQE has been added to the CQ ring. There's
-	 * no depedency on 1:1 relationship between how many times this
-	 * function is called (and hence the eventfd count) and number of CQEs
-	 * posted to the CQ ring.
-	 */
-	skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
-	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
-	spin_unlock(&ctx->completion_lock);
-	if (skip)
-		return;
+void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
+{
+	struct io_ev_fd *ev_fd;
 
-	io_eventfd_signal(ctx);
+	ev_fd = io_eventfd_grab(ctx);
+	if (ev_fd) {
+		bool skip, put_ref = true;
+
+		/*
+		 * Eventfd should only get triggered when at least one event
+		 * has been posted. Some applications rely on the eventfd
+		 * notification count only changing IFF a new CQE has been
+		 * added to the CQ ring. There's no dependency on 1:1
+		 * relationship between how many times this function is called
+		 * (and hence the eventfd count) and number of CQEs posted to
+		 * the CQ ring.
+		 */
+		spin_lock(&ctx->completion_lock);
+		skip = ctx->cached_cq_tail == ev_fd->last_cq_tail;
+		ev_fd->last_cq_tail = ctx->cached_cq_tail;
+		spin_unlock(&ctx->completion_lock);
+
+		if (!skip)
+			put_ref = __io_eventfd_signal(ev_fd);
+
+		io_eventfd_release(ev_fd, put_ref);
+	}
 }
 
 int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -132,7 +180,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
 	}
 
 	spin_lock(&ctx->completion_lock);
-	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+	ev_fd->last_cq_tail = ctx->cached_cq_tail;
 	spin_unlock(&ctx->completion_lock);
 
 	ev_fd->eventfd_async = eventfd_async;
@@ -152,8 +200,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
 	if (ev_fd) {
 		ctx->has_evfd = false;
 		rcu_assign_pointer(ctx->io_ev_fd, NULL);
-		if (refcount_dec_and_test(&ev_fd->refs))
-			call_rcu(&ev_fd->rcu, io_eventfd_free);
+		io_eventfd_put(ev_fd);
 		return 0;
 	}
author	Linus Torvalds <torvalds@linux-foundation.org>	2024-11-18 17:02:57 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2024-11-18 17:02:57 -0800
commit	8350142a4b4cedebfa76cd4cc6e5a7ba6a330629 (patch)
tree	14310648caff08366a28cd6d2b8dab44688a9995 /io_uring/eventfd.c
parent	77a0cfafa9af9c0d5b43534eb90d530c189edca1 (diff)
parent	a652958888fb1ada3e4f6b548576c2d2c1b60d66 (diff)
download	lwn-8350142a4b4cedebfa76cd4cc6e5a7ba6a330629.tar.gz lwn-8350142a4b4cedebfa76cd4cc6e5a7ba6a330629.zip