summaryrefslogtreecommitdiff
path: root/io_uring/napi.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-18 17:02:57 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-18 17:02:57 -0800
commit8350142a4b4cedebfa76cd4cc6e5a7ba6a330629 (patch)
tree14310648caff08366a28cd6d2b8dab44688a9995 /io_uring/napi.c
parent77a0cfafa9af9c0d5b43534eb90d530c189edca1 (diff)
parenta652958888fb1ada3e4f6b548576c2d2c1b60d66 (diff)
downloadlwn-8350142a4b4cedebfa76cd4cc6e5a7ba6a330629.tar.gz
lwn-8350142a4b4cedebfa76cd4cc6e5a7ba6a330629.zip
Merge tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Cleanups of the eventfd handling code, making it fully private. - Support for sending a sync message to another ring, without having a ring available to send a normal async message. - Get rid of the separate unlocked hash table, unify everything around the single locked one. - Add support for ring resizing. It can be hard to appropriately size the CQ ring upfront, if the application doesn't know how busy it will be. This results in applications sizing rings for the most busy case, which can be wasteful. With ring resizing, they can start small and grow the ring, if needed. - Add support for fixed wait regions, rather than needing to copy the same wait data tons of times for each wait operation. - Rewrite the resource node handling, which before was serialized per ring. This caused issues with particularly fixed files, where one file waiting on IO could hold up putting and freeing of other unrelated files. Now each node is handled separately. New code is much simpler too, and was a net 250 line reduction in code. - Add support for just doing partial buffer clones, rather than always cloning the entire buffer table. - Series adding static NAPI support, where a specific NAPI instance is used rather than having a list of them available that need lookup. - Add support for mapped regions, and also convert the fixed wait support mentioned above to that concept. This avoids doing special mappings for various planned features, and folds the existing registered wait into that too. - Add support for hybrid IO polling, which is a variant of strict IOPOLL but with an initial sleep delay to avoid spinning too early and wasting resources on devices that aren't necessarily in the < 5 usec category wrt latencies. - Various cleanups and little fixes. * tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux: (79 commits) io_uring/region: fix error codes after failed vmap io_uring: restore back registered wait arguments io_uring: add memory region registration io_uring: introduce concept of memory regions io_uring: temporarily disable registered waits io_uring: disable ENTER_EXT_ARG_REG for IOPOLL io_uring: fortify io_pin_pages with a warning switch io_msg_ring() to CLASS(fd) io_uring: fix invalid hybrid polling ctx leaks io_uring/uring_cmd: fix buffer index retrieval io_uring/rsrc: add & apply io_req_assign_buf_node() io_uring/rsrc: remove '->ctx_ptr' of 'struct io_rsrc_node' io_uring/rsrc: pass 'struct io_ring_ctx' reference to rsrc helpers io_uring: avoid normal tw intermediate fallback io_uring/napi: add static napi tracking strategy io_uring/napi: clean up __io_napi_do_busy_loop io_uring/napi: Use lock guards io_uring/napi: improve __io_napi_add io_uring/napi: fix io_napi_entry RCU accesses io_uring/napi: protect concurrent io_napi_entry timeout accesses ...
Diffstat (limited to 'io_uring/napi.c')
-rw-r--r--io_uring/napi.c184
1 files changed, 131 insertions, 53 deletions
diff --git a/io_uring/napi.c b/io_uring/napi.c
index d0cf694d0172..b1ade3fda30f 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -38,67 +38,88 @@ static inline ktime_t net_to_ktime(unsigned long t)
return ns_to_ktime(t << 10);
}
-void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
+int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id)
{
struct hlist_head *hash_list;
- unsigned int napi_id;
- struct sock *sk;
struct io_napi_entry *e;
- sk = sock->sk;
- if (!sk)
- return;
-
- napi_id = READ_ONCE(sk->sk_napi_id);
-
/* Non-NAPI IDs can be rejected. */
if (napi_id < MIN_NAPI_ID)
- return;
+ return -EINVAL;
hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
- rcu_read_lock();
- e = io_napi_hash_find(hash_list, napi_id);
- if (e) {
- e->timeout = jiffies + NAPI_TIMEOUT;
- rcu_read_unlock();
- return;
+ scoped_guard(rcu) {
+ e = io_napi_hash_find(hash_list, napi_id);
+ if (e) {
+ WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT);
+ return -EEXIST;
+ }
}
- rcu_read_unlock();
e = kmalloc(sizeof(*e), GFP_NOWAIT);
if (!e)
- return;
+ return -ENOMEM;
e->napi_id = napi_id;
e->timeout = jiffies + NAPI_TIMEOUT;
+ /*
+ * guard(spinlock) is not used to manually unlock it before calling
+ * kfree()
+ */
spin_lock(&ctx->napi_lock);
if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
spin_unlock(&ctx->napi_lock);
kfree(e);
- return;
+ return -EEXIST;
}
hlist_add_tail_rcu(&e->node, hash_list);
- list_add_tail(&e->list, &ctx->napi_list);
+ list_add_tail_rcu(&e->list, &ctx->napi_list);
spin_unlock(&ctx->napi_lock);
+ return 0;
+}
+
+static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id)
+{
+ struct hlist_head *hash_list;
+ struct io_napi_entry *e;
+
+ /* Non-NAPI IDs can be rejected. */
+ if (napi_id < MIN_NAPI_ID)
+ return -EINVAL;
+
+ hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
+ guard(spinlock)(&ctx->napi_lock);
+ e = io_napi_hash_find(hash_list, napi_id);
+ if (!e)
+ return -ENOENT;
+
+ list_del_rcu(&e->list);
+ hash_del_rcu(&e->node);
+ kfree_rcu(e, rcu);
+ return 0;
}
static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
{
struct io_napi_entry *e;
- unsigned int i;
- spin_lock(&ctx->napi_lock);
- hash_for_each(ctx->napi_ht, i, e, node) {
- if (time_after(jiffies, e->timeout)) {
- list_del(&e->list);
+ guard(spinlock)(&ctx->napi_lock);
+ /*
+ * list_for_each_entry_safe() is not required as long as:
+ * 1. list_del_rcu() does not reset the deleted node next pointer
+ * 2. kfree_rcu() delays the memory freeing until the next quiescent
+ * state
+ */
+ list_for_each_entry(e, &ctx->napi_list, list) {
+ if (time_after(jiffies, READ_ONCE(e->timeout))) {
+ list_del_rcu(&e->list);
hash_del_rcu(&e->node);
kfree_rcu(e, rcu);
}
}
- spin_unlock(&ctx->napi_lock);
}
static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
@@ -136,45 +157,73 @@ static bool io_napi_busy_loop_should_end(void *data,
return false;
}
-static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
- void *loop_end_arg)
+/*
+ * never report stale entries
+ */
+static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg)
{
struct io_napi_entry *e;
- bool (*loop_end)(void *, unsigned long) = NULL;
- bool is_stale = false;
- if (loop_end_arg)
- loop_end = io_napi_busy_loop_should_end;
+ list_for_each_entry_rcu(e, &ctx->napi_list, list)
+ napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
+ ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
+ return false;
+}
+
+static bool
+dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg)
+{
+ struct io_napi_entry *e;
+ bool is_stale = false;
list_for_each_entry_rcu(e, &ctx->napi_list, list) {
napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
- if (time_after(jiffies, e->timeout))
+ if (time_after(jiffies, READ_ONCE(e->timeout)))
is_stale = true;
}
return is_stale;
}
+static inline bool
+__io_napi_do_busy_loop(struct io_ring_ctx *ctx,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg)
+{
+ if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC)
+ return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+ return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+}
+
static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
unsigned long start_time = busy_loop_current_time();
+ bool (*loop_end)(void *, unsigned long) = NULL;
void *loop_end_arg = NULL;
bool is_stale = false;
/* Singular lists use a different napi loop end check function and are
* only executed once.
*/
- if (list_is_singular(&ctx->napi_list))
+ if (list_is_singular(&ctx->napi_list)) {
+ loop_end = io_napi_busy_loop_should_end;
loop_end_arg = iowq;
+ }
- rcu_read_lock();
- do {
- is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
- } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ do {
+ is_stale = __io_napi_do_busy_loop(ctx, loop_end,
+ loop_end_arg);
+ } while (!io_napi_busy_loop_should_end(iowq, start_time) &&
+ !loop_end_arg);
+ }
io_napi_remove_stale(ctx, is_stale);
}
@@ -193,6 +242,7 @@ void io_napi_init(struct io_ring_ctx *ctx)
spin_lock_init(&ctx->napi_lock);
ctx->napi_prefer_busy_poll = false;
ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
+ ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE;
}
/*
@@ -204,14 +254,31 @@ void io_napi_init(struct io_ring_ctx *ctx)
void io_napi_free(struct io_ring_ctx *ctx)
{
struct io_napi_entry *e;
- unsigned int i;
- spin_lock(&ctx->napi_lock);
- hash_for_each(ctx->napi_ht, i, e, node) {
+ guard(spinlock)(&ctx->napi_lock);
+ list_for_each_entry(e, &ctx->napi_list, list) {
hash_del_rcu(&e->node);
kfree_rcu(e, rcu);
}
- spin_unlock(&ctx->napi_lock);
+ INIT_LIST_HEAD_RCU(&ctx->napi_list);
+}
+
+static int io_napi_register_napi(struct io_ring_ctx *ctx,
+ struct io_uring_napi *napi)
+{
+ switch (napi->op_param) {
+ case IO_URING_NAPI_TRACKING_DYNAMIC:
+ case IO_URING_NAPI_TRACKING_STATIC:
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* clean the napi list for new settings */
+ io_napi_free(ctx);
+ WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
+ WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
+ WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
+ return 0;
}
/*
@@ -225,7 +292,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
{
const struct io_uring_napi curr = {
.busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt),
- .prefer_busy_poll = ctx->napi_prefer_busy_poll
+ .prefer_busy_poll = ctx->napi_prefer_busy_poll,
+ .op_param = ctx->napi_track_mode
};
struct io_uring_napi napi;
@@ -233,16 +301,26 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
return -EINVAL;
if (copy_from_user(&napi, arg, sizeof(napi)))
return -EFAULT;
- if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
+ if (napi.pad[0] || napi.pad[1] || napi.resv)
return -EINVAL;
if (copy_to_user(arg, &curr, sizeof(curr)))
return -EFAULT;
- WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
- WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
- WRITE_ONCE(ctx->napi_enabled, true);
- return 0;
+ switch (napi.opcode) {
+ case IO_URING_NAPI_REGISTER_OP:
+ return io_napi_register_napi(ctx, &napi);
+ case IO_URING_NAPI_STATIC_ADD_ID:
+ if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+ return -EINVAL;
+ return __io_napi_add_id(ctx, napi.op_param);
+ case IO_URING_NAPI_STATIC_DEL_ID:
+ if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+ return -EINVAL;
+ return __io_napi_del_id(ctx, napi.op_param);
+ default:
+ return -EINVAL;
+ }
}
/*
@@ -265,7 +343,7 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
- WRITE_ONCE(ctx->napi_enabled, false);
+ WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
return 0;
}
@@ -307,9 +385,9 @@ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
if (list_empty_careful(&ctx->napi_list))
return 0;
- rcu_read_lock();
- is_stale = __io_napi_do_busy_loop(ctx, NULL);
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL);
+ }
io_napi_remove_stale(ctx, is_stale);
return 1;