diff options
author | Jens Axboe <axboe@kernel.dk> | 2019-11-14 12:09:58 -0700 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2019-11-14 12:09:58 -0700 |
commit | eac406c61cd0ec8fe7970ca46ddf23e40a86b579 (patch) | |
tree | 89f5835a2a93c1a44da3a3679ca16b4954872f12 /fs/io_uring.c | |
parent | 021d1cdda3875bf35edac9133335f622d7910abc (diff) | |
download | lwn-eac406c61cd0ec8fe7970ca46ddf23e40a86b579.tar.gz lwn-eac406c61cd0ec8fe7970ca46ddf23e40a86b579.zip |
io_uring: make POLL_ADD/POLL_REMOVE scale better
One of the obvious use cases for these commands is networking, where
it's not uncommon to have tons of sockets open and polled for. The
current implementation uses a list for insertion and lookup, which works
fine for file based use cases where the count is usually low, it breaks
down somewhat for higher number of files / sockets. A test case with
30k sockets being polled for and cancelled takes:
real 0m6.968s
user 0m0.002s
sys 0m6.936s
with the patch it takes:
real 0m0.233s
user 0m0.010s
sys 0m0.176s
If you go to 50k sockets, it gets even more abysmal with the current
code:
real 0m40.602s
user 0m0.010s
sys 0m40.555s
with the patch it takes:
real 0m0.398s
user 0m0.000s
sys 0m0.341s
Change is pretty straight forward, just replace the cancel_list with
a red/black tree instead.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r-- | fs/io_uring.c | 69 |
1 files changed, 54 insertions, 15 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index 55f8b1d378df..5ad652fa24b8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -271,7 +271,7 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct list_head poll_list; - struct list_head cancel_list; + struct rb_root cancel_tree; spinlock_t inflight_lock; struct list_head inflight_list; @@ -323,7 +323,10 @@ struct io_kiocb { struct sqe_submit submit; struct io_ring_ctx *ctx; - struct list_head list; + union { + struct list_head list; + struct rb_node rb_node; + }; struct list_head link_list; unsigned int flags; refcount_t refs; @@ -433,7 +436,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); - INIT_LIST_HEAD(&ctx->cancel_list); + ctx->cancel_tree = RB_ROOT; INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -1934,6 +1937,14 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, #endif } +static inline void io_poll_remove_req(struct io_kiocb *req) +{ + if (!RB_EMPTY_NODE(&req->rb_node)) { + rb_erase(&req->rb_node, &req->ctx->cancel_tree); + RB_CLEAR_NODE(&req->rb_node); + } +} + static void io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; @@ -1945,17 +1956,17 @@ static void io_poll_remove_one(struct io_kiocb *req) io_queue_async_work(req); } spin_unlock(&poll->head->lock); - - list_del_init(&req->list); + io_poll_remove_req(req); } static void io_poll_remove_all(struct io_ring_ctx *ctx) { + struct rb_node *node; struct io_kiocb *req; spin_lock_irq(&ctx->completion_lock); - while (!list_empty(&ctx->cancel_list)) { - req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list); + while ((node = rb_first(&ctx->cancel_tree)) != NULL) { + req = rb_entry(node, struct io_kiocb, rb_node); io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); @@ -1963,13 +1974,21 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) { + struct rb_node *p, *parent = NULL; struct io_kiocb *req; - list_for_each_entry(req, &ctx->cancel_list, list) { - if (req->user_data != sqe_addr) - continue; - io_poll_remove_one(req); - return 0; + p = ctx->cancel_tree.rb_node; + while (p) { + parent = p; + req = rb_entry(parent, struct io_kiocb, rb_node); + if (sqe_addr < req->user_data) { + p = p->rb_left; + } else if (sqe_addr > req->user_data) { + p = p->rb_right; + } else { + io_poll_remove_one(req); + return 0; + } } return -ENOENT; @@ -2039,7 +2058,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) spin_unlock_irq(&ctx->completion_lock); return; } - list_del_init(&req->list); + io_poll_remove_req(req); io_poll_complete(req, mask); spin_unlock_irq(&ctx->completion_lock); @@ -2073,7 +2092,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * for finalizing the request, mark us as having grabbed that already. */ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - list_del(&req->list); + io_poll_remove_req(req); io_poll_complete(req, mask); req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); @@ -2108,6 +2127,25 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, add_wait_queue(head, &pt->req->poll.wait); } +static void io_poll_req_insert(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct rb_node **p = &ctx->cancel_tree.rb_node; + struct rb_node *parent = NULL; + struct io_kiocb *tmp; + + while (*p) { + parent = *p; + tmp = rb_entry(parent, struct io_kiocb, rb_node); + if (req->user_data < tmp->user_data) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&req->rb_node, parent, p); + rb_insert_color(&req->rb_node, &ctx->cancel_tree); +} + static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_kiocb **nxt) { @@ -2129,6 +2167,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + RB_CLEAR_NODE(&req->rb_node); poll->head = NULL; poll->done = false; @@ -2161,7 +2200,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, else if (cancel) WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ - list_add_tail(&req->list, &ctx->cancel_list); + io_poll_req_insert(req); spin_unlock(&poll->head->lock); } if (mask) { /* no async, we'd stolen it */ |