diff options
author | Christoph Hellwig <hch@lst.de> | 2021-10-12 13:12:24 +0200 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2021-10-18 06:17:36 -0600 |
commit | 3e08773c3841e9db7a520908cc2b136a77d275ff (patch) | |
tree | 5dce4a3944899f0eb1370a92130e89134da2a714 /block | |
parent | 19416123ab3e1348b3532347af221d8f60838431 (diff) | |
download | lwn-3e08773c3841e9db7a520908cc2b136a77d275ff.tar.gz lwn-3e08773c3841e9db7a520908cc2b136a77d275ff.zip |
block: switch polling to be bio based
Replace the blk_poll interface that requires the caller to keep a queue
and cookie from the submissions with polling based on the bio.
Polling for the bio itself leads to a few advantages:
- the cookie construction can made entirely private in blk-mq.c
- the caller does not need to remember the request_queue and cookie
separately and thus sidesteps their lifetime issues
- keeping the device and the cookie inside the bio allows to trivially
support polling BIOs remapping by stacking drivers
- a lot of code to propagate the cookie back up the submission path can
be removed entirely.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Mark Wunderlich <mark.wunderlich@intel.com>
Link: https://lore.kernel.org/r/20211012111226.760968-15-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block')
-rw-r--r-- | block/bio.c | 1 | ||||
-rw-r--r-- | block/blk-core.c | 127 | ||||
-rw-r--r-- | block/blk-exec.c | 10 | ||||
-rw-r--r-- | block/blk-mq.c | 72 | ||||
-rw-r--r-- | block/blk-mq.h | 2 | ||||
-rw-r--r-- | block/fops.c | 25 |
6 files changed, 137 insertions, 100 deletions
diff --git a/block/bio.c b/block/bio.c index df45f4b996ac..a3c9ff23a036 100644 --- a/block/bio.c +++ b/block/bio.c @@ -282,6 +282,7 @@ void bio_init(struct bio *bio, struct bio_vec *table, atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); + bio->bi_cookie = BLK_QC_T_NONE; bio->bi_max_vecs = max_vecs; bio->bi_io_vec = table; diff --git a/block/blk-core.c b/block/blk-core.c index 8eb0e08d5395..f008c38ae967 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -915,25 +915,22 @@ end_io: return false; } -static blk_qc_t __submit_bio(struct bio *bio) +static void __submit_bio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; - blk_qc_t ret = BLK_QC_T_NONE; if (unlikely(bio_queue_enter(bio) != 0)) - return BLK_QC_T_NONE; + return; if (!submit_bio_checks(bio) || !blk_crypto_bio_prep(&bio)) goto queue_exit; - if (disk->fops->submit_bio) { - ret = disk->fops->submit_bio(bio); - goto queue_exit; + if (!disk->fops->submit_bio) { + blk_mq_submit_bio(bio); + return; } - return blk_mq_submit_bio(bio); - + disk->fops->submit_bio(bio); queue_exit: blk_queue_exit(disk->queue); - return ret; } /* @@ -955,10 +952,9 @@ queue_exit: * bio_list_on_stack[1] contains bios that were submitted before the current * ->submit_bio_bio, but that haven't been processed yet. */ -static blk_qc_t __submit_bio_noacct(struct bio *bio) +static void __submit_bio_noacct(struct bio *bio) { struct bio_list bio_list_on_stack[2]; - blk_qc_t ret = BLK_QC_T_NONE; BUG_ON(bio->bi_next); @@ -975,7 +971,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - ret = __submit_bio(bio); + __submit_bio(bio); /* * Sort new bios into those for a lower level and those for the @@ -998,22 +994,19 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); current->bio_list = NULL; - return ret; } -static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) +static void __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; - blk_qc_t ret; current->bio_list = bio_list; do { - ret = __submit_bio(bio); + __submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); current->bio_list = NULL; - return ret; } /** @@ -1025,7 +1018,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) * systems and other upper level users of the block layer should use * submit_bio() instead. */ -blk_qc_t submit_bio_noacct(struct bio *bio) +void submit_bio_noacct(struct bio *bio) { /* * We only want one ->submit_bio to be active at a time, else stack @@ -1033,14 +1026,12 @@ blk_qc_t submit_bio_noacct(struct bio *bio) * to collect a list of requests submited by a ->submit_bio method while * it is active, and then process them after it returned. */ - if (current->bio_list) { + if (current->bio_list) bio_list_add(¤t->bio_list[0], bio); - return BLK_QC_T_NONE; - } - - if (!bio->bi_bdev->bd_disk->fops->submit_bio) - return __submit_bio_noacct_mq(bio); - return __submit_bio_noacct(bio); + else if (!bio->bi_bdev->bd_disk->fops->submit_bio) + __submit_bio_noacct_mq(bio); + else + __submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio_noacct); @@ -1057,10 +1048,10 @@ EXPORT_SYMBOL(submit_bio_noacct); * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has * been called. */ -blk_qc_t submit_bio(struct bio *bio) +void submit_bio(struct bio *bio) { if (blkcg_punt_bio_submit(bio)) - return BLK_QC_T_NONE; + return; /* * If it's a regular read/write or a barrier with data attached, @@ -1092,20 +1083,92 @@ blk_qc_t submit_bio(struct bio *bio) if (unlikely(bio_op(bio) == REQ_OP_READ && bio_flagged(bio, BIO_WORKINGSET))) { unsigned long pflags; - blk_qc_t ret; psi_memstall_enter(&pflags); - ret = submit_bio_noacct(bio); + submit_bio_noacct(bio); psi_memstall_leave(&pflags); - - return ret; + return; } - return submit_bio_noacct(bio); + submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); /** + * bio_poll - poll for BIO completions + * @bio: bio to poll for + * @flags: BLK_POLL_* flags that control the behavior + * + * Poll for completions on queue associated with the bio. Returns number of + * completed entries found. + * + * Note: the caller must either be the context that submitted @bio, or + * be in a RCU critical section to prevent freeing of @bio. + */ +int bio_poll(struct bio *bio, unsigned int flags) +{ + struct request_queue *q = bio->bi_bdev->bd_disk->queue; + blk_qc_t cookie = READ_ONCE(bio->bi_cookie); + int ret; + + if (cookie == BLK_QC_T_NONE || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return 0; + + if (current->plug) + blk_flush_plug_list(current->plug, false); + + if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT)) + return 0; + if (WARN_ON_ONCE(!queue_is_mq(q))) + ret = 0; /* not yet implemented, should not happen */ + else + ret = blk_mq_poll(q, cookie, flags); + blk_queue_exit(q); + return ret; +} +EXPORT_SYMBOL_GPL(bio_poll); + +/* + * Helper to implement file_operations.iopoll. Requires the bio to be stored + * in iocb->private, and cleared before freeing the bio. + */ +int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags) +{ + struct bio *bio; + int ret = 0; + + /* + * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can + * point to a freshly allocated bio at this point. If that happens + * we have a few cases to consider: + * + * 1) the bio is beeing initialized and bi_bdev is NULL. We can just + * simply nothing in this case + * 2) the bio points to a not poll enabled device. bio_poll will catch + * this and return 0 + * 3) the bio points to a poll capable device, including but not + * limited to the one that the original bio pointed to. In this + * case we will call into the actual poll method and poll for I/O, + * even if we don't need to, but it won't cause harm either. + * + * For cases 2) and 3) above the RCU grace period ensures that bi_bdev + * is still allocated. Because partitions hold a reference to the whole + * device bdev and thus disk, the disk is also still valid. Grabbing + * a reference to the queue in bio_poll() ensures the hctxs and requests + * are still valid as well. + */ + rcu_read_lock(); + bio = READ_ONCE(kiocb->private); + if (bio && bio->bi_bdev) + ret = bio_poll(bio, flags); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(iocb_bio_iopoll); + +/** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for the new queue limits * @q: the queue diff --git a/block/blk-exec.c b/block/blk-exec.c index 1fa7f25e5726..55f0cd34b37b 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -65,13 +65,19 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); static bool blk_rq_is_poll(struct request *rq) { - return rq->mq_hctx && rq->mq_hctx->type == HCTX_TYPE_POLL; + if (!rq->mq_hctx) + return false; + if (rq->mq_hctx->type != HCTX_TYPE_POLL) + return false; + if (WARN_ON_ONCE(!rq->bio)) + return false; + return true; } static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { - blk_poll(rq->q, request_to_qc_t(rq->mq_hctx, rq), 0); + bio_poll(rq->bio, 0); cond_resched(); } while (!completion_done(wait)); } diff --git a/block/blk-mq.c b/block/blk-mq.c index a34ffcf861c3..0860f622099f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -65,6 +65,9 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) return bucket; } +#define BLK_QC_T_SHIFT 16 +#define BLK_QC_T_INTERNAL (1U << 31) + static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, blk_qc_t qc) { @@ -81,6 +84,13 @@ static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, return blk_mq_tag_to_rq(hctx->tags, tag); } +static inline blk_qc_t blk_rq_to_qc(struct request *rq) +{ + return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) | + (rq->tag != -1 ? + rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL)); +} + /* * Check if any of the ctx, dispatch list or elevator * have pending work in this hardware queue. @@ -819,6 +829,8 @@ void blk_mq_start_request(struct request *rq) if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) q->integrity.profile->prepare_fn(rq); #endif + if (rq->bio && rq->bio->bi_opf & REQ_POLLED) + WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq)); } EXPORT_SYMBOL(blk_mq_start_request); @@ -2045,19 +2057,15 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, } static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie, bool last) + struct request *rq, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, .last = last, }; - blk_qc_t new_cookie; blk_status_t ret; - new_cookie = request_to_qc_t(hctx, rq); - /* * For OK queue, we are done. For error, caller may kill it. * Any other error (busy), just add it to our list as we @@ -2067,7 +2075,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, switch (ret) { case BLK_STS_OK: blk_mq_update_dispatch_busy(hctx, false); - *cookie = new_cookie; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: @@ -2076,7 +2083,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, break; default: blk_mq_update_dispatch_busy(hctx, false); - *cookie = BLK_QC_T_NONE; break; } @@ -2085,7 +2091,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, - blk_qc_t *cookie, bool bypass_insert, bool last) { struct request_queue *q = rq->q; @@ -2119,7 +2124,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - return __blk_mq_issue_directly(hctx, rq, cookie, last); + return __blk_mq_issue_directly(hctx, rq, last); insert: if (bypass_insert) return BLK_STS_RESOURCE; @@ -2133,7 +2138,6 @@ insert: * blk_mq_try_issue_directly - Try to send a request directly to device driver. * @hctx: Pointer of the associated hardware queue. * @rq: Pointer to request to be sent. - * @cookie: Request queue cookie. * * If the device has enough resources to accept a new request now, send the * request directly to device driver. Else, insert at hctx->dispatch queue, so @@ -2141,7 +2145,7 @@ insert: * queue have higher priority. */ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, blk_qc_t *cookie) + struct request *rq) { blk_status_t ret; int srcu_idx; @@ -2150,7 +2154,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); + ret = __blk_mq_try_issue_directly(hctx, rq, false, true); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) blk_mq_request_bypass_insert(rq, false, true); else if (ret != BLK_STS_OK) @@ -2163,11 +2167,10 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { blk_status_t ret; int srcu_idx; - blk_qc_t unused_cookie; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last); + ret = __blk_mq_try_issue_directly(hctx, rq, true, last); hctx_unlock(hctx, srcu_idx); return ret; @@ -2247,10 +2250,8 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) * * It will not queue the request if there is an error with the bio, or at the * request creation. - * - * Returns: Request queue cookie. */ -blk_qc_t blk_mq_submit_bio(struct bio *bio) +void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; const int is_sync = op_is_sync(bio->bi_opf); @@ -2259,9 +2260,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) struct blk_plug *plug; struct request *same_queue_rq = NULL; unsigned int nr_segs; - blk_qc_t cookie; blk_status_t ret; - bool hipri; blk_queue_bounce(q, &bio); __blk_queue_split(&bio, &nr_segs); @@ -2278,8 +2277,6 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) rq_qos_throttle(q, bio); - hipri = bio->bi_opf & REQ_POLLED; - plug = blk_mq_plug(q, bio); if (plug && plug->cached_rq) { rq = plug->cached_rq; @@ -2310,8 +2307,6 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) rq_qos_track(q, rq, bio); - cookie = request_to_qc_t(rq->mq_hctx, rq); - blk_mq_bio_to_request(rq, bio, nr_segs); ret = blk_crypto_init_request(rq); @@ -2319,7 +2314,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) bio->bi_status = ret; bio_endio(bio); blk_mq_free_request(rq); - return BLK_QC_T_NONE; + return; } if (unlikely(is_flush_fua)) { @@ -2375,7 +2370,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) if (same_queue_rq) { trace_block_unplug(q, 1, true); blk_mq_try_issue_directly(same_queue_rq->mq_hctx, - same_queue_rq, &cookie); + same_queue_rq); } } else if ((q->nr_hw_queues > 1 && is_sync) || !rq->mq_hctx->dispatch_busy) { @@ -2383,18 +2378,15 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) * There is no scheduler and we can try to send directly * to the hardware. */ - blk_mq_try_issue_directly(rq->mq_hctx, rq, &cookie); + blk_mq_try_issue_directly(rq->mq_hctx, rq); } else { /* Default case. */ blk_mq_sched_insert_request(rq, false, true, true); } - if (!hipri) - return BLK_QC_T_NONE; - return cookie; + return; queue_exit: blk_queue_exit(q); - return BLK_QC_T_NONE; } static size_t order_to_size(unsigned int order) @@ -4084,25 +4076,8 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, return 0; } -/** - * blk_poll - poll for IO completions - * @q: the queue - * @cookie: cookie passed back at IO submission time - * @flags: BLK_POLL_* flags that control the behavior - * - * Description: - * Poll for completions on the passed in queue. Returns number of - * completed entries found. - */ -int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags) +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags) { - if (cookie == BLK_QC_T_NONE || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - return 0; - - if (current->plug) - blk_flush_plug_list(current->plug, false); - if (!(flags & BLK_POLL_NOSLEEP) && q->poll_nsec != BLK_MQ_POLL_CLASSIC) { if (blk_mq_poll_hybrid(q, cookie)) @@ -4110,7 +4085,6 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags) } return blk_mq_poll_classic(q, cookie, flags); } -EXPORT_SYMBOL_GPL(blk_poll); unsigned int blk_mq_rq_cpu(struct request *rq) { diff --git a/block/blk-mq.h b/block/blk-mq.h index a9fe01e14951..8be447995106 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -37,6 +37,8 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; +void blk_mq_submit_bio(struct bio *bio); +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); diff --git a/block/fops.c b/block/fops.c index db8f2fe68dd2..ce1255529ba2 100644 --- a/block/fops.c +++ b/block/fops.c @@ -61,7 +61,6 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bool should_dirty = false; struct bio bio; ssize_t ret; - blk_qc_t qc; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) @@ -102,13 +101,12 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_HIPRI) bio_set_polled(&bio, iocb); - qc = submit_bio(&bio); + submit_bio(&bio); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(bio.bi_private)) break; - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, 0)) + if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -141,14 +139,6 @@ struct blkdev_dio { static struct bio_set blkdev_dio_pool; -static int blkdev_iopoll(struct kiocb *kiocb, unsigned int flags) -{ - struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); - struct request_queue *q = bdev_get_queue(bdev); - - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), flags); -} - static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; @@ -162,6 +152,8 @@ static void blkdev_bio_end_io(struct bio *bio) struct kiocb *iocb = dio->iocb; ssize_t ret; + WRITE_ONCE(iocb->private, NULL); + if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; @@ -200,7 +192,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bool do_poll = (iocb->ki_flags & IOCB_HIPRI); bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; - blk_qc_t qc = BLK_QC_T_NONE; int ret = 0; if ((pos | iov_iter_alignment(iter)) & @@ -262,9 +253,9 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (!nr_pages) { if (do_poll) bio_set_polled(bio, iocb); - qc = submit_bio(bio); + submit_bio(bio); if (do_poll) - WRITE_ONCE(iocb->ki_cookie, qc); + WRITE_ONCE(iocb->private, bio); break; } if (!dio->multi_bio) { @@ -297,7 +288,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(dio->waiter)) break; - if (!do_poll || !blk_poll(bdev_get_queue(bdev), qc, 0)) + if (!do_poll || !bio_poll(bio, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -594,7 +585,7 @@ const struct file_operations def_blk_fops = { .llseek = blkdev_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, - .iopoll = blkdev_iopoll, + .iopoll = iocb_bio_iopoll, .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = blkdev_ioctl, |