diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-26 12:52:58 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-26 12:52:58 -0700 |
commit | 9dd6956b38923dc1b7b349ca1eee3c0bb1f0163a (patch) | |
tree | c70bb7d65a50a51686378b6113a8663e0e60d9b8 /block/blk-mq.c | |
parent | 5b9a7bb72fddbc5247f56ede55d485fab7abdf92 (diff) | |
parent | 55793ea54d77719a071b1ccc05a05056e3b5e009 (diff) | |
download | lwn-9dd6956b38923dc1b7b349ca1eee3c0bb1f0163a.tar.gz lwn-9dd6956b38923dc1b7b349ca1eee3c0bb1f0163a.zip |
Merge tag 'for-6.4/block-2023-04-21' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- drbd patches, bringing us closer to unifying the out-of-tree version
and the in tree one (Andreas, Christoph)
- support for auto-quiesce for the s390 dasd driver (Stefan)
- MD pull request via Song:
- md/bitmap: Optimal last page size (Jon Derrick)
- Various raid10 fixes (Yu Kuai, Li Nan)
- md: add error_handlers for raid0 and linear (Mariusz Tkaczyk)
- NVMe pull request via Christoph:
- Drop redundant pci_enable_pcie_error_reporting (Bjorn Helgaas)
- Validate nvmet module parameters (Chaitanya Kulkarni)
- Fence TCP socket on receive error (Chris Leech)
- Fix async event trace event (Keith Busch)
- Minor cleanups (Chaitanya Kulkarni, zhenwei pi)
- Fix and cleanup nvmet Identify handling (Damien Le Moal,
Christoph Hellwig)
- Fix double blk_mq_complete_request race in the timeout handler
(Lei Yin)
- Fix irq locking in nvme-fcloop (Ming Lei)
- Remove queue mapping helper for rdma devices (Sagi Grimberg)
- use structured request attribute checks for nbd (Jakub)
- fix blk-crypto race conditions between keyslot management (Eric)
- add sed-opal support for reading read locking range attributes
(Ondrej)
- make fault injection configurable for null_blk (Akinobu)
- clean up the request insertion API (Christoph)
- clean up the queue running API (Christoph)
- blkg config helper cleanups (Tejun)
- lazy init support for blk-iolatency (Tejun)
- various fixes and tweaks to ublk (Ming)
- remove hybrid polling. It hasn't really been useful since we got
async polled IO support, and these days we don't support sync polled
IO at all (Keith)
- misc fixes, cleanups, improvements (Zhong, Ondrej, Colin, Chengming,
Chaitanya, me)
* tag 'for-6.4/block-2023-04-21' of git://git.kernel.dk/linux: (118 commits)
nbd: fix incomplete validation of ioctl arg
ublk: don't return 0 in case of any failure
sed-opal: geometry feature reporting command
null_blk: Always check queue mode setting from configfs
block: ublk: switch to ioctl command encoding
blk-mq: fix the blk_mq_add_to_requeue_list call in blk_kick_flush
block, bfq: Fix division by zero error on zero wsum
fault-inject: fix build error when FAULT_INJECTION_CONFIGFS=y and CONFIGFS_FS=m
block: store bdev->bd_disk->fops->submit_bio state in bdev
block: re-arrange the struct block_device fields for better layout
md/raid5: remove unused working_disks variable
md/raid10: don't call bio_start_io_acct twice for bio which experienced read error
md/raid10: fix memleak of md thread
md/raid10: fix memleak for 'conf->bio_split'
md/raid10: fix leak of 'r10bio->remaining' for recovery
md/raid10: don't BUG_ON() in raise_barrier()
md: fix soft lockup in status_resync
md: add error_handlers for raid0 and linear
md: Use optimal I/O size for last bitmap page
md: Fix types in sb writer
...
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 665 |
1 files changed, 256 insertions, 409 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 2831f78f86a0..f6dad0886a2f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -32,12 +32,10 @@ #include <trace/events/block.h> -#include <linux/blk-mq.h> #include <linux/t10-pi.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" -#include "blk-mq-tag.h" #include "blk-pm.h" #include "blk-stat.h" #include "blk-mq-sched.h" @@ -46,51 +44,19 @@ static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); -static void blk_mq_poll_stats_start(struct request_queue *q); -static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); - -static int blk_mq_poll_stats_bkt(const struct request *rq) -{ - int ddir, sectors, bucket; - - ddir = rq_data_dir(rq); - sectors = blk_rq_stats_sectors(rq); - - bucket = ddir + 2 * ilog2(sectors); - - if (bucket < 0) - return -1; - else if (bucket >= BLK_MQ_POLL_STATS_BKTS) - return ddir + BLK_MQ_POLL_STATS_BKTS - 2; - - return bucket; -} - -#define BLK_QC_T_SHIFT 16 -#define BLK_QC_T_INTERNAL (1U << 31) +static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); +static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, + struct list_head *list); static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, blk_qc_t qc) { - return xa_load(&q->hctx_table, - (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT); -} - -static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, - blk_qc_t qc) -{ - unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1); - - if (qc & BLK_QC_T_INTERNAL) - return blk_mq_tag_to_rq(hctx->sched_tags, tag); - return blk_mq_tag_to_rq(hctx->tags, tag); + return xa_load(&q->hctx_table, qc); } static inline blk_qc_t blk_rq_to_qc(struct request *rq) { - return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) | - (rq->tag != -1 ? - rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL)); + return rq->mq_hctx->queue_num; } /* @@ -840,6 +806,12 @@ static void blk_complete_request(struct request *req) req->q->integrity.profile->complete_fn(req, total_bytes); #endif + /* + * Upper layers may call blk_crypto_evict_key() anytime after the last + * bio_endio(). Therefore, the keyslot must be released before that. + */ + blk_crypto_rq_put_keyslot(req); + blk_account_io_completion(req, total_bytes); do { @@ -905,6 +877,13 @@ bool blk_update_request(struct request *req, blk_status_t error, req->q->integrity.profile->complete_fn(req, nr_bytes); #endif + /* + * Upper layers may call blk_crypto_evict_key() anytime after the last + * bio_endio(). Therefore, the keyslot must be released before that. + */ + if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) + __blk_crypto_rq_put_keyslot(req); + if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET)) && !test_bit(GD_DEAD, &req->q->disk->state)) { @@ -976,17 +955,6 @@ bool blk_update_request(struct request *req, blk_status_t error, } EXPORT_SYMBOL_GPL(blk_update_request); -static void __blk_account_io_done(struct request *req, u64 now) -{ - const int sgrp = op_stat_group(req_op(req)); - - part_stat_lock(); - update_io_ticks(req->part, jiffies, true); - part_stat_inc(req->part, ios[sgrp]); - part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); - part_stat_unlock(); -} - static inline void blk_account_io_done(struct request *req, u64 now) { /* @@ -995,40 +963,41 @@ static inline void blk_account_io_done(struct request *req, u64 now) * containing request is enough. */ if (blk_do_io_stat(req) && req->part && - !(req->rq_flags & RQF_FLUSH_SEQ)) - __blk_account_io_done(req, now); -} - -static void __blk_account_io_start(struct request *rq) -{ - /* - * All non-passthrough requests are created from a bio with one - * exception: when a flush command that is part of a flush sequence - * generated by the state machine in blk-flush.c is cloned onto the - * lower device by dm-multipath we can get here without a bio. - */ - if (rq->bio) - rq->part = rq->bio->bi_bdev; - else - rq->part = rq->q->disk->part0; + !(req->rq_flags & RQF_FLUSH_SEQ)) { + const int sgrp = op_stat_group(req_op(req)); - part_stat_lock(); - update_io_ticks(rq->part, jiffies, false); - part_stat_unlock(); + part_stat_lock(); + update_io_ticks(req->part, jiffies, true); + part_stat_inc(req->part, ios[sgrp]); + part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + part_stat_unlock(); + } } static inline void blk_account_io_start(struct request *req) { - if (blk_do_io_stat(req)) - __blk_account_io_start(req); + if (blk_do_io_stat(req)) { + /* + * All non-passthrough requests are created from a bio with one + * exception: when a flush command that is part of a flush sequence + * generated by the state machine in blk-flush.c is cloned onto the + * lower device by dm-multipath we can get here without a bio. + */ + if (req->bio) + req->part = req->bio->bi_bdev; + else + req->part = req->q->disk->part0; + + part_stat_lock(); + update_io_ticks(req->part, jiffies, false); + part_stat_unlock(); + } } static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) { - if (rq->rq_flags & RQF_STATS) { - blk_mq_poll_stats_start(rq->q); + if (rq->rq_flags & RQF_STATS) blk_stat_add(rq, now); - } blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); @@ -1322,6 +1291,8 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) */ void blk_execute_rq_nowait(struct request *rq, bool at_head) { + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); @@ -1332,10 +1303,13 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head) * device, directly accessing the plug instead of using blk_mq_plug() * should not have any consequences. */ - if (current->plug) + if (current->plug && !at_head) { blk_add_rq_to_plug(current->plug, rq); - else - blk_mq_sched_insert_request(rq, at_head, true, false); + return; + } + + blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); + blk_mq_run_hw_queue(hctx, false); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); @@ -1383,6 +1357,7 @@ static void blk_rq_poll_completion(struct request *rq, struct completion *wait) */ blk_status_t blk_execute_rq(struct request *rq, bool at_head) { + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_rq_wait wait = { .done = COMPLETION_INITIALIZER_ONSTACK(wait.done), }; @@ -1394,7 +1369,8 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head) rq->end_io = blk_end_sync_rq; blk_account_io_start(rq); - blk_mq_sched_insert_request(rq, at_head, true, false); + blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); + blk_mq_run_hw_queue(hctx, false); if (blk_rq_is_poll(rq)) { blk_rq_poll_completion(rq, &wait.done); @@ -1434,12 +1410,17 @@ static void __blk_mq_requeue_request(struct request *rq) void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { + struct request_queue *q = rq->q; + __blk_mq_requeue_request(rq); /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); + blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD); + + if (kick_requeue_list) + blk_mq_kick_requeue_list(q); } EXPORT_SYMBOL(blk_mq_requeue_request); @@ -1455,33 +1436,33 @@ static void blk_mq_requeue_work(struct work_struct *work) spin_unlock_irq(&q->requeue_lock); list_for_each_entry_safe(rq, next, &rq_list, queuelist) { - if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) - continue; - - rq->rq_flags &= ~RQF_SOFTBARRIER; - list_del_init(&rq->queuelist); /* - * If RQF_DONTPREP, rq has contained some driver specific - * data, so insert it to hctx dispatch list to avoid any - * merge. + * If RQF_DONTPREP ist set, the request has been started by the + * driver already and might have driver-specific data allocated + * already. Insert it into the hctx dispatch list to avoid + * block layer merges for the request. */ - if (rq->rq_flags & RQF_DONTPREP) - blk_mq_request_bypass_insert(rq, false, false); - else - blk_mq_sched_insert_request(rq, true, false, false); + if (rq->rq_flags & RQF_DONTPREP) { + rq->rq_flags &= ~RQF_SOFTBARRIER; + list_del_init(&rq->queuelist); + blk_mq_request_bypass_insert(rq, 0); + } else if (rq->rq_flags & RQF_SOFTBARRIER) { + rq->rq_flags &= ~RQF_SOFTBARRIER; + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); + } } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, false, false, false); + blk_mq_insert_request(rq, 0); } blk_mq_run_hw_queues(q, false); } -void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, - bool kick_requeue_list) +void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags) { struct request_queue *q = rq->q; unsigned long flags; @@ -1493,16 +1474,13 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); spin_lock_irqsave(&q->requeue_lock, flags); - if (at_head) { + if (insert_flags & BLK_MQ_INSERT_AT_HEAD) { rq->rq_flags |= RQF_SOFTBARRIER; list_add(&rq->queuelist, &q->requeue_list); } else { list_add_tail(&rq->queuelist, &q->requeue_list); } spin_unlock_irqrestore(&q->requeue_lock, flags); - - if (kick_requeue_list) - blk_mq_kick_requeue_list(q); } void blk_mq_kick_requeue_list(struct request_queue *q) @@ -2158,24 +2136,6 @@ out: return true; } -/** - * __blk_mq_run_hw_queue - Run a hardware queue. - * @hctx: Pointer to the hardware queue to run. - * - * Send pending requests to the hardware. - */ -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) -{ - /* - * We can't run the queue inline with ints disabled. Ensure that - * we catch bad users of this early. - */ - WARN_ON_ONCE(in_interrupt()); - - blk_mq_run_dispatch_ops(hctx->queue, - blk_mq_sched_dispatch_requests(hctx)); -} - static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) { int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); @@ -2232,42 +2192,19 @@ select_cpu: } /** - * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. + * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. - * @async: If we want to run the queue asynchronously. * @msecs: Milliseconds of delay to wait before running the queue. * - * If !@async, try to run the queue now. Else, run the queue asynchronously and - * with a delay of @msecs. + * Run a hardware queue asynchronously with a delay of @msecs. */ -static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, - unsigned long msecs) +void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { if (unlikely(blk_mq_hctx_stopped(hctx))) return; - - if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { - if (cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { - __blk_mq_run_hw_queue(hctx); - return; - } - } - kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs)); } - -/** - * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. - * @hctx: Pointer to the hardware queue to run. - * @msecs: Milliseconds of delay to wait before running the queue. - * - * Run a hardware queue asynchronously with a delay of @msecs. - */ -void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) -{ - __blk_mq_delay_run_hw_queue(hctx, true, msecs); -} EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); /** @@ -2284,6 +2221,11 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) bool need_run; /* + * We can't run the queue inline with interrupts disabled. + */ + WARN_ON_ONCE(!async && in_interrupt()); + + /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue * any more, even __blk_mq_hctx_has_pending() can't be called safely. @@ -2295,8 +2237,17 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) need_run = !blk_queue_quiesced(hctx->queue) && blk_mq_hctx_has_pending(hctx)); - if (need_run) - __blk_mq_delay_run_hw_queue(hctx, async, 0); + if (!need_run) + return; + + if (async || (hctx->flags & BLK_MQ_F_BLOCKING) || + !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { + blk_mq_delay_run_hw_queue(hctx, 0); + return; + } + + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_sched_dispatch_requests(hctx)); } EXPORT_SYMBOL(blk_mq_run_hw_queue); @@ -2461,80 +2412,52 @@ EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); static void blk_mq_run_work_fn(struct work_struct *work) { - struct blk_mq_hw_ctx *hctx; - - hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); - - /* - * If we are stopped, don't run the queue. - */ - if (blk_mq_hctx_stopped(hctx)) - return; - - __blk_mq_run_hw_queue(hctx); -} - -static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, - struct request *rq, - bool at_head) -{ - struct blk_mq_ctx *ctx = rq->mq_ctx; - enum hctx_type type = hctx->type; - - lockdep_assert_held(&ctx->lock); - - trace_block_rq_insert(rq); - - if (at_head) - list_add(&rq->queuelist, &ctx->rq_lists[type]); - else - list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); -} - -void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) -{ - struct blk_mq_ctx *ctx = rq->mq_ctx; - - lockdep_assert_held(&ctx->lock); + struct blk_mq_hw_ctx *hctx = + container_of(work, struct blk_mq_hw_ctx, run_work.work); - __blk_mq_insert_req_list(hctx, rq, at_head); - blk_mq_hctx_mark_pending(hctx, ctx); + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_sched_dispatch_requests(hctx)); } /** * blk_mq_request_bypass_insert - Insert a request at dispatch list. * @rq: Pointer to request to be inserted. - * @at_head: true if the request should be inserted at the head of the list. - * @run_queue: If we should run the hardware queue after inserting the request. + * @flags: BLK_MQ_INSERT_* * * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ -void blk_mq_request_bypass_insert(struct request *rq, bool at_head, - bool run_queue) +void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); - if (at_head) + if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &hctx->dispatch); else list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); - - if (run_queue) - blk_mq_run_hw_queue(hctx, false); } -void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - struct list_head *list) - +static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, struct list_head *list, + bool run_queue_async) { struct request *rq; enum hctx_type type = hctx->type; /* + * Try to issue requests directly if the hw queue isn't busy to save an + * extra enqueue & dequeue to the sw queue. + */ + if (!hctx->dispatch_busy && !run_queue_async) { + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_try_issue_list_directly(hctx, list)); + if (list_empty(list)) + goto out; + } + + /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now */ @@ -2547,6 +2470,70 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); +out: + blk_mq_run_hw_queue(hctx, run_queue_async); +} + +static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) +{ + struct request_queue *q = rq->q; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (blk_rq_is_passthrough(rq)) { + /* + * Passthrough request have to be added to hctx->dispatch + * directly. The device may be in a situation where it can't + * handle FS request, and always returns BLK_STS_RESOURCE for + * them, which gets them added to hctx->dispatch. + * + * If a passthrough request is required to unblock the queues, + * and it is added to the scheduler queue, there is no chance to + * dispatch it given we prioritize requests in hctx->dispatch. + */ + blk_mq_request_bypass_insert(rq, flags); + } else if (rq->rq_flags & RQF_FLUSH_SEQ) { + /* + * Firstly normal IO request is inserted to scheduler queue or + * sw queue, meantime we add flush request to dispatch queue( + * hctx->dispatch) directly and there is at most one in-flight + * flush request for each hw queue, so it doesn't matter to add + * flush request to tail or front of the dispatch queue. + * + * Secondly in case of NCQ, flush request belongs to non-NCQ + * command, and queueing it will fail when there is any + * in-flight normal IO request(NCQ command). When adding flush + * rq to the front of hctx->dispatch, it is easier to introduce + * extra time to flush rq's latency because of S_SCHED_RESTART + * compared with adding to the tail of dispatch queue, then + * chance of flush merge is increased, and less flush requests + * will be issued to controller. It is observed that ~10% time + * is saved in blktests block/004 on disk attached to AHCI/NCQ + * drive when adding flush rq to the front of hctx->dispatch. + * + * Simply queue flush rq to the front of hctx->dispatch so that + * intensive flush workloads can benefit in case of NCQ HW. + */ + blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD); + } else if (q->elevator) { + LIST_HEAD(list); + + WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG); + + list_add(&rq->queuelist, &list); + q->elevator->type->ops.insert_requests(hctx, &list, flags); + } else { + trace_block_rq_insert(rq); + + spin_lock(&ctx->lock); + if (flags & BLK_MQ_INSERT_AT_HEAD) + list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); + else + list_add_tail(&rq->queuelist, + &ctx->rq_lists[hctx->type]); + blk_mq_hctx_mark_pending(hctx, ctx); + spin_unlock(&ctx->lock); + } } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, @@ -2600,49 +2587,19 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; } -static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - bool bypass_insert, bool last) +static bool blk_mq_get_budget_and_tag(struct request *rq) { - struct request_queue *q = rq->q; - bool run_queue = true; int budget_token; - /* - * RCU or SRCU read lock is needed before checking quiesced flag. - * - * When queue is stopped or quiesced, ignore 'bypass_insert' from - * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, - * and avoid driver to try to dispatch again. - */ - if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { - run_queue = false; - bypass_insert = false; - goto insert; - } - - if ((rq->rq_flags & RQF_ELV) && !bypass_insert) - goto insert; - - budget_token = blk_mq_get_dispatch_budget(q); + budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) - goto insert; - + return false; blk_mq_set_rq_budget_token(rq, budget_token); - if (!blk_mq_get_driver_tag(rq)) { - blk_mq_put_dispatch_budget(q, budget_token); - goto insert; + blk_mq_put_dispatch_budget(rq->q, budget_token); + return false; } - - return __blk_mq_issue_directly(hctx, rq, last); -insert: - if (bypass_insert) - return BLK_STS_RESOURCE; - - blk_mq_sched_insert_request(rq, false, run_queue, false); - - return BLK_STS_OK; + return true; } /** @@ -2658,18 +2615,46 @@ insert: static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq) { - blk_status_t ret = - __blk_mq_try_issue_directly(hctx, rq, false, true); + blk_status_t ret; + + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { + blk_mq_insert_request(rq, 0); + return; + } + + if ((rq->rq_flags & RQF_ELV) || !blk_mq_get_budget_and_tag(rq)) { + blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, false); + return; + } - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - blk_mq_request_bypass_insert(rq, false, true); - else if (ret != BLK_STS_OK) + ret = __blk_mq_issue_directly(hctx, rq, true); + switch (ret) { + case BLK_STS_OK: + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, 0); + blk_mq_run_hw_queue(hctx, false); + break; + default: blk_mq_end_request(rq, ret); + break; + } } static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { - return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { + blk_mq_insert_request(rq, 0); + return BLK_STS_OK; + } + + if (!blk_mq_get_budget_and_tag(rq)) + return BLK_STS_RESOURCE; + return __blk_mq_issue_directly(hctx, rq, last); } static void blk_mq_plug_issue_direct(struct blk_plug *plug) @@ -2697,7 +2682,8 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug) break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: - blk_mq_request_bypass_insert(rq, false, true); + blk_mq_request_bypass_insert(rq, 0); + blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); @@ -2743,7 +2729,16 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) plug->mq_list = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_sched); + + percpu_ref_get(&this_hctx->queue->q_usage_counter); + if (this_hctx->queue->elevator) { + this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, + &list, 0); + blk_mq_run_hw_queue(this_hctx, from_sched); + } else { + blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched); + } + percpu_ref_put(&this_hctx->queue->q_usage_counter); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) @@ -2789,7 +2784,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) } while (!rq_list_empty(plug->mq_list)); } -void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, +static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { int queued = 0; @@ -2807,8 +2802,9 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: - blk_mq_request_bypass_insert(rq, false, - list_empty(list)); + blk_mq_request_bypass_insert(rq, 0); + if (list_empty(list)) + blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); @@ -2934,6 +2930,7 @@ void blk_mq_submit_bio(struct bio *bio) struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blk_plug *plug = blk_mq_plug(bio); const int is_sync = op_is_sync(bio->bi_opf); + struct blk_mq_hw_ctx *hctx; struct request *rq; unsigned int nr_segs = 1; blk_status_t ret; @@ -2965,7 +2962,7 @@ void blk_mq_submit_bio(struct bio *bio) blk_mq_bio_to_request(rq, bio, nr_segs); - ret = blk_crypto_init_request(rq); + ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) { bio->bi_status = ret; bio_endio(bio); @@ -2978,15 +2975,19 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (plug) + if (plug) { blk_add_rq_to_plug(plug, rq); - else if ((rq->rq_flags & RQF_ELV) || - (rq->mq_hctx->dispatch_busy && - (q->nr_hw_queues == 1 || !is_sync))) - blk_mq_sched_insert_request(rq, false, true, true); - else - blk_mq_run_dispatch_ops(rq->q, - blk_mq_try_issue_directly(rq->mq_hctx, rq)); + return; + } + + hctx = rq->mq_hctx; + if ((rq->rq_flags & RQF_ELV) || + (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { + blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, true); + } else { + blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); + } } #ifdef CONFIG_BLK_MQ_STACKING @@ -3034,8 +3035,9 @@ blk_status_t blk_insert_cloned_request(struct request *rq) if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; - if (blk_crypto_insert_cloned_request(rq)) - return BLK_STS_IOERR; + ret = blk_crypto_rq_get_keyslot(rq); + if (ret != BLK_STS_OK) + return ret; blk_account_io_start(rq); @@ -4206,14 +4208,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops; - q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, - blk_mq_poll_stats_bkt, - BLK_MQ_POLL_STATS_BKTS, q); - if (!q->poll_cb) - goto err_exit; - if (blk_mq_alloc_ctxs(q)) - goto err_poll; + goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); @@ -4241,11 +4237,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_requests = set->queue_depth; - /* - * Default to classic polling - */ - q->poll_nsec = BLK_MQ_POLL_CLASSIC; - blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); @@ -4253,9 +4244,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, err_hctxs: blk_mq_release(q); -err_poll: - blk_stat_free_callback(q->poll_cb); - q->poll_cb = NULL; err_exit: q->mq_ops = NULL; return -ENOMEM; @@ -4752,138 +4740,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); -/* Enable polling stats and return whether they were already enabled. */ -static bool blk_poll_stats_enable(struct request_queue *q) -{ - if (q->poll_stat) - return true; - - return blk_stats_alloc_enable(q); -} - -static void blk_mq_poll_stats_start(struct request_queue *q) -{ - /* - * We don't arm the callback if polling stats are not enabled or the - * callback is already active. - */ - if (!q->poll_stat || blk_stat_is_active(q->poll_cb)) - return; - - blk_stat_activate_msecs(q->poll_cb, 100); -} - -static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) -{ - struct request_queue *q = cb->data; - int bucket; - - for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { - if (cb->stat[bucket].nr_samples) - q->poll_stat[bucket] = cb->stat[bucket]; - } -} - -static unsigned long blk_mq_poll_nsecs(struct request_queue *q, - struct request *rq) -{ - unsigned long ret = 0; - int bucket; - - /* - * If stats collection isn't on, don't sleep but turn it on for - * future users - */ - if (!blk_poll_stats_enable(q)) - return 0; - - /* - * As an optimistic guess, use half of the mean service time - * for this type of request. We can (and should) make this smarter. - * For instance, if the completion latencies are tight, we can - * get closer than just half the mean. This is especially - * important on devices where the completion latencies are longer - * than ~10 usec. We do use the stats for the relevant IO size - * if available which does lead to better estimates. - */ - bucket = blk_mq_poll_stats_bkt(rq); - if (bucket < 0) - return ret; - - if (q->poll_stat[bucket].nr_samples) - ret = (q->poll_stat[bucket].mean + 1) / 2; - - return ret; -} - -static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc) -{ - struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc); - struct request *rq = blk_qc_to_rq(hctx, qc); - struct hrtimer_sleeper hs; - enum hrtimer_mode mode; - unsigned int nsecs; - ktime_t kt; - - /* - * If a request has completed on queue that uses an I/O scheduler, we - * won't get back a request from blk_qc_to_rq. - */ - if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT)) - return false; - - /* - * If we get here, hybrid polling is enabled. Hence poll_nsec can be: - * - * 0: use half of prev avg - * >0: use this specific value - */ - if (q->poll_nsec > 0) - nsecs = q->poll_nsec; - else - nsecs = blk_mq_poll_nsecs(q, rq); - - if (!nsecs) - return false; - - rq->rq_flags |= RQF_MQ_POLL_SLEPT; - - /* - * This will be replaced with the stats tracking code, using - * 'avg_completion_time / 2' as the pre-sleep target. - */ - kt = nsecs; - - mode = HRTIMER_MODE_REL; - hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode); - hrtimer_set_expires(&hs.timer, kt); - - do { - if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) - break; - set_current_state(TASK_UNINTERRUPTIBLE); - hrtimer_sleeper_start_expires(&hs, mode); - if (hs.task) - io_schedule(); - hrtimer_cancel(&hs.timer); - mode = HRTIMER_MODE_ABS; - } while (hs.task && !signal_pending(current)); - - __set_current_state(TASK_RUNNING); - destroy_hrtimer_on_stack(&hs.timer); - - /* - * If we sleep, have the caller restart the poll loop to reset the - * state. Like for the other success return cases, the caller is - * responsible for checking if the IO completed. If the IO isn't - * complete, we'll get called again and will go straight to the busy - * poll loop. - */ - return true; -} - -static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, - struct io_comp_batch *iob, unsigned int flags) +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, + unsigned int flags) { struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); long state = get_current_state(); @@ -4910,17 +4768,6 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, return 0; } -int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, - unsigned int flags) -{ - if (!(flags & BLK_POLL_NOSLEEP) && - q->poll_nsec != BLK_MQ_POLL_CLASSIC) { - if (blk_mq_poll_hybrid(q, cookie)) - return 1; - } - return blk_mq_poll_classic(q, cookie, iob, flags); -} - unsigned int blk_mq_rq_cpu(struct request *rq) { return rq->mq_ctx->cpu; |