summaryrefslogtreecommitdiff
path: root/block/blk-mq.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-13 10:43:59 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-13 10:43:59 -0800
commitce8a79d5601aab94c02ed4539c48e8605422ac94 (patch)
tree7830a97a475d57284640c8e2d3516521722708b6 /block/blk-mq.c
parent96f7e448b9f4546ffd0356ffceb2b9586777f316 (diff)
parentf596da3efaf4130ff61cd029558845808df9bf99 (diff)
downloadlwn-ce8a79d5601aab94c02ed4539c48e8605422ac94.tar.gz
lwn-ce8a79d5601aab94c02ed4539c48e8605422ac94.zip
Merge tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - NVMe pull requests via Christoph: - Support some passthrough commands without CAP_SYS_ADMIN (Kanchan Joshi) - Refactor PCIe probing and reset (Christoph Hellwig) - Various fabrics authentication fixes and improvements (Sagi Grimberg) - Avoid fallback to sequential scan due to transient issues (Uday Shankar) - Implement support for the DEAC bit in Write Zeroes (Christoph Hellwig) - Allow overriding the IEEE OUI and firmware revision in configfs for nvmet (Aleksandr Miloserdov) - Force reconnect when number of queue changes in nvmet (Daniel Wagner) - Minor fixes and improvements (Uros Bizjak, Joel Granados, Sagi Grimberg, Christoph Hellwig, Christophe JAILLET) - Fix and cleanup nvme-fc req allocation (Chaitanya Kulkarni) - Use the common tagset helpers in nvme-pci driver (Christoph Hellwig) - Cleanup the nvme-pci removal path (Christoph Hellwig) - Use kstrtobool() instead of strtobool (Christophe JAILLET) - Allow unprivileged passthrough of Identify Controller (Joel Granados) - Support io stats on the mpath device (Sagi Grimberg) - Minor nvmet cleanup (Sagi Grimberg) - MD pull requests via Song: - Code cleanups (Christoph) - Various fixes - Floppy pull request from Denis: - Fix a memory leak in the init error path (Yuan) - Series fixing some batch wakeup issues with sbitmap (Gabriel) - Removal of the pktcdvd driver that was deprecated more than 5 years ago, and subsequent removal of the devnode callback in struct block_device_operations as no users are now left (Greg) - Fix for partition read on an exclusively opened bdev (Jan) - Series of elevator API cleanups (Jinlong, Christoph) - Series of fixes and cleanups for blk-iocost (Kemeng) - Series of fixes and cleanups for blk-throttle (Kemeng) - Series adding concurrent support for sync queues in BFQ (Yu) - Series bringing drbd a bit closer to the out-of-tree maintained version (Christian, Joel, Lars, Philipp) - Misc drbd fixes (Wang) - blk-wbt fixes and tweaks for enable/disable (Yu) - Fixes for mq-deadline for zoned devices (Damien) - Add support for read-only and offline zones for null_blk (Shin'ichiro) - Series fixing the delayed holder tracking, as used by DM (Yu, Christoph) - Series enabling bio alloc caching for IRQ based IO (Pavel) - Series enabling userspace peer-to-peer DMA (Logan) - BFQ waker fixes (Khazhismel) - Series fixing elevator refcount issues (Christoph, Jinlong) - Series cleaning up references around queue destruction (Christoph) - Series doing quiesce by tagset, enabling cleanups in drivers (Christoph, Chao) - Series untangling the queue kobject and queue references (Christoph) - Misc fixes and cleanups (Bart, David, Dawei, Jinlong, Kemeng, Ye, Yang, Waiman, Shin'ichiro, Randy, Pankaj, Christoph) * tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux: (247 commits) blktrace: Fix output non-blktrace event when blk_classic option enabled block: sed-opal: Don't include <linux/kernel.h> sed-opal: allow using IOC_OPAL_SAVE for locking too blk-cgroup: Fix typo in comment block: remove bio_set_op_attrs nvmet: don't open-code NVME_NS_ATTR_RO enumeration nvme-pci: use the tagset alloc/free helpers nvme: add the Apple shared tag workaround to nvme_alloc_io_tag_set nvme: only set reserved_tags in nvme_alloc_io_tag_set for fabrics controllers nvme: consolidate setting the tagset flags nvme: pass nr_maps explicitly to nvme_alloc_io_tag_set block: bio_copy_data_iter nvme-pci: split out a nvme_pci_ctrl_is_dead helper nvme-pci: return early on ctrl state mismatch in nvme_reset_work nvme-pci: rename nvme_disable_io_queues nvme-pci: cleanup nvme_suspend_queue nvme-pci: remove nvme_pci_disable nvme-pci: remove nvme_disable_admin_queue nvme: merge nvme_shutdown_ctrl into nvme_disable_ctrl nvme: use nvme_wait_ready in nvme_shutdown_ctrl ...
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c229
1 files changed, 148 insertions, 81 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 228a6696d835..c5cf0dbca1db 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -254,15 +254,17 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
/**
* blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
- * @q: request queue.
+ * @set: tag_set to wait on
*
* Note: it is driver's responsibility for making sure that quiesce has
- * been started.
+ * been started on or more of the request_queues of the tag_set. This
+ * function only waits for the quiesce on those request_queues that had
+ * the quiesce flag set using blk_mq_quiesce_queue_nowait.
*/
-void blk_mq_wait_quiesce_done(struct request_queue *q)
+void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
{
- if (blk_queue_has_srcu(q))
- synchronize_srcu(q->srcu);
+ if (set->flags & BLK_MQ_F_BLOCKING)
+ synchronize_srcu(set->srcu);
else
synchronize_rcu();
}
@@ -280,7 +282,9 @@ EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
void blk_mq_quiesce_queue(struct request_queue *q)
{
blk_mq_quiesce_queue_nowait(q);
- blk_mq_wait_quiesce_done(q);
+ /* nothing to wait for non-mq queues */
+ if (queue_is_mq(q))
+ blk_mq_wait_quiesce_done(q->tag_set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
@@ -311,6 +315,33 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
+{
+ struct request_queue *q;
+
+ mutex_lock(&set->tag_list_lock);
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ if (!blk_queue_skip_tagset_quiesce(q))
+ blk_mq_quiesce_queue_nowait(q);
+ }
+ blk_mq_wait_quiesce_done(set);
+ mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
+
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
+{
+ struct request_queue *q;
+
+ mutex_lock(&set->tag_list_lock);
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ if (!blk_queue_skip_tagset_quiesce(q))
+ blk_mq_unquiesce_queue(q);
+ }
+ mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
+
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
@@ -544,25 +575,26 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (!plug)
return NULL;
+
if (rq_list_empty(plug->cached_rq)) {
if (plug->nr_ios == 1)
return NULL;
rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
- if (rq)
- goto got_it;
- return NULL;
- }
- rq = rq_list_peek(&plug->cached_rq);
- if (!rq || rq->q != q)
- return NULL;
+ if (!rq)
+ return NULL;
+ } else {
+ rq = rq_list_peek(&plug->cached_rq);
+ if (!rq || rq->q != q)
+ return NULL;
- if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
- return NULL;
- if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
- return NULL;
+ if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
+ return NULL;
+ if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
+ return NULL;
+
+ plug->cached_rq = rq_list_next(rq);
+ }
- plug->cached_rq = rq_list_next(rq);
-got_it:
rq->cmd_flags = opf;
INIT_LIST_HEAD(&rq->queuelist);
return rq;
@@ -1529,7 +1561,13 @@ static void blk_mq_rq_timed_out(struct request *req)
blk_add_timer(req);
}
-static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
+struct blk_expired_data {
+ bool has_timedout_rq;
+ unsigned long next;
+ unsigned long timeout_start;
+};
+
+static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired)
{
unsigned long deadline;
@@ -1539,13 +1577,13 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
return false;
deadline = READ_ONCE(rq->deadline);
- if (time_after_eq(jiffies, deadline))
+ if (time_after_eq(expired->timeout_start, deadline))
return true;
- if (*next == 0)
- *next = deadline;
- else if (time_after(*next, deadline))
- *next = deadline;
+ if (expired->next == 0)
+ expired->next = deadline;
+ else if (time_after(expired->next, deadline))
+ expired->next = deadline;
return false;
}
@@ -1561,7 +1599,7 @@ void blk_mq_put_rq_ref(struct request *rq)
static bool blk_mq_check_expired(struct request *rq, void *priv)
{
- unsigned long *next = priv;
+ struct blk_expired_data *expired = priv;
/*
* blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
@@ -1570,7 +1608,18 @@ static bool blk_mq_check_expired(struct request *rq, void *priv)
* it was completed and reallocated as a new request after returning
* from blk_mq_check_expired().
*/
- if (blk_mq_req_expired(rq, next))
+ if (blk_mq_req_expired(rq, expired)) {
+ expired->has_timedout_rq = true;
+ return false;
+ }
+ return true;
+}
+
+static bool blk_mq_handle_expired(struct request *rq, void *priv)
+{
+ struct blk_expired_data *expired = priv;
+
+ if (blk_mq_req_expired(rq, expired))
blk_mq_rq_timed_out(rq);
return true;
}
@@ -1579,7 +1628,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
{
struct request_queue *q =
container_of(work, struct request_queue, timeout_work);
- unsigned long next = 0;
+ struct blk_expired_data expired = {
+ .timeout_start = jiffies,
+ };
struct blk_mq_hw_ctx *hctx;
unsigned long i;
@@ -1599,10 +1650,23 @@ static void blk_mq_timeout_work(struct work_struct *work)
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
+ /* check if there is any timed-out request */
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired);
+ if (expired.has_timedout_rq) {
+ /*
+ * Before walking tags, we must ensure any submit started
+ * before the current time has finished. Since the submit
+ * uses srcu or rcu, wait for a synchronization point to
+ * ensure all running submits have finished
+ */
+ blk_mq_wait_quiesce_done(q->tag_set);
+
+ expired.next = 0;
+ blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired);
+ }
- if (next != 0) {
- mod_timer(&q->timeout, next);
+ if (expired.next != 0) {
+ mod_timer(&q->timeout, expired.next);
} else {
/*
* Request timeouts are handled as a forward rolling timer. If
@@ -3248,21 +3312,22 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
- if (!tags->rqs) {
- blk_mq_free_tags(tags);
- return NULL;
- }
+ if (!tags->rqs)
+ goto err_free_tags;
tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
- if (!tags->static_rqs) {
- kfree(tags->rqs);
- blk_mq_free_tags(tags);
- return NULL;
- }
+ if (!tags->static_rqs)
+ goto err_free_rqs;
return tags;
+
+err_free_rqs:
+ kfree(tags->rqs);
+err_free_tags:
+ blk_mq_free_tags(tags);
+ return NULL;
}
static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
@@ -3975,7 +4040,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
struct request_queue *q;
int ret;
- q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
+ q = blk_alloc_queue(set->numa_node);
if (!q)
return ERR_PTR(-ENOMEM);
q->queuedata = queuedata;
@@ -4011,14 +4076,11 @@ void blk_mq_destroy_queue(struct request_queue *q)
blk_queue_flag_set(QUEUE_FLAG_DYING, q);
blk_queue_start_drain(q);
- blk_freeze_queue(q);
+ blk_mq_freeze_queue_wait(q);
blk_sync_queue(q);
blk_mq_cancel_work_sync(q);
blk_mq_exit_queue(q);
-
- /* @q is and will stay empty, shutdown and put */
- blk_put_queue(q);
}
EXPORT_SYMBOL(blk_mq_destroy_queue);
@@ -4035,6 +4097,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
disk = __alloc_disk_node(q, set->numa_node, lkclass);
if (!disk) {
blk_mq_destroy_queue(q);
+ blk_put_queue(q);
return ERR_PTR(-ENOMEM);
}
set_bit(GD_OWNS_QUEUE, &disk->state);
@@ -4147,9 +4210,6 @@ static void blk_mq_update_poll_flag(struct request_queue *q)
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
- WARN_ON_ONCE(blk_queue_has_srcu(q) !=
- !!(set->flags & BLK_MQ_F_BLOCKING));
-
/* mark the queue as mq asap */
q->mq_ops = set->ops;
@@ -4325,12 +4385,12 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
}
static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
- int cur_nr_hw_queues, int new_nr_hw_queues)
+ int new_nr_hw_queues)
{
struct blk_mq_tags **new_tags;
- if (cur_nr_hw_queues >= new_nr_hw_queues)
- return 0;
+ if (set->nr_hw_queues >= new_nr_hw_queues)
+ goto done;
new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
GFP_KERNEL, set->numa_node);
@@ -4338,21 +4398,15 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
return -ENOMEM;
if (set->tags)
- memcpy(new_tags, set->tags, cur_nr_hw_queues *
+ memcpy(new_tags, set->tags, set->nr_hw_queues *
sizeof(*set->tags));
kfree(set->tags);
set->tags = new_tags;
+done:
set->nr_hw_queues = new_nr_hw_queues;
-
return 0;
}
-static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
- int new_nr_hw_queues)
-{
- return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
-}
-
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
@@ -4406,10 +4460,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
set->nr_hw_queues = nr_cpu_ids;
- if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
- return -ENOMEM;
+ if (set->flags & BLK_MQ_F_BLOCKING) {
+ set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
+ if (!set->srcu)
+ return -ENOMEM;
+ ret = init_srcu_struct(set->srcu);
+ if (ret)
+ goto out_free_srcu;
+ }
ret = -ENOMEM;
+ set->tags = kcalloc_node(set->nr_hw_queues,
+ sizeof(struct blk_mq_tags *), GFP_KERNEL,
+ set->numa_node);
+ if (!set->tags)
+ goto out_cleanup_srcu;
+
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
sizeof(set->map[i].mq_map[0]),
@@ -4437,6 +4503,12 @@ out_free_mq_map:
}
kfree(set->tags);
set->tags = NULL;
+out_cleanup_srcu:
+ if (set->flags & BLK_MQ_F_BLOCKING)
+ cleanup_srcu_struct(set->srcu);
+out_free_srcu:
+ if (set->flags & BLK_MQ_F_BLOCKING)
+ kfree(set->srcu);
return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
@@ -4476,6 +4548,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
kfree(set->tags);
set->tags = NULL;
+ if (set->flags & BLK_MQ_F_BLOCKING) {
+ cleanup_srcu_struct(set->srcu);
+ kfree(set->srcu);
+ }
}
EXPORT_SYMBOL(blk_mq_free_tag_set);
@@ -4564,17 +4640,10 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
INIT_LIST_HEAD(&qe->node);
qe->q = q;
qe->type = q->elevator->type;
+ /* keep a reference to the elevator module as we'll switch back */
+ __elevator_get(qe->type);
list_add(&qe->node, head);
-
- /*
- * After elevator_switch, the previous elevator_queue will be
- * released by elevator_release. The reference of the io scheduler
- * module get by elevator_get will also be put. So we need to get
- * a reference of the io scheduler module here to prevent it to be
- * removed.
- */
- __module_get(qe->type->elevator_owner);
- elevator_switch(q, NULL);
+ elevator_disable(q);
mutex_unlock(&q->sysfs_lock);
return true;
@@ -4607,6 +4676,8 @@ static void blk_mq_elv_switch_back(struct list_head *head,
mutex_lock(&q->sysfs_lock);
elevator_switch(q, t);
+ /* drop the reference acquired in blk_mq_elv_switch_none */
+ elevator_put(t);
mutex_unlock(&q->sysfs_lock);
}
@@ -4643,11 +4714,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
}
prev_nr_hw_queues = set->nr_hw_queues;
- if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
- 0)
+ if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
goto reregister;
- set->nr_hw_queues = nr_hw_queues;
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
@@ -4867,15 +4936,13 @@ EXPORT_SYMBOL(blk_mq_rq_cpu);
void blk_mq_cancel_work_sync(struct request_queue *q)
{
- if (queue_is_mq(q)) {
- struct blk_mq_hw_ctx *hctx;
- unsigned long i;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
- cancel_delayed_work_sync(&q->requeue_work);
+ cancel_delayed_work_sync(&q->requeue_work);
- queue_for_each_hw_ctx(q, hctx, i)
- cancel_delayed_work_sync(&hctx->run_work);
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ cancel_delayed_work_sync(&hctx->run_work);
}
static int __init blk_mq_init(void)