diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 10:43:59 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 10:43:59 -0800 |
commit | ce8a79d5601aab94c02ed4539c48e8605422ac94 (patch) | |
tree | 7830a97a475d57284640c8e2d3516521722708b6 /block/blk-mq.c | |
parent | 96f7e448b9f4546ffd0356ffceb2b9586777f316 (diff) | |
parent | f596da3efaf4130ff61cd029558845808df9bf99 (diff) | |
download | lwn-ce8a79d5601aab94c02ed4539c48e8605422ac94.tar.gz lwn-ce8a79d5601aab94c02ed4539c48e8605422ac94.zip |
Merge tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- NVMe pull requests via Christoph:
- Support some passthrough commands without CAP_SYS_ADMIN (Kanchan
Joshi)
- Refactor PCIe probing and reset (Christoph Hellwig)
- Various fabrics authentication fixes and improvements (Sagi
Grimberg)
- Avoid fallback to sequential scan due to transient issues (Uday
Shankar)
- Implement support for the DEAC bit in Write Zeroes (Christoph
Hellwig)
- Allow overriding the IEEE OUI and firmware revision in configfs
for nvmet (Aleksandr Miloserdov)
- Force reconnect when number of queue changes in nvmet (Daniel
Wagner)
- Minor fixes and improvements (Uros Bizjak, Joel Granados, Sagi
Grimberg, Christoph Hellwig, Christophe JAILLET)
- Fix and cleanup nvme-fc req allocation (Chaitanya Kulkarni)
- Use the common tagset helpers in nvme-pci driver (Christoph
Hellwig)
- Cleanup the nvme-pci removal path (Christoph Hellwig)
- Use kstrtobool() instead of strtobool (Christophe JAILLET)
- Allow unprivileged passthrough of Identify Controller (Joel
Granados)
- Support io stats on the mpath device (Sagi Grimberg)
- Minor nvmet cleanup (Sagi Grimberg)
- MD pull requests via Song:
- Code cleanups (Christoph)
- Various fixes
- Floppy pull request from Denis:
- Fix a memory leak in the init error path (Yuan)
- Series fixing some batch wakeup issues with sbitmap (Gabriel)
- Removal of the pktcdvd driver that was deprecated more than 5 years
ago, and subsequent removal of the devnode callback in struct
block_device_operations as no users are now left (Greg)
- Fix for partition read on an exclusively opened bdev (Jan)
- Series of elevator API cleanups (Jinlong, Christoph)
- Series of fixes and cleanups for blk-iocost (Kemeng)
- Series of fixes and cleanups for blk-throttle (Kemeng)
- Series adding concurrent support for sync queues in BFQ (Yu)
- Series bringing drbd a bit closer to the out-of-tree maintained
version (Christian, Joel, Lars, Philipp)
- Misc drbd fixes (Wang)
- blk-wbt fixes and tweaks for enable/disable (Yu)
- Fixes for mq-deadline for zoned devices (Damien)
- Add support for read-only and offline zones for null_blk
(Shin'ichiro)
- Series fixing the delayed holder tracking, as used by DM (Yu,
Christoph)
- Series enabling bio alloc caching for IRQ based IO (Pavel)
- Series enabling userspace peer-to-peer DMA (Logan)
- BFQ waker fixes (Khazhismel)
- Series fixing elevator refcount issues (Christoph, Jinlong)
- Series cleaning up references around queue destruction (Christoph)
- Series doing quiesce by tagset, enabling cleanups in drivers
(Christoph, Chao)
- Series untangling the queue kobject and queue references (Christoph)
- Misc fixes and cleanups (Bart, David, Dawei, Jinlong, Kemeng, Ye,
Yang, Waiman, Shin'ichiro, Randy, Pankaj, Christoph)
* tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux: (247 commits)
blktrace: Fix output non-blktrace event when blk_classic option enabled
block: sed-opal: Don't include <linux/kernel.h>
sed-opal: allow using IOC_OPAL_SAVE for locking too
blk-cgroup: Fix typo in comment
block: remove bio_set_op_attrs
nvmet: don't open-code NVME_NS_ATTR_RO enumeration
nvme-pci: use the tagset alloc/free helpers
nvme: add the Apple shared tag workaround to nvme_alloc_io_tag_set
nvme: only set reserved_tags in nvme_alloc_io_tag_set for fabrics controllers
nvme: consolidate setting the tagset flags
nvme: pass nr_maps explicitly to nvme_alloc_io_tag_set
block: bio_copy_data_iter
nvme-pci: split out a nvme_pci_ctrl_is_dead helper
nvme-pci: return early on ctrl state mismatch in nvme_reset_work
nvme-pci: rename nvme_disable_io_queues
nvme-pci: cleanup nvme_suspend_queue
nvme-pci: remove nvme_pci_disable
nvme-pci: remove nvme_disable_admin_queue
nvme: merge nvme_shutdown_ctrl into nvme_disable_ctrl
nvme: use nvme_wait_ready in nvme_shutdown_ctrl
...
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 229 |
1 files changed, 148 insertions, 81 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 228a6696d835..c5cf0dbca1db 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -254,15 +254,17 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); /** * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done - * @q: request queue. + * @set: tag_set to wait on * * Note: it is driver's responsibility for making sure that quiesce has - * been started. + * been started on or more of the request_queues of the tag_set. This + * function only waits for the quiesce on those request_queues that had + * the quiesce flag set using blk_mq_quiesce_queue_nowait. */ -void blk_mq_wait_quiesce_done(struct request_queue *q) +void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set) { - if (blk_queue_has_srcu(q)) - synchronize_srcu(q->srcu); + if (set->flags & BLK_MQ_F_BLOCKING) + synchronize_srcu(set->srcu); else synchronize_rcu(); } @@ -280,7 +282,9 @@ EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); void blk_mq_quiesce_queue(struct request_queue *q) { blk_mq_quiesce_queue_nowait(q); - blk_mq_wait_quiesce_done(q); + /* nothing to wait for non-mq queues */ + if (queue_is_mq(q)) + blk_mq_wait_quiesce_done(q->tag_set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); @@ -311,6 +315,33 @@ void blk_mq_unquiesce_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) +{ + struct request_queue *q; + + mutex_lock(&set->tag_list_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (!blk_queue_skip_tagset_quiesce(q)) + blk_mq_quiesce_queue_nowait(q); + } + blk_mq_wait_quiesce_done(set); + mutex_unlock(&set->tag_list_lock); +} +EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); + +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) +{ + struct request_queue *q; + + mutex_lock(&set->tag_list_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (!blk_queue_skip_tagset_quiesce(q)) + blk_mq_unquiesce_queue(q); + } + mutex_unlock(&set->tag_list_lock); +} +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); + void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; @@ -544,25 +575,26 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (!plug) return NULL; + if (rq_list_empty(plug->cached_rq)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); - if (rq) - goto got_it; - return NULL; - } - rq = rq_list_peek(&plug->cached_rq); - if (!rq || rq->q != q) - return NULL; + if (!rq) + return NULL; + } else { + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; - if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) - return NULL; - if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) - return NULL; + if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) + return NULL; + + plug->cached_rq = rq_list_next(rq); + } - plug->cached_rq = rq_list_next(rq); -got_it: rq->cmd_flags = opf; INIT_LIST_HEAD(&rq->queuelist); return rq; @@ -1529,7 +1561,13 @@ static void blk_mq_rq_timed_out(struct request *req) blk_add_timer(req); } -static bool blk_mq_req_expired(struct request *rq, unsigned long *next) +struct blk_expired_data { + bool has_timedout_rq; + unsigned long next; + unsigned long timeout_start; +}; + +static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired) { unsigned long deadline; @@ -1539,13 +1577,13 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) return false; deadline = READ_ONCE(rq->deadline); - if (time_after_eq(jiffies, deadline)) + if (time_after_eq(expired->timeout_start, deadline)) return true; - if (*next == 0) - *next = deadline; - else if (time_after(*next, deadline)) - *next = deadline; + if (expired->next == 0) + expired->next = deadline; + else if (time_after(expired->next, deadline)) + expired->next = deadline; return false; } @@ -1561,7 +1599,7 @@ void blk_mq_put_rq_ref(struct request *rq) static bool blk_mq_check_expired(struct request *rq, void *priv) { - unsigned long *next = priv; + struct blk_expired_data *expired = priv; /* * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot @@ -1570,7 +1608,18 @@ static bool blk_mq_check_expired(struct request *rq, void *priv) * it was completed and reallocated as a new request after returning * from blk_mq_check_expired(). */ - if (blk_mq_req_expired(rq, next)) + if (blk_mq_req_expired(rq, expired)) { + expired->has_timedout_rq = true; + return false; + } + return true; +} + +static bool blk_mq_handle_expired(struct request *rq, void *priv) +{ + struct blk_expired_data *expired = priv; + + if (blk_mq_req_expired(rq, expired)) blk_mq_rq_timed_out(rq); return true; } @@ -1579,7 +1628,9 @@ static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, timeout_work); - unsigned long next = 0; + struct blk_expired_data expired = { + .timeout_start = jiffies, + }; struct blk_mq_hw_ctx *hctx; unsigned long i; @@ -1599,10 +1650,23 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!percpu_ref_tryget(&q->q_usage_counter)) return; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); + /* check if there is any timed-out request */ + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); + if (expired.has_timedout_rq) { + /* + * Before walking tags, we must ensure any submit started + * before the current time has finished. Since the submit + * uses srcu or rcu, wait for a synchronization point to + * ensure all running submits have finished + */ + blk_mq_wait_quiesce_done(q->tag_set); + + expired.next = 0; + blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); + } - if (next != 0) { - mod_timer(&q->timeout, next); + if (expired.next != 0) { + mod_timer(&q->timeout, expired.next); } else { /* * Request timeouts are handled as a forward rolling timer. If @@ -3248,21 +3312,22 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); - if (!tags->rqs) { - blk_mq_free_tags(tags); - return NULL; - } + if (!tags->rqs) + goto err_free_tags; tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); - if (!tags->static_rqs) { - kfree(tags->rqs); - blk_mq_free_tags(tags); - return NULL; - } + if (!tags->static_rqs) + goto err_free_rqs; return tags; + +err_free_rqs: + kfree(tags->rqs); +err_free_tags: + blk_mq_free_tags(tags); + return NULL; } static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, @@ -3975,7 +4040,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING); + q = blk_alloc_queue(set->numa_node); if (!q) return ERR_PTR(-ENOMEM); q->queuedata = queuedata; @@ -4011,14 +4076,11 @@ void blk_mq_destroy_queue(struct request_queue *q) blk_queue_flag_set(QUEUE_FLAG_DYING, q); blk_queue_start_drain(q); - blk_freeze_queue(q); + blk_mq_freeze_queue_wait(q); blk_sync_queue(q); blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); - - /* @q is and will stay empty, shutdown and put */ - blk_put_queue(q); } EXPORT_SYMBOL(blk_mq_destroy_queue); @@ -4035,6 +4097,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { blk_mq_destroy_queue(q); + blk_put_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); @@ -4147,9 +4210,6 @@ static void blk_mq_update_poll_flag(struct request_queue *q) int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { - WARN_ON_ONCE(blk_queue_has_srcu(q) != - !!(set->flags & BLK_MQ_F_BLOCKING)); - /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -4325,12 +4385,12 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) } static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, - int cur_nr_hw_queues, int new_nr_hw_queues) + int new_nr_hw_queues) { struct blk_mq_tags **new_tags; - if (cur_nr_hw_queues >= new_nr_hw_queues) - return 0; + if (set->nr_hw_queues >= new_nr_hw_queues) + goto done; new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); @@ -4338,21 +4398,15 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, return -ENOMEM; if (set->tags) - memcpy(new_tags, set->tags, cur_nr_hw_queues * + memcpy(new_tags, set->tags, set->nr_hw_queues * sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; +done: set->nr_hw_queues = new_nr_hw_queues; - return 0; } -static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set, - int new_nr_hw_queues) -{ - return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues); -} - /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -4406,10 +4460,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0) - return -ENOMEM; + if (set->flags & BLK_MQ_F_BLOCKING) { + set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); + if (!set->srcu) + return -ENOMEM; + ret = init_srcu_struct(set->srcu); + if (ret) + goto out_free_srcu; + } ret = -ENOMEM; + set->tags = kcalloc_node(set->nr_hw_queues, + sizeof(struct blk_mq_tags *), GFP_KERNEL, + set->numa_node); + if (!set->tags) + goto out_cleanup_srcu; + for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, sizeof(set->map[i].mq_map[0]), @@ -4437,6 +4503,12 @@ out_free_mq_map: } kfree(set->tags); set->tags = NULL; +out_cleanup_srcu: + if (set->flags & BLK_MQ_F_BLOCKING) + cleanup_srcu_struct(set->srcu); +out_free_srcu: + if (set->flags & BLK_MQ_F_BLOCKING) + kfree(set->srcu); return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); @@ -4476,6 +4548,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) kfree(set->tags); set->tags = NULL; + if (set->flags & BLK_MQ_F_BLOCKING) { + cleanup_srcu_struct(set->srcu); + kfree(set->srcu); + } } EXPORT_SYMBOL(blk_mq_free_tag_set); @@ -4564,17 +4640,10 @@ static bool blk_mq_elv_switch_none(struct list_head *head, INIT_LIST_HEAD(&qe->node); qe->q = q; qe->type = q->elevator->type; + /* keep a reference to the elevator module as we'll switch back */ + __elevator_get(qe->type); list_add(&qe->node, head); - - /* - * After elevator_switch, the previous elevator_queue will be - * released by elevator_release. The reference of the io scheduler - * module get by elevator_get will also be put. So we need to get - * a reference of the io scheduler module here to prevent it to be - * removed. - */ - __module_get(qe->type->elevator_owner); - elevator_switch(q, NULL); + elevator_disable(q); mutex_unlock(&q->sysfs_lock); return true; @@ -4607,6 +4676,8 @@ static void blk_mq_elv_switch_back(struct list_head *head, mutex_lock(&q->sysfs_lock); elevator_switch(q, t); + /* drop the reference acquired in blk_mq_elv_switch_none */ + elevator_put(t); mutex_unlock(&q->sysfs_lock); } @@ -4643,11 +4714,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, } prev_nr_hw_queues = set->nr_hw_queues; - if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < - 0) + if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) goto reregister; - set->nr_hw_queues = nr_hw_queues; fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { @@ -4867,15 +4936,13 @@ EXPORT_SYMBOL(blk_mq_rq_cpu); void blk_mq_cancel_work_sync(struct request_queue *q) { - if (queue_is_mq(q)) { - struct blk_mq_hw_ctx *hctx; - unsigned long i; + struct blk_mq_hw_ctx *hctx; + unsigned long i; - cancel_delayed_work_sync(&q->requeue_work); + cancel_delayed_work_sync(&q->requeue_work); - queue_for_each_hw_ctx(q, hctx, i) - cancel_delayed_work_sync(&hctx->run_work); - } + queue_for_each_hw_ctx(q, hctx, i) + cancel_delayed_work_sync(&hctx->run_work); } static int __init blk_mq_init(void) |