diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 10:43:59 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 10:43:59 -0800 |
commit | ce8a79d5601aab94c02ed4539c48e8605422ac94 (patch) | |
tree | 7830a97a475d57284640c8e2d3516521722708b6 /block | |
parent | 96f7e448b9f4546ffd0356ffceb2b9586777f316 (diff) | |
parent | f596da3efaf4130ff61cd029558845808df9bf99 (diff) | |
download | lwn-ce8a79d5601aab94c02ed4539c48e8605422ac94.tar.gz lwn-ce8a79d5601aab94c02ed4539c48e8605422ac94.zip |
Merge tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- NVMe pull requests via Christoph:
- Support some passthrough commands without CAP_SYS_ADMIN (Kanchan
Joshi)
- Refactor PCIe probing and reset (Christoph Hellwig)
- Various fabrics authentication fixes and improvements (Sagi
Grimberg)
- Avoid fallback to sequential scan due to transient issues (Uday
Shankar)
- Implement support for the DEAC bit in Write Zeroes (Christoph
Hellwig)
- Allow overriding the IEEE OUI and firmware revision in configfs
for nvmet (Aleksandr Miloserdov)
- Force reconnect when number of queue changes in nvmet (Daniel
Wagner)
- Minor fixes and improvements (Uros Bizjak, Joel Granados, Sagi
Grimberg, Christoph Hellwig, Christophe JAILLET)
- Fix and cleanup nvme-fc req allocation (Chaitanya Kulkarni)
- Use the common tagset helpers in nvme-pci driver (Christoph
Hellwig)
- Cleanup the nvme-pci removal path (Christoph Hellwig)
- Use kstrtobool() instead of strtobool (Christophe JAILLET)
- Allow unprivileged passthrough of Identify Controller (Joel
Granados)
- Support io stats on the mpath device (Sagi Grimberg)
- Minor nvmet cleanup (Sagi Grimberg)
- MD pull requests via Song:
- Code cleanups (Christoph)
- Various fixes
- Floppy pull request from Denis:
- Fix a memory leak in the init error path (Yuan)
- Series fixing some batch wakeup issues with sbitmap (Gabriel)
- Removal of the pktcdvd driver that was deprecated more than 5 years
ago, and subsequent removal of the devnode callback in struct
block_device_operations as no users are now left (Greg)
- Fix for partition read on an exclusively opened bdev (Jan)
- Series of elevator API cleanups (Jinlong, Christoph)
- Series of fixes and cleanups for blk-iocost (Kemeng)
- Series of fixes and cleanups for blk-throttle (Kemeng)
- Series adding concurrent support for sync queues in BFQ (Yu)
- Series bringing drbd a bit closer to the out-of-tree maintained
version (Christian, Joel, Lars, Philipp)
- Misc drbd fixes (Wang)
- blk-wbt fixes and tweaks for enable/disable (Yu)
- Fixes for mq-deadline for zoned devices (Damien)
- Add support for read-only and offline zones for null_blk
(Shin'ichiro)
- Series fixing the delayed holder tracking, as used by DM (Yu,
Christoph)
- Series enabling bio alloc caching for IRQ based IO (Pavel)
- Series enabling userspace peer-to-peer DMA (Logan)
- BFQ waker fixes (Khazhismel)
- Series fixing elevator refcount issues (Christoph, Jinlong)
- Series cleaning up references around queue destruction (Christoph)
- Series doing quiesce by tagset, enabling cleanups in drivers
(Christoph, Chao)
- Series untangling the queue kobject and queue references (Christoph)
- Misc fixes and cleanups (Bart, David, Dawei, Jinlong, Kemeng, Ye,
Yang, Waiman, Shin'ichiro, Randy, Pankaj, Christoph)
* tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux: (247 commits)
blktrace: Fix output non-blktrace event when blk_classic option enabled
block: sed-opal: Don't include <linux/kernel.h>
sed-opal: allow using IOC_OPAL_SAVE for locking too
blk-cgroup: Fix typo in comment
block: remove bio_set_op_attrs
nvmet: don't open-code NVME_NS_ATTR_RO enumeration
nvme-pci: use the tagset alloc/free helpers
nvme: add the Apple shared tag workaround to nvme_alloc_io_tag_set
nvme: only set reserved_tags in nvme_alloc_io_tag_set for fabrics controllers
nvme: consolidate setting the tagset flags
nvme: pass nr_maps explicitly to nvme_alloc_io_tag_set
block: bio_copy_data_iter
nvme-pci: split out a nvme_pci_ctrl_is_dead helper
nvme-pci: return early on ctrl state mismatch in nvme_reset_work
nvme-pci: rename nvme_disable_io_queues
nvme-pci: cleanup nvme_suspend_queue
nvme-pci: remove nvme_pci_disable
nvme-pci: remove nvme_disable_admin_queue
nvme: merge nvme_shutdown_ctrl into nvme_disable_ctrl
nvme: use nvme_wait_ready in nvme_shutdown_ctrl
...
Diffstat (limited to 'block')
38 files changed, 1132 insertions, 877 deletions
diff --git a/block/bdev.c b/block/bdev.c index d699ecdb3260..edc110d90df4 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -224,7 +224,7 @@ int fsync_bdev(struct block_device *bdev) EXPORT_SYMBOL(fsync_bdev); /** - * freeze_bdev -- lock a filesystem and force it into a consistent state + * freeze_bdev - lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * If a superblock is found on this device, we take the s_umount semaphore @@ -268,7 +268,7 @@ done: EXPORT_SYMBOL(freeze_bdev); /** - * thaw_bdev -- unlock filesystem + * thaw_bdev - unlock filesystem * @bdev: blockdevice to unlock * * Unlocks the filesystem and marks it writeable again after freeze_bdev(). diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 7d624a3a3f0f..627476bc6495 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -224,7 +224,7 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, { blkg_rwstat_add(&bfqg->stats.queued, opf, 1); bfqg_stats_end_empty_time(&bfqg->stats); - if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) + if (!(bfqq == bfqg->bfqd->in_service_queue)) bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); } @@ -552,6 +552,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; + bfqg->num_queues_with_pending_reqs = 0; bfqg->online = true; bfqg->rq_pos_tree = RB_ROOT; } @@ -645,6 +646,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_entity *entity = &bfqq->entity; struct bfq_group *old_parent = bfqq_group(bfqq); + bool has_pending_reqs = false; /* * No point to move bfqq to the same group, which can happen when @@ -665,6 +667,11 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ bfqq->ref++; + if (entity->in_groups_with_pending_reqs) { + has_pending_reqs = true; + bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); + } + /* If bfqq is empty, then bfq_bfqq_expire also invokes * bfq_del_bfqq_busy, thereby removing bfqq and its entity * from data structures related to current group. Otherwise we @@ -692,6 +699,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* pin down bfqg and its associated blkg */ bfqg_and_blkg_get(bfqg); + if (has_pending_reqs) + bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); + if (bfq_bfqq_busy(bfqq)) { if (unlikely(!bfqd->nonrot_with_queueing)) bfq_pos_tree_add_move(bfqd, bfqq); diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7ea427817f7f..a72304c728fc 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -820,7 +820,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) * much easier to maintain the needed state: * 1) all active queues have the same weight, * 2) all active queues belong to the same I/O-priority class, - * 3) there are no active groups. + * 3) there is at most one active group. * In particular, the last condition is always true if hierarchical * support or the cgroups interface are not enabled, thus no state * needs to be maintained in this case. @@ -852,7 +852,7 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd, return varied_queue_weights || multiple_classes_busy #ifdef CONFIG_BFQ_GROUP_IOSCHED - || bfqd->num_groups_with_pending_reqs > 0 + || bfqd->num_groups_with_pending_reqs > 1 #endif ; } @@ -870,9 +870,9 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd, * In most scenarios, the rate at which nodes are created/destroyed * should be low too. */ -void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct rb_root_cached *root) +void bfq_weights_tree_add(struct bfq_queue *bfqq) { + struct rb_root_cached *root = &bfqq->bfqd->queue_weights_tree; struct bfq_entity *entity = &bfqq->entity; struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL; bool leftmost = true; @@ -944,13 +944,14 @@ inc_counter: * See the comments to the function bfq_weights_tree_add() for considerations * about overhead. */ -void __bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct rb_root_cached *root) +void bfq_weights_tree_remove(struct bfq_queue *bfqq) { + struct rb_root_cached *root; + if (!bfqq->weight_counter) return; + root = &bfqq->bfqd->queue_weights_tree; bfqq->weight_counter->num_active--; if (bfqq->weight_counter->num_active > 0) goto reset_entity_pointer; @@ -964,59 +965,6 @@ reset_entity_pointer: } /* - * Invoke __bfq_weights_tree_remove on bfqq and decrement the number - * of active groups for each queue's inactive parent entity. - */ -void bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = bfqq->entity.parent; - - for_each_entity(entity) { - struct bfq_sched_data *sd = entity->my_sched_data; - - if (sd->next_in_service || sd->in_service_entity) { - /* - * entity is still active, because either - * next_in_service or in_service_entity is not - * NULL (see the comments on the definition of - * next_in_service for details on why - * in_service_entity must be checked too). - * - * As a consequence, its parent entities are - * active as well, and thus this loop must - * stop here. - */ - break; - } - - /* - * The decrement of num_groups_with_pending_reqs is - * not performed immediately upon the deactivation of - * entity, but it is delayed to when it also happens - * that the first leaf descendant bfqq of entity gets - * all its pending requests completed. The following - * instructions perform this delayed decrement, if - * needed. See the comments on - * num_groups_with_pending_reqs for details. - */ - if (entity->in_groups_with_pending_reqs) { - entity->in_groups_with_pending_reqs = false; - bfqd->num_groups_with_pending_reqs--; - } - } - - /* - * Next function is invoked last, because it causes bfqq to be - * freed if the following holds: bfqq is not in service and - * has no dispatched request. DO NOT use bfqq after the next - * function invocation. - */ - __bfq_weights_tree_remove(bfqd, bfqq, - &bfqd->queue_weights_tree); -} - -/* * Return expired entry, or NULL to just start from scratch in rbtree. */ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, @@ -2135,7 +2083,9 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || - now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC) + now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || + bfqd->last_completed_rq_bfqq == &bfqd->oom_bfqq || + bfqq == &bfqd->oom_bfqq) return; /* @@ -2373,22 +2323,6 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq) return 0; } -#if 0 /* Still not clear if we can do without next two functions */ -static void bfq_activate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - bfqd->rq_in_driver++; -} - -static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - bfqd->rq_in_driver--; -} -#endif - static void bfq_remove_request(struct request_queue *q, struct request *rq) { @@ -6261,7 +6195,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) */ bfqq->budget_timeout = jiffies; - bfq_weights_tree_remove(bfqd, bfqq); + bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); + bfq_weights_tree_remove(bfqq); } now_ns = ktime_get_ns(); @@ -6784,6 +6719,12 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); + if (unlikely(bfqq == &bfqd->oom_bfqq)) + bfqq_already_existing = true; + } else + bfqq_already_existing = true; + + if (!bfqq_already_existing) { bfqq->waker_bfqq = old_bfqq->waker_bfqq; bfqq->tentative_waker_bfqq = NULL; @@ -6797,8 +6738,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) if (bfqq->waker_bfqq) hlist_add_head(&bfqq->woken_list_node, &bfqq->waker_bfqq->woken_list); - } else - bfqq_already_existing = true; + } } } @@ -7045,6 +6985,7 @@ static void bfq_exit_queue(struct elevator_queue *e) #endif blk_stat_disable_accounting(bfqd->queue); + clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags); wbt_enable_default(bfqd->queue); kfree(bfqd); @@ -7190,6 +7131,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); + set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags); wbt_disable_default(q); blk_stat_enable_accounting(q); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 71f721670ab6..9fa89577322d 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -492,27 +492,27 @@ struct bfq_data { struct rb_root_cached queue_weights_tree; /* - * Number of groups with at least one descendant process that + * Number of groups with at least one process that * has at least one request waiting for completion. Note that * this accounts for also requests already dispatched, but not * yet completed. Therefore this number of groups may differ * (be larger) than the number of active groups, as a group is * considered active only if its corresponding entity has - * descendant queues with at least one request queued. This + * queues with at least one request queued. This * number is used to decide whether a scenario is symmetric. * For a detailed explanation see comments on the computation * of the variable asymmetric_scenario in the function * bfq_better_to_idle(). * * However, it is hard to compute this number exactly, for - * groups with multiple descendant processes. Consider a group - * that is inactive, i.e., that has no descendant process with + * groups with multiple processes. Consider a group + * that is inactive, i.e., that has no process with * pending I/O inside BFQ queues. Then suppose that * num_groups_with_pending_reqs is still accounting for this - * group, because the group has descendant processes with some + * group, because the group has processes with some * I/O request still in flight. num_groups_with_pending_reqs * should be decremented when the in-flight request of the - * last descendant process is finally completed (assuming that + * last process is finally completed (assuming that * nothing else has changed for the group in the meantime, in * terms of composition of the group and active/inactive state of child * groups and processes). To accomplish this, an additional @@ -521,7 +521,7 @@ struct bfq_data { * we resort to the following tradeoff between simplicity and * accuracy: for an inactive group that is still counted in * num_groups_with_pending_reqs, we decrement - * num_groups_with_pending_reqs when the first descendant + * num_groups_with_pending_reqs when the first * process of the group remains with no request waiting for * completion. * @@ -529,12 +529,12 @@ struct bfq_data { * carefulness: to avoid multiple decrements, we flag a group, * more precisely an entity representing a group, as still * counted in num_groups_with_pending_reqs when it becomes - * inactive. Then, when the first descendant queue of the + * inactive. Then, when the first queue of the * entity remains with no request waiting for completion, * num_groups_with_pending_reqs is decremented, and this flag * is reset. After this flag is reset for the entity, * num_groups_with_pending_reqs won't be decremented any - * longer in case a new descendant queue of the entity remains + * longer in case a new queue of the entity remains * with no request waiting for completion. */ unsigned int num_groups_with_pending_reqs; @@ -931,7 +931,7 @@ struct bfq_group { struct bfq_entity entity; struct bfq_sched_data sched_data; - void *bfqd; + struct bfq_data *bfqd; struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; struct bfq_queue *async_idle_bfqq; @@ -939,6 +939,7 @@ struct bfq_group { struct bfq_entity *my_entity; int active_entities; + int num_queues_with_pending_reqs; struct rb_root rq_pos_tree; @@ -968,13 +969,8 @@ struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync); void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); -void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct rb_root_cached *root); -void __bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct rb_root_cached *root); -void bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq); +void bfq_weights_tree_add(struct bfq_queue *bfqq); +void bfq_weights_tree_remove(struct bfq_queue *bfqq); void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); @@ -1078,6 +1074,8 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool expiration); void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration); void bfq_add_bfqq_busy(struct bfq_queue *bfqq); +void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); +void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 8fc3da4c23bb..b02b53658ed4 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -218,6 +218,24 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) return false; } +static void bfq_inc_active_entities(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data); + + if (bfqg != bfqg->bfqd->root_group) + bfqg->active_entities++; +} + +static void bfq_dec_active_entities(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data); + + if (bfqg != bfqg->bfqd->root_group) + bfqg->active_entities--; +} + #else /* CONFIG_BFQ_GROUP_IOSCHED */ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) @@ -230,6 +248,14 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) return true; } +static void bfq_inc_active_entities(struct bfq_entity *entity) +{ +} + +static void bfq_dec_active_entities(struct bfq_entity *entity) +{ +} + #endif /* CONFIG_BFQ_GROUP_IOSCHED */ /* @@ -456,11 +482,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node = &entity->rb_node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -#endif bfq_insert(&st->active, entity); @@ -471,17 +492,10 @@ static void bfq_active_insert(struct bfq_service_tree *st, bfq_update_active_tree(node); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - bfqd = (struct bfq_data *)bfqg->bfqd; -#endif if (bfqq) list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (bfqg != bfqd->root_group) - bfqg->active_entities++; -#endif + + bfq_inc_active_entities(entity); } /** @@ -558,29 +572,16 @@ static void bfq_active_extract(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -#endif node = bfq_find_deepest(&entity->rb_node); bfq_extract(&st->active, entity); if (node) bfq_update_active_tree(node); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - bfqd = (struct bfq_data *)bfqg->bfqd; -#endif if (bfqq) list_del(&bfqq->bfqq_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (bfqg != bfqd->root_group) - bfqg->active_entities--; -#endif + + bfq_dec_active_entities(entity); } /** @@ -706,22 +707,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, if (entity->prio_changed) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); unsigned int prev_weight, new_weight; - struct bfq_data *bfqd = NULL; - struct rb_root_cached *root; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd; - struct bfq_group *bfqg; -#endif - - if (bfqq) - bfqd = bfqq->bfqd; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - sd = entity->my_sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - bfqd = (struct bfq_data *)bfqg->bfqd; - } -#endif /* Matches the smp_wmb() in bfq_group_set_weight. */ smp_rmb(); @@ -770,19 +755,15 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, * queue, remove the entity from its old weight counter (if * there is a counter associated with the entity). */ - if (prev_weight != new_weight && bfqq) { - root = &bfqd->queue_weights_tree; - __bfq_weights_tree_remove(bfqd, bfqq, root); - } + if (prev_weight != new_weight && bfqq) + bfq_weights_tree_remove(bfqq); entity->weight = new_weight; /* * Add the entity, if it is not a weight-raised queue, * to the counter associated with its new weight. */ - if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) { - /* If we get here, root has been initialized. */ - bfq_weights_tree_add(bfqd, bfqq, root); - } + if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) + bfq_weights_tree_add(bfqq); new_st->wsum += entity->weight; @@ -984,19 +965,6 @@ static void __bfq_activate_entity(struct bfq_entity *entity, entity->on_st_or_in_serv = true; } -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - struct bfq_data *bfqd = bfqg->bfqd; - - if (!entity->in_groups_with_pending_reqs) { - entity->in_groups_with_pending_reqs = true; - bfqd->num_groups_with_pending_reqs++; - } - } -#endif - bfq_update_fin_time_enqueue(entity, st, backshifted); } @@ -1082,12 +1050,12 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) } static void __bfq_activate_requeue_entity(struct bfq_entity *entity, - struct bfq_sched_data *sd, bool non_blocking_wait_rq) { struct bfq_service_tree *st = bfq_entity_service_tree(entity); - if (sd->in_service_entity == entity || entity->tree == &st->active) + if (entity->sched_data->in_service_entity == entity || + entity->tree == &st->active) /* * in service or already queued on the active tree, * requeue or reposition @@ -1119,14 +1087,10 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, bool non_blocking_wait_rq, bool requeue, bool expiration) { - struct bfq_sched_data *sd; - for_each_entity(entity) { - sd = entity->sched_data; - __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); - - if (!bfq_update_next_in_service(sd, entity, expiration) && - !requeue) + __bfq_activate_requeue_entity(entity, non_blocking_wait_rq); + if (!bfq_update_next_in_service(entity->sched_data, entity, + expiration) && !requeue) break; } } @@ -1646,6 +1610,32 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq == bfqd->in_service_queue, expiration); } +void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (!entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = true; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (!(bfqq_group(bfqq)->num_queues_with_pending_reqs++)) + bfqq->bfqd->num_groups_with_pending_reqs++; +#endif + } +} + +void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = false; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (!(--bfqq_group(bfqq)->num_queues_with_pending_reqs)) + bfqq->bfqd->num_groups_with_pending_reqs--; +#endif + } +} + /* * Called when the bfqq no longer has requests pending, remove it from * the service tree. As a special case, it can be invoked during an @@ -1668,8 +1658,14 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration) bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); - if (!bfqq->dispatched) - bfq_weights_tree_remove(bfqd, bfqq); + if (!bfqq->dispatched) { + bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); + /* + * Next function is invoked last, because it causes bfqq to be + * freed. DO NOT use bfqq after the next function invocation. + */ + bfq_weights_tree_remove(bfqq); + } } /* @@ -1686,10 +1682,11 @@ void bfq_add_bfqq_busy(struct bfq_queue *bfqq) bfq_mark_bfqq_busy(bfqq); bfqd->busy_queues[bfqq->ioprio_class - 1]++; - if (!bfqq->dispatched) + if (!bfqq->dispatched) { + bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); if (bfqq->wr_coeff == 1) - bfq_weights_tree_add(bfqd, bfqq, - &bfqd->queue_weights_tree); + bfq_weights_tree_add(bfqq); + } if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; diff --git a/block/bio.c b/block/bio.c index 57c2f327225b..5f96fcae3f75 100644 --- a/block/bio.c +++ b/block/bio.c @@ -25,9 +25,15 @@ #include "blk-rq-qos.h" #include "blk-cgroup.h" +#define ALLOC_CACHE_THRESHOLD 16 +#define ALLOC_CACHE_SLACK 64 +#define ALLOC_CACHE_MAX 256 + struct bio_alloc_cache { struct bio *free_list; + struct bio *free_list_irq; unsigned int nr; + unsigned int nr_irq; }; static struct biovec_slab { @@ -408,6 +414,22 @@ static void punt_bios_to_rescuer(struct bio_set *bs) queue_work(bs->rescue_workqueue, &bs->rescue_work); } +static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) +{ + unsigned long flags; + + /* cache->free_list must be empty */ + if (WARN_ON_ONCE(cache->free_list)) + return; + + local_irq_save(flags); + cache->free_list = cache->free_list_irq; + cache->free_list_irq = NULL; + cache->nr += cache->nr_irq; + cache->nr_irq = 0; + local_irq_restore(flags); +} + static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, struct bio_set *bs) @@ -417,8 +439,12 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, cache = per_cpu_ptr(bs->cache, get_cpu()); if (!cache->free_list) { - put_cpu(); - return NULL; + if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD) + bio_alloc_irq_cache_splice(cache); + if (!cache->free_list) { + put_cpu(); + return NULL; + } } bio = cache->free_list; cache->free_list = bio->bi_next; @@ -462,9 +488,6 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * - * If REQ_ALLOC_CACHE is set, the final put of the bio MUST be done from process - * context, not hard/soft IRQ. - * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, @@ -526,6 +549,8 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, } if (unlikely(!p)) return NULL; + if (!mempool_is_saturated(&bs->bio_pool)) + opf &= ~REQ_ALLOC_CACHE; bio = p + bs->front_pad; if (nr_vecs > BIO_INLINE_VECS) { @@ -676,11 +701,8 @@ void guard_bio_eod(struct bio *bio) bio_truncate(bio, maxsector << 9); } -#define ALLOC_CACHE_MAX 512 -#define ALLOC_CACHE_SLACK 64 - -static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, - unsigned int nr) +static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache, + unsigned int nr) { unsigned int i = 0; struct bio *bio; @@ -692,6 +714,17 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, if (++i == nr) break; } + return i; +} + +static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, + unsigned int nr) +{ + nr -= __bio_alloc_cache_prune(cache, nr); + if (!READ_ONCE(cache->free_list)) { + bio_alloc_irq_cache_splice(cache); + __bio_alloc_cache_prune(cache, nr); + } } static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node) @@ -725,6 +758,35 @@ static void bio_alloc_cache_destroy(struct bio_set *bs) bs->cache = NULL; } +static inline void bio_put_percpu_cache(struct bio *bio) +{ + struct bio_alloc_cache *cache; + + cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); + if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) { + put_cpu(); + bio_free(bio); + return; + } + + bio_uninit(bio); + + if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) { + bio->bi_next = cache->free_list; + cache->free_list = bio; + cache->nr++; + } else { + unsigned long flags; + + local_irq_save(flags); + bio->bi_next = cache->free_list_irq; + cache->free_list_irq = bio; + cache->nr_irq++; + local_irq_restore(flags); + } + put_cpu(); +} + /** * bio_put - release a reference to a bio * @bio: bio to release reference to @@ -740,20 +802,10 @@ void bio_put(struct bio *bio) if (!atomic_dec_and_test(&bio->__bi_cnt)) return; } - - if ((bio->bi_opf & REQ_ALLOC_CACHE) && !WARN_ON_ONCE(in_interrupt())) { - struct bio_alloc_cache *cache; - - bio_uninit(bio); - cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); - bio->bi_next = cache->free_list; - cache->free_list = bio; - if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK) - bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK); - put_cpu(); - } else { + if (bio->bi_opf & REQ_ALLOC_CACHE) + bio_put_percpu_cache(bio); + else bio_free(bio); - } } EXPORT_SYMBOL(bio_put); @@ -863,6 +915,8 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, return false; if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; + if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) + return false; *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); if (*same_page) @@ -1195,6 +1249,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; + unsigned int gup_flags = 0; ssize_t size, left; unsigned len, i = 0; size_t offset, trim; @@ -1208,6 +1263,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); + if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) + gup_flags |= FOLL_PCI_P2PDMA; + /* * Each segment in the iov is required to be a block size multiple. * However, we may not be able to get the entire segment if it spans @@ -1215,8 +1273,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * result to ensure the bio's total size is correct. The remainder of * the iov data will be picked up in the next bio iteration. */ - size = iov_iter_get_pages2(iter, pages, UINT_MAX - bio->bi_iter.bi_size, - nr_pages, &offset); + size = iov_iter_get_pages(iter, pages, + UINT_MAX - bio->bi_iter.bi_size, + nr_pages, &offset, gup_flags); if (unlikely(size <= 0)) return size ? size : -EFAULT; @@ -1342,27 +1401,6 @@ void __bio_advance(struct bio *bio, unsigned bytes) } EXPORT_SYMBOL(__bio_advance); -void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, - struct bio *src, struct bvec_iter *src_iter) -{ - while (src_iter->bi_size && dst_iter->bi_size) { - struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); - struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); - unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); - void *src_buf = bvec_kmap_local(&src_bv); - void *dst_buf = bvec_kmap_local(&dst_bv); - - memcpy(dst_buf, src_buf, bytes); - - kunmap_local(dst_buf); - kunmap_local(src_buf); - - bio_advance_iter_single(src, src_iter, bytes); - bio_advance_iter_single(dst, dst_iter, bytes); - } -} -EXPORT_SYMBOL(bio_copy_data_iter); - /** * bio_copy_data - copy contents of data buffers from one bio to another * @src: source bio @@ -1376,7 +1414,21 @@ void bio_copy_data(struct bio *dst, struct bio *src) struct bvec_iter src_iter = src->bi_iter; struct bvec_iter dst_iter = dst->bi_iter; - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + while (src_iter.bi_size && dst_iter.bi_size) { + struct bio_vec src_bv = bio_iter_iovec(src, src_iter); + struct bio_vec dst_bv = bio_iter_iovec(dst, dst_iter); + unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); + void *src_buf = bvec_kmap_local(&src_bv); + void *dst_buf = bvec_kmap_local(&dst_bv); + + memcpy(dst_buf, src_buf, bytes); + + kunmap_local(dst_buf); + kunmap_local(src_buf); + + bio_advance_iter_single(src, &src_iter, bytes); + bio_advance_iter_single(dst, &dst_iter, bytes); + } } EXPORT_SYMBOL(bio_copy_data); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ed761c62ad0a..50ac0dce95b8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -59,6 +59,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq; #define BLKG_DESTROY_BATCH_SIZE 64 +/* + * Lockless lists for tracking IO stats update + * + * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg). + * There are multiple blkg's (one for each block device) attached to each + * blkcg. The rstat code keeps track of which cpu has IO stats updated, + * but it doesn't know which blkg has the updated stats. If there are many + * block devices in a system, the cost of iterating all the blkg's to flush + * out the IO stats can be high. To reduce such overhead, a set of percpu + * lockless lists (lhead) per blkcg are used to track the set of recently + * updated iostat_cpu's since the last flush. An iostat_cpu will be put + * onto the lockless list on the update side [blk_cgroup_bio_start()] if + * not there yet and then removed when being flushed [blkcg_rstat_flush()]. + * References to blkg are gotten and then put back in the process to + * protect against blkg removal. + * + * Return: 0 if successful or -ENOMEM if allocation fails. + */ +static int init_blkcg_llists(struct blkcg *blkcg) +{ + int cpu; + + blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL); + if (!blkcg->lhead) + return -ENOMEM; + + for_each_possible_cpu(cpu) + init_llist_head(per_cpu_ptr(blkcg->lhead, cpu)); + return 0; +} + /** * blkcg_css - find the current css * @@ -236,8 +267,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, blkg->blkcg = blkcg; u64_stats_init(&blkg->iostat.sync); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); + per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg; + } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -577,7 +610,7 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs); * @pd: policy private data of interest * @v: value to print * - * Print @v to @sf for the device assocaited with @pd. + * Print @v to @sf for the device associated with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { @@ -765,7 +798,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_finish - finish up per-blkg config update - * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() + * @ctx: blkg_conf_ctx initialized by blkg_conf_prep() * * Finish up after per-blkg config update. This function must be paired * with blkg_conf_prep(). @@ -827,7 +860,9 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; + struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); + struct llist_node *lnode; + struct blkg_iostat_set *bisc, *next_bisc; /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(css->cgroup)) @@ -835,12 +870,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + lnode = llist_del_all(lhead); + if (!lnode) + goto out; + + /* + * Iterate only the iostat_cpu's queued in the lockless list. + */ + llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) { + struct blkcg_gq *blkg = bisc->blkg; struct blkcg_gq *parent = blkg->parent; - struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); struct blkg_iostat cur; unsigned int seq; + WRITE_ONCE(bisc->lqueued, false); + /* fetch the current per-cpu values */ do { seq = u64_stats_fetch_begin(&bisc->sync); @@ -853,8 +897,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (parent && parent->parent) blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); + percpu_ref_put(&blkg->refcnt); } +out: rcu_read_unlock(); } @@ -1132,6 +1178,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) mutex_unlock(&blkcg_pol_mutex); + free_percpu(blkcg->lhead); kfree(blkcg); } @@ -1139,7 +1186,6 @@ static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; - struct cgroup_subsys_state *ret; int i; mutex_lock(&blkcg_pol_mutex); @@ -1148,12 +1194,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) blkcg = &blkcg_root; } else { blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) { - ret = ERR_PTR(-ENOMEM); + if (!blkcg) goto unlock; - } } + if (init_blkcg_llists(blkcg)) + goto free_blkcg; + for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; @@ -1168,10 +1215,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) continue; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) { - ret = ERR_PTR(-ENOMEM); + if (!cpd) goto free_pd_blkcg; - } + blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; @@ -1195,12 +1241,13 @@ free_pd_blkcg: for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); - + free_percpu(blkcg->lhead); +free_blkcg: if (blkcg != &blkcg_root) kfree(blkcg); unlock: mutex_unlock(&blkcg_pol_mutex); - return ret; + return ERR_PTR(-ENOMEM); } static int blkcg_css_online(struct cgroup_subsys_state *css) @@ -1784,7 +1831,7 @@ out: /** * blkcg_schedule_throttle - this task needs to check for throttling - * @gendisk: disk to throttle + * @disk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated @@ -1943,6 +1990,7 @@ static int blk_cgroup_io_type(struct bio *bio) void blk_cgroup_bio_start(struct bio *bio) { + struct blkcg *blkcg = bio->bi_blkg->blkcg; int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; unsigned long flags; @@ -1961,9 +2009,21 @@ void blk_cgroup_bio_start(struct bio *bio) } bis->cur.ios[rwd]++; + /* + * If the iostat_cpu isn't in a lockless list, put it into the + * list to indicate that a stat update is pending. + */ + if (!READ_ONCE(bis->lqueued)) { + struct llist_head *lhead = this_cpu_ptr(blkcg->lhead); + + llist_add(&bis->lnode, lhead); + WRITE_ONCE(bis->lqueued, true); + percpu_ref_get(&bis->blkg->refcnt); + } + u64_stats_update_end_irqrestore(&bis->sync, flags); if (cgroup_subsys_on_dfl(io_cgrp_subsys)) - cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); + cgroup_rstat_updated(blkcg->css.cgroup, cpu); put_cpu(); } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index aa2b286bc825..1e94e404eaa8 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -18,6 +18,7 @@ #include <linux/cgroup.h> #include <linux/kthread.h> #include <linux/blk-mq.h> +#include <linux/llist.h> struct blkcg_gq; struct blkg_policy_data; @@ -43,6 +44,9 @@ struct blkg_iostat { struct blkg_iostat_set { struct u64_stats_sync sync; + struct blkcg_gq *blkg; + struct llist_node lnode; + int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; @@ -97,6 +101,12 @@ struct blkcg { struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; + + /* + * List of updated percpu blkg_iostat_set's since the last flush. + */ + struct llist_head __percpu *lhead; + #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif diff --git a/block/blk-core.c b/block/blk-core.c index 5487912befe8..3866b6c4cd88 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -59,13 +59,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert); -DEFINE_IDA(blk_queue_ida); +static DEFINE_IDA(blk_queue_ida); /* * For queue allocation */ -struct kmem_cache *blk_requestq_cachep; -struct kmem_cache *blk_requestq_srcu_cachep; +static struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd @@ -253,19 +252,44 @@ void blk_clear_pm_only(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_clear_pm_only); +static void blk_free_queue_rcu(struct rcu_head *rcu_head) +{ + kmem_cache_free(blk_requestq_cachep, + container_of(rcu_head, struct request_queue, rcu_head)); +} + +static void blk_free_queue(struct request_queue *q) +{ + percpu_ref_exit(&q->q_usage_counter); + + if (q->poll_stat) + blk_stat_remove_callback(q, q->poll_cb); + blk_stat_free_callback(q->poll_cb); + + blk_free_queue_stats(q->stats); + kfree(q->poll_stat); + + if (queue_is_mq(q)) + blk_mq_release(q); + + ida_free(&blk_queue_ida, q->id); + call_rcu(&q->rcu_head, blk_free_queue_rcu); +} + /** * blk_put_queue - decrement the request_queue refcount * @q: the request_queue structure to decrement the refcount for * - * Decrements the refcount of the request_queue kobject. When this reaches 0 - * we'll have blk_release_queue() called. + * Decrements the refcount of the request_queue and free it when the refcount + * reaches 0. * - * Context: Any context, but the last reference must not be dropped from - * atomic context. + * Context: Can sleep. */ void blk_put_queue(struct request_queue *q) { - kobject_put(&q->kobj); + might_sleep(); + if (refcount_dec_and_test(&q->refs)) + blk_free_queue(q); } EXPORT_SYMBOL(blk_put_queue); @@ -373,26 +397,20 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) +struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; - q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu), - GFP_KERNEL | __GFP_ZERO, node_id); + q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, + node_id); if (!q) return NULL; - if (alloc_srcu) { - blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q); - if (init_srcu_struct(q->srcu) != 0) - goto fail_q; - } - q->last_merge = NULL; q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); if (q->id < 0) - goto fail_srcu; + goto fail_q; q->stats = blk_alloc_queue_stats(); if (!q->stats) @@ -406,8 +424,7 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); - kobject_init(&q->kobj, &blk_queue_ktype); - + refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); @@ -434,11 +451,8 @@ fail_stats: blk_free_queue_stats(q->stats); fail_id: ida_free(&blk_queue_ida, q->id); -fail_srcu: - if (alloc_srcu) - cleanup_srcu_struct(q->srcu); fail_q: - kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q); + kmem_cache_free(blk_requestq_cachep, q); return NULL; } @@ -454,7 +468,7 @@ bool blk_get_queue(struct request_queue *q) { if (unlikely(blk_queue_dying(q))) return false; - kobject_get(&q->kobj); + refcount_inc(&q->refs); return true; } EXPORT_SYMBOL(blk_get_queue); @@ -945,18 +959,6 @@ unsigned long bdev_start_io_acct(struct block_device *bdev, EXPORT_SYMBOL(bdev_start_io_acct); /** - * bio_start_io_acct_time - start I/O accounting for bio based drivers - * @bio: bio to start account for - * @start_time: start time that should be passed back to bio_end_io_acct(). - */ -void bio_start_io_acct_time(struct bio *bio, unsigned long start_time) -{ - bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), - bio_op(bio), start_time); -} -EXPORT_SYMBOL_GPL(bio_start_io_acct_time); - -/** * bio_start_io_acct - start I/O accounting for bio based drivers * @bio: bio to start account for * @@ -1183,9 +1185,6 @@ int __init blk_dev_init(void) sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct bio, bi_opf)); - BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu), - __alignof__(struct request_queue)) != - sizeof(struct request_queue)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", @@ -1196,10 +1195,6 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); - blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu", - sizeof(struct request_queue) + - sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL); - blk_debugfs_root = debugfs_create_dir("block", NULL); return 0; diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index e6818ffaddbf..a8cdaf26851e 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -21,9 +21,9 @@ extern const struct blk_crypto_mode blk_crypto_modes[]; #ifdef CONFIG_BLK_INLINE_ENCRYPTION -int blk_crypto_sysfs_register(struct request_queue *q); +int blk_crypto_sysfs_register(struct gendisk *disk); -void blk_crypto_sysfs_unregister(struct request_queue *q); +void blk_crypto_sysfs_unregister(struct gendisk *disk); void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc); @@ -65,14 +65,28 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq) return rq->crypt_ctx; } +blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + struct blk_crypto_keyslot **slot_ptr); + +void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); + +int __blk_crypto_evict_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key); + +bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, + const struct blk_crypto_config *cfg); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -static inline int blk_crypto_sysfs_register(struct request_queue *q) +static inline int blk_crypto_sysfs_register(struct gendisk *disk) { return 0; } -static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { } +static inline void blk_crypto_sysfs_unregister(struct gendisk *disk) +{ +} static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 96c511967386..0307fb0d95d3 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -32,6 +32,7 @@ #include <linux/wait.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> +#include "blk-crypto-internal.h" struct blk_crypto_keyslot { atomic_t slot_refs; diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c index fd93bd2f33b7..55268edc0625 100644 --- a/block/blk-crypto-sysfs.c +++ b/block/blk-crypto-sysfs.c @@ -126,8 +126,9 @@ static struct kobj_type blk_crypto_ktype = { * If the request_queue has a blk_crypto_profile, create the "crypto" * subdirectory in sysfs (/sys/block/$disk/queue/crypto/). */ -int blk_crypto_sysfs_register(struct request_queue *q) +int blk_crypto_sysfs_register(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blk_crypto_kobj *obj; int err; @@ -139,8 +140,8 @@ int blk_crypto_sysfs_register(struct request_queue *q) return -ENOMEM; obj->profile = q->crypto_profile; - err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj, - "crypto"); + err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, + &disk->queue_kobj, "crypto"); if (err) { kobject_put(&obj->kobj); return err; @@ -149,9 +150,9 @@ int blk_crypto_sysfs_register(struct request_queue *q) return 0; } -void blk_crypto_sysfs_unregister(struct request_queue *q) +void blk_crypto_sysfs_unregister(struct gendisk *disk) { - kobject_put(q->crypto_kobject); + kobject_put(disk->queue->crypto_kobject); } static int __init blk_crypto_sysfs_init(void) diff --git a/block/blk-crypto.c b/block/blk-crypto.c index e44709fc6a08..45378586151f 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -273,7 +273,6 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; - struct blk_crypto_profile *profile; /* Error if bio has no data. */ if (WARN_ON_ONCE(!bio_has_data(bio))) { @@ -290,10 +289,9 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ - profile = bdev_get_queue(bio->bi_bdev)->crypto_profile; - if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg)) + if (blk_crypto_config_supported_natively(bio->bi_bdev, + &bc_key->crypto_cfg)) return true; - if (blk_crypto_fallback_bio_prep(bio_ptr)) return true; fail: @@ -358,22 +356,29 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, return 0; } +bool blk_crypto_config_supported_natively(struct block_device *bdev, + const struct blk_crypto_config *cfg) +{ + return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile, + cfg); +} + /* * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the - * request queue it's submitted to supports inline crypto, or the + * block_device it's submitted to supports inline crypto, or the * blk-crypto-fallback is enabled and supports the cfg). */ -bool blk_crypto_config_supported(struct request_queue *q, +bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg) { return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - __blk_crypto_cfg_supported(q->crypto_profile, cfg); + blk_crypto_config_supported_natively(bdev, cfg); } /** * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device + * @bdev: block device to operate on * @key: A key to use on the device - * @q: the request queue for the device * * Upper layers must call this function to ensure that either the hardware * supports the key's crypto settings, or the crypto API fallback has transforms @@ -385,10 +390,10 @@ bool blk_crypto_config_supported(struct request_queue *q, * blk-crypto-fallback is either disabled or the needed algorithm * is disabled in the crypto API; or another -errno code. */ -int blk_crypto_start_using_key(const struct blk_crypto_key *key, - struct request_queue *q) +int blk_crypto_start_using_key(struct block_device *bdev, + const struct blk_crypto_key *key) { - if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg)) + if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return 0; return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } @@ -396,7 +401,7 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key, /** * blk_crypto_evict_key() - Evict a key from any inline encryption hardware * it may have been programmed into - * @q: The request queue who's associated inline encryption hardware this key + * @bdev: The block_device who's associated inline encryption hardware this key * might have been programmed into * @key: The key to evict * @@ -406,14 +411,16 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key, * * Return: 0 on success or if the key wasn't in any keyslot; -errno on error. */ -int blk_crypto_evict_key(struct request_queue *q, +int blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key) { - if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg)) + struct request_queue *q = bdev_get_queue(bdev); + + if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return __blk_crypto_evict_key(q->crypto_profile, key); /* - * If the request_queue didn't support the key, then blk-crypto-fallback + * If the block_device didn't support the key, then blk-crypto-fallback * may have been used, so try to evict the key from blk-crypto-fallback. */ return blk_crypto_fallback_evict_key(key); diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index 2bd1d311033b..2141931ddd37 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -123,7 +123,8 @@ int disk_register_independent_access_ranges(struct gendisk *disk) */ WARN_ON(iars->sysfs_registered); ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype, - &q->kobj, "%s", "independent_access_ranges"); + &disk->queue_kobj, "%s", + "independent_access_ranges"); if (ret) { disk->ia_ranges = NULL; kobject_put(&iars->kobj); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 495396425bad..d1bdc12deaa7 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -111,7 +111,7 @@ * busy signal. * * As devices can have deep queues and be unfair in how the queued commands - * are executed, soley depending on rq wait may not result in satisfactory + * are executed, solely depending on rq wait may not result in satisfactory * control quality. For a better control quality, completion latency QoS * parameters can be configured so that the device is considered saturated * if N'th percentile completion latency rises above the set point. @@ -556,7 +556,6 @@ struct ioc_now { u64 now_ns; u64 now; u64 vnow; - u64 vrate; }; struct iocg_wait { @@ -906,8 +905,10 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force) if (idx == ioc->autop_idx && !force) return false; - if (idx != ioc->autop_idx) + if (idx != ioc->autop_idx) { atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); + ioc->vtime_base_rate = VTIME_PER_USEC; + } ioc->autop_idx = idx; ioc->autop_too_fast_at = 0; @@ -975,7 +976,7 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { if (ioc->busy_level != prev_busy_level || nr_lagging) - trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); @@ -1018,10 +1019,11 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, static void ioc_now(struct ioc *ioc, struct ioc_now *now) { unsigned seq; + u64 vrate; now->now_ns = ktime_get(); now->now = ktime_to_us(now->now_ns); - now->vrate = atomic64_read(&ioc->vtime_rate); + vrate = atomic64_read(&ioc->vtime_rate); /* * The current vtime is @@ -1034,7 +1036,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now) do { seq = read_seqcount_begin(&ioc->period_seqcount); now->vnow = ioc->period_at_vtime + - (now->now - ioc->period_at) * now->vrate; + (now->now - ioc->period_at) * vrate; } while (read_seqcount_retry(&ioc->period_seqcount, seq)); } @@ -2203,8 +2205,8 @@ static void ioc_timer_fn(struct timer_list *timer) LIST_HEAD(surpluses); int nr_debtors, nr_shortages = 0, nr_lagging = 0; u64 usage_us_sum = 0; - u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; - u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; + u32 ppm_rthr; + u32 ppm_wthr; u32 missed_ppm[2], rq_wait_pct; u64 period_vtime; int prev_busy_level; @@ -2215,6 +2217,8 @@ static void ioc_timer_fn(struct timer_list *timer) /* take care of active iocgs */ spin_lock_irq(&ioc->lock); + ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; + ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; ioc_now(ioc, &now); period_vtime = now.vnow - ioc->period_at_vtime; @@ -2878,7 +2882,7 @@ static int blk_iocost_init(struct gendisk *disk) spin_unlock_irq(&ioc->lock); /* - * rqos must be added before activation to allow iocg_pd_init() to + * rqos must be added before activation to allow ioc_pd_init() to * lookup the ioc from q. This means that the rqos methods may get * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. @@ -3187,11 +3191,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(disk->queue); } + blk_mq_freeze_queue(disk->queue); + blk_mq_quiesce_queue(disk->queue); + spin_lock_irq(&ioc->lock); memcpy(qos, ioc->params.qos, sizeof(qos)); enable = ioc->enabled; user = ioc->user_qos_params; - spin_unlock_irq(&ioc->lock); while ((p = strsep(&input, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; @@ -3258,15 +3264,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, if (qos[QOS_MIN] > qos[QOS_MAX]) goto einval; - spin_lock_irq(&ioc->lock); - if (enable) { blk_stat_enable_accounting(disk->queue); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; + wbt_disable_default(disk->queue); } else { blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; + wbt_enable_default(disk->queue); } if (user) { @@ -3279,9 +3285,17 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); + blk_mq_unquiesce_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue); + blkdev_put_no_open(bdev); return nbytes; einval: + spin_unlock_irq(&ioc->lock); + + blk_mq_unquiesce_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue); + ret = -EINVAL; err: blkdev_put_no_open(bdev); @@ -3336,6 +3350,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct block_device *bdev; + struct request_queue *q; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; @@ -3346,18 +3361,21 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, if (IS_ERR(bdev)) return PTR_ERR(bdev); - ioc = q_to_ioc(bdev_get_queue(bdev)); + q = bdev_get_queue(bdev); + ioc = q_to_ioc(q); if (!ioc) { ret = blk_iocost_init(bdev->bd_disk); if (ret) goto err; - ioc = q_to_ioc(bdev_get_queue(bdev)); + ioc = q_to_ioc(q); } + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + spin_lock_irq(&ioc->lock); memcpy(u, ioc->params.i_lcoefs, sizeof(u)); user = ioc->user_cost_model; - spin_unlock_irq(&ioc->lock); while ((p = strsep(&input, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; @@ -3394,7 +3412,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, user = true; } - spin_lock_irq(&ioc->lock); if (user) { memcpy(ioc->params.i_lcoefs, u, sizeof(u)); ioc->user_cost_model = true; @@ -3404,10 +3421,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + blkdev_put_no_open(bdev); return nbytes; einval: + spin_unlock_irq(&ioc->lock); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + ret = -EINVAL; err: blkdev_put_no_open(bdev); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 571fa95aafe9..778a0057193e 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -141,7 +141,7 @@ struct iolatency_grp { struct latency_stat __percpu *stats; struct latency_stat cur_stat; struct blk_iolatency *blkiolat; - struct rq_depth rq_depth; + unsigned int max_depth; struct rq_wait rq_wait; atomic64_t window_start; atomic_t scale_cookie; @@ -280,7 +280,7 @@ static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) { struct iolatency_grp *iolat = private_data; - return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); + return rq_wait_inc_below(rqw, iolat->max_depth); } static void __blkcg_iolatency_throttle(struct rq_qos *rqos, @@ -364,15 +364,17 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat, } /* - * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the + * Change the queue depth of the iolatency_grp. We add 1/16th of the * queue depth at a time so we don't get wild swings and hopefully dial in to - * fairer distribution of the overall queue depth. + * fairer distribution of the overall queue depth. We halve the queue depth + * at a time so we can scale down queue depth quickly from default unlimited + * to target. */ static void scale_change(struct iolatency_grp *iolat, bool up) { unsigned long qd = iolat->blkiolat->rqos.q->nr_requests; unsigned long scale = scale_amount(qd, up); - unsigned long old = iolat->rq_depth.max_depth; + unsigned long old = iolat->max_depth; if (old > qd) old = qd; @@ -384,12 +386,12 @@ static void scale_change(struct iolatency_grp *iolat, bool up) if (old < qd) { old += scale; old = min(old, qd); - iolat->rq_depth.max_depth = old; + iolat->max_depth = old; wake_up_all(&iolat->rq_wait.wait); } } else { old >>= 1; - iolat->rq_depth.max_depth = max(old, 1UL); + iolat->max_depth = max(old, 1UL); } } @@ -403,9 +405,6 @@ static void check_scale_change(struct iolatency_grp *iolat) u64 scale_lat; int direction = 0; - if (lat_to_blkg(iolat)->parent == NULL) - return; - parent = blkg_to_lat(lat_to_blkg(iolat)->parent); if (!parent) return; @@ -445,7 +444,7 @@ static void check_scale_change(struct iolatency_grp *iolat) } /* We're as low as we can go. */ - if (iolat->rq_depth.max_depth == 1 && direction < 0) { + if (iolat->max_depth == 1 && direction < 0) { blkcg_use_delay(lat_to_blkg(iolat)); return; } @@ -453,7 +452,7 @@ static void check_scale_change(struct iolatency_grp *iolat) /* We're back to the default cookie, unthrottle all the things. */ if (cur_cookie == DEFAULT_SCALE_COOKIE) { blkcg_clear_delay(lat_to_blkg(iolat)); - iolat->rq_depth.max_depth = UINT_MAX; + iolat->max_depth = UINT_MAX; wake_up_all(&iolat->rq_wait.wait); return; } @@ -508,7 +507,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat, * We don't want to count issue_as_root bio's in the cgroups latency * statistics as it could skew the numbers downwards. */ - if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) { + if (unlikely(issue_as_root && iolat->max_depth != UINT_MAX)) { u64 sub = iolat->min_lat_nsec; if (req_time < sub) blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time); @@ -920,7 +919,7 @@ static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) } preempt_enable(); - if (iolat->rq_depth.max_depth == UINT_MAX) + if (iolat->max_depth == UINT_MAX) seq_printf(s, " missed=%llu total=%llu depth=max", (unsigned long long)stat.ps.missed, (unsigned long long)stat.ps.total); @@ -928,7 +927,7 @@ static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) seq_printf(s, " missed=%llu total=%llu depth=%u", (unsigned long long)stat.ps.missed, (unsigned long long)stat.ps.total, - iolat->rq_depth.max_depth); + iolat->max_depth); } static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) @@ -945,12 +944,12 @@ static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); - if (iolat->rq_depth.max_depth == UINT_MAX) + if (iolat->max_depth == UINT_MAX) seq_printf(s, " depth=max avg_lat=%llu win=%llu", avg_lat, cur_win); else seq_printf(s, " depth=%u avg_lat=%llu win=%llu", - iolat->rq_depth.max_depth, avg_lat, cur_win); + iolat->max_depth, avg_lat, cur_win); } static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, @@ -994,9 +993,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) latency_stat_init(iolat, &iolat->cur_stat); rq_wait_init(&iolat->rq_wait); spin_lock_init(&iolat->child_lat.lock); - iolat->rq_depth.queue_depth = blkg->q->nr_requests; - iolat->rq_depth.max_depth = UINT_MAX; - iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; + iolat->max_depth = UINT_MAX; iolat->blkiolat = blkiolat; iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; atomic64_set(&iolat->window_start, now); diff --git a/block/blk-map.c b/block/blk-map.c index 34735626b00f..19940c978c73 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -267,6 +267,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, { unsigned int max_sectors = queue_max_hw_sectors(rq->q); unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); + unsigned int gup_flags = 0; struct bio *bio; int ret; int j; @@ -278,6 +279,9 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (bio == NULL) return -ENOMEM; + if (blk_queue_pci_p2pdma(rq->q)) + gup_flags |= FOLL_PCI_P2PDMA; + while (iov_iter_count(iter)) { struct page **pages, *stack_pages[UIO_FASTIOV]; ssize_t bytes; @@ -286,11 +290,11 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (nr_vecs <= ARRAY_SIZE(stack_pages)) { pages = stack_pages; - bytes = iov_iter_get_pages2(iter, pages, LONG_MAX, - nr_vecs, &offs); + bytes = iov_iter_get_pages(iter, pages, LONG_MAX, + nr_vecs, &offs, gup_flags); } else { - bytes = iov_iter_get_pages_alloc2(iter, &pages, - LONG_MAX, &offs); + bytes = iov_iter_get_pages_alloc(iter, &pages, + LONG_MAX, &offs, gup_flags); } if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; @@ -555,7 +559,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) size_t nr_iter = iov_iter_count(iter); size_t nr_segs = iter->nr_segs; struct bio_vec *bvecs, *bvprvp = NULL; - struct queue_limits *lim = &q->limits; + const struct queue_limits *lim = &q->limits; unsigned int nsegs = 0, bytes = 0; struct bio *bio; size_t i; diff --git a/block/blk-merge.c b/block/blk-merge.c index ff04e9290715..35a8f75cc45d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -100,13 +100,14 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio) * is defined as 'unsigned int', meantime it has to be aligned to with the * logical block size, which is the minimum accepted unit by hardware. */ -static unsigned int bio_allowed_max_sectors(struct queue_limits *lim) +static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) { return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; } -static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim, - unsigned *nsegs, struct bio_set *bs) +static struct bio *bio_split_discard(struct bio *bio, + const struct queue_limits *lim, + unsigned *nsegs, struct bio_set *bs) { unsigned int max_discard_sectors, granularity; sector_t tmp; @@ -146,7 +147,8 @@ static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim, } static struct bio *bio_split_write_zeroes(struct bio *bio, - struct queue_limits *lim, unsigned *nsegs, struct bio_set *bs) + const struct queue_limits *lim, + unsigned *nsegs, struct bio_set *bs) { *nsegs = 0; if (!lim->max_write_zeroes_sectors) @@ -165,7 +167,7 @@ static struct bio *bio_split_write_zeroes(struct bio *bio, * aligned to a physical block boundary. */ static inline unsigned get_max_io_size(struct bio *bio, - struct queue_limits *lim) + const struct queue_limits *lim) { unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT; unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT; @@ -184,7 +186,15 @@ static inline unsigned get_max_io_size(struct bio *bio, return max_sectors & ~(lbs - 1); } -static inline unsigned get_max_segment_size(struct queue_limits *lim, +/** + * get_max_segment_size() - maximum number of bytes to add as a single segment + * @lim: Request queue limits. + * @start_page: See below. + * @offset: Offset from @start_page where to add a segment. + * + * Returns the maximum number of bytes that can be added as a single segment. + */ +static inline unsigned get_max_segment_size(const struct queue_limits *lim, struct page *start_page, unsigned long offset) { unsigned long mask = lim->seg_boundary_mask; @@ -192,11 +202,10 @@ static inline unsigned get_max_segment_size(struct queue_limits *lim, offset = mask & (page_to_phys(start_page) + offset); /* - * overflow may be triggered in case of zero page physical address - * on 32bit arch, use queue's max segment size when that happens. + * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1 + * after having calculated the minimum. */ - return min_not_zero(mask - offset + 1, - (unsigned long)lim->max_segment_size); + return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1; } /** @@ -219,9 +228,9 @@ static inline unsigned get_max_segment_size(struct queue_limits *lim, * *@nsegs segments and *@sectors sectors would make that bio unacceptable for * the block driver. */ -static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv, - unsigned *nsegs, unsigned *bytes, unsigned max_segs, - unsigned max_bytes) +static bool bvec_split_segs(const struct queue_limits *lim, + const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes, + unsigned max_segs, unsigned max_bytes) { unsigned max_len = min(max_bytes, UINT_MAX) - *bytes; unsigned len = min(bv->bv_len, max_len); @@ -267,7 +276,7 @@ static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv, * responsible for ensuring that @bs is only destroyed after processing of the * split bio has finished. */ -static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim, +static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *segs, struct bio_set *bs, unsigned max_bytes) { struct bio_vec bv, bvprv, *bvprvp = NULL; @@ -331,8 +340,9 @@ split: * The split bio is allocated from @q->bio_split, which is provided by the * block layer. */ -struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim, - unsigned int *nr_segs) +struct bio *__bio_split_to_limits(struct bio *bio, + const struct queue_limits *lim, + unsigned int *nr_segs) { struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split; struct bio *split; @@ -377,7 +387,7 @@ struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim, */ struct bio *bio_split_to_limits(struct bio *bio) { - struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; + const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; unsigned int nr_segs; if (bio_may_exceed_limits(bio, lim)) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index a4f7c101b53b..23d1a90fec42 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -555,6 +555,7 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue) return 0; } +/* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { unsigned int flags = q->tag_set->flags; @@ -563,13 +564,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) unsigned long i; int ret; - if (!e) { - blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); - q->elevator = NULL; - q->nr_requests = q->tag_set->queue_depth; - return 0; - } - /* * Default to double of smaller one between hw queue_depth and 128, * since we don't split into sync/async like the old code did. diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 93997d297d42..4515288fbe35 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -185,7 +185,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct blk_mq_ctx *ctx; - int i, ret; + int i, j, ret; if (!hctx->nr_ctx) return 0; @@ -197,9 +197,16 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) hctx_for_each_ctx(hctx, ctx, i) { ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); if (ret) - break; + goto out; } + return 0; +out: + hctx_for_each_ctx(hctx, ctx, j) { + if (j < i) + kobject_del(&ctx->kobj); + } + kobject_del(&hctx->kobj); return ret; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 228a6696d835..c5cf0dbca1db 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -254,15 +254,17 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); /** * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done - * @q: request queue. + * @set: tag_set to wait on * * Note: it is driver's responsibility for making sure that quiesce has - * been started. + * been started on or more of the request_queues of the tag_set. This + * function only waits for the quiesce on those request_queues that had + * the quiesce flag set using blk_mq_quiesce_queue_nowait. */ -void blk_mq_wait_quiesce_done(struct request_queue *q) +void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set) { - if (blk_queue_has_srcu(q)) - synchronize_srcu(q->srcu); + if (set->flags & BLK_MQ_F_BLOCKING) + synchronize_srcu(set->srcu); else synchronize_rcu(); } @@ -280,7 +282,9 @@ EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); void blk_mq_quiesce_queue(struct request_queue *q) { blk_mq_quiesce_queue_nowait(q); - blk_mq_wait_quiesce_done(q); + /* nothing to wait for non-mq queues */ + if (queue_is_mq(q)) + blk_mq_wait_quiesce_done(q->tag_set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); @@ -311,6 +315,33 @@ void blk_mq_unquiesce_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) +{ + struct request_queue *q; + + mutex_lock(&set->tag_list_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (!blk_queue_skip_tagset_quiesce(q)) + blk_mq_quiesce_queue_nowait(q); + } + blk_mq_wait_quiesce_done(set); + mutex_unlock(&set->tag_list_lock); +} +EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); + +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) +{ + struct request_queue *q; + + mutex_lock(&set->tag_list_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (!blk_queue_skip_tagset_quiesce(q)) + blk_mq_unquiesce_queue(q); + } + mutex_unlock(&set->tag_list_lock); +} +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); + void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; @@ -544,25 +575,26 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (!plug) return NULL; + if (rq_list_empty(plug->cached_rq)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); - if (rq) - goto got_it; - return NULL; - } - rq = rq_list_peek(&plug->cached_rq); - if (!rq || rq->q != q) - return NULL; + if (!rq) + return NULL; + } else { + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; - if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) - return NULL; - if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) - return NULL; + if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) + return NULL; + + plug->cached_rq = rq_list_next(rq); + } - plug->cached_rq = rq_list_next(rq); -got_it: rq->cmd_flags = opf; INIT_LIST_HEAD(&rq->queuelist); return rq; @@ -1529,7 +1561,13 @@ static void blk_mq_rq_timed_out(struct request *req) blk_add_timer(req); } -static bool blk_mq_req_expired(struct request *rq, unsigned long *next) +struct blk_expired_data { + bool has_timedout_rq; + unsigned long next; + unsigned long timeout_start; +}; + +static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired) { unsigned long deadline; @@ -1539,13 +1577,13 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) return false; deadline = READ_ONCE(rq->deadline); - if (time_after_eq(jiffies, deadline)) + if (time_after_eq(expired->timeout_start, deadline)) return true; - if (*next == 0) - *next = deadline; - else if (time_after(*next, deadline)) - *next = deadline; + if (expired->next == 0) + expired->next = deadline; + else if (time_after(expired->next, deadline)) + expired->next = deadline; return false; } @@ -1561,7 +1599,7 @@ void blk_mq_put_rq_ref(struct request *rq) static bool blk_mq_check_expired(struct request *rq, void *priv) { - unsigned long *next = priv; + struct blk_expired_data *expired = priv; /* * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot @@ -1570,7 +1608,18 @@ static bool blk_mq_check_expired(struct request *rq, void *priv) * it was completed and reallocated as a new request after returning * from blk_mq_check_expired(). */ - if (blk_mq_req_expired(rq, next)) + if (blk_mq_req_expired(rq, expired)) { + expired->has_timedout_rq = true; + return false; + } + return true; +} + +static bool blk_mq_handle_expired(struct request *rq, void *priv) +{ + struct blk_expired_data *expired = priv; + + if (blk_mq_req_expired(rq, expired)) blk_mq_rq_timed_out(rq); return true; } @@ -1579,7 +1628,9 @@ static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, timeout_work); - unsigned long next = 0; + struct blk_expired_data expired = { + .timeout_start = jiffies, + }; struct blk_mq_hw_ctx *hctx; unsigned long i; @@ -1599,10 +1650,23 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!percpu_ref_tryget(&q->q_usage_counter)) return; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); + /* check if there is any timed-out request */ + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); + if (expired.has_timedout_rq) { + /* + * Before walking tags, we must ensure any submit started + * before the current time has finished. Since the submit + * uses srcu or rcu, wait for a synchronization point to + * ensure all running submits have finished + */ + blk_mq_wait_quiesce_done(q->tag_set); + + expired.next = 0; + blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); + } - if (next != 0) { - mod_timer(&q->timeout, next); + if (expired.next != 0) { + mod_timer(&q->timeout, expired.next); } else { /* * Request timeouts are handled as a forward rolling timer. If @@ -3248,21 +3312,22 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); - if (!tags->rqs) { - blk_mq_free_tags(tags); - return NULL; - } + if (!tags->rqs) + goto err_free_tags; tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); - if (!tags->static_rqs) { - kfree(tags->rqs); - blk_mq_free_tags(tags); - return NULL; - } + if (!tags->static_rqs) + goto err_free_rqs; return tags; + +err_free_rqs: + kfree(tags->rqs); +err_free_tags: + blk_mq_free_tags(tags); + return NULL; } static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, @@ -3975,7 +4040,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING); + q = blk_alloc_queue(set->numa_node); if (!q) return ERR_PTR(-ENOMEM); q->queuedata = queuedata; @@ -4011,14 +4076,11 @@ void blk_mq_destroy_queue(struct request_queue *q) blk_queue_flag_set(QUEUE_FLAG_DYING, q); blk_queue_start_drain(q); - blk_freeze_queue(q); + blk_mq_freeze_queue_wait(q); blk_sync_queue(q); blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); - - /* @q is and will stay empty, shutdown and put */ - blk_put_queue(q); } EXPORT_SYMBOL(blk_mq_destroy_queue); @@ -4035,6 +4097,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { blk_mq_destroy_queue(q); + blk_put_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); @@ -4147,9 +4210,6 @@ static void blk_mq_update_poll_flag(struct request_queue *q) int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { - WARN_ON_ONCE(blk_queue_has_srcu(q) != - !!(set->flags & BLK_MQ_F_BLOCKING)); - /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -4325,12 +4385,12 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) } static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, - int cur_nr_hw_queues, int new_nr_hw_queues) + int new_nr_hw_queues) { struct blk_mq_tags **new_tags; - if (cur_nr_hw_queues >= new_nr_hw_queues) - return 0; + if (set->nr_hw_queues >= new_nr_hw_queues) + goto done; new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); @@ -4338,21 +4398,15 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, return -ENOMEM; if (set->tags) - memcpy(new_tags, set->tags, cur_nr_hw_queues * + memcpy(new_tags, set->tags, set->nr_hw_queues * sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; +done: set->nr_hw_queues = new_nr_hw_queues; - return 0; } -static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set, - int new_nr_hw_queues) -{ - return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues); -} - /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -4406,10 +4460,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0) - return -ENOMEM; + if (set->flags & BLK_MQ_F_BLOCKING) { + set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); + if (!set->srcu) + return -ENOMEM; + ret = init_srcu_struct(set->srcu); + if (ret) + goto out_free_srcu; + } ret = -ENOMEM; + set->tags = kcalloc_node(set->nr_hw_queues, + sizeof(struct blk_mq_tags *), GFP_KERNEL, + set->numa_node); + if (!set->tags) + goto out_cleanup_srcu; + for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, sizeof(set->map[i].mq_map[0]), @@ -4437,6 +4503,12 @@ out_free_mq_map: } kfree(set->tags); set->tags = NULL; +out_cleanup_srcu: + if (set->flags & BLK_MQ_F_BLOCKING) + cleanup_srcu_struct(set->srcu); +out_free_srcu: + if (set->flags & BLK_MQ_F_BLOCKING) + kfree(set->srcu); return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); @@ -4476,6 +4548,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) kfree(set->tags); set->tags = NULL; + if (set->flags & BLK_MQ_F_BLOCKING) { + cleanup_srcu_struct(set->srcu); + kfree(set->srcu); + } } EXPORT_SYMBOL(blk_mq_free_tag_set); @@ -4564,17 +4640,10 @@ static bool blk_mq_elv_switch_none(struct list_head *head, INIT_LIST_HEAD(&qe->node); qe->q = q; qe->type = q->elevator->type; + /* keep a reference to the elevator module as we'll switch back */ + __elevator_get(qe->type); list_add(&qe->node, head); - - /* - * After elevator_switch, the previous elevator_queue will be - * released by elevator_release. The reference of the io scheduler - * module get by elevator_get will also be put. So we need to get - * a reference of the io scheduler module here to prevent it to be - * removed. - */ - __module_get(qe->type->elevator_owner); - elevator_switch(q, NULL); + elevator_disable(q); mutex_unlock(&q->sysfs_lock); return true; @@ -4607,6 +4676,8 @@ static void blk_mq_elv_switch_back(struct list_head *head, mutex_lock(&q->sysfs_lock); elevator_switch(q, t); + /* drop the reference acquired in blk_mq_elv_switch_none */ + elevator_put(t); mutex_unlock(&q->sysfs_lock); } @@ -4643,11 +4714,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, } prev_nr_hw_queues = set->nr_hw_queues; - if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < - 0) + if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) goto reregister; - set->nr_hw_queues = nr_hw_queues; fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { @@ -4867,15 +4936,13 @@ EXPORT_SYMBOL(blk_mq_rq_cpu); void blk_mq_cancel_work_sync(struct request_queue *q) { - if (queue_is_mq(q)) { - struct blk_mq_hw_ctx *hctx; - unsigned long i; + struct blk_mq_hw_ctx *hctx; + unsigned long i; - cancel_delayed_work_sync(&q->requeue_work); + cancel_delayed_work_sync(&q->requeue_work); - queue_for_each_hw_ctx(q, hctx, i) - cancel_delayed_work_sync(&hctx->run_work); - } + queue_for_each_hw_ctx(q, hctx, i) + cancel_delayed_work_sync(&hctx->run_work); } static int __init blk_mq_init(void) diff --git a/block/blk-mq.h b/block/blk-mq.h index 0b2870839cdd..ef59fee62780 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -377,17 +377,17 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, /* run the code block in @dispatch_ops with rcu/srcu read lock held */ #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ - if (!blk_queue_has_srcu(q)) { \ - rcu_read_lock(); \ - (dispatch_ops); \ - rcu_read_unlock(); \ - } else { \ + if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ int srcu_idx; \ \ might_sleep_if(check_sleep); \ - srcu_idx = srcu_read_lock((q)->srcu); \ + srcu_idx = srcu_read_lock((q)->tag_set->srcu); \ (dispatch_ops); \ - srcu_read_unlock((q)->srcu, srcu_idx); \ + srcu_read_unlock((q)->tag_set->srcu, srcu_idx); \ + } else { \ + rcu_read_lock(); \ + (dispatch_ops); \ + rcu_read_unlock(); \ } \ } while (0) diff --git a/block/blk-settings.c b/block/blk-settings.c index 8ac1038d0c79..0477c4d527fe 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -481,7 +481,7 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) } EXPORT_SYMBOL(blk_queue_io_opt); -static int queue_limit_alignment_offset(struct queue_limits *lim, +static int queue_limit_alignment_offset(const struct queue_limits *lim, sector_t sector) { unsigned int granularity = max(lim->physical_block_size, lim->io_min); @@ -491,8 +491,8 @@ static int queue_limit_alignment_offset(struct queue_limits *lim, return (granularity + lim->alignment_offset - alignment) % granularity; } -static unsigned int queue_limit_discard_alignment(struct queue_limits *lim, - sector_t sector) +static unsigned int queue_limit_discard_alignment( + const struct queue_limits *lim, sector_t sector) { unsigned int alignment, granularity, offset; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e71b3b43927c..93d9e9c9a6ea 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -470,6 +470,9 @@ static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) if (!wbt_rq_qos(q)) return -EINVAL; + if (wbt_disabled(q)) + return sprintf(page, "0\n"); + return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); } @@ -680,8 +683,8 @@ static struct attribute *queue_attrs[] = { static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; if (attr == &queue_io_timeout_entry.attr && (!q->mq_ops || !q->mq_ops->timeout)) @@ -707,8 +710,8 @@ static ssize_t queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; ssize_t res; if (!entry->show) @@ -724,68 +727,19 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct queue_sysfs_entry *entry = to_queue(attr); - struct request_queue *q; + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; ssize_t res; if (!entry->store) return -EIO; - q = container_of(kobj, struct request_queue, kobj); mutex_lock(&q->sysfs_lock); res = entry->store(q, page, length); mutex_unlock(&q->sysfs_lock); return res; } -static void blk_free_queue_rcu(struct rcu_head *rcu_head) -{ - struct request_queue *q = container_of(rcu_head, struct request_queue, - rcu_head); - - kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q); -} - -/** - * blk_release_queue - releases all allocated resources of the request_queue - * @kobj: pointer to a kobject, whose container is a request_queue - * - * This function releases all allocated resources of the request queue. - * - * The struct request_queue refcount is incremented with blk_get_queue() and - * decremented with blk_put_queue(). Once the refcount reaches 0 this function - * is called. - * - * Drivers exist which depend on the release of the request_queue to be - * synchronous, it should not be deferred. - * - * Context: can sleep - */ -static void blk_release_queue(struct kobject *kobj) -{ - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); - - might_sleep(); - - percpu_ref_exit(&q->q_usage_counter); - - if (q->poll_stat) - blk_stat_remove_callback(q, q->poll_cb); - blk_stat_free_callback(q->poll_cb); - - blk_free_queue_stats(q->stats); - kfree(q->poll_stat); - - if (queue_is_mq(q)) - blk_mq_release(q); - - if (blk_queue_has_srcu(q)) - cleanup_srcu_struct(q->srcu); - - ida_free(&blk_queue_ida, q->id); - call_rcu(&q->rcu_head, blk_free_queue_rcu); -} - static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, @@ -796,12 +750,30 @@ static const struct attribute_group *blk_queue_attr_groups[] = { NULL }; -struct kobj_type blk_queue_ktype = { +static void blk_queue_release(struct kobject *kobj) +{ + /* nothing to do here, all data is associated with the parent gendisk */ +} + +static struct kobj_type blk_queue_ktype = { .default_groups = blk_queue_attr_groups, .sysfs_ops = &queue_sysfs_ops, - .release = blk_release_queue, + .release = blk_queue_release, }; +static void blk_debugfs_remove(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + + mutex_lock(&q->debugfs_mutex); + blk_trace_shutdown(q); + debugfs_remove_recursive(q->debugfs_dir); + q->debugfs_dir = NULL; + q->sched_debugfs_dir = NULL; + q->rqos_debugfs_dir = NULL; + mutex_unlock(&q->debugfs_mutex); +} + /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -812,47 +784,47 @@ int blk_register_queue(struct gendisk *disk) int ret; mutex_lock(&q->sysfs_dir_lock); - - ret = kobject_add(&q->kobj, &disk_to_dev(disk)->kobj, "queue"); + kobject_init(&disk->queue_kobj, &blk_queue_ktype); + ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) - goto unlock; + goto out_put_queue_kobj; - if (queue_is_mq(q)) - blk_mq_sysfs_register(disk); + if (queue_is_mq(q)) { + ret = blk_mq_sysfs_register(disk); + if (ret) + goto out_put_queue_kobj; + } mutex_lock(&q->sysfs_lock); mutex_lock(&q->debugfs_mutex); - q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), - blk_debugfs_root); + q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root); if (queue_is_mq(q)) blk_mq_debugfs_register(q); mutex_unlock(&q->debugfs_mutex); ret = disk_register_independent_access_ranges(disk); if (ret) - goto put_dev; + goto out_debugfs_remove; if (q->elevator) { ret = elv_register_queue(q, false); if (ret) - goto put_dev; + goto out_unregister_ia_ranges; } - ret = blk_crypto_sysfs_register(q); + ret = blk_crypto_sysfs_register(disk); if (ret) - goto put_dev; + goto out_elv_unregister; blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ - kobject_uevent(&q->kobj, KOBJ_ADD); + kobject_uevent(&disk->queue_kobj, KOBJ_ADD); if (q->elevator) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); - -unlock: mutex_unlock(&q->sysfs_dir_lock); /* @@ -871,13 +843,16 @@ unlock: return ret; -put_dev: +out_elv_unregister: elv_unregister_queue(q); +out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); +out_debugfs_remove: + blk_debugfs_remove(disk); mutex_unlock(&q->sysfs_lock); +out_put_queue_kobj: + kobject_put(&disk->queue_kobj); mutex_unlock(&q->sysfs_dir_lock); - kobject_del(&q->kobj); - return ret; } @@ -915,7 +890,7 @@ void blk_unregister_queue(struct gendisk *disk) */ if (queue_is_mq(q)) blk_mq_sysfs_unregister(disk); - blk_crypto_sysfs_unregister(q); + blk_crypto_sysfs_unregister(disk); mutex_lock(&q->sysfs_lock); elv_unregister_queue(q); @@ -923,15 +898,9 @@ void blk_unregister_queue(struct gendisk *disk) mutex_unlock(&q->sysfs_lock); /* Now that we've deleted all child objects, we can delete the queue. */ - kobject_uevent(&q->kobj, KOBJ_REMOVE); - kobject_del(&q->kobj); + kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE); + kobject_del(&disk->queue_kobj); mutex_unlock(&q->sysfs_dir_lock); - mutex_lock(&q->debugfs_mutex); - blk_trace_shutdown(q); - debugfs_remove_recursive(q->debugfs_dir); - q->debugfs_dir = NULL; - q->sched_debugfs_dir = NULL; - q->rqos_debugfs_dir = NULL; - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_remove(disk); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 847721dc2b2b..6fb5a2f9e1ee 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -129,7 +129,7 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) /* * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to * make the IO dispatch more smooth. - * Scale up: linearly scale up according to lapsed time since upgrade. For + * Scale up: linearly scale up according to elapsed time since upgrade. For * every throtl_slice, the limit scales up 1/2 .low limit till the * limit hits .max limit * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit @@ -395,8 +395,9 @@ static void throtl_pd_init(struct blkg_policy_data *pd) * If on the default hierarchy, we switch to properly hierarchical * behavior where limits on a given throtl_grp are applied to the * whole subtree rather than just the group itself. e.g. If 16M - * read_bps limit is set on the root group, the whole system can't - * exceed 16M for the device. + * read_bps limit is set on a parent group, summary bps of + * parent group and its subtree groups can't exceed 16M for the + * device. * * If not on the default hierarchy, the broken flat hierarchy * behavior is retained where all throtl_grps are treated as if @@ -644,7 +645,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, * that bandwidth. Do try to make use of that bandwidth while giving * credit. */ - if (time_after_eq(start, tg->slice_start[rw])) + if (time_after(start, tg->slice_start[rw])) tg->slice_start[rw] = start; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; @@ -821,17 +822,15 @@ static void tg_update_carryover(struct throtl_grp *tg) tg->carryover_ios[READ], tg->carryover_ios[WRITE]); } -static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, - u32 iops_limit, unsigned long *wait) +static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, + u32 iops_limit) { bool rw = bio_data_dir(bio); unsigned int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; if (iops_limit == UINT_MAX) { - if (wait) - *wait = 0; - return true; + return 0; } jiffy_elapsed = jiffies - tg->slice_start[rw]; @@ -841,21 +840,16 @@ static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + tg->carryover_ios[rw]; if (tg->io_disp[rw] + 1 <= io_allowed) { - if (wait) - *wait = 0; - return true; + return 0; } /* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; - - if (wait) - *wait = jiffy_wait; - return false; + return jiffy_wait; } -static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, - u64 bps_limit, unsigned long *wait) +static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, + u64 bps_limit) { bool rw = bio_data_dir(bio); u64 bytes_allowed, extra_bytes; @@ -864,9 +858,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, /* no need to throttle if this bio's bytes have been accounted */ if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { - if (wait) - *wait = 0; - return true; + return 0; } jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; @@ -879,9 +871,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + tg->carryover_bytes[rw]; if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { - if (wait) - *wait = 0; - return true; + return 0; } /* Calc approx time to dispatch */ @@ -896,9 +886,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, * up we did. Add that time also. */ jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); - if (wait) - *wait = jiffy_wait; - return false; + return jiffy_wait; } /* @@ -946,8 +934,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } - if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) && - tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) { + bps_wait = tg_within_bps_limit(tg, bio, bps_limit); + iops_wait = tg_within_iops_limit(tg, bio, iops_limit); + if (bps_wait + iops_wait == 0) { if (wait) *wait = 0; return true; @@ -1066,7 +1055,6 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) sq->nr_queued[rw]--; throtl_charge_bio(tg, bio); - bio_set_flag(bio, BIO_BPS_THROTTLED); /* * If our parent is another tg, we just need to transfer @bio to @@ -1079,6 +1067,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); start_parent_slice_with_credit(tg, parent_tg, rw); } else { + bio_set_flag(bio, BIO_BPS_THROTTLED); throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], &parent_sq->queued[rw]); BUG_ON(tg->td->nr_queued[rw] <= 0); @@ -1737,7 +1726,18 @@ void blk_throtl_cancel_bios(struct gendisk *disk) * Set the flag to make sure throtl_pending_timer_fn() won't * stop until all throttled bios are dispatched. */ - blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; + tg->flags |= THROTL_TG_CANCELING; + + /* + * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup + * will be inserted to service queue without THROTL_TG_PENDING + * set in tg_update_disptime below. Then IO dispatched from + * child in tg_dispatch_one_bio will trigger double insertion + * and corrupt the tree. + */ + if (!(tg->flags & THROTL_TG_PENDING)) + continue; + /* * Update disptime after setting the above flag to make sure * throtl_select_dispatch() won't exit without dispatching. @@ -1762,7 +1762,6 @@ static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) return min(rtime, wtime); } -/* tg should not be an intermediate node */ static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) { struct throtl_service_queue *parent_sq; @@ -1816,24 +1815,29 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) return ret; } -static bool throtl_tg_can_upgrade(struct throtl_grp *tg) +static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw) { struct throtl_service_queue *sq = &tg->service_queue; - bool read_limit, write_limit; + bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW]; /* - * if cgroup reaches low limit (if low limit is 0, the cgroup always - * reaches), it's ok to upgrade to next limit + * if low limit is zero, low limit is always reached. + * if low limit is non-zero, we can check if there is any request + * is queued to determine if low limit is reached as we throttle + * request according to limit. */ - read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]; - write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]; - if (!read_limit && !write_limit) - return true; - if (read_limit && sq->nr_queued[READ] && - (!write_limit || sq->nr_queued[WRITE])) - return true; - if (write_limit && sq->nr_queued[WRITE] && - (!read_limit || sq->nr_queued[READ])) + return !limit || sq->nr_queued[rw]; +} + +static bool throtl_tg_can_upgrade(struct throtl_grp *tg) +{ + /* + * cgroup reaches low limit when low limit of READ and WRITE are + * both reached, it's ok to upgrade to next limit if cgroup reaches + * low limit + */ + if (throtl_low_limit_reached(tg, READ) && + throtl_low_limit_reached(tg, WRITE)) return true; if (time_after_eq(jiffies, @@ -1951,8 +1955,7 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg) * If cgroup is below low limit, consider downgrade and throttle other * cgroups */ - if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) && - time_after_eq(now, tg_last_low_overflow_time(tg) + + if (time_after_eq(now, tg_last_low_overflow_time(tg) + td->throtl_slice) && (!throtl_tg_is_idle(tg) || !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) @@ -1962,6 +1965,11 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg) static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) { + struct throtl_data *td = tg->td; + + if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice)) + return false; + while (true) { if (!throtl_tg_can_downgrade(tg)) return false; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index c293e08b301f..68a774d7a7c9 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -27,6 +27,7 @@ #include "blk-wbt.h" #include "blk-rq-qos.h" +#include "elevator.h" #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> @@ -422,6 +423,14 @@ static void wbt_update_limits(struct rq_wb *rwb) rwb_wake_all(rwb); } +bool wbt_disabled(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + + return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT || + RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL; +} + u64 wbt_get_min_lat(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); @@ -435,8 +444,13 @@ void wbt_set_min_lat(struct request_queue *q, u64 val) struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) return; + RQWB(rqos)->min_lat_nsec = val; - RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + if (val) + RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + else + RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL; + wbt_update_limits(RQWB(rqos)); } @@ -638,11 +652,15 @@ void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) */ void wbt_enable_default(struct request_queue *q) { - struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_qos *rqos; + bool disable_flag = q->elevator && + test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags); /* Throttling already enabled? */ + rqos = wbt_rq_qos(q); if (rqos) { - if (RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) + if (!disable_flag && + RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; return; } @@ -651,7 +669,7 @@ void wbt_enable_default(struct request_queue *q) if (!blk_queue_registered(q)) return; - if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ)) + if (queue_is_mq(q) && !disable_flag) wbt_init(q); } EXPORT_SYMBOL_GPL(wbt_enable_default); diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 7e44eccc676d..e3ea6e7e2900 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -28,13 +28,15 @@ enum { }; /* - * Enable states. Either off, or on by default (done at init time), - * or on through manual setup in sysfs. + * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other + * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered + * to WBT_STATE_OFF/ON_MANUAL. */ enum { - WBT_STATE_ON_DEFAULT = 1, - WBT_STATE_ON_MANUAL = 2, - WBT_STATE_OFF_DEFAULT + WBT_STATE_ON_DEFAULT = 1, /* on by default */ + WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ + WBT_STATE_OFF_DEFAULT = 3, /* off by default */ + WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ }; struct rq_wb { @@ -94,6 +96,7 @@ void wbt_enable_default(struct request_queue *); u64 wbt_get_min_lat(struct request_queue *q); void wbt_set_min_lat(struct request_queue *q, u64 val); +bool wbt_disabled(struct request_queue *); void wbt_set_write_cache(struct request_queue *, bool); @@ -125,6 +128,10 @@ static inline u64 wbt_default_latency_nsec(struct request_queue *q) { return 0; } +static inline bool wbt_disabled(struct request_queue *q) +{ + return true; +} #endif /* CONFIG_BLK_WBT */ diff --git a/block/blk.h b/block/blk.h index a186ea20f39d..4c3b3325219a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -26,11 +26,6 @@ struct blk_flush_queue { spinlock_t mq_flush_lock; }; -extern struct kmem_cache *blk_requestq_cachep; -extern struct kmem_cache *blk_requestq_srcu_cachep; -extern struct kobj_type blk_queue_ktype; -extern struct ida blk_queue_ida; - bool is_flush_rq(struct request *req); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, @@ -104,7 +99,7 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, return true; } -static inline bool __bvec_gap_to_prev(struct queue_limits *lim, +static inline bool __bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { return (offset & lim->virt_boundary_mask) || @@ -115,7 +110,7 @@ static inline bool __bvec_gap_to_prev(struct queue_limits *lim, * Check if adding a bio_vec after bprv with offset would create a gap in * the SG list. Most drivers don't care about this, but some do. */ -static inline bool bvec_gap_to_prev(struct queue_limits *lim, +static inline bool bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { if (!lim->virt_boundary_mask) @@ -278,6 +273,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, void blk_insert_flush(struct request *rq); int elevator_switch(struct request_queue *q, struct elevator_type *new_e); +void elevator_disable(struct request_queue *q); void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); @@ -297,7 +293,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); static inline bool bio_may_exceed_limits(struct bio *bio, - struct queue_limits *lim) + const struct queue_limits *lim) { switch (bio_op(bio)) { case REQ_OP_DISCARD: @@ -320,8 +316,9 @@ static inline bool bio_may_exceed_limits(struct bio *bio, bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; } -struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim, - unsigned int *nr_segs); +struct bio *__bio_split_to_limits(struct bio *bio, + const struct queue_limits *lim, + unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, @@ -428,15 +425,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); -static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu) -{ - if (srcu) - return blk_requestq_srcu_cachep; - return blk_requestq_cachep; -} -struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu); +struct request_queue *blk_alloc_queue(int node_id); -int disk_scan_partitions(struct gendisk *disk, fmode_t mode); +int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner); int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index d6f5dcdce748..435c32373cd6 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -325,6 +325,7 @@ void bsg_remove_queue(struct request_queue *q) bsg_unregister_queue(bset->bd); blk_mq_destroy_queue(q); + blk_put_queue(q); blk_mq_free_tag_set(&bset->tag_set); kfree(bset); } @@ -400,6 +401,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, return q; out_cleanup_queue: blk_mq_destroy_queue(q); + blk_put_queue(q); out_queue: blk_mq_free_tag_set(set); out_tag_set: diff --git a/block/bsg.c b/block/bsg.c index 2ab1351eb082..8eba57b9bb46 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -175,8 +175,10 @@ static void bsg_device_release(struct device *dev) void bsg_unregister_queue(struct bsg_device *bd) { - if (bd->queue->kobj.sd) - sysfs_remove_link(&bd->queue->kobj, "bsg"); + struct gendisk *disk = bd->queue->disk; + + if (disk && disk->queue_kobj.sd) + sysfs_remove_link(&disk->queue_kobj, "bsg"); cdev_device_del(&bd->cdev, &bd->device); put_device(&bd->device); } @@ -216,8 +218,9 @@ struct bsg_device *bsg_register_queue(struct request_queue *q, if (ret) goto out_put_device; - if (q->kobj.sd) { - ret = sysfs_create_link(&q->kobj, &bd->device.kobj, "bsg"); + if (q->disk && q->disk->queue_kobj.sd) { + ret = sysfs_create_link(&q->disk->queue_kobj, &bd->device.kobj, + "bsg"); if (ret) goto out_device_del; } diff --git a/block/elevator.c b/block/elevator.c index bd71f0fc4e4b..adee58e48e2d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -57,7 +57,7 @@ static LIST_HEAD(elv_list); * Query io scheduler to see if the current process issuing bio may be * merged with rq. */ -static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) +static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; @@ -65,7 +65,7 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) if (e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); - return 1; + return true; } /* @@ -83,78 +83,45 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); -static inline bool elv_support_features(unsigned int elv_features, - unsigned int required_features) +static inline bool elv_support_features(struct request_queue *q, + const struct elevator_type *e) { - return (required_features & elv_features) == required_features; + return (q->required_elevator_features & e->elevator_features) == + q->required_elevator_features; } /** - * elevator_match - Test an elevator name and features + * elevator_match - Check whether @e's name or alias matches @name * @e: Scheduler to test * @name: Elevator name to test - * @required_features: Features that the elevator must provide * - * Return true if the elevator @e name matches @name and if @e provides all - * the features specified by @required_features. + * Return true if the elevator @e's name or alias matches @name. */ -static bool elevator_match(const struct elevator_type *e, const char *name, - unsigned int required_features) +static bool elevator_match(const struct elevator_type *e, const char *name) { - if (!elv_support_features(e->elevator_features, required_features)) - return false; - if (!strcmp(e->elevator_name, name)) - return true; - if (e->elevator_alias && !strcmp(e->elevator_alias, name)) - return true; - - return false; + return !strcmp(e->elevator_name, name) || + (e->elevator_alias && !strcmp(e->elevator_alias, name)); } -/** - * elevator_find - Find an elevator - * @name: Name of the elevator to find - * @required_features: Features that the elevator must provide - * - * Return the first registered scheduler with name @name and supporting the - * features @required_features and NULL otherwise. - */ -static struct elevator_type *elevator_find(const char *name, - unsigned int required_features) +static struct elevator_type *__elevator_find(const char *name) { struct elevator_type *e; - list_for_each_entry(e, &elv_list, list) { - if (elevator_match(e, name, required_features)) + list_for_each_entry(e, &elv_list, list) + if (elevator_match(e, name)) return e; - } - return NULL; } -static void elevator_put(struct elevator_type *e) -{ - module_put(e->elevator_owner); -} - -static struct elevator_type *elevator_get(struct request_queue *q, - const char *name, bool try_loading) +static struct elevator_type *elevator_find_get(struct request_queue *q, + const char *name) { struct elevator_type *e; spin_lock(&elv_list_lock); - - e = elevator_find(name, q->required_elevator_features); - if (!e && try_loading) { - spin_unlock(&elv_list_lock); - request_module("%s-iosched", name); - spin_lock(&elv_list_lock); - e = elevator_find(name, q->required_elevator_features); - } - - if (e && !try_module_get(e->elevator_owner)) + e = __elevator_find(name); + if (e && (!elv_support_features(q, e) || !elevator_tryget(e))) e = NULL; - spin_unlock(&elv_list_lock); return e; } @@ -170,6 +137,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, if (unlikely(!eq)) return NULL; + __elevator_get(e); eq->type = e; kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); @@ -499,7 +467,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) lockdep_assert_held(&q->sysfs_lock); - error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); + error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; if (attr) { @@ -512,7 +480,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) if (uevent) kobject_uevent(&e->kobj, KOBJ_ADD); - e->registered = 1; + set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags); } return error; } @@ -523,13 +491,9 @@ void elv_unregister_queue(struct request_queue *q) lockdep_assert_held(&q->sysfs_lock); - if (e && e->registered) { - struct elevator_queue *e = q->elevator; - + if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); - - e->registered = 0; } } @@ -555,7 +519,7 @@ int elv_register(struct elevator_type *e) /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); - if (elevator_find(e->elevator_name, 0)) { + if (__elevator_find(e->elevator_name)) { spin_unlock(&elv_list_lock); kmem_cache_destroy(e->icq_cache); return -EBUSY; @@ -588,39 +552,6 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -static int elevator_switch_mq(struct request_queue *q, - struct elevator_type *new_e) -{ - int ret; - - lockdep_assert_held(&q->sysfs_lock); - - if (q->elevator) { - elv_unregister_queue(q); - elevator_exit(q); - } - - ret = blk_mq_init_sched(q, new_e); - if (ret) - goto out; - - if (new_e) { - ret = elv_register_queue(q, true); - if (ret) { - elevator_exit(q); - goto out; - } - } - - if (new_e) - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); - else - blk_add_trace_msg(q, "elv switch: none"); - -out: - return ret; -} - static inline bool elv_support_iosched(struct request_queue *q) { if (!queue_is_mq(q) || @@ -642,7 +573,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) !blk_mq_is_shared_tags(q->tag_set->flags)) return NULL; - return elevator_get(q, "mq-deadline", false); + return elevator_find_get(q, "mq-deadline"); } /* @@ -656,14 +587,13 @@ static struct elevator_type *elevator_get_by_features(struct request_queue *q) spin_lock(&elv_list_lock); list_for_each_entry(e, &elv_list, list) { - if (elv_support_features(e->elevator_features, - q->required_elevator_features)) { + if (elv_support_features(q, e)) { found = e; break; } } - if (found && !try_module_get(found->elevator_owner)) + if (found && !elevator_tryget(found)) found = NULL; spin_unlock(&elv_list_lock); @@ -713,115 +643,147 @@ void elevator_init_mq(struct request_queue *q) if (err) { pr_warn("\"%s\" elevator initialization failed, " "falling back to \"none\"\n", e->elevator_name); - elevator_put(e); } + + elevator_put(e); } /* - * switch to new_e io scheduler. be careful not to introduce deadlocks - - * we don't free the old io scheduler, before we have allocated what we - * need for the new one. this way we have a chance of going back to the old - * one, if the new one fails init for some reason. + * Switch to new_e io scheduler. + * + * If switching fails, we are most likely running out of memory and not able + * to restore the old io scheduler, so leaving the io scheduler being none. */ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { - int err; + int ret; lockdep_assert_held(&q->sysfs_lock); blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); - err = elevator_switch_mq(q, new_e); + if (q->elevator) { + elv_unregister_queue(q); + elevator_exit(q); + } + + ret = blk_mq_init_sched(q, new_e); + if (ret) + goto out_unfreeze; + + ret = elv_register_queue(q, true); + if (ret) { + elevator_exit(q); + goto out_unfreeze; + } + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); +out_unfreeze: blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); - return err; + if (ret) { + pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n", + new_e->elevator_name); + } + + return ret; +} + +void elevator_disable(struct request_queue *q) +{ + lockdep_assert_held(&q->sysfs_lock); + + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + elv_unregister_queue(q); + elevator_exit(q); + blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); + q->elevator = NULL; + q->nr_requests = q->tag_set->queue_depth; + blk_add_trace_msg(q, "elv switch: none"); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); } /* * Switch this queue to the given IO scheduler. */ -static int __elevator_change(struct request_queue *q, const char *name) +static int elevator_change(struct request_queue *q, const char *elevator_name) { - char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; + int ret; /* Make sure queue is not in the middle of being removed */ if (!blk_queue_registered(q)) return -ENOENT; - /* - * Special case for mq, turn off scheduling - */ - if (!strncmp(name, "none", 4)) { - if (!q->elevator) - return 0; - return elevator_switch(q, NULL); + if (!strncmp(elevator_name, "none", 4)) { + if (q->elevator) + elevator_disable(q); + return 0; } - strlcpy(elevator_name, name, sizeof(elevator_name)); - e = elevator_get(q, strstrip(elevator_name), true); - if (!e) - return -EINVAL; - - if (q->elevator && - elevator_match(q->elevator->type, elevator_name, 0)) { - elevator_put(e); + if (q->elevator && elevator_match(q->elevator->type, elevator_name)) return 0; - } - return elevator_switch(q, e); + e = elevator_find_get(q, elevator_name); + if (!e) { + request_module("%s-iosched", elevator_name); + e = elevator_find_get(q, elevator_name); + if (!e) + return -EINVAL; + } + ret = elevator_switch(q, e); + elevator_put(e); + return ret; } -ssize_t elv_iosched_store(struct request_queue *q, const char *name, +ssize_t elv_iosched_store(struct request_queue *q, const char *buf, size_t count) { + char elevator_name[ELV_NAME_MAX]; int ret; if (!elv_support_iosched(q)) return count; - ret = __elevator_change(q, name); + strlcpy(elevator_name, buf, sizeof(elevator_name)); + ret = elevator_change(q, strstrip(elevator_name)); if (!ret) return count; - return ret; } ssize_t elv_iosched_show(struct request_queue *q, char *name) { - struct elevator_queue *e = q->elevator; - struct elevator_type *elv = NULL; - struct elevator_type *__e; + struct elevator_queue *eq = q->elevator; + struct elevator_type *cur = NULL, *e; int len = 0; - if (!queue_is_mq(q)) + if (!elv_support_iosched(q)) return sprintf(name, "none\n"); - if (!q->elevator) + if (!q->elevator) { len += sprintf(name+len, "[none] "); - else - elv = e->type; + } else { + len += sprintf(name+len, "none "); + cur = eq->type; + } spin_lock(&elv_list_lock); - list_for_each_entry(__e, &elv_list, list) { - if (elv && elevator_match(elv, __e->elevator_name, 0)) { - len += sprintf(name+len, "[%s] ", elv->elevator_name); - continue; - } - if (elv_support_iosched(q) && - elevator_match(__e, __e->elevator_name, - q->required_elevator_features)) - len += sprintf(name+len, "%s ", __e->elevator_name); + list_for_each_entry(e, &elv_list, list) { + if (e == cur) + len += sprintf(name+len, "[%s] ", e->elevator_name); + else if (elv_support_features(q, e)) + len += sprintf(name+len, "%s ", e->elevator_name); } spin_unlock(&elv_list_lock); - if (q->elevator) - len += sprintf(name+len, "none"); - - len += sprintf(len+name, "\n"); + len += sprintf(name+len, "\n"); return len; } diff --git a/block/elevator.h b/block/elevator.h index 3f0593b3bf9d..774a8f6b99e6 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -84,6 +84,21 @@ struct elevator_type struct list_head list; }; +static inline bool elevator_tryget(struct elevator_type *e) +{ + return try_module_get(e->elevator_owner); +} + +static inline void __elevator_get(struct elevator_type *e) +{ + __module_get(e->elevator_owner); +} + +static inline void elevator_put(struct elevator_type *e) +{ + module_put(e->elevator_owner); +} + #define ELV_HASH_BITS 6 void elv_rqhash_del(struct request_queue *q, struct request *rq); @@ -100,10 +115,13 @@ struct elevator_queue void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; - unsigned int registered:1; + unsigned long flags; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; +#define ELEVATOR_FLAG_REGISTERED 0 +#define ELEVATOR_FLAG_DISABLE_WBT 1 + /* * block elevator interface */ diff --git a/block/fops.c b/block/fops.c index b90742595317..50d245e8c913 100644 --- a/block/fops.c +++ b/block/fops.c @@ -405,12 +405,6 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, return ret; } -static int blkdev_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return generic_writepages(mapping, wbc); -} - const struct address_space_operations def_blk_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, @@ -419,7 +413,6 @@ const struct address_space_operations def_blk_aops = { .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, - .writepages = blkdev_writepages, .direct_IO = blkdev_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, diff --git a/block/genhd.c b/block/genhd.c index 0f9769db2de8..08f76135a637 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -356,7 +356,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action) } EXPORT_SYMBOL_GPL(disk_uevent); -int disk_scan_partitions(struct gendisk *disk, fmode_t mode) +int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner) { struct block_device *bdev; @@ -366,6 +366,9 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) return -EINVAL; if (disk->open_partitions) return -EBUSY; + /* Someone else has bdev exclusively open? */ + if (disk->part0->bd_holder && disk->part0->bd_holder != owner) + return -EBUSY; set_bit(GD_NEED_PART_SCAN, &disk->state); bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL); @@ -479,10 +482,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, goto out_put_holder_dir; } - ret = bd_register_pending_holders(disk); - if (ret < 0) - goto out_put_slave_dir; - ret = blk_register_queue(disk); if (ret) goto out_put_slave_dir; @@ -500,7 +499,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, bdev_add(disk->part0, ddev->devt); if (get_capacity(disk)) - disk_scan_partitions(disk, FMODE_READ); + disk_scan_partitions(disk, FMODE_READ, NULL); /* * Announce the disk and partitions after all partitions are @@ -530,6 +529,7 @@ out_unregister_queue: rq_qos_exit(disk->queue); out_put_slave_dir: kobject_put(disk->slave_dir); + disk->slave_dir = NULL; out_put_holder_dir: kobject_put(disk->part0->bd_holder_dir); out_del_integrity: @@ -560,6 +560,11 @@ void blk_mark_disk_dead(struct gendisk *disk) { set_bit(GD_DEAD, &disk->state); blk_queue_start_drain(disk->queue); + + /* + * Stop buffered writers from dirtying pages that can't be written out. + */ + set_capacity_and_notify(disk, 0); } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); @@ -629,6 +634,7 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); + disk->slave_dir = NULL; part_stat_set_all(disk->part0, 0); disk->part0->bd_stamp = 0; @@ -643,7 +649,9 @@ void del_gendisk(struct gendisk *disk) blk_sync_queue(q); blk_flush_integrity(); - blk_mq_cancel_work_sync(q); + + if (queue_is_mq(q)) + blk_mq_cancel_work_sync(q); blk_mq_quiesce_queue(q); if (q->elevator) { @@ -1193,21 +1201,10 @@ struct class block_class = { .dev_uevent = block_uevent, }; -static char *block_devnode(struct device *dev, umode_t *mode, - kuid_t *uid, kgid_t *gid) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (disk->fops->devnode) - return disk->fops->devnode(disk, mode); - return NULL; -} - const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, - .devnode = block_devnode, }; #ifdef CONFIG_PROC_FS @@ -1412,7 +1409,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) struct request_queue *q; struct gendisk *disk; - q = blk_alloc_queue(node, false); + q = blk_alloc_queue(node); if (!q) return NULL; diff --git a/block/holder.c b/block/holder.c index 5283bc804cc1..37d18c13d958 100644 --- a/block/holder.c +++ b/block/holder.c @@ -4,7 +4,7 @@ struct bd_holder_disk { struct list_head list; - struct block_device *bdev; + struct kobject *holder_dir; int refcnt; }; @@ -14,7 +14,7 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, struct bd_holder_disk *holder; list_for_each_entry(holder, &disk->slave_bdevs, list) - if (holder->bdev == bdev) + if (holder->holder_dir == bdev->bd_holder_dir) return holder; return NULL; } @@ -29,19 +29,6 @@ static void del_symlink(struct kobject *from, struct kobject *to) sysfs_remove_link(from, kobject_name(to)); } -static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - int ret; - - ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); - if (ret) - return ret; - ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - if (ret) - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - return ret; -} - /** * bd_link_disk_holder - create symlinks between holding disk and slave bdev * @bdev: the claimed slave bdev @@ -75,12 +62,30 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) struct bd_holder_disk *holder; int ret = 0; - mutex_lock(&disk->open_mutex); + if (WARN_ON_ONCE(!disk->slave_dir)) + return -EINVAL; + + if (bdev->bd_disk == disk) + return -EINVAL; + + /* + * del_gendisk drops the initial reference to bd_holder_dir, so we + * need to keep our own here to allow for cleanup past that point. + */ + mutex_lock(&bdev->bd_disk->open_mutex); + if (!disk_live(bdev->bd_disk)) { + mutex_unlock(&bdev->bd_disk->open_mutex); + return -ENODEV; + } + kobject_get(bdev->bd_holder_dir); + mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); WARN_ON_ONCE(!bdev->bd_holder); holder = bd_find_holder_disk(bdev, disk); if (holder) { + kobject_put(bdev->bd_holder_dir); holder->refcnt++; goto out_unlock; } @@ -92,36 +97,32 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } INIT_LIST_HEAD(&holder->list); - holder->bdev = bdev; holder->refcnt = 1; - if (disk->slave_dir) { - ret = __link_disk_holder(bdev, disk); - if (ret) { - kfree(holder); - goto out_unlock; - } - } + holder->holder_dir = bdev->bd_holder_dir; + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); + if (ret) + goto out_free_holder; + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + goto out_del_symlink; list_add(&holder->list, &disk->slave_bdevs); - /* - * del_gendisk drops the initial reference to bd_holder_dir, so we need - * to keep our own here to allow for cleanup past that point. - */ - kobject_get(bdev->bd_holder_dir); + mutex_unlock(&disk->open_mutex); + return 0; + +out_del_symlink: + del_symlink(disk->slave_dir, bdev_kobj(bdev)); +out_free_holder: + kfree(holder); out_unlock: mutex_unlock(&disk->open_mutex); + if (ret) + kobject_put(bdev->bd_holder_dir); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); -static void __unlink_disk_holder(struct block_device *bdev, - struct gendisk *disk) -{ - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); -} - /** * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() * @bdev: the calimed slave bdev @@ -136,36 +137,18 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; + if (WARN_ON_ONCE(!disk->slave_dir)) + return; + mutex_lock(&disk->open_mutex); holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - if (disk->slave_dir) - __unlink_disk_holder(bdev, disk); - kobject_put(bdev->bd_holder_dir); + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + del_symlink(holder->holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(holder->holder_dir); list_del_init(&holder->list); kfree(holder); } mutex_unlock(&disk->open_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); - -int bd_register_pending_holders(struct gendisk *disk) -{ - struct bd_holder_disk *holder; - int ret; - - mutex_lock(&disk->open_mutex); - list_for_each_entry(holder, &disk->slave_bdevs, list) { - ret = __link_disk_holder(holder->bdev, disk); - if (ret) - goto out_undo; - } - mutex_unlock(&disk->open_mutex); - return 0; - -out_undo: - list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list) - __unlink_disk_holder(holder->bdev, disk); - mutex_unlock(&disk->open_mutex); - return ret; -} diff --git a/block/ioctl.c b/block/ioctl.c index 60121e89052b..96617512982e 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -467,9 +467,10 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, * user space. Note the separate arg/argp parameters that are needed * to deal with the compat_ptr() conversion. */ -static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg, void __user *argp) +static int blkdev_common_ioctl(struct file *file, fmode_t mode, unsigned cmd, + unsigned long arg, void __user *argp) { + struct block_device *bdev = I_BDEV(file->f_mapping->host); unsigned int max_sectors; switch (cmd) { @@ -527,7 +528,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; - return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL); + return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL, + file); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: @@ -605,7 +607,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; } - ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); + ret = blkdev_common_ioctl(file, mode, cmd, arg, argp); if (ret != -ENOIOCTLCMD) return ret; @@ -674,7 +676,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; } - ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); + ret = blkdev_common_ioctl(file, mode, cmd, arg, argp); if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl) ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg); diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 5639921dfa92..f10c2a0d18d4 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -131,6 +131,20 @@ static u8 dd_rq_ioclass(struct request *rq) } /* + * get the request before `rq' in sector-sorted order + */ +static inline struct request * +deadline_earlier_request(struct request *rq) +{ + struct rb_node *node = rb_prev(&rq->rb_node); + + if (node) + return rb_entry_rq(node); + + return NULL; +} + +/* * get the request after `rq' in sector-sorted order */ static inline struct request * @@ -278,6 +292,39 @@ static inline int deadline_check_fifo(struct dd_per_prio *per_prio, } /* + * Check if rq has a sequential request preceding it. + */ +static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq) +{ + struct request *prev = deadline_earlier_request(rq); + + if (!prev) + return false; + + return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq); +} + +/* + * Skip all write requests that are sequential from @rq, even if we cross + * a zone boundary. + */ +static struct request *deadline_skip_seq_writes(struct deadline_data *dd, + struct request *rq) +{ + sector_t pos = blk_rq_pos(rq); + sector_t skipped_sectors = 0; + + while (rq) { + if (blk_rq_pos(rq) != pos + skipped_sectors) + break; + skipped_sectors += blk_rq_sectors(rq); + rq = deadline_latter_request(rq); + } + + return rq; +} + +/* * For the specified data direction, return the next request to * dispatch using arrival ordered lists. */ @@ -297,11 +344,16 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, /* * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. + * an unlocked target zone. For some HDDs, breaking a sequential + * write stream can lead to lower throughput, so make sure to preserve + * sequential write streams, even if that stream crosses into the next + * zones and these zones are unlocked. */ spin_lock_irqsave(&dd->zone_lock, flags); list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { - if (blk_req_can_dispatch_to_zone(rq)) + if (blk_req_can_dispatch_to_zone(rq) && + (blk_queue_nonrot(rq->q) || + !deadline_is_seq_write(dd, rq))) goto out; } rq = NULL; @@ -331,13 +383,19 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, /* * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. + * an unlocked target zone. For some HDDs, breaking a sequential + * write stream can lead to lower throughput, so make sure to preserve + * sequential write streams, even if that stream crosses into the next + * zones and these zones are unlocked. */ spin_lock_irqsave(&dd->zone_lock, flags); while (rq) { if (blk_req_can_dispatch_to_zone(rq)) break; - rq = deadline_latter_request(rq); + if (blk_queue_nonrot(rq->q)) + rq = deadline_latter_request(rq); + else + rq = deadline_skip_seq_writes(dd, rq); } spin_unlock_irqrestore(&dd->zone_lock, flags); @@ -789,6 +847,18 @@ static void dd_prepare_request(struct request *rq) rq->elv.priv[0] = NULL; } +static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + enum dd_prio p; + + for (p = 0; p <= DD_PRIO_MAX; p++) + if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE])) + return true; + + return false; +} + /* * Callback from inside blk_mq_free_request(). * @@ -828,9 +898,10 @@ static void dd_finish_request(struct request *rq) spin_lock_irqsave(&dd->zone_lock, flags); blk_req_zone_write_unlock(rq); - if (!list_empty(&per_prio->fifo_list[DD_WRITE])) - blk_mq_sched_mark_restart_hctx(rq->mq_hctx); spin_unlock_irqrestore(&dd->zone_lock, flags); + + if (dd_has_write_work(rq->mq_hctx)) + blk_mq_sched_mark_restart_hctx(rq->mq_hctx); } } diff --git a/block/sed-opal.c b/block/sed-opal.c index 9bdb833e5817..463873f61e01 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -2461,6 +2461,44 @@ static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key) return execute_steps(dev, mbrdone_step, ARRAY_SIZE(mbrdone_step)); } +static void opal_lock_check_for_saved_key(struct opal_dev *dev, + struct opal_lock_unlock *lk_unlk) +{ + struct opal_suspend_data *iter; + + if (lk_unlk->l_state != OPAL_LK || + lk_unlk->session.opal_key.key_len > 0) + return; + + /* + * Usually when closing a crypto device (eg: dm-crypt with LUKS) the + * volume key is not required, as it requires root privileges anyway, + * and root can deny access to a disk in many ways regardless. + * Requiring the volume key to lock the device is a peculiarity of the + * OPAL specification. Given we might already have saved the key if + * the user requested it via the 'IOC_OPAL_SAVE' ioctl, we can use + * that key to lock the device if no key was provided here, the + * locking range matches and the appropriate flag was passed with + * 'IOC_OPAL_SAVE'. + * This allows integrating OPAL with tools and libraries that are used + * to the common behaviour and do not ask for the volume key when + * closing a device. + */ + setup_opal_dev(dev); + list_for_each_entry(iter, &dev->unlk_lst, node) { + if ((iter->unlk.flags & OPAL_SAVE_FOR_LOCK) && + iter->lr == lk_unlk->session.opal_key.lr && + iter->unlk.session.opal_key.key_len > 0) { + lk_unlk->session.opal_key.key_len = + iter->unlk.session.opal_key.key_len; + memcpy(lk_unlk->session.opal_key.key, + iter->unlk.session.opal_key.key, + iter->unlk.session.opal_key.key_len); + break; + } + } +} + static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) { @@ -2470,6 +2508,7 @@ static int opal_lock_unlock(struct opal_dev *dev, return -EINVAL; mutex_lock(&dev->dev_lock); + opal_lock_check_for_saved_key(dev, lk_unlk); ret = __opal_lock_unlock(dev, lk_unlk); mutex_unlock(&dev->dev_lock); |