diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-03 11:57:03 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-03 11:57:03 -0700 |
commit | 382625d0d4325fb14a29444eb8dce8dcc2eb9b51 (patch) | |
tree | de35ff523e65c3a98fd3ac3a3595d517856d03da /block | |
parent | 99f6cf61f175c1239ed8e86d4a1757c380da52d1 (diff) | |
parent | d958e343bdc3de2643ce25225bed082dc222858d (diff) | |
download | lwn-382625d0d4325fb14a29444eb8dce8dcc2eb9b51.tar.gz lwn-382625d0d4325fb14a29444eb8dce8dcc2eb9b51.zip |
Merge tag 'for-5.9/block-20200802' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe:
"Good amount of cleanups and tech debt removals in here, and as a
result, the diffstat shows a nice net reduction in code.
- Softirq completion cleanups (Christoph)
- Stop using ->queuedata (Christoph)
- Cleanup bd claiming (Christoph)
- Use check_events, moving away from the legacy media change
(Christoph)
- Use inode i_blkbits consistently (Christoph)
- Remove old unused writeback congestion bits (Christoph)
- Cleanup/unify submission path (Christoph)
- Use bio_uninit consistently, instead of bio_disassociate_blkg
(Christoph)
- sbitmap cleared bits handling (John)
- Request merging blktrace event addition (Jan)
- sysfs add/remove race fixes (Luis)
- blk-mq tag fixes/optimizations (Ming)
- Duplicate words in comments (Randy)
- Flush deferral cleanup (Yufen)
- IO context locking/retry fixes (John)
- struct_size() usage (Gustavo)
- blk-iocost fixes (Chengming)
- blk-cgroup IO stats fixes (Boris)
- Various little fixes"
* tag 'for-5.9/block-20200802' of git://git.kernel.dk/linux-block: (135 commits)
block: blk-timeout: delete duplicated word
block: blk-mq-sched: delete duplicated word
block: blk-mq: delete duplicated word
block: genhd: delete duplicated words
block: elevator: delete duplicated word and fix typos
block: bio: delete duplicated words
block: bfq-iosched: fix duplicated word
iocost_monitor: start from the oldest usage index
iocost: Fix check condition of iocg abs_vdebt
block: Remove callback typedefs for blk_mq_ops
block: Use non _rcu version of list functions for tag_set_list
blk-cgroup: show global disk stats in root cgroup io.stat
blk-cgroup: make iostat functions visible to stat printing
block: improve discard bio alignment in __blkdev_issue_discard()
block: change REQ_OP_ZONE_RESET and REQ_OP_ZONE_RESET_ALL to be odd numbers
block: defer flush request no matter whether we have elevator
block: make blk_timeout_init() static
block: remove retry loop in ioc_release_fn()
block: remove unnecessary ioc nested locking
block: integrate bd_start_claiming into __blkdev_get
...
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile | 2 | ||||
-rw-r--r-- | block/bfq-iosched.c | 2 | ||||
-rw-r--r-- | block/bio.c | 165 | ||||
-rw-r--r-- | block/blk-cgroup.c | 402 | ||||
-rw-r--r-- | block/blk-core.c | 286 | ||||
-rw-r--r-- | block/blk-crypto-fallback.c | 2 | ||||
-rw-r--r-- | block/blk-crypto.c | 2 | ||||
-rw-r--r-- | block/blk-flush.c | 23 | ||||
-rw-r--r-- | block/blk-ioc.c | 42 | ||||
-rw-r--r-- | block/blk-iocost.c | 5 | ||||
-rw-r--r-- | block/blk-iolatency.c | 3 | ||||
-rw-r--r-- | block/blk-lib.c | 31 | ||||
-rw-r--r-- | block/blk-merge.c | 25 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 8 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 103 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 62 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 41 | ||||
-rw-r--r-- | block/blk-mq.c | 396 | ||||
-rw-r--r-- | block/blk-mq.h | 17 | ||||
-rw-r--r-- | block/blk-softirq.c | 156 | ||||
-rw-r--r-- | block/blk-sysfs.c | 52 | ||||
-rw-r--r-- | block/blk-throttle.c | 14 | ||||
-rw-r--r-- | block/blk-timeout.c | 30 | ||||
-rw-r--r-- | block/blk.h | 37 | ||||
-rw-r--r-- | block/bounce.c | 2 | ||||
-rw-r--r-- | block/bsg-lib.c | 5 | ||||
-rw-r--r-- | block/elevator.c | 4 | ||||
-rw-r--r-- | block/genhd.c | 85 | ||||
-rw-r--r-- | block/partitions/core.c | 2 |
29 files changed, 1054 insertions, 950 deletions
diff --git a/block/Makefile b/block/Makefile index 78719169fb2a..8d841f5f986f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ + blk-exec.o blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 50c8f034c01c..a4c0bec920cb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4714,7 +4714,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * some unlucky request wait for as long as the device * wishes. * - * Of course, serving one request at at time may cause loss of + * Of course, serving one request at a time may cause loss of * throughput. */ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) diff --git a/block/bio.c b/block/bio.c index a7366c02c9b5..c63ba04bd629 100644 --- a/block/bio.c +++ b/block/bio.c @@ -234,8 +234,12 @@ fallback: void bio_uninit(struct bio *bio) { - bio_disassociate_blkg(bio); - +#ifdef CONFIG_BLK_CGROUP + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } +#endif if (bio_integrity(bio)) bio_integrity_free(bio); @@ -354,7 +358,7 @@ static void bio_alloc_rescue(struct work_struct *work) if (!bio) break; - generic_make_request(bio); + submit_bio_noacct(bio); } } @@ -412,19 +416,19 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * submit the previously allocated bio for IO before attempting to allocate * a new one. Failure to do so can cause deadlocks under memory pressure. * - * Note that when running under generic_make_request() (i.e. any block + * Note that when running under submit_bio_noacct() (i.e. any block * driver), bios are not submitted until after you return - see the code in - * generic_make_request() that converts recursion into iteration, to prevent + * submit_bio_noacct() that converts recursion into iteration, to prevent * stack overflows. * * This would normally mean allocating multiple bios under - * generic_make_request() would be susceptible to deadlocks, but we have + * submit_bio_noacct() would be susceptible to deadlocks, but we have * deadlock avoidance code that resubmits any blocked bios from a rescuer * thread. * * However, we do not guarantee forward progress for allocations from other * mempools. Doing multiple allocations from the same mempool under - * generic_make_request() should be avoided - instead, use bio_set's front_pad + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * * RETURNS: @@ -444,9 +448,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, if (nr_iovecs > UIO_MAXIOV) return NULL; - p = kmalloc(sizeof(struct bio) + - nr_iovecs * sizeof(struct bio_vec), - gfp_mask); + p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); front_pad = 0; inline_vecs = nr_iovecs; } else { @@ -455,14 +457,14 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, nr_iovecs > 0)) return NULL; /* - * generic_make_request() converts recursion to iteration; this + * submit_bio_noacct() converts recursion to iteration; this * means if we're running beneath it, any bios we allocate and * submit will not be submitted (and thus freed) until after we * return. * * This exposes us to a potential deadlock if we allocate * multiple bios from the same bio_set() while running - * underneath generic_make_request(). If we were to allocate + * underneath submit_bio_noacct(). If we were to allocate * multiple bios (say a stacking block driver that was splitting * bios), we would deadlock if we exhausted the mempool's * reserve. @@ -860,7 +862,7 @@ EXPORT_SYMBOL(bio_add_pc_page); * @same_page: return if the segment has been merged inside the same page * * Try to add the data at @page + @off to the last bvec of @bio. This is a - * a useful optimisation for file systems with a block size smaller than the + * useful optimisation for file systems with a block size smaller than the * page size. * * Warn if (@len, @off) crosses pages in case that @same_page is true. @@ -986,7 +988,7 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) * Pins pages from *iter and appends them to @bio's bvec array. The * pages will have to be released using put_page() when done. * For multi-segment *iter, this function only adds pages from the - * the next non-empty segment of the iov iterator. + * next non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1625,141 +1627,6 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) } EXPORT_SYMBOL(bioset_init_from_src); -#ifdef CONFIG_BLK_CGROUP - -/** - * bio_disassociate_blkg - puts back the blkg reference if associated - * @bio: target bio - * - * Helper to disassociate the blkg from @bio if a blkg is associated. - */ -void bio_disassociate_blkg(struct bio *bio) -{ - if (bio->bi_blkg) { - blkg_put(bio->bi_blkg); - bio->bi_blkg = NULL; - } -} -EXPORT_SYMBOL_GPL(bio_disassociate_blkg); - -/** - * __bio_associate_blkg - associate a bio with the a blkg - * @bio: target bio - * @blkg: the blkg to associate - * - * This tries to associate @bio with the specified @blkg. Association failure - * is handled by walking up the blkg tree. Therefore, the blkg associated can - * be anything between @blkg and the root_blkg. This situation only happens - * when a cgroup is dying and then the remaining bios will spill to the closest - * alive blkg. - * - * A reference will be taken on the @blkg and will be released when @bio is - * freed. - */ -static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) -{ - bio_disassociate_blkg(bio); - - bio->bi_blkg = blkg_tryget_closest(blkg); -} - -/** - * bio_associate_blkg_from_css - associate a bio with a specified css - * @bio: target bio - * @css: target css - * - * Associate @bio with the blkg found by combining the css's blkg and the - * request_queue of the @bio. This falls back to the queue's root_blkg if - * the association fails with the css. - */ -void bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ - struct request_queue *q = bio->bi_disk->queue; - struct blkcg_gq *blkg; - - rcu_read_lock(); - - if (!css || !css->parent) - blkg = q->root_blkg; - else - blkg = blkg_lookup_create(css_to_blkcg(css), q); - - __bio_associate_blkg(bio, blkg); - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); - -#ifdef CONFIG_MEMCG -/** - * bio_associate_blkg_from_page - associate a bio with the page's blkg - * @bio: target bio - * @page: the page to lookup the blkcg from - * - * Associate @bio with the blkg from @page's owning memcg and the respective - * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's - * root_blkg. - */ -void bio_associate_blkg_from_page(struct bio *bio, struct page *page) -{ - struct cgroup_subsys_state *css; - - if (!page->mem_cgroup) - return; - - rcu_read_lock(); - - css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); - bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); -} -#endif /* CONFIG_MEMCG */ - -/** - * bio_associate_blkg - associate a bio with a blkg - * @bio: target bio - * - * Associate @bio with the blkg found from the bio's css and request_queue. - * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is - * already associated, the css is reused and association redone as the - * request_queue may have changed. - */ -void bio_associate_blkg(struct bio *bio) -{ - struct cgroup_subsys_state *css; - - rcu_read_lock(); - - if (bio->bi_blkg) - css = &bio_blkcg(bio)->css; - else - css = blkcg_css(); - - bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(bio_associate_blkg); - -/** - * bio_clone_blkg_association - clone blkg association from src to dst bio - * @dst: destination bio - * @src: source bio - */ -void bio_clone_blkg_association(struct bio *dst, struct bio *src) -{ - rcu_read_lock(); - - if (src->bi_blkg) - __bio_associate_blkg(dst, src->bi_blkg); - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(bio_clone_blkg_association); -#endif /* CONFIG_BLK_CGROUP */ - static void __init biovec_init_slabs(void) { int i; diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0ecc897b225c..619a79b51068 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -95,9 +95,6 @@ static void __blkg_release(struct rcu_head *rcu) css_put(&blkg->blkcg->css); if (blkg->parent) blkg_put(blkg->parent); - - wb_congested_put(blkg->wb_congested); - blkg_free(blkg); } @@ -227,7 +224,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; - struct bdi_writeback_congested *wb_congested; int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -245,31 +241,22 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, goto err_free_blkg; } - wb_congested = wb_congested_get_create(q->backing_dev_info, - blkcg->css.id, - GFP_NOWAIT | __GFP_NOWARN); - if (!wb_congested) { - ret = -ENOMEM; - goto err_put_css; - } - /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto err_put_congested; + goto err_put_css; } } blkg = new_blkg; - blkg->wb_congested = wb_congested; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; - goto err_put_congested; + goto err_put_css; } blkg_get(blkg->parent); } @@ -306,8 +293,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); -err_put_congested: - wb_congested_put(wb_congested); err_put_css: css_put(&blkcg->css); err_free_blkg: @@ -316,30 +301,35 @@ err_free_blkg: } /** - * __blkg_lookup_create - lookup blkg, try to create one if not there + * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function - * should be called under RCU read lock and @q->queue_lock. + * should be called under RCU read lock and takes @q->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ -struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) +static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) { struct blkcg_gq *blkg; + unsigned long flags; WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(&q->queue_lock); - blkg = __blkg_lookup(blkcg, q, true); + blkg = blkg_lookup(blkcg, q); if (blkg) return blkg; + spin_lock_irqsave(&q->queue_lock, flags); + blkg = __blkg_lookup(blkcg, q, true); + if (blkg) + goto found; + /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. Returns the closest @@ -362,34 +352,16 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, } blkg = blkg_create(pos, q, NULL); - if (IS_ERR(blkg)) - return ret_blkg; + if (IS_ERR(blkg)) { + blkg = ret_blkg; + break; + } if (pos == blkcg) - return blkg; - } -} - -/** - * blkg_lookup_create - find or create a blkg - * @blkcg: target block cgroup - * @q: target request_queue - * - * This looks up or creates the blkg representing the unique pair - * of the blkcg and the request_queue. - */ -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) -{ - struct blkcg_gq *blkg = blkg_lookup(blkcg, q); - - if (unlikely(!blkg)) { - unsigned long flags; - - spin_lock_irqsave(&q->queue_lock, flags); - blkg = __blkg_lookup_create(blkcg, q); - spin_unlock_irqrestore(&q->queue_lock, flags); + break; } +found: + spin_unlock_irqrestore(&q->queue_lock, flags); return blkg; } @@ -739,12 +711,137 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] += src->bytes[i]; + dst->ios[i] += src->ios[i]; + } +} + +static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] -= src->bytes[i]; + dst->ios[i] -= src->ios[i]; + } +} + +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkcg_gq *parent = blkg->parent; + struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); + struct blkg_iostat cur, delta; + unsigned int seq; + + /* fetch the current per-cpu values */ + do { + seq = u64_stats_fetch_begin(&bisc->sync); + blkg_iostat_set(&cur, &bisc->cur); + } while (u64_stats_fetch_retry(&bisc->sync, seq)); + + /* propagate percpu delta to global */ + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&delta, &cur); + blkg_iostat_sub(&delta, &bisc->last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(&bisc->last, &delta); + u64_stats_update_end(&blkg->iostat.sync); + + /* propagate global delta to parent */ + if (parent) { + u64_stats_update_begin(&parent->iostat.sync); + blkg_iostat_set(&delta, &blkg->iostat.cur); + blkg_iostat_sub(&delta, &blkg->iostat.last); + blkg_iostat_add(&parent->iostat.cur, &delta); + blkg_iostat_add(&blkg->iostat.last, &delta); + u64_stats_update_end(&parent->iostat.sync); + } + } + + rcu_read_unlock(); +} + +/* + * The rstat algorithms intentionally don't handle the root cgroup to avoid + * incurring overhead when no cgroups are defined. For that reason, + * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the + * iostat in the root cgroup's blkcg_gq. + * + * However, we would like to re-use the printing code between the root and + * non-root cgroups to the extent possible. For that reason, we simulate + * flushing the root cgroup's stats by explicitly filling in the iostat + * with disk level statistics. + */ +static void blkcg_fill_root_iostats(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct hd_struct *part = disk_get_part(disk, 0); + struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); + struct blkg_iostat tmp; + int cpu; + + memset(&tmp, 0, sizeof(tmp)); + for_each_possible_cpu(cpu) { + struct disk_stats *cpu_dkstats; + + cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); + tmp.ios[BLKG_IOSTAT_READ] += + cpu_dkstats->ios[STAT_READ]; + tmp.ios[BLKG_IOSTAT_WRITE] += + cpu_dkstats->ios[STAT_WRITE]; + tmp.ios[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->ios[STAT_DISCARD]; + // convert sectors to bytes + tmp.bytes[BLKG_IOSTAT_READ] += + cpu_dkstats->sectors[STAT_READ] << 9; + tmp.bytes[BLKG_IOSTAT_WRITE] += + cpu_dkstats->sectors[STAT_WRITE] << 9; + tmp.bytes[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->sectors[STAT_DISCARD] << 9; + + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&blkg->iostat.cur, &tmp); + u64_stats_update_end(&blkg->iostat.sync); + } + } +} + static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; - cgroup_rstat_flush(blkcg->css.cgroup); + if (!seq_css(sf)->parent) + blkcg_fill_root_iostats(); + else + cgroup_rstat_flush(blkcg->css.cgroup); + rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -833,7 +930,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) static struct cftype blkcg_files[] = { { .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = blkcg_print_stat, }, { } /* terminate */ @@ -1025,7 +1121,7 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) * blkcg_init_queue - initialize blkcg part of request queue * @q: request_queue to initialize * - * Called from __blk_alloc_queue(). Responsible for initializing blkcg + * Called from blk_alloc_queue(). Responsible for initializing blkcg * part of new request_queue @q. * * RETURNS: @@ -1114,77 +1210,6 @@ static int blkcg_can_attach(struct cgroup_taskset *tset) return ret; } -static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] = src->bytes[i]; - dst->ios[i] = src->ios[i]; - } -} - -static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] += src->bytes[i]; - dst->ios[i] += src->ios[i]; - } -} - -static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] -= src->bytes[i]; - dst->ios[i] -= src->ios[i]; - } -} - -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) -{ - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; - - rcu_read_lock(); - - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkcg_gq *parent = blkg->parent; - struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); - struct blkg_iostat cur, delta; - unsigned seq; - - /* fetch the current per-cpu values */ - do { - seq = u64_stats_fetch_begin(&bisc->sync); - blkg_iostat_set(&cur, &bisc->cur); - } while (u64_stats_fetch_retry(&bisc->sync, seq)); - - /* propagate percpu delta to global */ - u64_stats_update_begin(&blkg->iostat.sync); - blkg_iostat_set(&delta, &cur); - blkg_iostat_sub(&delta, &bisc->last); - blkg_iostat_add(&blkg->iostat.cur, &delta); - blkg_iostat_add(&bisc->last, &delta); - u64_stats_update_end(&blkg->iostat.sync); - - /* propagate global delta to parent */ - if (parent) { - u64_stats_update_begin(&parent->iostat.sync); - blkg_iostat_set(&delta, &blkg->iostat.cur); - blkg_iostat_sub(&delta, &blkg->iostat.last); - blkg_iostat_add(&parent->iostat.cur, &delta); - blkg_iostat_add(&blkg->iostat.last, &delta); - u64_stats_update_end(&parent->iostat.sync); - } - } - - rcu_read_unlock(); -} - static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; @@ -1727,6 +1752,139 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) atomic64_add(delta, &blkg->delay_nsec); } +/** + * blkg_tryget_closest - try and get a blkg ref on the closet blkg + * @bio: target bio + * @css: target css + * + * As the failure mode here is to walk up the blkg tree, this ensure that the + * blkg->parent pointers are always valid. This returns the blkg that it ended + * up taking a reference on or %NULL if no reference was taken. + */ +static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, + struct cgroup_subsys_state *css) +{ + struct blkcg_gq *blkg, *ret_blkg = NULL; + + rcu_read_lock(); + blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue); + while (blkg) { + if (blkg_tryget(blkg)) { + ret_blkg = blkg; + break; + } + blkg = blkg->parent; + } + rcu_read_unlock(); + + return ret_blkg; +} + +/** + * bio_associate_blkg_from_css - associate a bio with a specified css + * @bio: target bio + * @css: target css + * + * Associate @bio with the blkg found by combining the css's blkg and the + * request_queue of the @bio. An association failure is handled by walking up + * the blkg tree. Therefore, the blkg associated can be anything between @blkg + * and q->root_blkg. This situation only happens when a cgroup is dying and + * then the remaining bios will spill to the closest alive blkg. + * + * A reference will be taken on the blkg and will be released when @bio is + * freed. + */ +void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) +{ + if (bio->bi_blkg) + blkg_put(bio->bi_blkg); + + if (css && css->parent) { + bio->bi_blkg = blkg_tryget_closest(bio, css); + } else { + blkg_get(bio->bi_disk->queue->root_blkg); + bio->bi_blkg = bio->bi_disk->queue->root_blkg; + } +} +EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); + +/** + * bio_associate_blkg - associate a bio with a blkg + * @bio: target bio + * + * Associate @bio with the blkg found from the bio's css and request_queue. + * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is + * already associated, the css is reused and association redone as the + * request_queue may have changed. + */ +void bio_associate_blkg(struct bio *bio) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + if (bio->bi_blkg) + css = &bio_blkcg(bio)->css; + else + css = blkcg_css(); + + bio_associate_blkg_from_css(bio, css); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(bio_associate_blkg); + +/** + * bio_clone_blkg_association - clone blkg association from src to dst bio + * @dst: destination bio + * @src: source bio + */ +void bio_clone_blkg_association(struct bio *dst, struct bio *src) +{ + if (src->bi_blkg) { + if (dst->bi_blkg) + blkg_put(dst->bi_blkg); + blkg_get(src->bi_blkg); + dst->bi_blkg = src->bi_blkg; + } +} +EXPORT_SYMBOL_GPL(bio_clone_blkg_association); + +static int blk_cgroup_io_type(struct bio *bio) +{ + if (op_is_discard(bio->bi_opf)) + return BLKG_IOSTAT_DISCARD; + if (op_is_write(bio->bi_opf)) + return BLKG_IOSTAT_WRITE; + return BLKG_IOSTAT_READ; +} + +void blk_cgroup_bio_start(struct bio *bio) +{ + int rwd = blk_cgroup_io_type(bio), cpu; + struct blkg_iostat_set *bis; + + cpu = get_cpu(); + bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); + u64_stats_update_begin(&bis->sync); + + /* + * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split + * bio and we would have already accounted for the size of the bio. + */ + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { + bio_set_flag(bio, BIO_CGROUP_ACCT); + bis->cur.bytes[rwd] += bio->bi_iter.bi_size; + } + bis->cur.ios[rwd]++; + + u64_stats_update_end(&bis->sync); + if (cgroup_subsys_on_dfl(io_cgrp_subsys)) + cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); + put_cpu(); +} + static int __init blkcg_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", diff --git a/block/blk-core.c b/block/blk-core.c index 03252af8c82c..93104c7470e8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -51,9 +51,7 @@ #include "blk-pm.h" #include "blk-rq-qos.h" -#ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; -#endif EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -285,7 +283,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags); * A block device may call blk_sync_queue to ensure that any * such activity is cancelled, thus allowing it to release resources * that the callbacks might use. The caller must already have made sure - * that its ->make_request_fn will not re-add plugging prior to calling + * that its ->submit_bio will not re-add plugging prior to calling * this function. * * This function does not cancel any asynchronous activity arising @@ -321,6 +319,16 @@ void blk_clear_pm_only(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_clear_pm_only); +/** + * blk_put_queue - decrement the request_queue refcount + * @q: the request_queue structure to decrement the refcount for + * + * Decrements the refcount of the request_queue kobject. When this reaches 0 + * we'll have blk_release_queue() called. + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. + */ void blk_put_queue(struct request_queue *q) { kobject_put(&q->kobj); @@ -352,9 +360,14 @@ EXPORT_SYMBOL_GPL(blk_set_queue_dying); * * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and * put it. All future requests will be failed immediately with -ENODEV. + * + * Context: can sleep */ void blk_cleanup_queue(struct request_queue *q) { + /* cannot be called from atomic context */ + might_sleep(); + WARN_ON_ONCE(blk_queue_registered(q)); /* mark @q DYING, no new request or merges will be allowed afterwards */ @@ -497,7 +510,7 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *__blk_alloc_queue(int node_id) +struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; int ret; @@ -540,9 +553,7 @@ struct request_queue *__blk_alloc_queue(int node_id) kobject_init(&q->kobj, &blk_queue_ktype); -#ifdef CONFIG_BLK_DEV_IO_TRACE - mutex_init(&q->blk_trace_mutex); -#endif + mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); spin_lock_init(&q->queue_lock); @@ -564,6 +575,7 @@ struct request_queue *__blk_alloc_queue(int node_id) blk_queue_dma_alignment(q, 511); blk_set_default_limits(&q->limits); + q->nr_requests = BLKDEV_MAX_RQ; return q; @@ -581,23 +593,16 @@ fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } - -struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id) -{ - struct request_queue *q; - - if (WARN_ON_ONCE(!make_request)) - return NULL; - - q = __blk_alloc_queue(node_id); - if (!q) - return NULL; - q->make_request_fn = make_request; - q->nr_requests = BLKDEV_MAX_RQ; - return q; -} EXPORT_SYMBOL(blk_alloc_queue); +/** + * blk_get_queue - increment the request_queue refcount + * @q: the request_queue structure to increment the refcount for + * + * Increment the refcount of the request_queue kobject. + * + * Context: Any context. + */ bool blk_get_queue(struct request_queue *q) { if (likely(!blk_queue_dying(q))) { @@ -850,8 +855,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) return false; WARN_ONCE(1, - "generic_make_request: Trying to write " - "to read-only block-device %s (partno %d)\n", + "Trying to write to read-only block-device %s (partno %d)\n", bio_devname(bio, b), part->partno); /* Older lvm-tools actually trigger this */ return false; @@ -952,25 +956,13 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_OK; } -static noinline_for_stack bool -generic_make_request_checks(struct bio *bio) +static noinline_for_stack bool submit_bio_checks(struct bio *bio) { - struct request_queue *q; - int nr_sectors = bio_sectors(bio); + struct request_queue *q = bio->bi_disk->queue; blk_status_t status = BLK_STS_IOERR; - char b[BDEVNAME_SIZE]; might_sleep(); - q = bio->bi_disk->queue; - if (unlikely(!q)) { - printk(KERN_ERR - "generic_make_request: Trying to access " - "nonexistent block-device %s (%Lu)\n", - bio_devname(bio, b), (long long)bio->bi_iter.bi_sector); - goto end_io; - } - /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. @@ -992,14 +984,13 @@ generic_make_request_checks(struct bio *bio) } /* - * Filter flush bio's early so that make_request based - * drivers without flush support don't have to worry - * about them. + * Filter flush bio's early so that bio based drivers without flush + * support don't have to worry about them. */ if (op_is_flush(bio->bi_opf) && !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); - if (!nr_sectors) { + if (!bio_sectors(bio)) { status = BLK_STS_OK; goto end_io; } @@ -1054,8 +1045,13 @@ generic_make_request_checks(struct bio *bio) if (unlikely(!current->io_context)) create_task_io_context(current, GFP_ATOMIC, q->node); - if (!blkcg_bio_issue_check(q, bio)) + if (blk_throtl_bio(bio)) { + blkcg_bio_issue_init(bio); return false; + } + + blk_cgroup_bio_start(bio); + blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_queue(q, bio); @@ -1074,138 +1070,144 @@ end_io: return false; } -static blk_qc_t do_make_request(struct bio *bio) +static blk_qc_t __submit_bio(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct gendisk *disk = bio->bi_disk; blk_qc_t ret = BLK_QC_T_NONE; if (blk_crypto_bio_prep(&bio)) { - if (!q->make_request_fn) - return blk_mq_make_request(q, bio); - ret = q->make_request_fn(q, bio); + if (!disk->fops->submit_bio) + return blk_mq_submit_bio(bio); + ret = disk->fops->submit_bio(bio); } - blk_queue_exit(q); + blk_queue_exit(disk->queue); return ret; } -/** - * generic_make_request - re-submit a bio to the block device layer for I/O - * @bio: The bio describing the location in memory and on the device. +/* + * The loop in this function may be a bit non-obvious, and so deserves some + * explanation: * - * This is a version of submit_bio() that shall only be used for I/O that is - * resubmitted to lower level drivers by stacking block drivers. All file - * systems and other upper level users of the block layer should use - * submit_bio() instead. + * - Before entering the loop, bio->bi_next is NULL (as all callers ensure + * that), so we have a list with a single bio. + * - We pretend that we have just taken it off a longer list, so we assign + * bio_list to a pointer to the bio_list_on_stack, thus initialising the + * bio_list of new bios to be added. ->submit_bio() may indeed add some more + * bios through a recursive call to submit_bio_noacct. If it did, we find a + * non-NULL value in bio_list and re-enter the loop from the top. + * - In this case we really did just take the bio of the top of the list (no + * pretending) and so remove it from bio_list, and call into ->submit_bio() + * again. + * + * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. + * bio_list_on_stack[1] contains bios that were submitted before the current + * ->submit_bio_bio, but that haven't been processed yet. */ -blk_qc_t generic_make_request(struct bio *bio) +static blk_qc_t __submit_bio_noacct(struct bio *bio) { - /* - * bio_list_on_stack[0] contains bios submitted by the current - * make_request_fn. - * bio_list_on_stack[1] contains bios that were submitted before - * the current make_request_fn, but that haven't been processed - * yet. - */ struct bio_list bio_list_on_stack[2]; blk_qc_t ret = BLK_QC_T_NONE; - if (!generic_make_request_checks(bio)) - goto out; - - /* - * We only want one ->make_request_fn to be active at a time, else - * stack usage with stacked devices could be a problem. So use - * current->bio_list to keep a list of requests submited by a - * make_request_fn function. current->bio_list is also used as a - * flag to say if generic_make_request is currently active in this - * task or not. If it is NULL, then no make_request is active. If - * it is non-NULL, then a make_request is active, and new requests - * should be added at the tail - */ - if (current->bio_list) { - bio_list_add(¤t->bio_list[0], bio); - goto out; - } - - /* following loop may be a bit non-obvious, and so deserves some - * explanation. - * Before entering the loop, bio->bi_next is NULL (as all callers - * ensure that) so we have a list with a single bio. - * We pretend that we have just taken it off a longer list, so - * we assign bio_list to a pointer to the bio_list_on_stack, - * thus initialising the bio_list of new bios to be - * added. ->make_request() may indeed add some more bios - * through a recursive call to generic_make_request. If it - * did, we find a non-NULL value in bio_list and re-enter the loop - * from the top. In this case we really did just take the bio - * of the top of the list (no pretending) and so remove it from - * bio_list, and call into ->make_request() again. - */ BUG_ON(bio->bi_next); + bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; + do { struct request_queue *q = bio->bi_disk->queue; + struct bio_list lower, same; + + if (unlikely(bio_queue_enter(bio) != 0)) + continue; - if (likely(bio_queue_enter(bio) == 0)) { - struct bio_list lower, same; + /* + * Create a fresh bio_list for all subordinate requests. + */ + bio_list_on_stack[1] = bio_list_on_stack[0]; + bio_list_init(&bio_list_on_stack[0]); - /* Create a fresh bio_list for all subordinate requests */ - bio_list_on_stack[1] = bio_list_on_stack[0]; - bio_list_init(&bio_list_on_stack[0]); - ret = do_make_request(bio); + ret = __submit_bio(bio); - /* sort new bios into those for a lower level - * and those for the same level - */ - bio_list_init(&lower); - bio_list_init(&same); - while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) - if (q == bio->bi_disk->queue) - bio_list_add(&same, bio); - else - bio_list_add(&lower, bio); - /* now assemble so we handle the lowest level first */ - bio_list_merge(&bio_list_on_stack[0], &lower); - bio_list_merge(&bio_list_on_stack[0], &same); - bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); + /* + * Sort new bios into those for a lower level and those for the + * same level. + */ + bio_list_init(&lower); + bio_list_init(&same); + while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) + if (q == bio->bi_disk->queue) + bio_list_add(&same, bio); + else + bio_list_add(&lower, bio); + + /* + * Now assemble so we handle the lowest level first. + */ + bio_list_merge(&bio_list_on_stack[0], &lower); + bio_list_merge(&bio_list_on_stack[0], &same); + bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); + } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); + + current->bio_list = NULL; + return ret; +} + +static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) +{ + struct bio_list bio_list[2] = { }; + blk_qc_t ret = BLK_QC_T_NONE; + + current->bio_list = bio_list; + + do { + struct gendisk *disk = bio->bi_disk; + + if (unlikely(bio_queue_enter(bio) != 0)) + continue; + + if (!blk_crypto_bio_prep(&bio)) { + blk_queue_exit(disk->queue); + ret = BLK_QC_T_NONE; + continue; } - bio = bio_list_pop(&bio_list_on_stack[0]); - } while (bio); - current->bio_list = NULL; /* deactivate */ -out: + ret = blk_mq_submit_bio(bio); + } while ((bio = bio_list_pop(&bio_list[0]))); + + current->bio_list = NULL; return ret; } -EXPORT_SYMBOL(generic_make_request); /** - * direct_make_request - hand a buffer directly to its device driver for I/O + * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. * - * This function behaves like generic_make_request(), but does not protect - * against recursion. Must only be used if the called driver is known - * to be blk-mq based. + * This is a version of submit_bio() that shall only be used for I/O that is + * resubmitted to lower level drivers by stacking block drivers. All file + * systems and other upper level users of the block layer should use + * submit_bio() instead. */ -blk_qc_t direct_make_request(struct bio *bio) +blk_qc_t submit_bio_noacct(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; - - if (WARN_ON_ONCE(q->make_request_fn)) { - bio_io_error(bio); - return BLK_QC_T_NONE; - } - if (!generic_make_request_checks(bio)) - return BLK_QC_T_NONE; - if (unlikely(bio_queue_enter(bio))) + if (!submit_bio_checks(bio)) return BLK_QC_T_NONE; - if (!blk_crypto_bio_prep(&bio)) { - blk_queue_exit(q); + + /* + * We only want one ->submit_bio to be active at a time, else stack + * usage with stacked devices could be a problem. Use current->bio_list + * to collect a list of requests submited by a ->submit_bio method while + * it is active, and then process them after it returned. + */ + if (current->bio_list) { + bio_list_add(¤t->bio_list[0], bio); return BLK_QC_T_NONE; } - return blk_mq_make_request(q, bio); + + if (!bio->bi_disk->fops->submit_bio) + return __submit_bio_noacct_mq(bio); + return __submit_bio_noacct(bio); } -EXPORT_SYMBOL_GPL(direct_make_request); +EXPORT_SYMBOL(submit_bio_noacct); /** * submit_bio - submit a bio to the block device layer for I/O @@ -1266,13 +1268,13 @@ blk_qc_t submit_bio(struct bio *bio) blk_qc_t ret; psi_memstall_enter(&pflags); - ret = generic_make_request(bio); + ret = submit_bio_noacct(bio); psi_memstall_leave(&pflags); return ret; } - return generic_make_request(bio); + return submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); @@ -1908,9 +1910,7 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); -#ifdef CONFIG_DEBUG_FS blk_debugfs_root = debugfs_create_dir("block", NULL); -#endif return 0; } diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 6e49688a2d80..c162b754efbd 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -228,7 +228,7 @@ static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) return false; } bio_chain(split_bio, bio); - generic_make_request(bio); + submit_bio_noacct(bio); *bio_ptr = split_bio; } diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 6533c9b36ab8..2d5e60023b08 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -239,7 +239,7 @@ void __blk_crypto_free_request(struct request *rq) * kernel crypto API. When the crypto API fallback is used for encryption, * blk-crypto may choose to split the bio into 2 - the first one that will * continue to be processed and the second one that will be resubmitted via - * generic_make_request. A bounce bio will be allocated to encrypt the contents + * submit_bio_noacct. A bounce bio will be allocated to encrypt the contents * of the aforementioned "first one", and *bio_ptr will be updated to this * bounce bio. * diff --git a/block/blk-flush.c b/block/blk-flush.c index 15ae0155ec07..6e1543c10493 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -219,7 +219,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) struct request *rq, *n; unsigned long flags = 0; struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); - struct blk_mq_hw_ctx *hctx; blk_account_io_flush(flush_rq); @@ -235,13 +234,11 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) if (fq->rq_status != BLK_STS_OK) error = fq->rq_status; - hctx = flush_rq->mq_hctx; if (!q->elevator) { - blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); - flush_rq->tag = -1; + flush_rq->tag = BLK_MQ_NO_TAG; } else { blk_mq_put_driver_tag(flush_rq); - flush_rq->internal_tag = -1; + flush_rq->internal_tag = BLK_MQ_NO_TAG; } running = &fq->flush_queue[fq->flush_running_idx]; @@ -286,13 +283,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) return; - /* C2 and C3 - * - * For blk-mq + scheduling, we can risk having all driver tags - * assigned to empty flushes, and we deadlock if we are expecting - * other requests to make progress. Don't defer for that case. - */ - if (!list_empty(&fq->flush_data_in_flight) && q->elevator && + /* C2 and C3 */ + if (!list_empty(&fq->flush_data_in_flight) && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; @@ -316,13 +308,10 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; - if (!q->elevator) { - fq->orig_rq = first_rq; + if (!q->elevator) flush_rq->tag = first_rq->tag; - blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq); - } else { + else flush_rq->internal_tag = first_rq->internal_tag; - } flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9df50fb507ca..57299f860d41 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -96,15 +96,7 @@ static void ioc_release_fn(struct work_struct *work) { struct io_context *ioc = container_of(work, struct io_context, release_work); - unsigned long flags; - - /* - * Exiting icq may call into put_io_context() through elevator - * which will trigger lockdep warning. The ioc's are guaranteed to - * be different, use a different locking subclass here. Use - * irqsave variant as there's no spin_lock_irq_nested(). - */ - spin_lock_irqsave_nested(&ioc->lock, flags, 1); + spin_lock_irq(&ioc->lock); while (!hlist_empty(&ioc->icq_list)) { struct io_cq *icq = hlist_entry(ioc->icq_list.first, @@ -115,13 +107,27 @@ static void ioc_release_fn(struct work_struct *work) ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); } else { - spin_unlock_irqrestore(&ioc->lock, flags); - cpu_relax(); - spin_lock_irqsave_nested(&ioc->lock, flags, 1); + /* Make sure q and icq cannot be freed. */ + rcu_read_lock(); + + /* Re-acquire the locks in the correct order. */ + spin_unlock(&ioc->lock); + spin_lock(&q->queue_lock); + spin_lock(&ioc->lock); + + /* + * The icq may have been destroyed when the ioc lock + * was released. + */ + if (!(icq->flags & ICQ_DESTROYED)) + ioc_destroy_icq(icq); + + spin_unlock(&q->queue_lock); + rcu_read_unlock(); } } - spin_unlock_irqrestore(&ioc->lock, flags); + spin_unlock_irq(&ioc->lock); kmem_cache_free(iocontext_cachep, ioc); } @@ -170,7 +176,6 @@ void put_io_context(struct io_context *ioc) */ void put_io_context_active(struct io_context *ioc) { - unsigned long flags; struct io_cq *icq; if (!atomic_dec_and_test(&ioc->active_ref)) { @@ -178,19 +183,14 @@ void put_io_context_active(struct io_context *ioc) return; } - /* - * Need ioc lock to walk icq_list and q lock to exit icq. Perform - * reverse double locking. Read comment in ioc_release_fn() for - * explanation on the nested locking annotation. - */ - spin_lock_irqsave_nested(&ioc->lock, flags, 1); + spin_lock_irq(&ioc->lock); hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { if (icq->flags & ICQ_EXITED) continue; ioc_exit_icq(icq); } - spin_unlock_irqrestore(&ioc->lock, flags); + spin_unlock_irq(&ioc->lock); put_io_context(ioc); } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 8ac4aad66ebc..521c29b8ae29 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1370,7 +1370,7 @@ static void ioc_timer_fn(struct timer_list *timer) * should have woken up in the last period and expire idle iocgs. */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { - if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt && + if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && !iocg_is_idle(iocg)) continue; @@ -2045,8 +2045,7 @@ static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, int levels = blkcg->css.cgroup->level + 1; struct ioc_gq *iocg; - iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]), - gfp, q->node); + iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node); if (!iocg) return NULL; diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c128d50cb410..f90429cf4edf 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -591,7 +591,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) struct rq_wait *rqw; struct iolatency_grp *iolat; u64 window_start; - u64 now = ktime_to_ns(ktime_get()); + u64 now; bool issue_as_root = bio_issue_as_root_blkg(bio); bool enabled = false; int inflight = 0; @@ -608,6 +608,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) if (!enabled) return; + now = ktime_to_ns(ktime_get()); while (blkg && blkg->parent) { iolat = blkg_to_lat(blkg); if (!iolat) { diff --git a/block/blk-lib.c b/block/blk-lib.c index 5f2c429d4378..019e09bb9c0e 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -29,7 +29,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; unsigned int op; - sector_t bs_mask; + sector_t bs_mask, part_offset = 0; if (!q) return -ENXIO; @@ -54,9 +54,34 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!nr_sects) return -EINVAL; + /* In case the discard request is in a partition */ + if (bdev->bd_partno) + part_offset = bdev->bd_part->start_sect; + while (nr_sects) { - sector_t req_sects = min_t(sector_t, nr_sects, - bio_allowed_max_sectors(q)); + sector_t granularity_aligned_lba, req_sects; + sector_t sector_mapped = sector + part_offset; + + granularity_aligned_lba = round_up(sector_mapped, + q->limits.discard_granularity >> SECTOR_SHIFT); + + /* + * Check whether the discard bio starts at a discard_granularity + * aligned LBA, + * - If no: set (granularity_aligned_lba - sector_mapped) to + * bi_size of the first split bio, then the second bio will + * start at a discard_granularity aligned LBA on the device. + * - If yes: use bio_aligned_discard_max_sectors() as the max + * possible bi_size of the first split bio. Then when this bio + * is split in device drive, the split ones are very probably + * to be aligned to discard_granularity of the device's queue. + */ + if (granularity_aligned_lba == sector_mapped) + req_sects = min_t(sector_t, nr_sects, + bio_aligned_discard_max_sectors(q)); + else + req_sects = min_t(sector_t, nr_sects, + granularity_aligned_lba - sector_mapped); WARN_ON_ONCE((req_sects << 9) > UINT_MAX); diff --git a/block/blk-merge.c b/block/blk-merge.c index f0b0bae075a0..5196dc145270 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -283,20 +283,20 @@ split: /** * __blk_queue_split - split a bio and submit the second half - * @q: [in] request queue pointer * @bio: [in, out] bio to be split * @nr_segs: [out] number of segments in the first bio * * Split a bio into two bios, chain the two bios, submit the second half and * store a pointer to the first half in *@bio. If the second bio is still too * big it will be split by a recursive call to this function. Since this - * function may allocate a new bio from @q->bio_split, it is the responsibility - * of the caller to ensure that @q is only released after processing of the + * function may allocate a new bio from @bio->bi_disk->queue->bio_split, it is + * the responsibility of the caller to ensure that + * @bio->bi_disk->queue->bio_split is only released after processing of the * split bio has finished. */ -void __blk_queue_split(struct request_queue *q, struct bio **bio, - unsigned int *nr_segs) +void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) { + struct request_queue *q = (*bio)->bi_disk->queue; struct bio *split = NULL; switch (bio_op(*bio)) { @@ -338,27 +338,26 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, bio_chain(split, *bio); trace_block_split(q, split, (*bio)->bi_iter.bi_sector); - generic_make_request(*bio); + submit_bio_noacct(*bio); *bio = split; } } /** * blk_queue_split - split a bio and submit the second half - * @q: [in] request queue pointer * @bio: [in, out] bio to be split * * Split a bio into two bios, chains the two bios, submit the second half and * store a pointer to the first half in *@bio. Since this function may allocate - * a new bio from @q->bio_split, it is the responsibility of the caller to - * ensure that @q is only released after processing of the split bio has - * finished. + * a new bio from @bio->bi_disk->queue->bio_split, it is the responsibility of + * the caller to ensure that @bio->bi_disk->queue->bio_split is only released + * after processing of the split bio has finished. */ -void blk_queue_split(struct request_queue *q, struct bio **bio) +void blk_queue_split(struct bio **bio) { unsigned int nr_segs; - __blk_queue_split(q, bio, &nr_segs); + __blk_queue_split(bio, &nr_segs); } EXPORT_SYMBOL(blk_queue_split); @@ -793,6 +792,8 @@ static struct request *attempt_merge(struct request_queue *q, */ blk_account_io_merge_request(next); + trace_block_rq_merge(q, next); + /* * ownership of bio passed from next to req, return 'next' for * the caller to free diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index e0b2bc131bf5..3f09bcb8a6fd 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -404,8 +404,7 @@ static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved) const struct show_busy_params *params = data; if (rq->mq_hctx == params->hctx) - __blk_mq_debugfs_rq_show(params->m, - list_entry_rq(&rq->queuelist)); + __blk_mq_debugfs_rq_show(params->m, rq); return true; } @@ -827,9 +826,6 @@ void blk_mq_debugfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), - blk_debugfs_root); - debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); /* @@ -860,9 +856,7 @@ void blk_mq_debugfs_register(struct request_queue *q) void blk_mq_debugfs_unregister(struct request_queue *q) { - debugfs_remove_recursive(q->debugfs_dir); q->sched_debugfs_dir = NULL; - q->debugfs_dir = NULL; } static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index fdcc2c1dd178..a19cdf159b75 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/blk-mq.h> +#include <linux/list_sort.h> #include <trace/events/block.h> @@ -80,6 +81,35 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) blk_mq_run_hw_queue(hctx, true); } +static int sched_rq_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct request *rqa = container_of(a, struct request, queuelist); + struct request *rqb = container_of(b, struct request, queuelist); + + return rqa->mq_hctx > rqb->mq_hctx; +} + +static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) +{ + struct blk_mq_hw_ctx *hctx = + list_first_entry(rq_list, struct request, queuelist)->mq_hctx; + struct request *rq; + LIST_HEAD(hctx_list); + unsigned int count = 0; + + list_for_each_entry(rq, rq_list, queuelist) { + if (rq->mq_hctx != hctx) { + list_cut_before(&hctx_list, rq_list, &rq->queuelist); + goto dispatch; + } + count++; + } + list_splice_tail_init(rq_list, &hctx_list); + +dispatch: + return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); +} + #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ /* @@ -90,12 +120,20 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. */ -static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; + bool multi_hctxs = false, run_queue = false; + bool dispatched = false, busy = false; + unsigned int max_dispatch; LIST_HEAD(rq_list); - int ret = 0; + int count = 0; + + if (hctx->dispatch_busy) + max_dispatch = 1; + else + max_dispatch = hctx->queue->nr_requests; do { struct request *rq; @@ -104,16 +142,16 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) break; if (!list_empty_careful(&hctx->dispatch)) { - ret = -EAGAIN; + busy = true; break; } - if (!blk_mq_get_dispatch_budget(hctx)) + if (!blk_mq_get_dispatch_budget(q)) break; rq = e->type->ops.dispatch_request(hctx); if (!rq) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(q); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the @@ -121,7 +159,7 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * no guarantee anyone will kick the queue. Kick it * ourselves. */ - blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); + run_queue = true; break; } @@ -130,8 +168,42 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * if this rq won't be queued to driver via .queue_rq() * in blk_mq_dispatch_rq_list(). */ - list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + list_add_tail(&rq->queuelist, &rq_list); + if (rq->mq_hctx != hctx) + multi_hctxs = true; + } while (++count < max_dispatch); + + if (!count) { + if (run_queue) + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); + } else if (multi_hctxs) { + /* + * Requests from different hctx may be dequeued from some + * schedulers, such as bfq and deadline. + * + * Sort the requests in the list according to their hctx, + * dispatch batching requests from same hctx at a time. + */ + list_sort(NULL, &rq_list, sched_rq_cmp); + do { + dispatched |= blk_mq_dispatch_hctx_list(&rq_list); + } while (!list_empty(&rq_list)); + } else { + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); + } + + if (busy) + return -EAGAIN; + return !!dispatched; +} + +static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +{ + int ret; + + do { + ret = __blk_mq_do_dispatch_sched(hctx); + } while (ret == 1); return ret; } @@ -153,7 +225,7 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to - * to be run again. This is necessary to avoid starving flushes. + * be run again. This is necessary to avoid starving flushes. */ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { @@ -161,10 +233,9 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) LIST_HEAD(rq_list); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); int ret = 0; + struct request *rq; do { - struct request *rq; - if (!list_empty_careful(&hctx->dispatch)) { ret = -EAGAIN; break; @@ -173,12 +244,12 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; - if (!blk_mq_get_dispatch_budget(hctx)) + if (!blk_mq_get_dispatch_budget(q)) break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(q); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the @@ -200,7 +271,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); WRITE_ONCE(hctx->dispatch_from, ctx); return ret; @@ -240,7 +311,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { + if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) { if (has_sched_dispatch) ret = blk_mq_do_dispatch_sched(hctx); else @@ -253,7 +324,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) ret = blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(q, &rq_list, false); + blk_mq_dispatch_rq_list(hctx, &rq_list, 0); } return ret; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index ae722f8b13fb..32d82e23b095 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -56,43 +56,12 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) blk_mq_tag_wakeup_all(tags, false); } -/* - * For shared tag users, we track the number of currently active users - * and attempt to provide a fair share of the tag depth for each of them. - */ -static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, - struct sbitmap_queue *bt) -{ - unsigned int depth, users; - - if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) - return true; - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - return true; - - /* - * Don't try dividing an ant - */ - if (bt->sb.depth == 1) - return true; - - users = atomic_read(&hctx->tags->active_queues); - if (!users) - return true; - - /* - * Allow at least some tags - */ - depth = max((bt->sb.depth + users - 1) / users, 4U); - return atomic_read(&hctx->nr_active) < depth; -} - static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { - if (!(data->flags & BLK_MQ_REQ_INTERNAL) && - !hctx_may_queue(data->hctx, bt)) + if (!data->q->elevator && !hctx_may_queue(data->hctx, bt)) return BLK_MQ_NO_TAG; + if (data->shallow_depth) return __sbitmap_queue_get_shallow(bt, data->shallow_depth); else @@ -191,33 +160,6 @@ found_tag: return tag + tag_offset; } -bool __blk_mq_get_driver_tag(struct request *rq) -{ - struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; - unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; - bool shared = blk_mq_tag_busy(rq->mq_hctx); - int tag; - - if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { - bt = &rq->mq_hctx->tags->breserved_tags; - tag_offset = 0; - } - - if (!hctx_may_queue(rq->mq_hctx, bt)) - return false; - tag = __sbitmap_queue_get(bt); - if (tag == BLK_MQ_NO_TAG) - return false; - - rq->tag = tag + tag_offset; - if (shared) { - rq->rq_flags |= RQF_MQ_INFLIGHT; - atomic_inc(&rq->mq_hctx->nr_active); - } - rq->mq_hctx->tags->rqs[rq->tag] = rq; - return true; -} - void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag) { diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 2e4ef51cdb32..b1acac518c4e 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -51,14 +51,6 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; -bool __blk_mq_get_driver_tag(struct request *rq); -static inline bool blk_mq_get_driver_tag(struct request *rq) -{ - if (rq->tag != BLK_MQ_NO_TAG) - return true; - return __blk_mq_get_driver_tag(rq); -} - extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); @@ -79,15 +71,34 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) } /* - * This helper should only be used for flush request to share tag - * with the request cloned from, and both the two requests can't be - * in flight at the same time. The caller has to make sure the tag - * can't be freed. + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. */ -static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, - unsigned int tag, struct request *rq) +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + struct sbitmap_queue *bt) { - hctx->tags->rqs[tag] = rq; + unsigned int depth, users; + + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return true; + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return true; + + /* + * Don't try dividing an ant + */ + if (bt->sb.depth == 1) + return true; + + users = atomic_read(&hctx->tags->active_queues); + if (!users) + return true; + + /* + * Allow at least some tags + */ + depth = max((bt->sb.depth + users - 1) / users, 4U); + return atomic_read(&hctx->nr_active) < depth; } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index 4e0d173beaa3..0015a1892153 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -41,6 +41,8 @@ #include "blk-mq-sched.h" #include "blk-rq-qos.h" +static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -275,26 +277,20 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; - req_flags_t rq_flags = 0; - if (data->flags & BLK_MQ_REQ_INTERNAL) { + if (data->q->elevator) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { - if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { - rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); - } rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; - data->hctx->tags->rqs[rq->tag] = rq; } /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; rq->mq_hctx = data->hctx; - rq->rq_flags = rq_flags; + rq->rq_flags = 0; rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; @@ -362,8 +358,6 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) data->flags |= BLK_MQ_REQ_NOWAIT; if (e) { - data->flags |= BLK_MQ_REQ_INTERNAL; - /* * Flush requests are special and go directly to the * dispatch list. Don't include reserved tags in the @@ -378,7 +372,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); - if (!(data->flags & BLK_MQ_REQ_INTERNAL)) + if (!e) blk_mq_tag_busy(data->hctx); /* @@ -394,7 +388,7 @@ retry: /* * Give up the CPU and sleep for a random short time to ensure * that thread using a realtime scheduling class are migrated - * off the the CPU, and thus off the hctx that is going away. + * off the CPU, and thus off the hctx that is going away. */ msleep(3); goto retry; @@ -474,9 +468,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); data.ctx = __blk_mq_get_ctx(q, cpu); - if (q->elevator) - data.flags |= BLK_MQ_REQ_INTERNAL; - else + if (!q->elevator) blk_mq_tag_busy(data.hctx); ret = -EWOULDBLOCK; @@ -552,8 +544,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) blk_stat_add(rq, now); } - if (rq->internal_tag != BLK_MQ_NO_TAG) - blk_mq_sched_completed_request(rq, now); + blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); @@ -574,71 +565,139 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) } EXPORT_SYMBOL(blk_mq_end_request); -static void __blk_mq_complete_request_remote(void *data) +/* + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +static __latent_entropy void blk_done_softirq(struct softirq_action *h) { - struct request *rq = data; - struct request_queue *q = rq->q; + struct list_head *cpu_list, local_list; - q->mq_ops->complete(rq); + local_irq_disable(); + cpu_list = this_cpu_ptr(&blk_cpu_done); + list_replace_init(cpu_list, &local_list); + local_irq_enable(); + + while (!list_empty(&local_list)) { + struct request *rq; + + rq = list_entry(local_list.next, struct request, ipi_list); + list_del_init(&rq->ipi_list); + rq->q->mq_ops->complete(rq); + } } -/** - * blk_mq_force_complete_rq() - Force complete the request, bypassing any error - * injection that could drop the completion. - * @rq: Request to be force completed - * - * Drivers should use blk_mq_complete_request() to complete requests in their - * normal IO path. For timeout error recovery, drivers may call this forced - * completion routine after they've reclaimed timed out requests to bypass - * potentially subsequent fake timeouts. - */ -void blk_mq_force_complete_rq(struct request *rq) +static void blk_mq_trigger_softirq(struct request *rq) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct request_queue *q = rq->q; - bool shared = false; - int cpu; + struct list_head *list; + unsigned long flags; + + local_irq_save(flags); + list = this_cpu_ptr(&blk_cpu_done); + list_add_tail(&rq->ipi_list, list); + + /* + * If the list only contains our just added request, signal a raise of + * the softirq. If there are already entries there, someone already + * raised the irq but it hasn't run yet. + */ + if (list->next == &rq->ipi_list) + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_restore(flags); +} + +static int blk_softirq_cpu_dead(unsigned int cpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_done, cpu), + this_cpu_ptr(&blk_cpu_done)); + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_enable(); + + return 0; +} + + +static void __blk_mq_complete_request_remote(void *data) +{ + struct request *rq = data; - WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* - * Most of single queue controllers, there is only one irq vector - * for handling IO completion, and the only irq's affinity is set - * as all possible CPUs. On most of ARCHs, this affinity means the - * irq is handled on one specific CPU. + * For most of single queue controllers, there is only one irq vector + * for handling I/O completion, and the only irq's affinity is set + * to all possible CPUs. On most of ARCHs, this affinity means the irq + * is handled on one specific CPU. * - * So complete IO reqeust in softirq context in case of single queue - * for not degrading IO performance by irqsoff latency. + * So complete I/O requests in softirq context in case of single queue + * devices to avoid degrading I/O performance due to irqsoff latency. */ - if (q->nr_hw_queues == 1) { - __blk_complete_request(rq); - return; - } + if (rq->q->nr_hw_queues == 1) + blk_mq_trigger_softirq(rq); + else + rq->q->mq_ops->complete(rq); +} + +static inline bool blk_mq_complete_need_ipi(struct request *rq) +{ + int cpu = raw_smp_processor_id(); + + if (!IS_ENABLED(CONFIG_SMP) || + !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) + return false; + + /* same CPU or cache domain? Complete locally */ + if (cpu == rq->mq_ctx->cpu || + (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && + cpus_share_cache(cpu, rq->mq_ctx->cpu))) + return false; + + /* don't try to IPI to an offline CPU */ + return cpu_online(rq->mq_ctx->cpu); +} + +bool blk_mq_complete_request_remote(struct request *rq) +{ + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* * For a polled request, always complete locallly, it's pointless * to redirect the completion. */ - if ((rq->cmd_flags & REQ_HIPRI) || - !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { - q->mq_ops->complete(rq); - return; - } - - cpu = get_cpu(); - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ctx->cpu); + if (rq->cmd_flags & REQ_HIPRI) + return false; - if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { + if (blk_mq_complete_need_ipi(rq)) { rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; - smp_call_function_single_async(ctx->cpu, &rq->csd); + smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); } else { - q->mq_ops->complete(rq); + if (rq->q->nr_hw_queues > 1) + return false; + blk_mq_trigger_softirq(rq); } - put_cpu(); + + return true; +} +EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); + +/** + * blk_mq_complete_request - end I/O on a request + * @rq: the request being processed + * + * Description: + * Complete a request by scheduling the ->complete_rq operation. + **/ +void blk_mq_complete_request(struct request *rq) +{ + if (!blk_mq_complete_request_remote(rq)) + rq->q->mq_ops->complete(rq); } -EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq); +EXPORT_SYMBOL(blk_mq_complete_request); static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) __releases(hctx->srcu) @@ -661,23 +720,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) } /** - * blk_mq_complete_request - end I/O on a request - * @rq: the request being processed - * - * Description: - * Ends all I/O on a request. It does not handle partial completions. - * The actual completion happens out-of-order, through a IPI handler. - **/ -bool blk_mq_complete_request(struct request *rq) -{ - if (unlikely(blk_should_fake_timeout(rq->q))) - return false; - blk_mq_force_complete_rq(rq); - return true; -} -EXPORT_SYMBOL(blk_mq_complete_request); - -/** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started * @@ -1052,6 +1094,45 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } +static bool __blk_mq_get_driver_tag(struct request *rq) +{ + struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; + unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; + int tag; + + blk_mq_tag_busy(rq->mq_hctx); + + if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { + bt = &rq->mq_hctx->tags->breserved_tags; + tag_offset = 0; + } + + if (!hctx_may_queue(rq->mq_hctx, bt)) + return false; + tag = __sbitmap_queue_get(bt); + if (tag == BLK_MQ_NO_TAG) + return false; + + rq->tag = tag + tag_offset; + return true; +} + +static bool blk_mq_get_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) + return false; + + if ((hctx->flags & BLK_MQ_F_TAG_SHARED) && + !(rq->rq_flags & RQF_MQ_INFLIGHT)) { + rq->rq_flags |= RQF_MQ_INFLIGHT; + atomic_inc(&hctx->nr_active); + } + hctx->tags->rqs[rq->tag] = rq; + return true; +} + static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { @@ -1204,25 +1285,70 @@ static void blk_mq_handle_zone_resource(struct request *rq, __blk_mq_requeue_request(rq); } +enum prep_dispatch { + PREP_DISPATCH_OK, + PREP_DISPATCH_NO_TAG, + PREP_DISPATCH_NO_BUDGET, +}; + +static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, + bool need_budget) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) { + blk_mq_put_driver_tag(rq); + return PREP_DISPATCH_NO_BUDGET; + } + + if (!blk_mq_get_driver_tag(rq)) { + /* + * The initial allocation attempt failed, so we need to + * rerun the hardware queue when a tag is freed. The + * waitqueue takes care of that. If the queue is run + * before we add this entry back on the dispatch list, + * we'll re-run it below. + */ + if (!blk_mq_mark_tag_wait(hctx, rq)) { + /* + * All budgets not got from this function will be put + * together during handling partial dispatch + */ + if (need_budget) + blk_mq_put_dispatch_budget(rq->q); + return PREP_DISPATCH_NO_TAG; + } + } + + return PREP_DISPATCH_OK; +} + +/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ +static void blk_mq_release_budgets(struct request_queue *q, + unsigned int nr_budgets) +{ + int i; + + for (i = 0; i < nr_budgets; i++) + blk_mq_put_dispatch_budget(q); +} + /* * Returns true if we did some work AND can potentially do more. */ -bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, - bool got_budget) +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, + unsigned int nr_budgets) { - struct blk_mq_hw_ctx *hctx; + enum prep_dispatch prep; + struct request_queue *q = hctx->queue; struct request *rq, *nxt; - bool no_tag = false; int errors, queued; blk_status_t ret = BLK_STS_OK; - bool no_budget_avail = false; LIST_HEAD(zone_list); if (list_empty(list)) return false; - WARN_ON(!list_is_singular(list) && got_budget); - /* * Now process all the entries, sending them to the driver. */ @@ -1232,32 +1358,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); - hctx = rq->mq_hctx; - if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) { - blk_mq_put_driver_tag(rq); - no_budget_avail = true; + WARN_ON_ONCE(hctx != rq->mq_hctx); + prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); + if (prep != PREP_DISPATCH_OK) break; - } - - if (!blk_mq_get_driver_tag(rq)) { - /* - * The initial allocation attempt failed, so we need to - * rerun the hardware queue when a tag is freed. The - * waitqueue takes care of that. If the queue is run - * before we add this entry back on the dispatch list, - * we'll re-run it below. - */ - if (!blk_mq_mark_tag_wait(hctx, rq)) { - blk_mq_put_dispatch_budget(hctx); - /* - * For non-shared tags, the RESTART check - * will suffice. - */ - if (hctx->flags & BLK_MQ_F_TAG_SHARED) - no_tag = true; - break; - } - } list_del_init(&rq->queuelist); @@ -1274,31 +1378,35 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bd.last = !blk_mq_get_driver_tag(nxt); } + /* + * once the request is queued to lld, no need to cover the + * budget any more + */ + if (nr_budgets) + nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd); - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { - blk_mq_handle_dev_resource(rq, list); + switch (ret) { + case BLK_STS_OK: + queued++; break; - } else if (ret == BLK_STS_ZONE_RESOURCE) { + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_handle_dev_resource(rq, list); + goto out; + case BLK_STS_ZONE_RESOURCE: /* * Move the request to zone_list and keep going through * the dispatch list to find more requests the drive can * accept. */ blk_mq_handle_zone_resource(rq, &zone_list); - if (list_empty(list)) - break; - continue; - } - - if (unlikely(ret != BLK_STS_OK)) { + break; + default: errors++; blk_mq_end_request(rq, BLK_STS_IOERR); - continue; } - - queued++; } while (!list_empty(list)); - +out: if (!list_empty(&zone_list)) list_splice_tail_init(&zone_list, list); @@ -1310,6 +1418,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, */ if (!list_empty(list)) { bool needs_restart; + /* For non-shared tags, the RESTART check will suffice */ + bool no_tag = prep == PREP_DISPATCH_NO_TAG && + (hctx->flags & BLK_MQ_F_TAG_SHARED); + bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; + + blk_mq_release_budgets(q, nr_budgets); /* * If we didn't flush the entire list, we could have told @@ -1361,13 +1475,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, } else blk_mq_update_dispatch_busy(hctx, false); - /* - * If the host/device is unable to accept more work, inform the - * caller of that. - */ - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - return false; - return (queued + errors) != 0; } @@ -1896,11 +2003,11 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (q->elevator && !bypass_insert) goto insert; - if (!blk_mq_get_dispatch_budget(hctx)) + if (!blk_mq_get_dispatch_budget(q)) goto insert; if (!blk_mq_get_driver_tag(rq)) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(q); goto insert; } @@ -2005,8 +2112,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) } /** - * blk_mq_make_request - Create and send a request to block device. - * @q: Request queue pointer. + * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. * * Builds up a request structure from @q and @bio and send to the device. The @@ -2020,8 +2126,9 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) * * Returns: Request queue cookie. */ -blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t blk_mq_submit_bio(struct bio *bio) { + struct request_queue *q = bio->bi_disk->queue; const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_mq_alloc_data data = { @@ -2035,7 +2142,7 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_status_t ret; blk_queue_bounce(q, &bio); - __blk_queue_split(q, &bio, &nr_segs); + __blk_queue_split(&bio, &nr_segs); if (!bio_integrity_prep(bio)) goto queue_exit; @@ -2146,7 +2253,7 @@ queue_exit: blk_queue_exit(q); return BLK_QC_T_NONE; } -EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */ +EXPORT_SYMBOL_GPL(blk_mq_submit_bio); /* only for request based dm */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) @@ -2792,7 +2899,7 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) struct blk_mq_tag_set *set = q->tag_set; mutex_lock(&set->tag_list_lock); - list_del_rcu(&q->tag_set_list); + list_del(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ set->flags &= ~BLK_MQ_F_TAG_SHARED; @@ -2819,7 +2926,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, } if (set->flags & BLK_MQ_F_TAG_SHARED) queue_set_hctx_shared(q, true); - list_add_tail_rcu(&q->tag_set_list, &set->tag_list); + list_add_tail(&q->tag_set_list, &set->tag_list); mutex_unlock(&set->tag_list_lock); } @@ -2886,7 +2993,7 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, { struct request_queue *uninit_q, *q; - uninit_q = __blk_alloc_queue(set->numa_node); + uninit_q = blk_alloc_queue(set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); uninit_q->queuedata = queuedata; @@ -3760,6 +3867,15 @@ EXPORT_SYMBOL(blk_mq_rq_cpu); static int __init blk_mq_init(void) { + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); + open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); + + cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, + "block/softirq:dead", NULL, + blk_softirq_cpu_dead); cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", diff --git a/block/blk-mq.h b/block/blk-mq.h index b3ce0f3a2ad2..863a2f3346d4 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -40,7 +40,8 @@ struct blk_mq_ctx { void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); -bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, + unsigned int); void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, bool kick_requeue_list); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); @@ -159,7 +160,7 @@ struct blk_mq_alloc_data { static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { - if (data->flags & BLK_MQ_REQ_INTERNAL) + if (data->q->elevator) return data->hctx->sched_tags; return data->hctx->tags; @@ -179,20 +180,16 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); -static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) +static inline void blk_mq_put_dispatch_budget(struct request_queue *q) { - struct request_queue *q = hctx->queue; - if (q->mq_ops->put_budget) - q->mq_ops->put_budget(hctx); + q->mq_ops->put_budget(q); } -static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx) +static inline bool blk_mq_get_dispatch_budget(struct request_queue *q) { - struct request_queue *q = hctx->queue; - if (q->mq_ops->get_budget) - return q->mq_ops->get_budget(hctx); + return q->mq_ops->get_budget(q); return true; } diff --git a/block/blk-softirq.c b/block/blk-softirq.c deleted file mode 100644 index 6e7ec87d49fa..000000000000 --- a/block/blk-softirq.c +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Functions related to softirq rq completions - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> -#include <linux/sched.h> -#include <linux/sched/topology.h> - -#include "blk.h" - -static DEFINE_PER_CPU(struct list_head, blk_cpu_done); - -/* - * Softirq action handler - move entries to local list and loop over them - * while passing them to the queue registered handler. - */ -static __latent_entropy void blk_done_softirq(struct softirq_action *h) -{ - struct list_head *cpu_list, local_list; - - local_irq_disable(); - cpu_list = this_cpu_ptr(&blk_cpu_done); - list_replace_init(cpu_list, &local_list); - local_irq_enable(); - - while (!list_empty(&local_list)) { - struct request *rq; - - rq = list_entry(local_list.next, struct request, ipi_list); - list_del_init(&rq->ipi_list); - rq->q->mq_ops->complete(rq); - } -} - -#ifdef CONFIG_SMP -static void trigger_softirq(void *data) -{ - struct request *rq = data; - struct list_head *list; - - list = this_cpu_ptr(&blk_cpu_done); - list_add_tail(&rq->ipi_list, list); - - if (list->next == &rq->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); -} - -/* - * Setup and invoke a run of 'trigger_softirq' on the given cpu. - */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - if (cpu_online(cpu)) { - call_single_data_t *data = &rq->csd; - - data->func = trigger_softirq; - data->info = rq; - data->flags = 0; - - smp_call_function_single_async(cpu, data); - return 0; - } - - return 1; -} -#else /* CONFIG_SMP */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - return 1; -} -#endif - -static int blk_softirq_cpu_dead(unsigned int cpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_done, cpu), - this_cpu_ptr(&blk_cpu_done)); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_enable(); - - return 0; -} - -void __blk_complete_request(struct request *req) -{ - struct request_queue *q = req->q; - int cpu, ccpu = req->mq_ctx->cpu; - unsigned long flags; - bool shared = false; - - BUG_ON(!q->mq_ops->complete); - - local_irq_save(flags); - cpu = smp_processor_id(); - - /* - * Select completion CPU - */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) { - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ccpu); - } else - ccpu = cpu; - - /* - * If current CPU and requested CPU share a cache, run the softirq on - * the current CPU. One might concern this is just like - * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is - * running in interrupt handler, and currently I/O controller doesn't - * support multiple interrupts, so current CPU is unique actually. This - * avoids IPI sending from current CPU to the first CPU of a group. - */ - if (ccpu == cpu || shared) { - struct list_head *list; -do_local: - list = this_cpu_ptr(&blk_cpu_done); - list_add_tail(&req->ipi_list, list); - - /* - * if the list only contains our just added request, - * signal a raise of the softirq. If there are already - * entries there, someone already raised the irq but it - * hasn't run yet. - */ - if (list->next == &req->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); - } else if (raise_blk_irq(ccpu, req)) - goto do_local; - - local_irq_restore(flags); -} - -static __init int blk_softirq_init(void) -{ - int i; - - for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); - - open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); - cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, - "block/softirq:dead", NULL, - blk_softirq_cpu_dead); - return 0; -} -subsys_initcall(blk_softirq_init); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 02643e149d5e..be67952e7be2 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -11,6 +11,7 @@ #include <linux/blktrace_api.h> #include <linux/blk-mq.h> #include <linux/blk-cgroup.h> +#include <linux/debugfs.h> #include "blk.h" #include "blk-mq.h" @@ -873,22 +874,32 @@ static void blk_exit_queue(struct request_queue *q) bdi_put(q->backing_dev_info); } - /** - * __blk_release_queue - release a request queue - * @work: pointer to the release_work member of the request queue to be released + * blk_release_queue - releases all allocated resources of the request_queue + * @kobj: pointer to a kobject, whose container is a request_queue + * + * This function releases all allocated resources of the request queue. + * + * The struct request_queue refcount is incremented with blk_get_queue() and + * decremented with blk_put_queue(). Once the refcount reaches 0 this function + * is called. + * + * For drivers that have a request_queue on a gendisk and added with + * __device_add_disk() the refcount to request_queue will reach 0 with + * the last put_disk() called by the driver. For drivers which don't use + * __device_add_disk() this happens with blk_cleanup_queue(). + * + * Drivers exist which depend on the release of the request_queue to be + * synchronous, it should not be deferred. * - * Description: - * This function is called when a block device is being unregistered. The - * process of releasing a request queue starts with blk_cleanup_queue, which - * set the appropriate flags and then calls blk_put_queue, that decrements - * the reference counter of the request queue. Once the reference counter - * of the request queue reaches zero, blk_release_queue is called to release - * all allocated resources of the request queue. + * Context: can sleep */ -static void __blk_release_queue(struct work_struct *work) +static void blk_release_queue(struct kobject *kobj) { - struct request_queue *q = container_of(work, typeof(*q), release_work); + struct request_queue *q = + container_of(kobj, struct request_queue, kobj); + + might_sleep(); if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) blk_stat_remove_callback(q, q->poll_cb); @@ -907,6 +918,9 @@ static void __blk_release_queue(struct work_struct *work) blk_mq_release(q); blk_trace_shutdown(q); + mutex_lock(&q->debugfs_mutex); + debugfs_remove_recursive(q->debugfs_dir); + mutex_unlock(&q->debugfs_mutex); if (queue_is_mq(q)) blk_mq_debugfs_unregister(q); @@ -917,15 +931,6 @@ static void __blk_release_queue(struct work_struct *work) call_rcu(&q->rcu_head, blk_free_queue_rcu); } -static void blk_release_queue(struct kobject *kobj) -{ - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); - - INIT_WORK(&q->release_work, __blk_release_queue); - schedule_work(&q->release_work); -} - static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, @@ -988,6 +993,11 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } + mutex_lock(&q->debugfs_mutex); + q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), + blk_debugfs_root); + mutex_unlock(&q->debugfs_mutex); + if (queue_is_mq(q)) { __blk_mq_register_dev(dev, q); blk_mq_debugfs_register(q); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 209fdd8939fb..fee3325edf27 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1339,8 +1339,8 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); - while((bio = bio_list_pop(&bio_list_on_stack))) - generic_make_request(bio); + while ((bio = bio_list_pop(&bio_list_on_stack))) + submit_bio_noacct(bio); blk_finish_plug(&plug); } } @@ -2158,17 +2158,18 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif -bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, - struct bio *bio) +bool blk_throtl_bio(struct bio *bio) { + struct request_queue *q = bio->bi_disk->queue; + struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; - struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); + struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; struct throtl_data *td = tg->td; - WARN_ON_ONCE(!rcu_read_lock_held()); + rcu_read_lock(); /* see throtl_charge_bio() */ if (bio_flagged(bio, BIO_THROTTLED)) @@ -2273,6 +2274,7 @@ out: if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; #endif + rcu_read_unlock(); return throttled; } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 8aa68fae96ad..1b8de0417fc1 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -20,13 +20,11 @@ static int __init setup_fail_io_timeout(char *str) } __setup("fail_io_timeout=", setup_fail_io_timeout); -int blk_should_fake_timeout(struct request_queue *q) +bool __blk_should_fake_timeout(struct request_queue *q) { - if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) - return 0; - return should_fail(&fail_io_timeout, 1); } +EXPORT_SYMBOL_GPL(__blk_should_fake_timeout); static int __init fail_io_timeout_debugfs(void) { @@ -70,7 +68,7 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, #endif /* CONFIG_FAIL_IO_TIMEOUT */ /** - * blk_abort_request -- Request request recovery for the specified command + * blk_abort_request - Request recovery for the specified command * @req: pointer to the request of interest * * This function requests that the block layer start recovery for the @@ -90,11 +88,29 @@ void blk_abort_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_abort_request); +static unsigned long blk_timeout_mask __read_mostly; + +static int __init blk_timeout_init(void) +{ + blk_timeout_mask = roundup_pow_of_two(HZ) - 1; + return 0; +} + +late_initcall(blk_timeout_init); + +/* + * Just a rough estimate, we don't care about specific values for timeouts. + */ +static inline unsigned long blk_round_jiffies(unsigned long j) +{ + return (j + blk_timeout_mask) + 1; +} + unsigned long blk_rq_timeout(unsigned long timeout) { unsigned long maxt; - maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); + maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT); if (time_after(timeout, maxt)) timeout = maxt; @@ -131,7 +147,7 @@ void blk_add_timer(struct request *req) * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = blk_rq_timeout(round_jiffies_up(expiry)); + expiry = blk_rq_timeout(blk_round_jiffies(expiry)); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { diff --git a/block/blk.h b/block/blk.h index b5d1f0fc6547..49e2928a1632 100644 --- a/block/blk.h +++ b/block/blk.h @@ -14,9 +14,7 @@ /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) -#ifdef CONFIG_DEBUG_FS extern struct dentry *blk_debugfs_root; -#endif struct blk_flush_queue { unsigned int flush_pending_idx:1; @@ -27,11 +25,6 @@ struct blk_flush_queue { struct list_head flush_data_in_flight; struct request *flush_rq; - /* - * flush_rq shares tag with this rq, both can't be active - * at the same time - */ - struct request *orig_rq; struct lock_class_key key; spinlock_t mq_flush_lock; }; @@ -223,21 +216,11 @@ ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); - -#ifdef CONFIG_FAIL_IO_TIMEOUT -int blk_should_fake_timeout(struct request_queue *); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); -#else -static inline int blk_should_fake_timeout(struct request_queue *q) -{ - return 0; -} -#endif -void __blk_queue_split(struct request_queue *q, struct bio **bio, - unsigned int *nr_segs); +void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); int ll_front_merge_fn(struct request *req, struct bio *bio, @@ -282,6 +265,20 @@ static inline unsigned int bio_allowed_max_sectors(struct request_queue *q) } /* + * The max bio size which is aligned to q->limits.discard_granularity. This + * is a hint to split large discard bio in generic block layer, then if device + * driver needs to split the discard bio into smaller ones, their bi_size can + * be very probably and easily aligned to discard_granularity of the device's + * queue. + */ +static inline unsigned int bio_aligned_discard_max_sectors( + struct request_queue *q) +{ + return round_down(UINT_MAX, q->limits.discard_granularity) >> + SECTOR_SHIFT; +} + +/* * Internal io_context interface */ void get_io_context(struct io_context *ioc); @@ -299,10 +296,12 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); extern void blk_throtl_register_queue(struct request_queue *q); +bool blk_throtl_bio(struct bio *bio); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline bool blk_throtl_bio(struct bio *bio) { return false; } #endif /* CONFIG_BLK_DEV_THROTTLING */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); @@ -434,8 +433,6 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) #endif } -struct request_queue *__blk_alloc_queue(int node_id); - int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); diff --git a/block/bounce.c b/block/bounce.c index c3aaed070124..431be88a0240 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -309,7 +309,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, if (!passthrough && sectors < bio_sectors(*bio_orig)) { bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); bio_chain(bio, *bio_orig); - generic_make_request(*bio_orig); + submit_bio_noacct(*bio_orig); *bio_orig = bio; } bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 6cbb7926534c..fb7b347f8010 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -181,9 +181,12 @@ EXPORT_SYMBOL_GPL(bsg_job_get); void bsg_job_done(struct bsg_job *job, int result, unsigned int reply_payload_rcv_len) { + struct request *rq = blk_mq_rq_from_pdu(job); + job->result = result; job->reply_payload_rcv_len = reply_payload_rcv_len; - blk_mq_complete_request(blk_mq_rq_from_pdu(job)); + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); } EXPORT_SYMBOL_GPL(bsg_job_done); diff --git a/block/elevator.c b/block/elevator.c index 4eab3d70e880..90ed7a28c21d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -95,8 +95,8 @@ static inline bool elv_support_features(unsigned int elv_features, * @name: Elevator name to test * @required_features: Features that the elevator must provide * - * Return true is the elevator @e name matches @name and if @e provides all the - * the feratures spcified by @required_features. + * Return true if the elevator @e name matches @name and if @e provides all + * the features specified by @required_features. */ static bool elevator_match(const struct elevator_type *e, const char *name, unsigned int required_features) diff --git a/block/genhd.c b/block/genhd.c index 1a7659327664..99c64641c314 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -38,8 +38,6 @@ static struct kobject *block_depr; static DEFINE_SPINLOCK(ext_devt_lock); static DEFINE_IDR(ext_devt_idr); -static const struct device_type disk_type; - static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr); static void disk_alloc_events(struct gendisk *disk); @@ -876,11 +874,32 @@ static void invalidate_partition(struct gendisk *disk, int partno) bdput(bdev); } +/** + * del_gendisk - remove the gendisk + * @disk: the struct gendisk to remove + * + * Removes the gendisk and all its associated resources. This deletes the + * partitions associated with the gendisk, and unregisters the associated + * request_queue. + * + * This is the counter to the respective __device_add_disk() call. + * + * The final removal of the struct gendisk happens when its refcount reaches 0 + * with put_disk(), which should be called after del_gendisk(), if + * __device_add_disk() was used. + * + * Drivers exist which depend on the release of the gendisk to be synchronous, + * it should not be deferred. + * + * Context: can sleep + */ void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; struct hd_struct *part; + might_sleep(); + blk_integrity_del(disk); disk_del_events(disk); @@ -971,11 +990,15 @@ static ssize_t disk_badblocks_store(struct device *dev, * * This function gets the structure containing partitioning * information for the given device @devt. + * + * Context: can sleep */ struct gendisk *get_gendisk(dev_t devt, int *partno) { struct gendisk *disk = NULL; + might_sleep(); + if (MAJOR(devt) != BLOCK_EXT_MAJOR) { struct kobject *kobj; @@ -1514,10 +1537,31 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) return 0; } +/** + * disk_release - releases all allocated resources of the gendisk + * @dev: the device representing this disk + * + * This function releases all allocated resources of the gendisk. + * + * The struct gendisk refcount is incremented with get_gendisk() or + * get_disk_and_module(), and its refcount is decremented with + * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this + * function is called. + * + * Drivers which used __device_add_disk() have a gendisk with a request_queue + * assigned. Since the request_queue sits on top of the gendisk for these + * drivers we also call blk_put_queue() for them, and we expect the + * request_queue refcount to reach 0 at this point, and so the request_queue + * will also be freed prior to the disk. + * + * Context: can sleep + */ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + might_sleep(); + blk_free_devt(dev->devt); disk_release_events(disk); kfree(disk->random); @@ -1541,7 +1585,7 @@ static char *block_devnode(struct device *dev, umode_t *mode, return NULL; } -static const struct device_type disk_type = { +const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, @@ -1727,6 +1771,15 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) } EXPORT_SYMBOL(__alloc_disk_node); +/** + * get_disk_and_module - increments the gendisk and gendisk fops module refcount + * @disk: the struct gendisk to increment the refcount for + * + * This increments the refcount for the struct gendisk, and the gendisk's + * fops module owner. + * + * Context: Any context. + */ struct kobject *get_disk_and_module(struct gendisk *disk) { struct module *owner; @@ -1747,6 +1800,16 @@ struct kobject *get_disk_and_module(struct gendisk *disk) } EXPORT_SYMBOL(get_disk_and_module); +/** + * put_disk - decrements the gendisk refcount + * @disk: the struct gendisk to decrement the refcount for + * + * This decrements the refcount for the struct gendisk. When this reaches 0 + * we'll have disk_release() called. + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. + */ void put_disk(struct gendisk *disk) { if (disk) @@ -1754,9 +1817,15 @@ void put_disk(struct gendisk *disk) } EXPORT_SYMBOL(put_disk); -/* +/** + * put_disk_and_module - decrements the module and gendisk refcount + * @disk: the struct gendisk to decrement the refcount for + * * This is a counterpart of get_disk_and_module() and thus also of * get_gendisk(). + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. */ void put_disk_and_module(struct gendisk *disk) { @@ -1985,18 +2054,12 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) */ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) { - const struct block_device_operations *bdops = disk->fops; struct disk_events *ev = disk->ev; unsigned int pending; unsigned int clearing = mask; - if (!ev) { - /* for drivers still using the old ->media_changed method */ - if ((mask & DISK_EVENT_MEDIA_CHANGE) && - bdops->media_changed && bdops->media_changed(disk)) - return DISK_EVENT_MEDIA_CHANGE; + if (!ev) return 0; - } disk_block_events(disk); diff --git a/block/partitions/core.c b/block/partitions/core.c index 78951e33b2d7..e62a98a8eeb7 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -619,8 +619,6 @@ int blk_drop_partitions(struct block_device *bdev) struct disk_part_iter piter; struct hd_struct *part; - if (!disk_part_scan_enabled(bdev->bd_disk)) - return 0; if (bdev->bd_part_count) return -EBUSY; |