diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/bfq-iosched.c | 2 | ||||
-rw-r--r-- | block/bio.c | 4 | ||||
-rw-r--r-- | block/blk-cgroup.c | 218 | ||||
-rw-r--r-- | block/blk-core.c | 6 | ||||
-rw-r--r-- | block/blk-flush.c | 23 | ||||
-rw-r--r-- | block/blk-ioc.c | 42 | ||||
-rw-r--r-- | block/blk-iocost.c | 2 | ||||
-rw-r--r-- | block/blk-lib.c | 31 | ||||
-rw-r--r-- | block/blk-merge.c | 2 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 6 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 12 | ||||
-rw-r--r-- | block/blk-mq.c | 68 | ||||
-rw-r--r-- | block/blk-rq-qos.c | 2 | ||||
-rw-r--r-- | block/blk-timeout.c | 24 | ||||
-rw-r--r-- | block/blk.h | 19 | ||||
-rw-r--r-- | block/elevator.c | 4 | ||||
-rw-r--r-- | block/genhd.c | 18 | ||||
-rw-r--r-- | block/partitions/core.c | 2 |
18 files changed, 272 insertions, 213 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 50c8f034c01c..a4c0bec920cb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4714,7 +4714,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * some unlucky request wait for as long as the device * wishes. * - * Of course, serving one request at at time may cause loss of + * Of course, serving one request at a time may cause loss of * throughput. */ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) diff --git a/block/bio.c b/block/bio.c index ef91782fd668..c63ba04bd629 100644 --- a/block/bio.c +++ b/block/bio.c @@ -862,7 +862,7 @@ EXPORT_SYMBOL(bio_add_pc_page); * @same_page: return if the segment has been merged inside the same page * * Try to add the data at @page + @off to the last bvec of @bio. This is a - * a useful optimisation for file systems with a block size smaller than the + * useful optimisation for file systems with a block size smaller than the * page size. * * Warn if (@len, @off) crosses pages in case that @same_page is true. @@ -988,7 +988,7 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) * Pins pages from *iter and appends them to @bio's bvec array. The * pages will have to be released using put_page() when done. * For multi-segment *iter, this function only adds pages from the - * the next non-empty segment of the iov iterator. + * next non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 594f1d0b0e5a..619a79b51068 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -95,9 +95,6 @@ static void __blkg_release(struct rcu_head *rcu) css_put(&blkg->blkcg->css); if (blkg->parent) blkg_put(blkg->parent); - - wb_congested_put(blkg->wb_congested); - blkg_free(blkg); } @@ -227,7 +224,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; - struct bdi_writeback_congested *wb_congested; int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -245,31 +241,22 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, goto err_free_blkg; } - wb_congested = wb_congested_get_create(q->backing_dev_info, - blkcg->css.id, - GFP_NOWAIT | __GFP_NOWARN); - if (!wb_congested) { - ret = -ENOMEM; - goto err_put_css; - } - /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto err_put_congested; + goto err_put_css; } } blkg = new_blkg; - blkg->wb_congested = wb_congested; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; - goto err_put_congested; + goto err_put_css; } blkg_get(blkg->parent); } @@ -306,8 +293,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); -err_put_congested: - wb_congested_put(wb_congested); err_put_css: css_put(&blkcg->css); err_free_blkg: @@ -726,12 +711,137 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] += src->bytes[i]; + dst->ios[i] += src->ios[i]; + } +} + +static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] -= src->bytes[i]; + dst->ios[i] -= src->ios[i]; + } +} + +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkcg_gq *parent = blkg->parent; + struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); + struct blkg_iostat cur, delta; + unsigned int seq; + + /* fetch the current per-cpu values */ + do { + seq = u64_stats_fetch_begin(&bisc->sync); + blkg_iostat_set(&cur, &bisc->cur); + } while (u64_stats_fetch_retry(&bisc->sync, seq)); + + /* propagate percpu delta to global */ + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&delta, &cur); + blkg_iostat_sub(&delta, &bisc->last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(&bisc->last, &delta); + u64_stats_update_end(&blkg->iostat.sync); + + /* propagate global delta to parent */ + if (parent) { + u64_stats_update_begin(&parent->iostat.sync); + blkg_iostat_set(&delta, &blkg->iostat.cur); + blkg_iostat_sub(&delta, &blkg->iostat.last); + blkg_iostat_add(&parent->iostat.cur, &delta); + blkg_iostat_add(&blkg->iostat.last, &delta); + u64_stats_update_end(&parent->iostat.sync); + } + } + + rcu_read_unlock(); +} + +/* + * The rstat algorithms intentionally don't handle the root cgroup to avoid + * incurring overhead when no cgroups are defined. For that reason, + * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the + * iostat in the root cgroup's blkcg_gq. + * + * However, we would like to re-use the printing code between the root and + * non-root cgroups to the extent possible. For that reason, we simulate + * flushing the root cgroup's stats by explicitly filling in the iostat + * with disk level statistics. + */ +static void blkcg_fill_root_iostats(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct hd_struct *part = disk_get_part(disk, 0); + struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); + struct blkg_iostat tmp; + int cpu; + + memset(&tmp, 0, sizeof(tmp)); + for_each_possible_cpu(cpu) { + struct disk_stats *cpu_dkstats; + + cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); + tmp.ios[BLKG_IOSTAT_READ] += + cpu_dkstats->ios[STAT_READ]; + tmp.ios[BLKG_IOSTAT_WRITE] += + cpu_dkstats->ios[STAT_WRITE]; + tmp.ios[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->ios[STAT_DISCARD]; + // convert sectors to bytes + tmp.bytes[BLKG_IOSTAT_READ] += + cpu_dkstats->sectors[STAT_READ] << 9; + tmp.bytes[BLKG_IOSTAT_WRITE] += + cpu_dkstats->sectors[STAT_WRITE] << 9; + tmp.bytes[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->sectors[STAT_DISCARD] << 9; + + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&blkg->iostat.cur, &tmp); + u64_stats_update_end(&blkg->iostat.sync); + } + } +} + static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; - cgroup_rstat_flush(blkcg->css.cgroup); + if (!seq_css(sf)->parent) + blkcg_fill_root_iostats(); + else + cgroup_rstat_flush(blkcg->css.cgroup); + rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -820,7 +930,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) static struct cftype blkcg_files[] = { { .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = blkcg_print_stat, }, { } /* terminate */ @@ -1101,77 +1210,6 @@ static int blkcg_can_attach(struct cgroup_taskset *tset) return ret; } -static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] = src->bytes[i]; - dst->ios[i] = src->ios[i]; - } -} - -static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] += src->bytes[i]; - dst->ios[i] += src->ios[i]; - } -} - -static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] -= src->bytes[i]; - dst->ios[i] -= src->ios[i]; - } -} - -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) -{ - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; - - rcu_read_lock(); - - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkcg_gq *parent = blkg->parent; - struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); - struct blkg_iostat cur, delta; - unsigned seq; - - /* fetch the current per-cpu values */ - do { - seq = u64_stats_fetch_begin(&bisc->sync); - blkg_iostat_set(&cur, &bisc->cur); - } while (u64_stats_fetch_retry(&bisc->sync, seq)); - - /* propagate percpu delta to global */ - u64_stats_update_begin(&blkg->iostat.sync); - blkg_iostat_set(&delta, &cur); - blkg_iostat_sub(&delta, &bisc->last); - blkg_iostat_add(&blkg->iostat.cur, &delta); - blkg_iostat_add(&bisc->last, &delta); - u64_stats_update_end(&blkg->iostat.sync); - - /* propagate global delta to parent */ - if (parent) { - u64_stats_update_begin(&parent->iostat.sync); - blkg_iostat_set(&delta, &blkg->iostat.cur); - blkg_iostat_sub(&delta, &blkg->iostat.last); - blkg_iostat_add(&parent->iostat.cur, &delta); - blkg_iostat_add(&blkg->iostat.last, &delta); - u64_stats_update_end(&parent->iostat.sync); - } - } - - rcu_read_unlock(); -} - static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; diff --git a/block/blk-core.c b/block/blk-core.c index 93104c7470e8..d9d632639bd1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -960,9 +960,14 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; blk_status_t status = BLK_STS_IOERR; + struct blk_plug *plug; might_sleep(); + plug = blk_mq_plug(q, bio); + if (plug && plug->nowait) + bio->bi_opf |= REQ_NOWAIT; + /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. @@ -1802,6 +1807,7 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->cb_list); plug->rq_count = 0; plug->multiple_queues = false; + plug->nowait = false; /* * Store ordering should not be needed here, since a potential diff --git a/block/blk-flush.c b/block/blk-flush.c index 15ae0155ec07..6e1543c10493 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -219,7 +219,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) struct request *rq, *n; unsigned long flags = 0; struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); - struct blk_mq_hw_ctx *hctx; blk_account_io_flush(flush_rq); @@ -235,13 +234,11 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) if (fq->rq_status != BLK_STS_OK) error = fq->rq_status; - hctx = flush_rq->mq_hctx; if (!q->elevator) { - blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); - flush_rq->tag = -1; + flush_rq->tag = BLK_MQ_NO_TAG; } else { blk_mq_put_driver_tag(flush_rq); - flush_rq->internal_tag = -1; + flush_rq->internal_tag = BLK_MQ_NO_TAG; } running = &fq->flush_queue[fq->flush_running_idx]; @@ -286,13 +283,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) return; - /* C2 and C3 - * - * For blk-mq + scheduling, we can risk having all driver tags - * assigned to empty flushes, and we deadlock if we are expecting - * other requests to make progress. Don't defer for that case. - */ - if (!list_empty(&fq->flush_data_in_flight) && q->elevator && + /* C2 and C3 */ + if (!list_empty(&fq->flush_data_in_flight) && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; @@ -316,13 +308,10 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; - if (!q->elevator) { - fq->orig_rq = first_rq; + if (!q->elevator) flush_rq->tag = first_rq->tag; - blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq); - } else { + else flush_rq->internal_tag = first_rq->internal_tag; - } flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9df50fb507ca..57299f860d41 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -96,15 +96,7 @@ static void ioc_release_fn(struct work_struct *work) { struct io_context *ioc = container_of(work, struct io_context, release_work); - unsigned long flags; - - /* - * Exiting icq may call into put_io_context() through elevator - * which will trigger lockdep warning. The ioc's are guaranteed to - * be different, use a different locking subclass here. Use - * irqsave variant as there's no spin_lock_irq_nested(). - */ - spin_lock_irqsave_nested(&ioc->lock, flags, 1); + spin_lock_irq(&ioc->lock); while (!hlist_empty(&ioc->icq_list)) { struct io_cq *icq = hlist_entry(ioc->icq_list.first, @@ -115,13 +107,27 @@ static void ioc_release_fn(struct work_struct *work) ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); } else { - spin_unlock_irqrestore(&ioc->lock, flags); - cpu_relax(); - spin_lock_irqsave_nested(&ioc->lock, flags, 1); + /* Make sure q and icq cannot be freed. */ + rcu_read_lock(); + + /* Re-acquire the locks in the correct order. */ + spin_unlock(&ioc->lock); + spin_lock(&q->queue_lock); + spin_lock(&ioc->lock); + + /* + * The icq may have been destroyed when the ioc lock + * was released. + */ + if (!(icq->flags & ICQ_DESTROYED)) + ioc_destroy_icq(icq); + + spin_unlock(&q->queue_lock); + rcu_read_unlock(); } } - spin_unlock_irqrestore(&ioc->lock, flags); + spin_unlock_irq(&ioc->lock); kmem_cache_free(iocontext_cachep, ioc); } @@ -170,7 +176,6 @@ void put_io_context(struct io_context *ioc) */ void put_io_context_active(struct io_context *ioc) { - unsigned long flags; struct io_cq *icq; if (!atomic_dec_and_test(&ioc->active_ref)) { @@ -178,19 +183,14 @@ void put_io_context_active(struct io_context *ioc) return; } - /* - * Need ioc lock to walk icq_list and q lock to exit icq. Perform - * reverse double locking. Read comment in ioc_release_fn() for - * explanation on the nested locking annotation. - */ - spin_lock_irqsave_nested(&ioc->lock, flags, 1); + spin_lock_irq(&ioc->lock); hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { if (icq->flags & ICQ_EXITED) continue; ioc_exit_icq(icq); } - spin_unlock_irqrestore(&ioc->lock, flags); + spin_unlock_irq(&ioc->lock); put_io_context(ioc); } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index cea5ee9be639..521c29b8ae29 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1370,7 +1370,7 @@ static void ioc_timer_fn(struct timer_list *timer) * should have woken up in the last period and expire idle iocgs. */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { - if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt && + if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && !iocg_is_idle(iocg)) continue; diff --git a/block/blk-lib.c b/block/blk-lib.c index 5f2c429d4378..019e09bb9c0e 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -29,7 +29,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; unsigned int op; - sector_t bs_mask; + sector_t bs_mask, part_offset = 0; if (!q) return -ENXIO; @@ -54,9 +54,34 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!nr_sects) return -EINVAL; + /* In case the discard request is in a partition */ + if (bdev->bd_partno) + part_offset = bdev->bd_part->start_sect; + while (nr_sects) { - sector_t req_sects = min_t(sector_t, nr_sects, - bio_allowed_max_sectors(q)); + sector_t granularity_aligned_lba, req_sects; + sector_t sector_mapped = sector + part_offset; + + granularity_aligned_lba = round_up(sector_mapped, + q->limits.discard_granularity >> SECTOR_SHIFT); + + /* + * Check whether the discard bio starts at a discard_granularity + * aligned LBA, + * - If no: set (granularity_aligned_lba - sector_mapped) to + * bi_size of the first split bio, then the second bio will + * start at a discard_granularity aligned LBA on the device. + * - If yes: use bio_aligned_discard_max_sectors() as the max + * possible bi_size of the first split bio. Then when this bio + * is split in device drive, the split ones are very probably + * to be aligned to discard_granularity of the device's queue. + */ + if (granularity_aligned_lba == sector_mapped) + req_sects = min_t(sector_t, nr_sects, + bio_aligned_discard_max_sectors(q)); + else + req_sects = min_t(sector_t, nr_sects, + granularity_aligned_lba - sector_mapped); WARN_ON_ONCE((req_sects << 9) > UINT_MAX); diff --git a/block/blk-merge.c b/block/blk-merge.c index 5196dc145270..6529e3aab001 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -472,7 +472,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, struct scatterlist *sglist, struct scatterlist **sg) { - struct bio_vec uninitialized_var(bvec), bvprv = { NULL }; + struct bio_vec bvec, bvprv = { NULL }; struct bvec_iter iter; int nsegs = 0; bool new_bio = false; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 1c52e56a19b1..a19cdf159b75 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -96,7 +96,6 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) struct request *rq; LIST_HEAD(hctx_list); unsigned int count = 0; - bool ret; list_for_each_entry(rq, rq_list, queuelist) { if (rq->mq_hctx != hctx) { @@ -108,8 +107,7 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) list_splice_tail_init(rq_list, &hctx_list); dispatch: - ret = blk_mq_dispatch_rq_list(hctx, &hctx_list, count); - return ret; + return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); } #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ @@ -227,7 +225,7 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to - * to be run again. This is necessary to avoid starving flushes. + * be run again. This is necessary to avoid starving flushes. */ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 3945c7f5b944..b1acac518c4e 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -101,18 +101,6 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return atomic_read(&hctx->nr_active) < depth; } -/* - * This helper should only be used for flush request to share tag - * with the request cloned from, and both the two requests can't be - * in flight at the same time. The caller has to make sure the tag - * can't be freed. - */ -static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, - unsigned int tag, struct request *rq) -{ - hctx->tags->rqs[tag] = rq; -} - static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { diff --git a/block/blk-mq.c b/block/blk-mq.c index abcf590f6238..0015a1892153 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -277,26 +277,20 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; - req_flags_t rq_flags = 0; if (data->q->elevator) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { - if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { - rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); - } rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; - data->hctx->tags->rqs[rq->tag] = rq; } /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; rq->mq_hctx = data->hctx; - rq->rq_flags = rq_flags; + rq->rq_flags = 0; rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; @@ -394,7 +388,7 @@ retry: /* * Give up the CPU and sleep for a random short time to ensure * that thread using a realtime scheduling class are migrated - * off the the CPU, and thus off the hctx that is going away. + * off the CPU, and thus off the hctx that is going away. */ msleep(3); goto retry; @@ -550,8 +544,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) blk_stat_add(rq, now); } - if (rq->internal_tag != BLK_MQ_NO_TAG) - blk_mq_sched_completed_request(rq, now); + blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); @@ -877,10 +870,10 @@ static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { /* - * If we find a request that is inflight and the queue matches, + * If we find a request that isn't idle and the queue matches, * we know the queue is busy. Return false to stop the iteration. */ - if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) { + if (blk_mq_request_started(rq) && rq->q == hctx->queue) { bool *busy = priv; *busy = true; @@ -1105,9 +1098,10 @@ static bool __blk_mq_get_driver_tag(struct request *rq) { struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; - bool shared = blk_mq_tag_busy(rq->mq_hctx); int tag; + blk_mq_tag_busy(rq->mq_hctx); + if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; @@ -1120,19 +1114,23 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return false; rq->tag = tag + tag_offset; - if (shared) { - rq->rq_flags |= RQF_MQ_INFLIGHT; - atomic_inc(&rq->mq_hctx->nr_active); - } - rq->mq_hctx->tags->rqs[rq->tag] = rq; return true; } static bool blk_mq_get_driver_tag(struct request *rq) { - if (rq->tag != BLK_MQ_NO_TAG) - return true; - return __blk_mq_get_driver_tag(rq); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) + return false; + + if ((hctx->flags & BLK_MQ_F_TAG_SHARED) && + !(rq->rq_flags & RQF_MQ_INFLIGHT)) { + rq->rq_flags |= RQF_MQ_INFLIGHT; + atomic_inc(&hctx->nr_active); + } + hctx->tags->rqs[rq->tag] = rq; + return true; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, @@ -1387,30 +1385,28 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, if (nr_budgets) nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd); - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { - blk_mq_handle_dev_resource(rq, list); + switch (ret) { + case BLK_STS_OK: + queued++; break; - } else if (ret == BLK_STS_ZONE_RESOURCE) { + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_handle_dev_resource(rq, list); + goto out; + case BLK_STS_ZONE_RESOURCE: /* * Move the request to zone_list and keep going through * the dispatch list to find more requests the drive can * accept. */ blk_mq_handle_zone_resource(rq, &zone_list); - if (list_empty(list)) - break; - continue; - } - - if (unlikely(ret != BLK_STS_OK)) { + break; + default: errors++; blk_mq_end_request(rq, BLK_STS_IOERR); - continue; } - - queued++; } while (!list_empty(list)); - +out: if (!list_empty(&zone_list)) list_splice_tail_init(&zone_list, list); @@ -2903,7 +2899,7 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) struct blk_mq_tag_set *set = q->tag_set; mutex_lock(&set->tag_list_lock); - list_del_rcu(&q->tag_set_list); + list_del(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ set->flags &= ~BLK_MQ_F_TAG_SHARED; @@ -2930,7 +2926,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, } if (set->flags & BLK_MQ_F_TAG_SHARED) queue_set_hctx_shared(q, true); - list_add_tail_rcu(&q->tag_set_list, &set->tag_list); + list_add_tail(&q->tag_set_list, &set->tag_list); mutex_unlock(&set->tag_list_lock); } diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 18f3eab9f768..656460636ad3 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -273,6 +273,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, if (data.got_token) break; if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { + finish_wait(&rqw->wait, &data.wq); + /* * We raced with wbt_wake_function() getting a token, * which means we now have two. Put our local token diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 3a1ac6434758..1b8de0417fc1 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -68,7 +68,7 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, #endif /* CONFIG_FAIL_IO_TIMEOUT */ /** - * blk_abort_request -- Request request recovery for the specified command + * blk_abort_request - Request recovery for the specified command * @req: pointer to the request of interest * * This function requests that the block layer start recovery for the @@ -88,11 +88,29 @@ void blk_abort_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_abort_request); +static unsigned long blk_timeout_mask __read_mostly; + +static int __init blk_timeout_init(void) +{ + blk_timeout_mask = roundup_pow_of_two(HZ) - 1; + return 0; +} + +late_initcall(blk_timeout_init); + +/* + * Just a rough estimate, we don't care about specific values for timeouts. + */ +static inline unsigned long blk_round_jiffies(unsigned long j) +{ + return (j + blk_timeout_mask) + 1; +} + unsigned long blk_rq_timeout(unsigned long timeout) { unsigned long maxt; - maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); + maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT); if (time_after(timeout, maxt)) timeout = maxt; @@ -129,7 +147,7 @@ void blk_add_timer(struct request *req) * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = blk_rq_timeout(round_jiffies_up(expiry)); + expiry = blk_rq_timeout(blk_round_jiffies(expiry)); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { diff --git a/block/blk.h b/block/blk.h index 94f7c084f68f..49e2928a1632 100644 --- a/block/blk.h +++ b/block/blk.h @@ -25,11 +25,6 @@ struct blk_flush_queue { struct list_head flush_data_in_flight; struct request *flush_rq; - /* - * flush_rq shares tag with this rq, both can't be active - * at the same time - */ - struct request *orig_rq; struct lock_class_key key; spinlock_t mq_flush_lock; }; @@ -270,6 +265,20 @@ static inline unsigned int bio_allowed_max_sectors(struct request_queue *q) } /* + * The max bio size which is aligned to q->limits.discard_granularity. This + * is a hint to split large discard bio in generic block layer, then if device + * driver needs to split the discard bio into smaller ones, their bi_size can + * be very probably and easily aligned to discard_granularity of the device's + * queue. + */ +static inline unsigned int bio_aligned_discard_max_sectors( + struct request_queue *q) +{ + return round_down(UINT_MAX, q->limits.discard_granularity) >> + SECTOR_SHIFT; +} + +/* * Internal io_context interface */ void get_io_context(struct io_context *ioc); diff --git a/block/elevator.c b/block/elevator.c index 4eab3d70e880..90ed7a28c21d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -95,8 +95,8 @@ static inline bool elv_support_features(unsigned int elv_features, * @name: Elevator name to test * @required_features: Features that the elevator must provide * - * Return true is the elevator @e name matches @name and if @e provides all the - * the feratures spcified by @required_features. + * Return true if the elevator @e name matches @name and if @e provides all + * the features specified by @required_features. */ static bool elevator_match(const struct elevator_type *e, const char *name, unsigned int required_features) diff --git a/block/genhd.c b/block/genhd.c index 60ae4e1b4d38..99c64641c314 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -38,8 +38,6 @@ static struct kobject *block_depr; static DEFINE_SPINLOCK(ext_devt_lock); static DEFINE_IDR(ext_devt_idr); -static const struct device_type disk_type; - static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr); static void disk_alloc_events(struct gendisk *disk); @@ -1587,7 +1585,7 @@ static char *block_devnode(struct device *dev, umode_t *mode, return NULL; } -static const struct device_type disk_type = { +const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, @@ -1775,7 +1773,7 @@ EXPORT_SYMBOL(__alloc_disk_node); /** * get_disk_and_module - increments the gendisk and gendisk fops module refcount - * @disk: the struct gendisk to to increment the refcount for + * @disk: the struct gendisk to increment the refcount for * * This increments the refcount for the struct gendisk, and the gendisk's * fops module owner. @@ -1804,7 +1802,7 @@ EXPORT_SYMBOL(get_disk_and_module); /** * put_disk - decrements the gendisk refcount - * @disk: the struct gendisk to to decrement the refcount for + * @disk: the struct gendisk to decrement the refcount for * * This decrements the refcount for the struct gendisk. When this reaches 0 * we'll have disk_release() called. @@ -1821,7 +1819,7 @@ EXPORT_SYMBOL(put_disk); /** * put_disk_and_module - decrements the module and gendisk refcount - * @disk: the struct gendisk to to decrement the refcount for + * @disk: the struct gendisk to decrement the refcount for * * This is a counterpart of get_disk_and_module() and thus also of * get_gendisk(). @@ -2056,18 +2054,12 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) */ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) { - const struct block_device_operations *bdops = disk->fops; struct disk_events *ev = disk->ev; unsigned int pending; unsigned int clearing = mask; - if (!ev) { - /* for drivers still using the old ->media_changed method */ - if ((mask & DISK_EVENT_MEDIA_CHANGE) && - bdops->media_changed && bdops->media_changed(disk)) - return DISK_EVENT_MEDIA_CHANGE; + if (!ev) return 0; - } disk_block_events(disk); diff --git a/block/partitions/core.c b/block/partitions/core.c index 78951e33b2d7..e62a98a8eeb7 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -619,8 +619,6 @@ int blk_drop_partitions(struct block_device *bdev) struct disk_part_iter piter; struct hd_struct *part; - if (!disk_part_scan_enabled(bdev->bd_disk)) - return 0; if (bdev->bd_part_count) return -EBUSY; |