summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/bfq-iosched.c2
-rw-r--r--block/bio.c4
-rw-r--r--block/blk-cgroup.c218
-rw-r--r--block/blk-core.c6
-rw-r--r--block/blk-flush.c23
-rw-r--r--block/blk-ioc.c42
-rw-r--r--block/blk-iocost.c2
-rw-r--r--block/blk-lib.c31
-rw-r--r--block/blk-merge.c2
-rw-r--r--block/blk-mq-sched.c6
-rw-r--r--block/blk-mq-tag.h12
-rw-r--r--block/blk-mq.c68
-rw-r--r--block/blk-rq-qos.c2
-rw-r--r--block/blk-timeout.c24
-rw-r--r--block/blk.h19
-rw-r--r--block/elevator.c4
-rw-r--r--block/genhd.c18
-rw-r--r--block/partitions/core.c2
18 files changed, 272 insertions, 213 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 50c8f034c01c..a4c0bec920cb 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4714,7 +4714,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
* some unlucky request wait for as long as the device
* wishes.
*
- * Of course, serving one request at at time may cause loss of
+ * Of course, serving one request at a time may cause loss of
* throughput.
*/
if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
diff --git a/block/bio.c b/block/bio.c
index ef91782fd668..c63ba04bd629 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -862,7 +862,7 @@ EXPORT_SYMBOL(bio_add_pc_page);
* @same_page: return if the segment has been merged inside the same page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
- * a useful optimisation for file systems with a block size smaller than the
+ * useful optimisation for file systems with a block size smaller than the
* page size.
*
* Warn if (@len, @off) crosses pages in case that @same_page is true.
@@ -988,7 +988,7 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
* Pins pages from *iter and appends them to @bio's bvec array. The
* pages will have to be released using put_page() when done.
* For multi-segment *iter, this function only adds pages from the
- * the next non-empty segment of the iov iterator.
+ * next non-empty segment of the iov iterator.
*/
static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 594f1d0b0e5a..619a79b51068 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -95,9 +95,6 @@ static void __blkg_release(struct rcu_head *rcu)
css_put(&blkg->blkcg->css);
if (blkg->parent)
blkg_put(blkg->parent);
-
- wb_congested_put(blkg->wb_congested);
-
blkg_free(blkg);
}
@@ -227,7 +224,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct blkcg_gq *new_blkg)
{
struct blkcg_gq *blkg;
- struct bdi_writeback_congested *wb_congested;
int i, ret;
WARN_ON_ONCE(!rcu_read_lock_held());
@@ -245,31 +241,22 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
goto err_free_blkg;
}
- wb_congested = wb_congested_get_create(q->backing_dev_info,
- blkcg->css.id,
- GFP_NOWAIT | __GFP_NOWARN);
- if (!wb_congested) {
- ret = -ENOMEM;
- goto err_put_css;
- }
-
/* allocate */
if (!new_blkg) {
new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
- goto err_put_congested;
+ goto err_put_css;
}
}
blkg = new_blkg;
- blkg->wb_congested = wb_congested;
/* link parent */
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) {
ret = -ENODEV;
- goto err_put_congested;
+ goto err_put_css;
}
blkg_get(blkg->parent);
}
@@ -306,8 +293,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg_put(blkg);
return ERR_PTR(ret);
-err_put_congested:
- wb_congested_put(wb_congested);
err_put_css:
css_put(&blkcg->css);
err_free_blkg:
@@ -726,12 +711,137 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_finish);
+static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] = src->bytes[i];
+ dst->ios[i] = src->ios[i];
+ }
+}
+
+static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] += src->bytes[i];
+ dst->ios[i] += src->ios[i];
+ }
+}
+
+static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] -= src->bytes[i];
+ dst->ios[i] -= src->ios[i];
+ }
+}
+
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ struct blkcg *blkcg = css_to_blkcg(css);
+ struct blkcg_gq *blkg;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ struct blkcg_gq *parent = blkg->parent;
+ struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
+ struct blkg_iostat cur, delta;
+ unsigned int seq;
+
+ /* fetch the current per-cpu values */
+ do {
+ seq = u64_stats_fetch_begin(&bisc->sync);
+ blkg_iostat_set(&cur, &bisc->cur);
+ } while (u64_stats_fetch_retry(&bisc->sync, seq));
+
+ /* propagate percpu delta to global */
+ u64_stats_update_begin(&blkg->iostat.sync);
+ blkg_iostat_set(&delta, &cur);
+ blkg_iostat_sub(&delta, &bisc->last);
+ blkg_iostat_add(&blkg->iostat.cur, &delta);
+ blkg_iostat_add(&bisc->last, &delta);
+ u64_stats_update_end(&blkg->iostat.sync);
+
+ /* propagate global delta to parent */
+ if (parent) {
+ u64_stats_update_begin(&parent->iostat.sync);
+ blkg_iostat_set(&delta, &blkg->iostat.cur);
+ blkg_iostat_sub(&delta, &blkg->iostat.last);
+ blkg_iostat_add(&parent->iostat.cur, &delta);
+ blkg_iostat_add(&blkg->iostat.last, &delta);
+ u64_stats_update_end(&parent->iostat.sync);
+ }
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * The rstat algorithms intentionally don't handle the root cgroup to avoid
+ * incurring overhead when no cgroups are defined. For that reason,
+ * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the
+ * iostat in the root cgroup's blkcg_gq.
+ *
+ * However, we would like to re-use the printing code between the root and
+ * non-root cgroups to the extent possible. For that reason, we simulate
+ * flushing the root cgroup's stats by explicitly filling in the iostat
+ * with disk level statistics.
+ */
+static void blkcg_fill_root_iostats(void)
+{
+ struct class_dev_iter iter;
+ struct device *dev;
+
+ class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+ while ((dev = class_dev_iter_next(&iter))) {
+ struct gendisk *disk = dev_to_disk(dev);
+ struct hd_struct *part = disk_get_part(disk, 0);
+ struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue);
+ struct blkg_iostat tmp;
+ int cpu;
+
+ memset(&tmp, 0, sizeof(tmp));
+ for_each_possible_cpu(cpu) {
+ struct disk_stats *cpu_dkstats;
+
+ cpu_dkstats = per_cpu_ptr(part->dkstats, cpu);
+ tmp.ios[BLKG_IOSTAT_READ] +=
+ cpu_dkstats->ios[STAT_READ];
+ tmp.ios[BLKG_IOSTAT_WRITE] +=
+ cpu_dkstats->ios[STAT_WRITE];
+ tmp.ios[BLKG_IOSTAT_DISCARD] +=
+ cpu_dkstats->ios[STAT_DISCARD];
+ // convert sectors to bytes
+ tmp.bytes[BLKG_IOSTAT_READ] +=
+ cpu_dkstats->sectors[STAT_READ] << 9;
+ tmp.bytes[BLKG_IOSTAT_WRITE] +=
+ cpu_dkstats->sectors[STAT_WRITE] << 9;
+ tmp.bytes[BLKG_IOSTAT_DISCARD] +=
+ cpu_dkstats->sectors[STAT_DISCARD] << 9;
+
+ u64_stats_update_begin(&blkg->iostat.sync);
+ blkg_iostat_set(&blkg->iostat.cur, &tmp);
+ u64_stats_update_end(&blkg->iostat.sync);
+ }
+ }
+}
+
static int blkcg_print_stat(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
struct blkcg_gq *blkg;
- cgroup_rstat_flush(blkcg->css.cgroup);
+ if (!seq_css(sf)->parent)
+ blkcg_fill_root_iostats();
+ else
+ cgroup_rstat_flush(blkcg->css.cgroup);
+
rcu_read_lock();
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
@@ -820,7 +930,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
static struct cftype blkcg_files[] = {
{
.name = "stat",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = blkcg_print_stat,
},
{ } /* terminate */
@@ -1101,77 +1210,6 @@ static int blkcg_can_attach(struct cgroup_taskset *tset)
return ret;
}
-static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
- int i;
-
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] = src->bytes[i];
- dst->ios[i] = src->ios[i];
- }
-}
-
-static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
- int i;
-
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] += src->bytes[i];
- dst->ios[i] += src->ios[i];
- }
-}
-
-static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
- int i;
-
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] -= src->bytes[i];
- dst->ios[i] -= src->ios[i];
- }
-}
-
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
-{
- struct blkcg *blkcg = css_to_blkcg(css);
- struct blkcg_gq *blkg;
-
- rcu_read_lock();
-
- hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
- struct blkcg_gq *parent = blkg->parent;
- struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
- struct blkg_iostat cur, delta;
- unsigned seq;
-
- /* fetch the current per-cpu values */
- do {
- seq = u64_stats_fetch_begin(&bisc->sync);
- blkg_iostat_set(&cur, &bisc->cur);
- } while (u64_stats_fetch_retry(&bisc->sync, seq));
-
- /* propagate percpu delta to global */
- u64_stats_update_begin(&blkg->iostat.sync);
- blkg_iostat_set(&delta, &cur);
- blkg_iostat_sub(&delta, &bisc->last);
- blkg_iostat_add(&blkg->iostat.cur, &delta);
- blkg_iostat_add(&bisc->last, &delta);
- u64_stats_update_end(&blkg->iostat.sync);
-
- /* propagate global delta to parent */
- if (parent) {
- u64_stats_update_begin(&parent->iostat.sync);
- blkg_iostat_set(&delta, &blkg->iostat.cur);
- blkg_iostat_sub(&delta, &blkg->iostat.last);
- blkg_iostat_add(&parent->iostat.cur, &delta);
- blkg_iostat_add(&blkg->iostat.last, &delta);
- u64_stats_update_end(&parent->iostat.sync);
- }
- }
-
- rcu_read_unlock();
-}
-
static void blkcg_bind(struct cgroup_subsys_state *root_css)
{
int i;
diff --git a/block/blk-core.c b/block/blk-core.c
index 93104c7470e8..d9d632639bd1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -960,9 +960,14 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
blk_status_t status = BLK_STS_IOERR;
+ struct blk_plug *plug;
might_sleep();
+ plug = blk_mq_plug(q, bio);
+ if (plug && plug->nowait)
+ bio->bi_opf |= REQ_NOWAIT;
+
/*
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue is not a request based queue.
@@ -1802,6 +1807,7 @@ void blk_start_plug(struct blk_plug *plug)
INIT_LIST_HEAD(&plug->cb_list);
plug->rq_count = 0;
plug->multiple_queues = false;
+ plug->nowait = false;
/*
* Store ordering should not be needed here, since a potential
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 15ae0155ec07..6e1543c10493 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -219,7 +219,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
struct request *rq, *n;
unsigned long flags = 0;
struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
- struct blk_mq_hw_ctx *hctx;
blk_account_io_flush(flush_rq);
@@ -235,13 +234,11 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
if (fq->rq_status != BLK_STS_OK)
error = fq->rq_status;
- hctx = flush_rq->mq_hctx;
if (!q->elevator) {
- blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
- flush_rq->tag = -1;
+ flush_rq->tag = BLK_MQ_NO_TAG;
} else {
blk_mq_put_driver_tag(flush_rq);
- flush_rq->internal_tag = -1;
+ flush_rq->internal_tag = BLK_MQ_NO_TAG;
}
running = &fq->flush_queue[fq->flush_running_idx];
@@ -286,13 +283,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
return;
- /* C2 and C3
- *
- * For blk-mq + scheduling, we can risk having all driver tags
- * assigned to empty flushes, and we deadlock if we are expecting
- * other requests to make progress. Don't defer for that case.
- */
- if (!list_empty(&fq->flush_data_in_flight) && q->elevator &&
+ /* C2 and C3 */
+ if (!list_empty(&fq->flush_data_in_flight) &&
time_before(jiffies,
fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
return;
@@ -316,13 +308,10 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
flush_rq->mq_ctx = first_rq->mq_ctx;
flush_rq->mq_hctx = first_rq->mq_hctx;
- if (!q->elevator) {
- fq->orig_rq = first_rq;
+ if (!q->elevator)
flush_rq->tag = first_rq->tag;
- blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq);
- } else {
+ else
flush_rq->internal_tag = first_rq->internal_tag;
- }
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 9df50fb507ca..57299f860d41 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -96,15 +96,7 @@ static void ioc_release_fn(struct work_struct *work)
{
struct io_context *ioc = container_of(work, struct io_context,
release_work);
- unsigned long flags;
-
- /*
- * Exiting icq may call into put_io_context() through elevator
- * which will trigger lockdep warning. The ioc's are guaranteed to
- * be different, use a different locking subclass here. Use
- * irqsave variant as there's no spin_lock_irq_nested().
- */
- spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+ spin_lock_irq(&ioc->lock);
while (!hlist_empty(&ioc->icq_list)) {
struct io_cq *icq = hlist_entry(ioc->icq_list.first,
@@ -115,13 +107,27 @@ static void ioc_release_fn(struct work_struct *work)
ioc_destroy_icq(icq);
spin_unlock(&q->queue_lock);
} else {
- spin_unlock_irqrestore(&ioc->lock, flags);
- cpu_relax();
- spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+ /* Make sure q and icq cannot be freed. */
+ rcu_read_lock();
+
+ /* Re-acquire the locks in the correct order. */
+ spin_unlock(&ioc->lock);
+ spin_lock(&q->queue_lock);
+ spin_lock(&ioc->lock);
+
+ /*
+ * The icq may have been destroyed when the ioc lock
+ * was released.
+ */
+ if (!(icq->flags & ICQ_DESTROYED))
+ ioc_destroy_icq(icq);
+
+ spin_unlock(&q->queue_lock);
+ rcu_read_unlock();
}
}
- spin_unlock_irqrestore(&ioc->lock, flags);
+ spin_unlock_irq(&ioc->lock);
kmem_cache_free(iocontext_cachep, ioc);
}
@@ -170,7 +176,6 @@ void put_io_context(struct io_context *ioc)
*/
void put_io_context_active(struct io_context *ioc)
{
- unsigned long flags;
struct io_cq *icq;
if (!atomic_dec_and_test(&ioc->active_ref)) {
@@ -178,19 +183,14 @@ void put_io_context_active(struct io_context *ioc)
return;
}
- /*
- * Need ioc lock to walk icq_list and q lock to exit icq. Perform
- * reverse double locking. Read comment in ioc_release_fn() for
- * explanation on the nested locking annotation.
- */
- spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+ spin_lock_irq(&ioc->lock);
hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
if (icq->flags & ICQ_EXITED)
continue;
ioc_exit_icq(icq);
}
- spin_unlock_irqrestore(&ioc->lock, flags);
+ spin_unlock_irq(&ioc->lock);
put_io_context(ioc);
}
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index cea5ee9be639..521c29b8ae29 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -1370,7 +1370,7 @@ static void ioc_timer_fn(struct timer_list *timer)
* should have woken up in the last period and expire idle iocgs.
*/
list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
- if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
+ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
!iocg_is_idle(iocg))
continue;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 5f2c429d4378..019e09bb9c0e 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -29,7 +29,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = *biop;
unsigned int op;
- sector_t bs_mask;
+ sector_t bs_mask, part_offset = 0;
if (!q)
return -ENXIO;
@@ -54,9 +54,34 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
if (!nr_sects)
return -EINVAL;
+ /* In case the discard request is in a partition */
+ if (bdev->bd_partno)
+ part_offset = bdev->bd_part->start_sect;
+
while (nr_sects) {
- sector_t req_sects = min_t(sector_t, nr_sects,
- bio_allowed_max_sectors(q));
+ sector_t granularity_aligned_lba, req_sects;
+ sector_t sector_mapped = sector + part_offset;
+
+ granularity_aligned_lba = round_up(sector_mapped,
+ q->limits.discard_granularity >> SECTOR_SHIFT);
+
+ /*
+ * Check whether the discard bio starts at a discard_granularity
+ * aligned LBA,
+ * - If no: set (granularity_aligned_lba - sector_mapped) to
+ * bi_size of the first split bio, then the second bio will
+ * start at a discard_granularity aligned LBA on the device.
+ * - If yes: use bio_aligned_discard_max_sectors() as the max
+ * possible bi_size of the first split bio. Then when this bio
+ * is split in device drive, the split ones are very probably
+ * to be aligned to discard_granularity of the device's queue.
+ */
+ if (granularity_aligned_lba == sector_mapped)
+ req_sects = min_t(sector_t, nr_sects,
+ bio_aligned_discard_max_sectors(q));
+ else
+ req_sects = min_t(sector_t, nr_sects,
+ granularity_aligned_lba - sector_mapped);
WARN_ON_ONCE((req_sects << 9) > UINT_MAX);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5196dc145270..6529e3aab001 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -472,7 +472,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
struct scatterlist *sglist,
struct scatterlist **sg)
{
- struct bio_vec uninitialized_var(bvec), bvprv = { NULL };
+ struct bio_vec bvec, bvprv = { NULL };
struct bvec_iter iter;
int nsegs = 0;
bool new_bio = false;
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 1c52e56a19b1..a19cdf159b75 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -96,7 +96,6 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
struct request *rq;
LIST_HEAD(hctx_list);
unsigned int count = 0;
- bool ret;
list_for_each_entry(rq, rq_list, queuelist) {
if (rq->mq_hctx != hctx) {
@@ -108,8 +107,7 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
list_splice_tail_init(rq_list, &hctx_list);
dispatch:
- ret = blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
- return ret;
+ return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
}
#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
@@ -227,7 +225,7 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
*
* Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
- * to be run again. This is necessary to avoid starving flushes.
+ * be run again. This is necessary to avoid starving flushes.
*/
static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 3945c7f5b944..b1acac518c4e 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -101,18 +101,6 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
return atomic_read(&hctx->nr_active) < depth;
}
-/*
- * This helper should only be used for flush request to share tag
- * with the request cloned from, and both the two requests can't be
- * in flight at the same time. The caller has to make sure the tag
- * can't be freed.
- */
-static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
- unsigned int tag, struct request *rq)
-{
- hctx->tags->rqs[tag] = rq;
-}
-
static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
unsigned int tag)
{
diff --git a/block/blk-mq.c b/block/blk-mq.c
index abcf590f6238..0015a1892153 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -277,26 +277,20 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
- req_flags_t rq_flags = 0;
if (data->q->elevator) {
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
} else {
- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
- rq_flags = RQF_MQ_INFLIGHT;
- atomic_inc(&data->hctx->nr_active);
- }
rq->tag = tag;
rq->internal_tag = BLK_MQ_NO_TAG;
- data->hctx->tags->rqs[rq->tag] = rq;
}
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = data->q;
rq->mq_ctx = data->ctx;
rq->mq_hctx = data->hctx;
- rq->rq_flags = rq_flags;
+ rq->rq_flags = 0;
rq->cmd_flags = data->cmd_flags;
if (data->flags & BLK_MQ_REQ_PREEMPT)
rq->rq_flags |= RQF_PREEMPT;
@@ -394,7 +388,7 @@ retry:
/*
* Give up the CPU and sleep for a random short time to ensure
* that thread using a realtime scheduling class are migrated
- * off the the CPU, and thus off the hctx that is going away.
+ * off the CPU, and thus off the hctx that is going away.
*/
msleep(3);
goto retry;
@@ -550,8 +544,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
blk_stat_add(rq, now);
}
- if (rq->internal_tag != BLK_MQ_NO_TAG)
- blk_mq_sched_completed_request(rq, now);
+ blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
@@ -877,10 +870,10 @@ static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
void *priv, bool reserved)
{
/*
- * If we find a request that is inflight and the queue matches,
+ * If we find a request that isn't idle and the queue matches,
* we know the queue is busy. Return false to stop the iteration.
*/
- if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
+ if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
bool *busy = priv;
*busy = true;
@@ -1105,9 +1098,10 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
{
struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
- bool shared = blk_mq_tag_busy(rq->mq_hctx);
int tag;
+ blk_mq_tag_busy(rq->mq_hctx);
+
if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
bt = &rq->mq_hctx->tags->breserved_tags;
tag_offset = 0;
@@ -1120,19 +1114,23 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
return false;
rq->tag = tag + tag_offset;
- if (shared) {
- rq->rq_flags |= RQF_MQ_INFLIGHT;
- atomic_inc(&rq->mq_hctx->nr_active);
- }
- rq->mq_hctx->tags->rqs[rq->tag] = rq;
return true;
}
static bool blk_mq_get_driver_tag(struct request *rq)
{
- if (rq->tag != BLK_MQ_NO_TAG)
- return true;
- return __blk_mq_get_driver_tag(rq);
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
+ return false;
+
+ if ((hctx->flags & BLK_MQ_F_TAG_SHARED) &&
+ !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
+ rq->rq_flags |= RQF_MQ_INFLIGHT;
+ atomic_inc(&hctx->nr_active);
+ }
+ hctx->tags->rqs[rq->tag] = rq;
+ return true;
}
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
@@ -1387,30 +1385,28 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
if (nr_budgets)
nr_budgets--;
ret = q->mq_ops->queue_rq(hctx, &bd);
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
- blk_mq_handle_dev_resource(rq, list);
+ switch (ret) {
+ case BLK_STS_OK:
+ queued++;
break;
- } else if (ret == BLK_STS_ZONE_RESOURCE) {
+ case BLK_STS_RESOURCE:
+ case BLK_STS_DEV_RESOURCE:
+ blk_mq_handle_dev_resource(rq, list);
+ goto out;
+ case BLK_STS_ZONE_RESOURCE:
/*
* Move the request to zone_list and keep going through
* the dispatch list to find more requests the drive can
* accept.
*/
blk_mq_handle_zone_resource(rq, &zone_list);
- if (list_empty(list))
- break;
- continue;
- }
-
- if (unlikely(ret != BLK_STS_OK)) {
+ break;
+ default:
errors++;
blk_mq_end_request(rq, BLK_STS_IOERR);
- continue;
}
-
- queued++;
} while (!list_empty(list));
-
+out:
if (!list_empty(&zone_list))
list_splice_tail_init(&zone_list, list);
@@ -2903,7 +2899,7 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
struct blk_mq_tag_set *set = q->tag_set;
mutex_lock(&set->tag_list_lock);
- list_del_rcu(&q->tag_set_list);
+ list_del(&q->tag_set_list);
if (list_is_singular(&set->tag_list)) {
/* just transitioned to unshared */
set->flags &= ~BLK_MQ_F_TAG_SHARED;
@@ -2930,7 +2926,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
}
if (set->flags & BLK_MQ_F_TAG_SHARED)
queue_set_hctx_shared(q, true);
- list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
+ list_add_tail(&q->tag_set_list, &set->tag_list);
mutex_unlock(&set->tag_list_lock);
}
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 18f3eab9f768..656460636ad3 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -273,6 +273,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
if (data.got_token)
break;
if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
+ finish_wait(&rqw->wait, &data.wq);
+
/*
* We raced with wbt_wake_function() getting a token,
* which means we now have two. Put our local token
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 3a1ac6434758..1b8de0417fc1 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -68,7 +68,7 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
#endif /* CONFIG_FAIL_IO_TIMEOUT */
/**
- * blk_abort_request -- Request request recovery for the specified command
+ * blk_abort_request - Request recovery for the specified command
* @req: pointer to the request of interest
*
* This function requests that the block layer start recovery for the
@@ -88,11 +88,29 @@ void blk_abort_request(struct request *req)
}
EXPORT_SYMBOL_GPL(blk_abort_request);
+static unsigned long blk_timeout_mask __read_mostly;
+
+static int __init blk_timeout_init(void)
+{
+ blk_timeout_mask = roundup_pow_of_two(HZ) - 1;
+ return 0;
+}
+
+late_initcall(blk_timeout_init);
+
+/*
+ * Just a rough estimate, we don't care about specific values for timeouts.
+ */
+static inline unsigned long blk_round_jiffies(unsigned long j)
+{
+ return (j + blk_timeout_mask) + 1;
+}
+
unsigned long blk_rq_timeout(unsigned long timeout)
{
unsigned long maxt;
- maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
+ maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT);
if (time_after(timeout, maxt))
timeout = maxt;
@@ -129,7 +147,7 @@ void blk_add_timer(struct request *req)
* than an existing one, modify the timer. Round up to next nearest
* second.
*/
- expiry = blk_rq_timeout(round_jiffies_up(expiry));
+ expiry = blk_rq_timeout(blk_round_jiffies(expiry));
if (!timer_pending(&q->timeout) ||
time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk.h b/block/blk.h
index 94f7c084f68f..49e2928a1632 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -25,11 +25,6 @@ struct blk_flush_queue {
struct list_head flush_data_in_flight;
struct request *flush_rq;
- /*
- * flush_rq shares tag with this rq, both can't be active
- * at the same time
- */
- struct request *orig_rq;
struct lock_class_key key;
spinlock_t mq_flush_lock;
};
@@ -270,6 +265,20 @@ static inline unsigned int bio_allowed_max_sectors(struct request_queue *q)
}
/*
+ * The max bio size which is aligned to q->limits.discard_granularity. This
+ * is a hint to split large discard bio in generic block layer, then if device
+ * driver needs to split the discard bio into smaller ones, their bi_size can
+ * be very probably and easily aligned to discard_granularity of the device's
+ * queue.
+ */
+static inline unsigned int bio_aligned_discard_max_sectors(
+ struct request_queue *q)
+{
+ return round_down(UINT_MAX, q->limits.discard_granularity) >>
+ SECTOR_SHIFT;
+}
+
+/*
* Internal io_context interface
*/
void get_io_context(struct io_context *ioc);
diff --git a/block/elevator.c b/block/elevator.c
index 4eab3d70e880..90ed7a28c21d 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -95,8 +95,8 @@ static inline bool elv_support_features(unsigned int elv_features,
* @name: Elevator name to test
* @required_features: Features that the elevator must provide
*
- * Return true is the elevator @e name matches @name and if @e provides all the
- * the feratures spcified by @required_features.
+ * Return true if the elevator @e name matches @name and if @e provides all
+ * the features specified by @required_features.
*/
static bool elevator_match(const struct elevator_type *e, const char *name,
unsigned int required_features)
diff --git a/block/genhd.c b/block/genhd.c
index 60ae4e1b4d38..99c64641c314 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -38,8 +38,6 @@ static struct kobject *block_depr;
static DEFINE_SPINLOCK(ext_devt_lock);
static DEFINE_IDR(ext_devt_idr);
-static const struct device_type disk_type;
-
static void disk_check_events(struct disk_events *ev,
unsigned int *clearing_ptr);
static void disk_alloc_events(struct gendisk *disk);
@@ -1587,7 +1585,7 @@ static char *block_devnode(struct device *dev, umode_t *mode,
return NULL;
}
-static const struct device_type disk_type = {
+const struct device_type disk_type = {
.name = "disk",
.groups = disk_attr_groups,
.release = disk_release,
@@ -1775,7 +1773,7 @@ EXPORT_SYMBOL(__alloc_disk_node);
/**
* get_disk_and_module - increments the gendisk and gendisk fops module refcount
- * @disk: the struct gendisk to to increment the refcount for
+ * @disk: the struct gendisk to increment the refcount for
*
* This increments the refcount for the struct gendisk, and the gendisk's
* fops module owner.
@@ -1804,7 +1802,7 @@ EXPORT_SYMBOL(get_disk_and_module);
/**
* put_disk - decrements the gendisk refcount
- * @disk: the struct gendisk to to decrement the refcount for
+ * @disk: the struct gendisk to decrement the refcount for
*
* This decrements the refcount for the struct gendisk. When this reaches 0
* we'll have disk_release() called.
@@ -1821,7 +1819,7 @@ EXPORT_SYMBOL(put_disk);
/**
* put_disk_and_module - decrements the module and gendisk refcount
- * @disk: the struct gendisk to to decrement the refcount for
+ * @disk: the struct gendisk to decrement the refcount for
*
* This is a counterpart of get_disk_and_module() and thus also of
* get_gendisk().
@@ -2056,18 +2054,12 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
*/
unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
{
- const struct block_device_operations *bdops = disk->fops;
struct disk_events *ev = disk->ev;
unsigned int pending;
unsigned int clearing = mask;
- if (!ev) {
- /* for drivers still using the old ->media_changed method */
- if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
- bdops->media_changed && bdops->media_changed(disk))
- return DISK_EVENT_MEDIA_CHANGE;
+ if (!ev)
return 0;
- }
disk_block_events(disk);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 78951e33b2d7..e62a98a8eeb7 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -619,8 +619,6 @@ int blk_drop_partitions(struct block_device *bdev)
struct disk_part_iter piter;
struct hd_struct *part;
- if (!disk_part_scan_enabled(bdev->bd_disk))
- return 0;
if (bdev->bd_part_count)
return -EBUSY;