summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Makefile2
-rw-r--r--block/bfq-iosched.c14
-rw-r--r--block/bio-integrity.c84
-rw-r--r--block/bio.c113
-rw-r--r--block/blk-cgroup-rwstat.h5
-rw-r--r--block/blk-cgroup.c11
-rw-r--r--block/blk-cgroup.h10
-rw-r--r--block/blk-core.c22
-rw-r--r--block/blk-ia-ranges.c4
-rw-r--r--block/blk-integrity.c14
-rw-r--r--block/blk-iocost.c14
-rw-r--r--block/blk-iolatency.c6
-rw-r--r--block/blk-map.c128
-rw-r--r--block/blk-merge.c184
-rw-r--r--block/blk-mq-cpumap.c36
-rw-r--r--block/blk-mq-debugfs.c27
-rw-r--r--block/blk-mq-pci.c46
-rw-r--r--block/blk-mq-sched.c3
-rw-r--r--block/blk-mq-sysfs.c40
-rw-r--r--block/blk-mq-tag.c41
-rw-r--r--block/blk-mq-virtio.c46
-rw-r--r--block/blk-mq.c92
-rw-r--r--block/blk-mq.h11
-rw-r--r--block/blk-pm.c2
-rw-r--r--block/blk-rq-qos.c12
-rw-r--r--block/blk-settings.c57
-rw-r--r--block/blk-sysfs.c143
-rw-r--r--block/blk-throttle.c5
-rw-r--r--block/blk-zoned.c146
-rw-r--r--block/blk.h42
-rw-r--r--block/bsg-lib.c2
-rw-r--r--block/elevator.c47
-rw-r--r--block/elevator.h2
-rw-r--r--block/fops.c50
-rw-r--r--block/genhd.c63
-rw-r--r--block/kyber-iosched.c2
-rw-r--r--block/mq-deadline.c2
-rw-r--r--block/partitions/efi.c2
-rw-r--r--block/partitions/ldm.h2
-rw-r--r--block/partitions/mac.c18
40 files changed, 743 insertions, 807 deletions
diff --git a/block/Makefile b/block/Makefile
index ddfd21c1a9ff..33748123710b 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -27,8 +27,6 @@ bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
-obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o
-obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 95dd7b795935..167542201603 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6844,16 +6844,24 @@ static struct bfq_queue *bfq_waker_bfqq(struct bfq_queue *bfqq)
if (new_bfqq == waker_bfqq) {
/*
* If waker_bfqq is in the merge chain, and current
- * is the only procress.
+ * is the only process, waker_bfqq can be freed.
*/
if (bfqq_process_refs(waker_bfqq) == 1)
return NULL;
- break;
+
+ return waker_bfqq;
}
new_bfqq = new_bfqq->new_bfqq;
}
+ /*
+ * If waker_bfqq is not in the merge chain, and it's procress reference
+ * is 0, waker_bfqq can be freed.
+ */
+ if (bfqq_process_refs(waker_bfqq) == 0)
+ return NULL;
+
return waker_bfqq;
}
@@ -7614,7 +7622,7 @@ static ssize_t bfq_low_latency_store(struct elevator_queue *e,
#define BFQ_ATTR(name) \
__ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
-static struct elv_fs_entry bfq_attrs[] = {
+static const struct elv_fs_entry bfq_attrs[] = {
BFQ_ATTR(fifo_expire_sync),
BFQ_ATTR(fifo_expire_async),
BFQ_ATTR(back_seek_max),
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 2a4bd6611692..5d81ad9a3d20 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -118,17 +118,18 @@ static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
{
- unsigned short nr_vecs = bip->bip_max_vcnt - 1;
- struct bio_vec *copy = &bip->bip_vec[1];
- size_t bytes = bip->bip_iter.bi_size;
- struct iov_iter iter;
+ unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1;
+ struct bio_vec *orig_bvecs = &bip->bip_vec[1];
+ struct bio_vec *bounce_bvec = &bip->bip_vec[0];
+ size_t bytes = bounce_bvec->bv_len;
+ struct iov_iter orig_iter;
int ret;
- iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
- ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
+ iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes);
+ ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter);
WARN_ON_ONCE(ret != bytes);
- bio_integrity_unpin_bvec(copy, nr_vecs, true);
+ bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs, true);
}
/**
@@ -301,16 +302,15 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
return nr_bvecs;
}
-int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
+int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits);
struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
+ size_t offset, bytes = iter->count;
unsigned int direction, nr_bvecs;
- struct iov_iter iter;
int ret, nr_vecs;
- size_t offset;
bool copy;
if (bio_integrity(bio))
@@ -323,8 +323,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
else
direction = ITER_SOURCE;
- iov_iter_ubuf(&iter, direction, ubuf, bytes);
- nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
+ nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1);
if (nr_vecs > BIO_MAX_VECS)
return -E2BIG;
if (nr_vecs > UIO_FASTIOV) {
@@ -334,8 +333,8 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
pages = NULL;
}
- copy = !iov_iter_is_aligned(&iter, align, align);
- ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
+ copy = !iov_iter_is_aligned(iter, align, align);
+ ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset);
if (unlikely(ret < 0))
goto free_bvec;
@@ -365,6 +364,55 @@ free_bvec:
return ret;
}
+static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta)
+{
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+
+ if (meta->flags & IO_INTEGRITY_CHK_GUARD)
+ bip->bip_flags |= BIP_CHECK_GUARD;
+ if (meta->flags & IO_INTEGRITY_CHK_APPTAG)
+ bip->bip_flags |= BIP_CHECK_APPTAG;
+ if (meta->flags & IO_INTEGRITY_CHK_REFTAG)
+ bip->bip_flags |= BIP_CHECK_REFTAG;
+
+ bip->app_tag = meta->app_tag;
+}
+
+int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta)
+{
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+ unsigned int integrity_bytes;
+ int ret;
+ struct iov_iter it;
+
+ if (!bi)
+ return -EINVAL;
+ /*
+ * original meta iterator can be bigger.
+ * process integrity info corresponding to current data buffer only.
+ */
+ it = meta->iter;
+ integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio));
+ if (it.count < integrity_bytes)
+ return -EINVAL;
+
+ /* should fit into two bytes */
+ BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16));
+
+ if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS))
+ return -EINVAL;
+
+ it.count = integrity_bytes;
+ ret = bio_integrity_map_user(bio, &it);
+ if (!ret) {
+ bio_uio_meta_to_bip(bio, meta);
+ bip_set_seed(bio_integrity(bio), meta->seed);
+ iov_iter_advance(&meta->iter, integrity_bytes);
+ meta->seed += bio_integrity_intervals(bi, bio_sectors(bio));
+ }
+ return ret;
+}
+
/**
* bio_integrity_prep - Prepare bio for integrity I/O
* @bio: bio to prepare
@@ -435,6 +483,11 @@ bool bio_integrity_prep(struct bio *bio)
if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
bip->bip_flags |= BIP_IP_CHECKSUM;
+ /* describe what tags to check in payload */
+ if (bi->csum_type)
+ bip->bip_flags |= BIP_CHECK_GUARD;
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
+ bip->bip_flags |= BIP_CHECK_REFTAG;
if (bio_integrity_add_page(bio, virt_to_page(buf), len,
offset_in_page(buf)) < len) {
printk(KERN_ERR "could not attach integrity payload\n");
@@ -559,7 +612,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
bip->bip_vec = bip_src->bip_vec;
bip->bip_iter = bip_src->bip_iter;
- bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY;
+ bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS;
+ bip->app_tag = bip_src->app_tag;
return 0;
}
diff --git a/block/bio.c b/block/bio.c
index d5bdc31d88d3..6ac5983ba51e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -77,7 +77,7 @@ struct bio_slab {
struct kmem_cache *slab;
unsigned int slab_ref;
unsigned int slab_size;
- char name[8];
+ char name[12];
};
static DEFINE_MUTEX(bio_slab_lock);
static DEFINE_XARRAY(bio_slabs);
@@ -946,8 +946,11 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
/*
* Try to merge a page into a segment, while obeying the hardware segment
- * size limit. This is not for normal read/write bios, but for passthrough
- * or Zone Append operations that we can't split.
+ * size limit.
+ *
+ * This is kept around for the integrity metadata, which is still tries
+ * to build the initial bio to the hardware limit and doesn't have proper
+ * helpers to split. Hopefully this will go away soon.
*/
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
struct page *page, unsigned len, unsigned offset,
@@ -965,106 +968,6 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
}
/**
- * bio_add_hw_page - attempt to add a page to a bio with hw constraints
- * @q: the target queue
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- * @max_sectors: maximum number of sectors that can be added
- * @same_page: return if the segment has been merged inside the same page
- *
- * Add a page to a bio while respecting the hardware max_sectors, max_segment
- * and gap limitations.
- */
-int bio_add_hw_page(struct request_queue *q, struct bio *bio,
- struct page *page, unsigned int len, unsigned int offset,
- unsigned int max_sectors, bool *same_page)
-{
- unsigned int max_size = max_sectors << SECTOR_SHIFT;
-
- if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
- return 0;
-
- len = min3(len, max_size, queue_max_segment_size(q));
- if (len > max_size - bio->bi_iter.bi_size)
- return 0;
-
- if (bio->bi_vcnt > 0) {
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
- if (bvec_try_merge_hw_page(q, bv, page, len, offset,
- same_page)) {
- bio->bi_iter.bi_size += len;
- return len;
- }
-
- if (bio->bi_vcnt >=
- min(bio->bi_max_vecs, queue_max_segments(q)))
- return 0;
-
- /*
- * If the queue doesn't support SG gaps and adding this segment
- * would create a gap, disallow it.
- */
- if (bvec_gap_to_prev(&q->limits, bv, offset))
- return 0;
- }
-
- bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
- bio->bi_vcnt++;
- bio->bi_iter.bi_size += len;
- return len;
-}
-
-/**
- * bio_add_hw_folio - attempt to add a folio to a bio with hw constraints
- * @q: the target queue
- * @bio: destination bio
- * @folio: folio to add
- * @len: vec entry length
- * @offset: vec entry offset in the folio
- * @max_sectors: maximum number of sectors that can be added
- * @same_page: return if the segment has been merged inside the same folio
- *
- * Add a folio to a bio while respecting the hardware max_sectors, max_segment
- * and gap limitations.
- */
-int bio_add_hw_folio(struct request_queue *q, struct bio *bio,
- struct folio *folio, size_t len, size_t offset,
- unsigned int max_sectors, bool *same_page)
-{
- if (len > UINT_MAX || offset > UINT_MAX)
- return 0;
- return bio_add_hw_page(q, bio, folio_page(folio, 0), len, offset,
- max_sectors, same_page);
-}
-
-/**
- * bio_add_pc_page - attempt to add page to passthrough bio
- * @q: the target queue
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
- *
- * This should only be used by passthrough bios.
- */
-int bio_add_pc_page(struct request_queue *q, struct bio *bio,
- struct page *page, unsigned int len, unsigned int offset)
-{
- bool same_page = false;
- return bio_add_hw_page(q, bio, page, len, offset,
- queue_max_hw_sectors(q), &same_page);
-}
-EXPORT_SYMBOL(bio_add_pc_page);
-
-/**
* __bio_add_page - add page(s) to a bio in a new segment
* @bio: destination bio
* @page: start page to add
@@ -1707,6 +1610,10 @@ EXPORT_SYMBOL(bio_split);
*/
void bio_trim(struct bio *bio, sector_t offset, sector_t size)
{
+ /* We should never trim an atomic write */
+ if (WARN_ON_ONCE(bio->bi_opf & REQ_ATOMIC && size))
+ return;
+
if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
offset + size > bio_sectors(bio)))
return;
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
index 022527b0b043..703a16fe1404 100644
--- a/block/blk-cgroup-rwstat.h
+++ b/block/blk-cgroup-rwstat.h
@@ -52,7 +52,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
/**
* blkg_rwstat_add - add a value to a blkg_rwstat
* @rwstat: target blkg_rwstat
- * @op: REQ_OP and flags
+ * @opf: REQ_OP and flags
* @val: value to add
*
* Add @val to @rwstat. The counters are chosen according to @rw. The
@@ -83,8 +83,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
/**
* blkg_rwstat_read - read the current values of a blkg_rwstat
* @rwstat: blkg_rwstat to read
+ * @result: where to put the current values
*
- * Read the current snapshot of @rwstat and return it in the aux counts.
+ * Read the current snapshot of @rwstat and return it in the @result counts.
*/
static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
struct blkg_rwstat_sample *result)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 45a395862fbc..9ed93d91d754 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1138,6 +1138,7 @@ static void blkcg_fill_root_iostats(void)
blkg_iostat_set(&blkg->iostat.cur, &tmp);
u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
+ class_dev_iter_exit(&iter);
}
static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
@@ -1545,6 +1546,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
struct request_queue *q = disk->queue;
struct blkg_policy_data *pd_prealloc = NULL;
struct blkcg_gq *blkg, *pinned_blkg = NULL;
+ unsigned int memflags;
int ret;
if (blkcg_policy_enabled(q, pol))
@@ -1559,7 +1561,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
return -EINVAL;
if (queue_is_mq(q))
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
retry:
spin_lock_irq(&q->queue_lock);
@@ -1623,7 +1625,7 @@ retry:
spin_unlock_irq(&q->queue_lock);
out:
if (queue_is_mq(q))
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
if (pinned_blkg)
blkg_put(pinned_blkg);
if (pd_prealloc)
@@ -1667,12 +1669,13 @@ void blkcg_deactivate_policy(struct gendisk *disk,
{
struct request_queue *q = disk->queue;
struct blkcg_gq *blkg;
+ unsigned int memflags;
if (!blkcg_policy_enabled(q, pol))
return;
if (queue_is_mq(q))
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
mutex_lock(&q->blkcg_mutex);
spin_lock_irq(&q->queue_lock);
@@ -1696,7 +1699,7 @@ void blkcg_deactivate_policy(struct gendisk *disk,
mutex_unlock(&q->blkcg_mutex);
if (queue_is_mq(q))
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
}
EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index b9e3265c1eb3..2c4663bd993a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -225,7 +225,9 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx);
/**
* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
- * @return: true if this bio needs to be submitted with the root blkg context.
+ * @bio: the target &bio
+ *
+ * Return: true if this bio needs to be submitted with the root blkg context.
*
* In order to avoid priority inversions we sometimes need to issue a bio as if
* it were attached to the root blkg, and then backcharge to the actual owning
@@ -245,7 +247,7 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio)
* @q: request_queue of interest
*
* Lookup blkg for the @blkcg - @q pair.
-
+ *
* Must be called in a RCU critical section.
*/
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
@@ -268,7 +270,7 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
}
/**
- * blkg_to_pdata - get policy private data
+ * blkg_to_pd - get policy private data
* @blkg: blkg of interest
* @pol: policy of interest
*
@@ -287,7 +289,7 @@ static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
}
/**
- * pdata_to_blkg - get blkg associated with policy private data
+ * pd_to_blkg - get blkg associated with policy private data
* @pd: policy private data of interest
*
* @pd is policy private data. Determine the blkg it's associated with.
diff --git a/block/blk-core.c b/block/blk-core.c
index 666efe8fa202..d6c4fa3943b5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -430,7 +430,6 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
refcount_set(&q->refs, 1);
mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
- mutex_init(&q->sysfs_dir_lock);
mutex_init(&q->limits_lock);
mutex_init(&q->rq_qos_mutex);
spin_lock_init(&q->queue_lock);
@@ -629,8 +628,14 @@ static void __submit_bio(struct bio *bio)
blk_mq_submit_bio(bio);
} else if (likely(bio_queue_enter(bio) == 0)) {
struct gendisk *disk = bio->bi_bdev->bd_disk;
-
- disk->fops->submit_bio(bio);
+
+ if ((bio->bi_opf & REQ_POLLED) &&
+ !(disk->queue->limits.features & BLK_FEAT_POLL)) {
+ bio->bi_status = BLK_STS_NOTSUPP;
+ bio_endio(bio);
+ } else {
+ disk->fops->submit_bio(bio);
+ }
blk_queue_exit(disk->queue);
}
@@ -805,12 +810,6 @@ void submit_bio_noacct(struct bio *bio)
}
}
- if (!(q->limits.features & BLK_FEAT_POLL) &&
- (bio->bi_opf & REQ_POLLED)) {
- bio_clear_polled(bio);
- goto not_supported;
- }
-
switch (bio_op(bio)) {
case REQ_OP_READ:
break;
@@ -935,7 +934,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
return 0;
q = bdev_get_queue(bdev);
- if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL))
+ if (cookie == BLK_QC_T_NONE)
return 0;
blk_flush_plug(current->plug, false);
@@ -956,7 +955,8 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
} else {
struct gendisk *disk = q->disk;
- if (disk && disk->fops->poll_bio)
+ if ((q->limits.features & BLK_FEAT_POLL) && disk &&
+ disk->fops->poll_bio)
ret = disk->fops->poll_bio(bio, iob, flags);
}
blk_queue_exit(q);
diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c
index c9eb4241e048..d479f5481b66 100644
--- a/block/blk-ia-ranges.c
+++ b/block/blk-ia-ranges.c
@@ -111,7 +111,6 @@ int disk_register_independent_access_ranges(struct gendisk *disk)
struct request_queue *q = disk->queue;
int i, ret;
- lockdep_assert_held(&q->sysfs_dir_lock);
lockdep_assert_held(&q->sysfs_lock);
if (!iars)
@@ -155,7 +154,6 @@ void disk_unregister_independent_access_ranges(struct gendisk *disk)
struct blk_independent_access_ranges *iars = disk->ia_ranges;
int i;
- lockdep_assert_held(&q->sysfs_dir_lock);
lockdep_assert_held(&q->sysfs_lock);
if (!iars)
@@ -289,7 +287,6 @@ void disk_set_independent_access_ranges(struct gendisk *disk,
{
struct request_queue *q = disk->queue;
- mutex_lock(&q->sysfs_dir_lock);
mutex_lock(&q->sysfs_lock);
if (iars && !disk_check_ia_ranges(disk, iars)) {
kfree(iars);
@@ -313,6 +310,5 @@ void disk_set_independent_access_ranges(struct gendisk *disk,
disk_register_independent_access_ranges(disk);
unlock:
mutex_unlock(&q->sysfs_lock);
- mutex_unlock(&q->sysfs_dir_lock);
}
EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index b180cac61a9d..a1678f0a9f81 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -115,8 +115,16 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg);
int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
ssize_t bytes)
{
- int ret = bio_integrity_map_user(rq->bio, ubuf, bytes);
+ int ret;
+ struct iov_iter iter;
+ unsigned int direction;
+ if (op_is_write(req_op(rq)))
+ direction = ITER_DEST;
+ else
+ direction = ITER_SOURCE;
+ iov_iter_ubuf(&iter, direction, ubuf, bytes);
+ ret = bio_integrity_map_user(rq->bio, &iter);
if (ret)
return ret;
@@ -218,9 +226,7 @@ static ssize_t flag_store(struct device *dev, const char *page, size_t count,
else
lim.integrity.flags |= flag;
- blk_mq_freeze_queue(q);
- err = queue_limits_commit_update(q, &lim);
- blk_mq_unfreeze_queue(q);
+ err = queue_limits_commit_update_frozen(q, &lim);
if (err)
return err;
return count;
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index a5894ec9696e..65a1d4427ccf 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3224,6 +3224,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
u32 qos[NR_QOS_PARAMS];
bool enable, user;
char *body, *p;
+ unsigned int memflags;
int ret;
blkg_conf_init(&ctx, input);
@@ -3247,7 +3248,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc = q_to_ioc(disk->queue);
}
- blk_mq_freeze_queue(disk->queue);
+ memflags = blk_mq_freeze_queue(disk->queue);
blk_mq_quiesce_queue(disk->queue);
spin_lock_irq(&ioc->lock);
@@ -3347,7 +3348,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
wbt_enable_default(disk);
blk_mq_unquiesce_queue(disk->queue);
- blk_mq_unfreeze_queue(disk->queue);
+ blk_mq_unfreeze_queue(disk->queue, memflags);
blkg_conf_exit(&ctx);
return nbytes;
@@ -3355,7 +3356,7 @@ einval:
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(disk->queue);
- blk_mq_unfreeze_queue(disk->queue);
+ blk_mq_unfreeze_queue(disk->queue, memflags);
ret = -EINVAL;
err:
@@ -3414,6 +3415,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
{
struct blkg_conf_ctx ctx;
struct request_queue *q;
+ unsigned int memflags;
struct ioc *ioc;
u64 u[NR_I_LCOEFS];
bool user;
@@ -3441,7 +3443,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ioc = q_to_ioc(q);
}
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
spin_lock_irq(&ioc->lock);
@@ -3493,7 +3495,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
blkg_conf_exit(&ctx);
return nbytes;
@@ -3502,7 +3504,7 @@ einval:
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
ret = -EINVAL;
err:
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index ebb522788d97..42c1e0b9a68f 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -749,9 +749,11 @@ static void blkiolatency_enable_work_fn(struct work_struct *work)
*/
enabled = atomic_read(&blkiolat->enable_cnt);
if (enabled != blkiolat->enabled) {
- blk_mq_freeze_queue(blkiolat->rqos.disk->queue);
+ unsigned int memflags;
+
+ memflags = blk_mq_freeze_queue(blkiolat->rqos.disk->queue);
blkiolat->enabled = enabled;
- blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue);
+ blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue, memflags);
}
}
diff --git a/block/blk-map.c b/block/blk-map.c
index 894009b2d881..d2f22744b3d1 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -189,7 +189,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
}
}
- if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) {
+ if (bio_add_page(bio, page, bytes, offset) < bytes) {
if (!map_data)
__free_page(page);
break;
@@ -272,86 +272,27 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq,
static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
gfp_t gfp_mask)
{
- iov_iter_extraction_t extraction_flags = 0;
- unsigned int max_sectors = queue_max_hw_sectors(rq->q);
unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS);
struct bio *bio;
int ret;
- int j;
if (!iov_iter_count(iter))
return -EINVAL;
bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask);
- if (bio == NULL)
+ if (!bio)
return -ENOMEM;
-
- if (blk_queue_pci_p2pdma(rq->q))
- extraction_flags |= ITER_ALLOW_P2PDMA;
- if (iov_iter_extract_will_pin(iter))
- bio_set_flag(bio, BIO_PAGE_PINNED);
-
- while (iov_iter_count(iter)) {
- struct page *stack_pages[UIO_FASTIOV];
- struct page **pages = stack_pages;
- ssize_t bytes;
- size_t offs;
- int npages;
-
- if (nr_vecs > ARRAY_SIZE(stack_pages))
- pages = NULL;
-
- bytes = iov_iter_extract_pages(iter, &pages, LONG_MAX,
- nr_vecs, extraction_flags, &offs);
- if (unlikely(bytes <= 0)) {
- ret = bytes ? bytes : -EFAULT;
- goto out_unmap;
- }
-
- npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE);
-
- if (unlikely(offs & queue_dma_alignment(rq->q)))
- j = 0;
- else {
- for (j = 0; j < npages; j++) {
- struct page *page = pages[j];
- unsigned int n = PAGE_SIZE - offs;
- bool same_page = false;
-
- if (n > bytes)
- n = bytes;
-
- if (!bio_add_hw_page(rq->q, bio, page, n, offs,
- max_sectors, &same_page))
- break;
-
- if (same_page)
- bio_release_page(bio, page);
- bytes -= n;
- offs = 0;
- }
- }
- /*
- * release the pages we didn't map into the bio, if any
- */
- while (j < npages)
- bio_release_page(bio, pages[j++]);
- if (pages != stack_pages)
- kvfree(pages);
- /* couldn't stuff something into bio? */
- if (bytes) {
- iov_iter_revert(iter, bytes);
- break;
- }
- }
-
+ ret = bio_iov_iter_get_pages(bio, iter);
+ if (ret)
+ goto out_put;
ret = blk_rq_append_bio(rq, bio);
if (ret)
- goto out_unmap;
+ goto out_release;
return 0;
- out_unmap:
+out_release:
bio_release_pages(bio, false);
+out_put:
blk_mq_map_bio_put(bio);
return ret;
}
@@ -422,8 +363,7 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data,
page = virt_to_page(data);
else
page = vmalloc_to_page(data);
- if (bio_add_pc_page(q, bio, page, bytes,
- offset) < bytes) {
+ if (bio_add_page(bio, page, bytes, offset) < bytes) {
/* we don't support partial mappings */
bio_uninit(bio);
kfree(bio);
@@ -507,7 +447,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
if (!reading)
memcpy(page_address(page), p, bytes);
- if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
+ if (bio_add_page(bio, page, bytes, 0) < bytes)
break;
len -= bytes;
@@ -536,24 +476,33 @@ cleanup:
*/
int blk_rq_append_bio(struct request *rq, struct bio *bio)
{
- struct bvec_iter iter;
- struct bio_vec bv;
+ const struct queue_limits *lim = &rq->q->limits;
+ unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT;
unsigned int nr_segs = 0;
+ int ret;
- bio_for_each_bvec(bv, bio, iter)
- nr_segs++;
+ /* check that the data layout matches the hardware restrictions */
+ ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes);
+ if (ret) {
+ /* if we would have to split the bio, copy instead */
+ if (ret > 0)
+ ret = -EREMOTEIO;
+ return ret;
+ }
- if (!rq->bio) {
- blk_rq_bio_prep(rq, bio, nr_segs);
- } else {
+ if (rq->bio) {
if (!ll_back_merge_fn(rq, bio, nr_segs))
return -EINVAL;
rq->biotail->bi_next = bio;
rq->biotail = bio;
- rq->__data_len += (bio)->bi_iter.bi_size;
+ rq->__data_len += bio->bi_iter.bi_size;
bio_crypt_free_ctx(bio);
+ return 0;
}
+ rq->nr_phys_segments = nr_segs;
+ rq->bio = rq->biotail = bio;
+ rq->__data_len = bio->bi_iter.bi_size;
return 0;
}
EXPORT_SYMBOL(blk_rq_append_bio);
@@ -561,9 +510,7 @@ EXPORT_SYMBOL(blk_rq_append_bio);
/* Prepare bio for passthrough IO given ITER_BVEC iter */
static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
{
- const struct queue_limits *lim = &rq->q->limits;
- unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT;
- unsigned int nsegs;
+ unsigned int max_bytes = rq->q->limits.max_hw_sectors << SECTOR_SHIFT;
struct bio *bio;
int ret;
@@ -576,18 +523,10 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
return -ENOMEM;
bio_iov_bvec_set(bio, iter);
- /* check that the data layout matches the hardware restrictions */
- ret = bio_split_rw_at(bio, lim, &nsegs, max_bytes);
- if (ret) {
- /* if we would have to split the bio, copy instead */
- if (ret > 0)
- ret = -EREMOTEIO;
+ ret = blk_rq_append_bio(rq, bio);
+ if (ret)
blk_mq_map_bio_put(bio);
- return ret;
- }
-
- blk_rq_bio_prep(rq, bio, nsegs);
- return 0;
+ return ret;
}
/**
@@ -644,8 +583,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask);
else
ret = bio_map_user_iov(rq, &i, gfp_mask);
- if (ret)
+ if (ret) {
+ if (ret == -EREMOTEIO)
+ ret = -EINVAL;
goto unmap_rq;
+ }
if (!bio)
bio = rq->bio;
} while (iov_iter_count(&i));
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e01383c6e534..1d1589c35297 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -270,7 +270,7 @@ static bool bvec_split_segs(const struct queue_limits *lim,
const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes,
unsigned max_segs, unsigned max_bytes)
{
- unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
+ unsigned max_len = max_bytes - *bytes;
unsigned len = min(bv->bv_len, max_len);
unsigned total_len = 0;
unsigned seg_size = 0;
@@ -329,7 +329,7 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
if (nsegs < lim->max_segments &&
bytes + bv.bv_len <= max_bytes &&
- bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
+ bv.bv_offset + bv.bv_len <= lim->min_segment_size) {
nsegs++;
bytes += bv.bv_len;
} else {
@@ -473,137 +473,103 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
return nr_phys_segs;
}
-static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
- struct scatterlist *sglist)
-{
- if (!*sg)
- return sglist;
+struct phys_vec {
+ phys_addr_t paddr;
+ u32 len;
+};
- /*
- * If the driver previously mapped a shorter list, we could see a
- * termination bit prematurely unless it fully inits the sg table
- * on each mapping. We KNOW that there must be more entries here
- * or the driver would be buggy, so force clear the termination bit
- * to avoid doing a full sg_init_table() in drivers for each command.
- */
- sg_unmark_end(*sg);
- return sg_next(*sg);
-}
-
-static unsigned blk_bvec_map_sg(struct request_queue *q,
- struct bio_vec *bvec, struct scatterlist *sglist,
- struct scatterlist **sg)
+static bool blk_map_iter_next(struct request *req,
+ struct req_iterator *iter, struct phys_vec *vec)
{
- unsigned nbytes = bvec->bv_len;
- unsigned nsegs = 0, total = 0;
-
- while (nbytes > 0) {
- unsigned offset = bvec->bv_offset + total;
- unsigned len = get_max_segment_size(&q->limits,
- bvec_phys(bvec) + total, nbytes);
- struct page *page = bvec->bv_page;
-
- /*
- * Unfortunately a fair number of drivers barf on scatterlists
- * that have an offset larger than PAGE_SIZE, despite other
- * subsystems dealing with that invariant just fine. For now
- * stick to the legacy format where we never present those from
- * the block layer, but the code below should be removed once
- * these offenders (mostly MMC/SD drivers) are fixed.
- */
- page += (offset >> PAGE_SHIFT);
- offset &= ~PAGE_MASK;
-
- *sg = blk_next_sg(sg, sglist);
- sg_set_page(*sg, page, len, offset);
+ unsigned int max_size;
+ struct bio_vec bv;
- total += len;
- nbytes -= len;
- nsegs++;
+ if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ if (!iter->bio)
+ return false;
+ vec->paddr = bvec_phys(&req->special_vec);
+ vec->len = req->special_vec.bv_len;
+ iter->bio = NULL;
+ return true;
}
- return nsegs;
-}
-
-static inline int __blk_bvec_map_sg(struct bio_vec bv,
- struct scatterlist *sglist, struct scatterlist **sg)
-{
- *sg = blk_next_sg(sg, sglist);
- sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
- return 1;
-}
-
-/* only try to merge bvecs into one sg if they are from two bios */
-static inline bool
-__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec,
- struct bio_vec *bvprv, struct scatterlist **sg)
-{
-
- int nbytes = bvec->bv_len;
-
- if (!*sg)
+ if (!iter->iter.bi_size)
return false;
- if ((*sg)->length + nbytes > queue_max_segment_size(q))
- return false;
+ bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+ vec->paddr = bvec_phys(&bv);
+ max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
+ bv.bv_len = min(bv.bv_len, max_size);
+ bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
- if (!biovec_phys_mergeable(q, bvprv, bvec))
- return false;
+ /*
+ * If we are entirely done with this bi_io_vec entry, check if the next
+ * one could be merged into it. This typically happens when moving to
+ * the next bio, but some callers also don't pack bvecs tight.
+ */
+ while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
+ struct bio_vec next;
+
+ if (!iter->iter.bi_size) {
+ if (!iter->bio->bi_next)
+ break;
+ iter->bio = iter->bio->bi_next;
+ iter->iter = iter->bio->bi_iter;
+ }
- (*sg)->length += nbytes;
+ next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+ if (bv.bv_len + next.bv_len > max_size ||
+ !biovec_phys_mergeable(req->q, &bv, &next))
+ break;
+ bv.bv_len += next.bv_len;
+ bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
+ }
+
+ vec->len = bv.bv_len;
return true;
}
-static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
- struct scatterlist *sglist,
- struct scatterlist **sg)
+static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
+ struct scatterlist *sglist)
{
- struct bio_vec bvec, bvprv = { NULL };
- struct bvec_iter iter;
- int nsegs = 0;
- bool new_bio = false;
-
- for_each_bio(bio) {
- bio_for_each_bvec(bvec, bio, iter) {
- /*
- * Only try to merge bvecs from two bios given we
- * have done bio internal merge when adding pages
- * to bio
- */
- if (new_bio &&
- __blk_segment_map_sg_merge(q, &bvec, &bvprv, sg))
- goto next_bvec;
-
- if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE)
- nsegs += __blk_bvec_map_sg(bvec, sglist, sg);
- else
- nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg);
- next_bvec:
- new_bio = false;
- }
- if (likely(bio->bi_iter.bi_size)) {
- bvprv = bvec;
- new_bio = true;
- }
- }
+ if (!*sg)
+ return sglist;
- return nsegs;
+ /*
+ * If the driver previously mapped a shorter list, we could see a
+ * termination bit prematurely unless it fully inits the sg table
+ * on each mapping. We KNOW that there must be more entries here
+ * or the driver would be buggy, so force clear the termination bit
+ * to avoid doing a full sg_init_table() in drivers for each command.
+ */
+ sg_unmark_end(*sg);
+ return sg_next(*sg);
}
/*
- * map a request to scatterlist, return number of sg entries setup. Caller
- * must make sure sg can hold rq->nr_phys_segments entries
+ * Map a request to scatterlist, return number of sg entries setup. Caller
+ * must make sure sg can hold rq->nr_phys_segments entries.
*/
int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
struct scatterlist *sglist, struct scatterlist **last_sg)
{
+ struct req_iterator iter = {
+ .bio = rq->bio,
+ };
+ struct phys_vec vec;
int nsegs = 0;
- if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
- nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg);
- else if (rq->bio)
- nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg);
+ /* the internal flush request may not have bio attached */
+ if (iter.bio)
+ iter.iter = iter.bio->bi_iter;
+
+ while (blk_map_iter_next(rq, &iter, &vec)) {
+ *last_sg = blk_next_sg(last_sg, sglist);
+ sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
+ offset_in_page(vec.paddr));
+ nsegs++;
+ }
if (*last_sg)
sg_mark_end(*last_sg);
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 9638b25fd521..444798c5374f 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -11,6 +11,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/group_cpus.h>
+#include <linux/device/bus.h>
#include "blk.h"
#include "blk-mq.h"
@@ -54,3 +55,38 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index)
return NUMA_NO_NODE;
}
+
+/**
+ * blk_mq_map_hw_queues - Create CPU to hardware queue mapping
+ * @qmap: CPU to hardware queue map
+ * @dev: The device to map queues
+ * @offset: Queue offset to use for the device
+ *
+ * Create a CPU to hardware queue mapping in @qmap. The struct bus_type
+ * irq_get_affinity callback will be used to retrieve the affinity.
+ */
+void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
+ struct device *dev, unsigned int offset)
+
+{
+ const struct cpumask *mask;
+ unsigned int queue, cpu;
+
+ if (!dev->bus->irq_get_affinity)
+ goto fallback;
+
+ for (queue = 0; queue < qmap->nr_queues; queue++) {
+ mask = dev->bus->irq_get_affinity(dev, queue + offset);
+ if (!mask)
+ goto fallback;
+
+ for_each_cpu(cpu, mask)
+ qmap->mq_map[cpu] = qmap->queue_offset + queue;
+ }
+
+ return;
+
+fallback:
+ blk_mq_map_queues(qmap);
+}
+EXPORT_SYMBOL_GPL(blk_mq_map_hw_queues);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 5463697a8442..adf5f0697b6b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -172,21 +172,13 @@ static int hctx_state_show(void *data, struct seq_file *m)
return 0;
}
-#define BLK_TAG_ALLOC_NAME(name) [BLK_TAG_ALLOC_##name] = #name
-static const char *const alloc_policy_name[] = {
- BLK_TAG_ALLOC_NAME(FIFO),
- BLK_TAG_ALLOC_NAME(RR),
-};
-#undef BLK_TAG_ALLOC_NAME
-
#define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name
static const char *const hctx_flag_name[] = {
- HCTX_FLAG_NAME(SHOULD_MERGE),
HCTX_FLAG_NAME(TAG_QUEUE_SHARED),
HCTX_FLAG_NAME(STACKING),
HCTX_FLAG_NAME(TAG_HCTX_SHARED),
HCTX_FLAG_NAME(BLOCKING),
- HCTX_FLAG_NAME(NO_SCHED),
+ HCTX_FLAG_NAME(TAG_RR),
HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT),
};
#undef HCTX_FLAG_NAME
@@ -194,22 +186,11 @@ static const char *const hctx_flag_name[] = {
static int hctx_flags_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
- const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags);
- BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) !=
- BLK_MQ_F_ALLOC_POLICY_START_BIT);
- BUILD_BUG_ON(ARRAY_SIZE(alloc_policy_name) != BLK_TAG_ALLOC_MAX);
+ BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != ilog2(BLK_MQ_F_MAX));
- seq_puts(m, "alloc_policy=");
- if (alloc_policy < ARRAY_SIZE(alloc_policy_name) &&
- alloc_policy_name[alloc_policy])
- seq_puts(m, alloc_policy_name[alloc_policy]);
- else
- seq_printf(m, "%d", alloc_policy);
- seq_puts(m, " ");
- blk_flags_show(m,
- hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy),
- hctx_flag_name, ARRAY_SIZE(hctx_flag_name));
+ blk_flags_show(m, hctx->flags, hctx_flag_name,
+ ARRAY_SIZE(hctx_flag_name));
seq_puts(m, "\n");
return 0;
}
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
deleted file mode 100644
index d47b5c73c9eb..000000000000
--- a/block/blk-mq-pci.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2016 Christoph Hellwig.
- */
-#include <linux/kobject.h>
-#include <linux/blkdev.h>
-#include <linux/blk-mq-pci.h>
-#include <linux/pci.h>
-#include <linux/module.h>
-
-#include "blk-mq.h"
-
-/**
- * blk_mq_pci_map_queues - provide a default queue mapping for PCI device
- * @qmap: CPU to hardware queue map.
- * @pdev: PCI device associated with @set.
- * @offset: Offset to use for the pci irq vector
- *
- * This function assumes the PCI device @pdev has at least as many available
- * interrupt vectors as @set has queues. It will then query the vector
- * corresponding to each queue for it's affinity mask and built queue mapping
- * that maps a queue to the CPUs that have irq affinity for the corresponding
- * vector.
- */
-void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
- int offset)
-{
- const struct cpumask *mask;
- unsigned int queue, cpu;
-
- for (queue = 0; queue < qmap->nr_queues; queue++) {
- mask = pci_irq_get_affinity(pdev, queue + offset);
- if (!mask)
- goto fallback;
-
- for_each_cpu(cpu, mask)
- qmap->mq_map[cpu] = qmap->queue_offset + queue;
- }
-
- return;
-
-fallback:
- WARN_ON_ONCE(qmap->nr_queues > 1);
- blk_mq_clear_mq_map(qmap);
-}
-EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 451a2c1f1f32..7442ca27c2bf 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -351,8 +351,7 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
ctx = blk_mq_get_ctx(q);
hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
type = hctx->type;
- if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
- list_empty_careful(&ctx->rq_lists[type]))
+ if (list_empty_careful(&ctx->rq_lists[type]))
goto out_put;
/* default per sw-queue merge */
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 156e9bb07abf..3feeeccf8a99 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -223,30 +223,27 @@ int blk_mq_sysfs_register(struct gendisk *disk)
unsigned long i, j;
int ret;
- lockdep_assert_held(&q->sysfs_dir_lock);
-
ret = kobject_add(q->mq_kobj, &disk_to_dev(disk)->kobj, "mq");
if (ret < 0)
- goto out;
+ return ret;
kobject_uevent(q->mq_kobj, KOBJ_ADD);
+ mutex_lock(&q->tag_set->tag_list_lock);
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
if (ret)
- goto unreg;
+ goto out_unreg;
}
+ mutex_unlock(&q->tag_set->tag_list_lock);
+ return 0;
- q->mq_sysfs_init_done = true;
-
-out:
- return ret;
-
-unreg:
+out_unreg:
queue_for_each_hw_ctx(q, hctx, j) {
if (j < i)
blk_mq_unregister_hctx(hctx);
}
+ mutex_unlock(&q->tag_set->tag_list_lock);
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
kobject_del(q->mq_kobj);
@@ -259,15 +256,13 @@ void blk_mq_sysfs_unregister(struct gendisk *disk)
struct blk_mq_hw_ctx *hctx;
unsigned long i;
- lockdep_assert_held(&q->sysfs_dir_lock);
-
+ mutex_lock(&q->tag_set->tag_list_lock);
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
+ mutex_unlock(&q->tag_set->tag_list_lock);
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
kobject_del(q->mq_kobj);
-
- q->mq_sysfs_init_done = false;
}
void blk_mq_sysfs_unregister_hctxs(struct request_queue *q)
@@ -275,15 +270,11 @@ void blk_mq_sysfs_unregister_hctxs(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
unsigned long i;
- mutex_lock(&q->sysfs_dir_lock);
- if (!q->mq_sysfs_init_done)
- goto unlock;
+ if (!blk_queue_registered(q))
+ return;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
-
-unlock:
- mutex_unlock(&q->sysfs_dir_lock);
}
int blk_mq_sysfs_register_hctxs(struct request_queue *q)
@@ -292,9 +283,8 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q)
unsigned long i;
int ret = 0;
- mutex_lock(&q->sysfs_dir_lock);
- if (!q->mq_sysfs_init_done)
- goto unlock;
+ if (!blk_queue_registered(q))
+ goto out;
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
@@ -302,8 +292,6 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q)
break;
}
-unlock:
- mutex_unlock(&q->sysfs_dir_lock);
-
+out:
return ret;
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2cafcf11ee8b..b9f417d980b4 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -544,30 +544,11 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
node);
}
-int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
- struct sbitmap_queue *breserved_tags,
- unsigned int queue_depth, unsigned int reserved,
- int node, int alloc_policy)
-{
- unsigned int depth = queue_depth - reserved;
- bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
-
- if (bt_alloc(bitmap_tags, depth, round_robin, node))
- return -ENOMEM;
- if (bt_alloc(breserved_tags, reserved, round_robin, node))
- goto free_bitmap_tags;
-
- return 0;
-
-free_bitmap_tags:
- sbitmap_queue_free(bitmap_tags);
- return -ENOMEM;
-}
-
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
- unsigned int reserved_tags,
- int node, int alloc_policy)
+ unsigned int reserved_tags, unsigned int flags, int node)
{
+ unsigned int depth = total_tags - reserved_tags;
+ bool round_robin = flags & BLK_MQ_F_TAG_RR;
struct blk_mq_tags *tags;
if (total_tags > BLK_MQ_TAG_MAX) {
@@ -582,14 +563,18 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
spin_lock_init(&tags->lock);
+ if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
+ goto out_free_tags;
+ if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node))
+ goto out_free_bitmap_tags;
- if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
- total_tags, reserved_tags, node,
- alloc_policy) < 0) {
- kfree(tags);
- return NULL;
- }
return tags;
+
+out_free_bitmap_tags:
+ sbitmap_queue_free(&tags->bitmap_tags);
+out_free_tags:
+ kfree(tags);
+ return NULL;
}
void blk_mq_free_tags(struct blk_mq_tags *tags)
diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c
deleted file mode 100644
index 68d0945c0b08..000000000000
--- a/block/blk-mq-virtio.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2016 Christoph Hellwig.
- */
-#include <linux/device.h>
-#include <linux/blk-mq-virtio.h>
-#include <linux/virtio_config.h>
-#include <linux/module.h>
-#include "blk-mq.h"
-
-/**
- * blk_mq_virtio_map_queues - provide a default queue mapping for virtio device
- * @qmap: CPU to hardware queue map.
- * @vdev: virtio device to provide a mapping for.
- * @first_vec: first interrupt vectors to use for queues (usually 0)
- *
- * This function assumes the virtio device @vdev has at least as many available
- * interrupt vectors as @set has queues. It will then query the vector
- * corresponding to each queue for it's affinity mask and built queue mapping
- * that maps a queue to the CPUs that have irq affinity for the corresponding
- * vector.
- */
-void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
- struct virtio_device *vdev, int first_vec)
-{
- const struct cpumask *mask;
- unsigned int queue, cpu;
-
- if (!vdev->config->get_vq_affinity)
- goto fallback;
-
- for (queue = 0; queue < qmap->nr_queues; queue++) {
- mask = vdev->config->get_vq_affinity(vdev, first_vec + queue);
- if (!mask)
- goto fallback;
-
- for_each_cpu(cpu, mask)
- qmap->mq_map[cpu] = qmap->queue_offset + queue;
- }
-
- return;
-
-fallback:
- blk_mq_map_queues(qmap);
-}
-EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8ac19d4ae3c0..40490ac88045 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -131,6 +131,10 @@ static bool blk_freeze_set_owner(struct request_queue *q,
if (!q->mq_freeze_depth) {
q->mq_freeze_owner = owner;
q->mq_freeze_owner_depth = 1;
+ q->mq_freeze_disk_dead = !q->disk ||
+ test_bit(GD_DEAD, &q->disk->state) ||
+ !blk_queue_registered(q);
+ q->mq_freeze_queue_dying = blk_queue_dying(q);
return true;
}
@@ -142,8 +146,6 @@ static bool blk_freeze_set_owner(struct request_queue *q,
/* verify the last unfreeze in owner context */
static bool blk_unfreeze_check_owner(struct request_queue *q)
{
- if (!q->mq_freeze_owner)
- return false;
if (q->mq_freeze_owner != current)
return false;
if (--q->mq_freeze_owner_depth == 0) {
@@ -189,7 +191,7 @@ bool __blk_freeze_queue_start(struct request_queue *q,
void blk_freeze_queue_start(struct request_queue *q)
{
if (__blk_freeze_queue_start(q, current))
- blk_freeze_acquire_lock(q, false, false);
+ blk_freeze_acquire_lock(q);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -208,12 +210,12 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
-void blk_mq_freeze_queue(struct request_queue *q)
+void blk_mq_freeze_queue_nomemsave(struct request_queue *q)
{
blk_freeze_queue_start(q);
blk_mq_freeze_queue_wait(q);
}
-EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave);
bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
{
@@ -234,12 +236,12 @@ bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
return unfreeze;
}
-void blk_mq_unfreeze_queue(struct request_queue *q)
+void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q)
{
if (__blk_mq_unfreeze_queue(q, false))
- blk_unfreeze_release_lock(q, false, false);
+ blk_unfreeze_release_lock(q);
}
-EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
+EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore);
/*
* non_owner variant of blk_freeze_queue_start
@@ -2656,8 +2658,10 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
if (bio->bi_opf & REQ_RAHEAD)
rq->cmd_flags |= REQ_FAILFAST_MASK;
+ rq->bio = rq->biotail = bio;
rq->__sector = bio->bi_iter.bi_sector;
- blk_rq_bio_prep(rq, bio, nr_segs);
+ rq->__data_len = bio->bi_iter.bi_size;
+ rq->nr_phys_segments = nr_segs;
if (bio_integrity(bio))
rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
bio);
@@ -2980,12 +2984,9 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
}
rq = __blk_mq_alloc_requests(&data);
- if (rq)
- return rq;
- rq_qos_cleanup(q, bio);
- if (bio->bi_opf & REQ_NOWAIT)
- bio_wouldblock_error(bio);
- return NULL;
+ if (unlikely(!rq))
+ rq_qos_cleanup(q, bio);
+ return rq;
}
/*
@@ -3092,14 +3093,21 @@ void blk_mq_submit_bio(struct bio *bio)
}
/*
- * Device reconfiguration may change logical block size, so alignment
- * check has to be done with queue usage counter held
+ * Device reconfiguration may change logical block size or reduce the
+ * number of poll queues, so the checks for alignment and poll support
+ * have to be done with queue usage counter held.
*/
if (unlikely(bio_unaligned(bio, q))) {
bio_io_error(bio);
goto queue_exit;
}
+ if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) {
+ bio->bi_status = BLK_STS_NOTSUPP;
+ bio_endio(bio);
+ goto queue_exit;
+ }
+
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio)
goto queue_exit;
@@ -3114,12 +3122,15 @@ void blk_mq_submit_bio(struct bio *bio)
goto queue_exit;
new_request:
- if (!rq) {
+ if (rq) {
+ blk_mq_use_cached_rq(rq, plug, bio);
+ } else {
rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ if (bio->bi_opf & REQ_NOWAIT)
+ bio_wouldblock_error(bio);
goto queue_exit;
- } else {
- blk_mq_use_cached_rq(rq, plug, bio);
+ }
}
trace_block_getrq(bio);
@@ -3472,8 +3483,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
if (node == NUMA_NO_NODE)
node = set->numa_node;
- tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
- BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
+ tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node);
if (!tags)
return NULL;
@@ -4213,13 +4223,14 @@ static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
bool shared)
{
struct request_queue *q;
+ unsigned int memflags;
lockdep_assert_held(&set->tag_list_lock);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
queue_set_hctx_shared(q, shared);
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
}
}
@@ -4317,12 +4328,6 @@ void blk_mq_release(struct request_queue *q)
blk_mq_sysfs_deinit(q);
}
-static bool blk_mq_can_poll(struct blk_mq_tag_set *set)
-{
- return set->nr_maps > HCTX_TYPE_POLL &&
- set->map[HCTX_TYPE_POLL].nr_queues;
-}
-
struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata)
{
@@ -4333,7 +4338,7 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
if (!lim)
lim = &default_lim;
lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
- if (blk_mq_can_poll(set))
+ if (set->nr_maps > HCTX_TYPE_POLL)
lim->features |= BLK_FEAT_POLL;
q = blk_alloc_queue(lim, set->numa_node);
@@ -4988,6 +4993,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
struct request_queue *q;
LIST_HEAD(head);
int prev_nr_hw_queues = set->nr_hw_queues;
+ unsigned int memflags;
int i;
lockdep_assert_held(&set->tag_list_lock);
@@ -4999,8 +5005,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
return;
+ memflags = memalloc_noio_save();
list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_freeze_queue(q);
+ blk_mq_freeze_queue_nomemsave(q);
+
/*
* Switch IO scheduler to 'none', cleaning up the data associated
* with the previous scheduler. We will switch back once we are done
@@ -5021,8 +5029,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
- struct queue_limits lim;
-
blk_mq_realloc_hw_ctxs(set, q);
if (q->nr_hw_queues != set->nr_hw_queues) {
@@ -5036,13 +5042,6 @@ fallback:
set->nr_hw_queues = prev_nr_hw_queues;
goto fallback;
}
- lim = queue_limits_start_update(q);
- if (blk_mq_can_poll(set))
- lim.features |= BLK_FEAT_POLL;
- else
- lim.features &= ~BLK_FEAT_POLL;
- if (queue_limits_commit_update(q, &lim) < 0)
- pr_warn("updating the poll flag failed\n");
blk_mq_map_swqueue(q);
}
@@ -5057,7 +5056,8 @@ switch_back:
blk_mq_elv_switch_back(&head, q);
list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue_nomemrestore(q);
+ memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++)
@@ -5102,9 +5102,9 @@ static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
struct io_comp_batch *iob, unsigned int flags)
{
- struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie);
-
- return blk_hctx_poll(q, hctx, iob, flags);
+ if (!blk_mq_can_poll(q))
+ return 0;
+ return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags);
}
int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 89a20fffa4b1..44979e92b79f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -163,11 +163,8 @@ struct blk_mq_alloc_data {
};
struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
- unsigned int reserved_tags, int node, int alloc_policy);
+ unsigned int reserved_tags, unsigned int flags, int node);
void blk_mq_free_tags(struct blk_mq_tags *tags);
-int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
- struct sbitmap_queue *breserved_tags, unsigned int queue_depth,
- unsigned int reserved, int node, int alloc_policy);
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
@@ -451,4 +448,10 @@ do { \
#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
__blk_mq_run_dispatch_ops(q, true, dispatch_ops) \
+static inline bool blk_mq_can_poll(struct request_queue *q)
+{
+ return (q->limits.features & BLK_FEAT_POLL) &&
+ q->tag_set->map[HCTX_TYPE_POLL].nr_queues;
+}
+
#endif
diff --git a/block/blk-pm.c b/block/blk-pm.c
index 42e842074715..8d3e052f91da 100644
--- a/block/blk-pm.c
+++ b/block/blk-pm.c
@@ -89,7 +89,7 @@ int blk_pre_runtime_suspend(struct request_queue *q)
if (percpu_ref_is_zero(&q->q_usage_counter))
ret = 0;
/* Switch q_usage_counter back to per-cpu mode. */
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue_nomemrestore(q);
if (ret < 0) {
spin_lock_irq(&q->queue_lock);
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index eb9618cd68ad..d4d4f4dc0e23 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -299,6 +299,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
const struct rq_qos_ops *ops)
{
struct request_queue *q = disk->queue;
+ unsigned int memflags;
lockdep_assert_held(&q->rq_qos_mutex);
@@ -310,14 +311,14 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
* No IO can be in-flight when adding rqos, so freeze queue, which
* is fine since we only support rq_qos for blk-mq queue.
*/
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
if (rq_qos_id(q, rqos->id))
goto ebusy;
rqos->next = q->rq_qos;
q->rq_qos = rqos;
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
if (rqos->ops->debugfs_attrs) {
mutex_lock(&q->debugfs_mutex);
@@ -327,7 +328,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
return 0;
ebusy:
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
return -EBUSY;
}
@@ -335,17 +336,18 @@ void rq_qos_del(struct rq_qos *rqos)
{
struct request_queue *q = rqos->disk->queue;
struct rq_qos **cur;
+ unsigned int memflags;
lockdep_assert_held(&q->rq_qos_mutex);
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
if (*cur == rqos) {
*cur = rqos->next;
break;
}
}
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_rqos(rqos);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8f09e33f41f6..b9c6f0ec1c49 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -175,6 +175,9 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim)
{
unsigned int boundary_sectors;
+ if (!(lim->features & BLK_FEAT_ATOMIC_WRITES))
+ goto unsupported;
+
if (!lim->atomic_write_hw_max)
goto unsupported;
@@ -243,6 +246,7 @@ int blk_validate_limits(struct queue_limits *lim)
{
unsigned int max_hw_sectors;
unsigned int logical_block_sectors;
+ unsigned long seg_size;
int err;
/*
@@ -300,7 +304,7 @@ int blk_validate_limits(struct queue_limits *lim)
max_hw_sectors = min_not_zero(lim->max_hw_sectors,
lim->max_dev_sectors);
if (lim->max_user_sectors) {
- if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE)
+ if (lim->max_user_sectors < BLK_MIN_SEGMENT_SIZE / SECTOR_SIZE)
return -EINVAL;
lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors);
} else if (lim->io_opt > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) {
@@ -338,7 +342,7 @@ int blk_validate_limits(struct queue_limits *lim)
*/
if (!lim->seg_boundary_mask)
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
- if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1))
+ if (WARN_ON_ONCE(lim->seg_boundary_mask < BLK_MIN_SEGMENT_SIZE - 1))
return -EINVAL;
/*
@@ -359,10 +363,17 @@ int blk_validate_limits(struct queue_limits *lim)
*/
if (!lim->max_segment_size)
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
- if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE))
+ if (WARN_ON_ONCE(lim->max_segment_size < BLK_MIN_SEGMENT_SIZE))
return -EINVAL;
}
+ /* setup min segment size for building new segment in fast path */
+ if (lim->seg_boundary_mask > lim->max_segment_size - 1)
+ seg_size = lim->max_segment_size;
+ else
+ seg_size = lim->seg_boundary_mask + 1;
+ lim->min_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE);
+
/*
* We require drivers to at least do logical block aligned I/O, but
* historically could not check for that due to the separate calls
@@ -413,7 +424,8 @@ int blk_set_default_limits(struct queue_limits *lim)
* @lim: limits to apply
*
* Apply the limits in @lim that were obtained from queue_limits_start_update()
- * and updated by the caller to @q.
+ * and updated by the caller to @q. The caller must have frozen the queue or
+ * ensure that there are no outstanding I/Os by other means.
*
* Returns 0 if successful, else a negative error code.
*/
@@ -444,6 +456,31 @@ out_unlock:
EXPORT_SYMBOL_GPL(queue_limits_commit_update);
/**
+ * queue_limits_commit_update_frozen - commit an atomic update of queue limits
+ * @q: queue to update
+ * @lim: limits to apply
+ *
+ * Apply the limits in @lim that were obtained from queue_limits_start_update()
+ * and updated with the new values by the caller to @q. Freezes the queue
+ * before the update and unfreezes it after.
+ *
+ * Returns 0 if successful, else a negative error code.
+ */
+int queue_limits_commit_update_frozen(struct request_queue *q,
+ struct queue_limits *lim)
+{
+ unsigned int memflags;
+ int ret;
+
+ memflags = blk_mq_freeze_queue(q);
+ ret = queue_limits_commit_update(q, lim);
+ blk_mq_unfreeze_queue(q, memflags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_limits_commit_update_frozen);
+
+/**
* queue_limits_set - apply queue limits to queue
* @q: queue to update
* @lim: limits to apply
@@ -584,12 +621,15 @@ static bool blk_stack_atomic_writes_head(struct queue_limits *t,
}
static void blk_stack_atomic_writes_limits(struct queue_limits *t,
- struct queue_limits *b)
+ struct queue_limits *b, sector_t start)
{
- if (!(t->features & BLK_FEAT_ATOMIC_WRITES_STACKED))
+ if (!(b->features & BLK_FEAT_ATOMIC_WRITES))
+ goto unsupported;
+
+ if (!b->atomic_write_hw_unit_min)
goto unsupported;
- if (!b->atomic_write_unit_min)
+ if (!blk_atomic_write_start_sect_aligned(start, b))
goto unsupported;
/*
@@ -611,7 +651,6 @@ unsupported:
t->atomic_write_hw_unit_max = 0;
t->atomic_write_hw_unit_min = 0;
t->atomic_write_hw_boundary = 0;
- t->features &= ~BLK_FEAT_ATOMIC_WRITES_STACKED;
}
/**
@@ -774,7 +813,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->zone_write_granularity = 0;
t->max_zone_append_sectors = 0;
}
- blk_stack_atomic_writes_limits(t, b);
+ blk_stack_atomic_writes_limits(t, b, start);
return ret;
}
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 767598e719ab..6f548a4376aa 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -24,6 +24,8 @@ struct queue_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct gendisk *disk, char *page);
ssize_t (*store)(struct gendisk *disk, const char *page, size_t count);
+ int (*store_limit)(struct gendisk *disk, const char *page,
+ size_t count, struct queue_limits *lim);
void (*load_module)(struct gendisk *disk, const char *page, size_t count);
};
@@ -153,13 +155,11 @@ QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0)
QUEUE_SYSFS_SHOW_CONST(write_same_max, 0)
QUEUE_SYSFS_SHOW_CONST(poll_delay, -1)
-static ssize_t queue_max_discard_sectors_store(struct gendisk *disk,
- const char *page, size_t count)
+static int queue_max_discard_sectors_store(struct gendisk *disk,
+ const char *page, size_t count, struct queue_limits *lim)
{
unsigned long max_discard_bytes;
- struct queue_limits lim;
ssize_t ret;
- int err;
ret = queue_var_store(&max_discard_bytes, page, count);
if (ret < 0)
@@ -171,38 +171,28 @@ static ssize_t queue_max_discard_sectors_store(struct gendisk *disk,
if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX)
return -EINVAL;
- lim = queue_limits_start_update(disk->queue);
- lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
- err = queue_limits_commit_update(disk->queue, &lim);
- if (err)
- return err;
- return ret;
+ lim->max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
+ return 0;
}
-static ssize_t
-queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count)
+static int
+queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count,
+ struct queue_limits *lim)
{
unsigned long max_sectors_kb;
- struct queue_limits lim;
ssize_t ret;
- int err;
ret = queue_var_store(&max_sectors_kb, page, count);
if (ret < 0)
return ret;
- lim = queue_limits_start_update(disk->queue);
- lim.max_user_sectors = max_sectors_kb << 1;
- err = queue_limits_commit_update(disk->queue, &lim);
- if (err)
- return err;
- return ret;
+ lim->max_user_sectors = max_sectors_kb << 1;
+ return 0;
}
static ssize_t queue_feature_store(struct gendisk *disk, const char *page,
- size_t count, blk_features_t feature)
+ size_t count, struct queue_limits *lim, blk_features_t feature)
{
- struct queue_limits lim;
unsigned long val;
ssize_t ret;
@@ -210,15 +200,11 @@ static ssize_t queue_feature_store(struct gendisk *disk, const char *page,
if (ret < 0)
return ret;
- lim = queue_limits_start_update(disk->queue);
if (val)
- lim.features |= feature;
+ lim->features |= feature;
else
- lim.features &= ~feature;
- ret = queue_limits_commit_update(disk->queue, &lim);
- if (ret)
- return ret;
- return count;
+ lim->features &= ~feature;
+ return 0;
}
#define QUEUE_SYSFS_FEATURE(_name, _feature) \
@@ -227,10 +213,10 @@ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \
return sysfs_emit(page, "%u\n", \
!!(disk->queue->limits.features & _feature)); \
} \
-static ssize_t queue_##_name##_store(struct gendisk *disk, \
- const char *page, size_t count) \
+static int queue_##_name##_store(struct gendisk *disk, \
+ const char *page, size_t count, struct queue_limits *lim) \
{ \
- return queue_feature_store(disk, page, count, _feature); \
+ return queue_feature_store(disk, page, count, lim, _feature); \
}
QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL)
@@ -245,10 +231,17 @@ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \
!!(disk->queue->limits.features & _feature)); \
}
-QUEUE_SYSFS_FEATURE_SHOW(poll, BLK_FEAT_POLL);
QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA);
QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX);
+static ssize_t queue_poll_show(struct gendisk *disk, char *page)
+{
+ if (queue_is_mq(disk->queue))
+ return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue));
+ return sysfs_emit(page, "%u\n",
+ !!(disk->queue->limits.features & BLK_FEAT_POLL));
+}
+
static ssize_t queue_zoned_show(struct gendisk *disk, char *page)
{
if (blk_queue_is_zoned(disk->queue))
@@ -266,10 +259,9 @@ static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page)
return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page);
}
-static ssize_t queue_iostats_passthrough_store(struct gendisk *disk,
- const char *page, size_t count)
+static int queue_iostats_passthrough_store(struct gendisk *disk,
+ const char *page, size_t count, struct queue_limits *lim)
{
- struct queue_limits lim;
unsigned long ios;
ssize_t ret;
@@ -277,18 +269,13 @@ static ssize_t queue_iostats_passthrough_store(struct gendisk *disk,
if (ret < 0)
return ret;
- lim = queue_limits_start_update(disk->queue);
if (ios)
- lim.flags |= BLK_FLAG_IOSTATS_PASSTHROUGH;
+ lim->flags |= BLK_FLAG_IOSTATS_PASSTHROUGH;
else
- lim.flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH;
-
- ret = queue_limits_commit_update(disk->queue, &lim);
- if (ret)
- return ret;
-
- return count;
+ lim->flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH;
+ return 0;
}
+
static ssize_t queue_nomerges_show(struct gendisk *disk, char *page)
{
return queue_var_show((blk_queue_nomerges(disk->queue) << 1) |
@@ -391,12 +378,10 @@ static ssize_t queue_wc_show(struct gendisk *disk, char *page)
return sysfs_emit(page, "write through\n");
}
-static ssize_t queue_wc_store(struct gendisk *disk, const char *page,
- size_t count)
+static int queue_wc_store(struct gendisk *disk, const char *page,
+ size_t count, struct queue_limits *lim)
{
- struct queue_limits lim;
bool disable;
- int err;
if (!strncmp(page, "write back", 10)) {
disable = false;
@@ -407,15 +392,11 @@ static ssize_t queue_wc_store(struct gendisk *disk, const char *page,
return -EINVAL;
}
- lim = queue_limits_start_update(disk->queue);
if (disable)
- lim.flags |= BLK_FLAG_WRITE_CACHE_DISABLED;
+ lim->flags |= BLK_FLAG_WRITE_CACHE_DISABLED;
else
- lim.flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED;
- err = queue_limits_commit_update(disk->queue, &lim);
- if (err)
- return err;
- return count;
+ lim->flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED;
+ return 0;
}
#define QUEUE_RO_ENTRY(_prefix, _name) \
@@ -431,6 +412,13 @@ static struct queue_sysfs_entry _prefix##_entry = { \
.store = _prefix##_store, \
};
+#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \
+static struct queue_sysfs_entry _prefix##_entry = { \
+ .attr = { .name = _name, .mode = 0644 }, \
+ .show = _prefix##_show, \
+ .store_limit = _prefix##_store, \
+}
+
#define QUEUE_RW_LOAD_MODULE_ENTRY(_prefix, _name) \
static struct queue_sysfs_entry _prefix##_entry = { \
.attr = { .name = _name, .mode = 0644 }, \
@@ -441,7 +429,7 @@ static struct queue_sysfs_entry _prefix##_entry = { \
QUEUE_RW_ENTRY(queue_requests, "nr_requests");
QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb");
-QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
+QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
@@ -457,7 +445,7 @@ QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size");
QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity");
QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes");
-QUEUE_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes");
+QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes");
QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes");
@@ -477,11 +465,11 @@ QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
-QUEUE_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough");
+QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough");
QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
QUEUE_RW_ENTRY(queue_poll, "io_poll");
QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay");
-QUEUE_RW_ENTRY(queue_wc, "write_cache");
+QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache");
QUEUE_RO_ENTRY(queue_fua, "fua");
QUEUE_RO_ENTRY(queue_dax, "dax");
QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
@@ -494,10 +482,10 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = {
.show = queue_logical_block_size_show,
};
-QUEUE_RW_ENTRY(queue_rotational, "rotational");
-QUEUE_RW_ENTRY(queue_iostats, "iostats");
-QUEUE_RW_ENTRY(queue_add_random, "add_random");
-QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes");
+QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational");
+QUEUE_LIM_RW_ENTRY(queue_iostats, "iostats");
+QUEUE_LIM_RW_ENTRY(queue_add_random, "add_random");
+QUEUE_LIM_RW_ENTRY(queue_stable_writes, "stable_writes");
#ifdef CONFIG_BLK_WBT
static ssize_t queue_var_store64(s64 *var, const char *page)
@@ -693,9 +681,10 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
struct queue_sysfs_entry *entry = to_queue(attr);
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
struct request_queue *q = disk->queue;
+ unsigned int memflags;
ssize_t res;
- if (!entry->store)
+ if (!entry->store_limit && !entry->store)
return -EIO;
/*
@@ -706,11 +695,26 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
if (entry->load_module)
entry->load_module(disk, page, length);
- blk_mq_freeze_queue(q);
+ if (entry->store_limit) {
+ struct queue_limits lim = queue_limits_start_update(q);
+
+ res = entry->store_limit(disk, page, length, &lim);
+ if (res < 0) {
+ queue_limits_cancel_update(q);
+ return res;
+ }
+
+ res = queue_limits_commit_update_frozen(q, &lim);
+ if (res)
+ return res;
+ return length;
+ }
+
mutex_lock(&q->sysfs_lock);
+ memflags = blk_mq_freeze_queue(q);
res = entry->store(disk, page, length);
+ blk_mq_unfreeze_queue(q, memflags);
mutex_unlock(&q->sysfs_lock);
- blk_mq_unfreeze_queue(q);
return res;
}
@@ -758,7 +762,6 @@ int blk_register_queue(struct gendisk *disk)
struct request_queue *q = disk->queue;
int ret;
- mutex_lock(&q->sysfs_dir_lock);
kobject_init(&disk->queue_kobj, &blk_queue_ktype);
ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue");
if (ret < 0)
@@ -799,7 +802,6 @@ int blk_register_queue(struct gendisk *disk)
if (q->elevator)
kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
mutex_unlock(&q->sysfs_lock);
- mutex_unlock(&q->sysfs_dir_lock);
/*
* SCSI probing may synchronously create and destroy a lot of
@@ -824,7 +826,6 @@ out_debugfs_remove:
mutex_unlock(&q->sysfs_lock);
out_put_queue_kobj:
kobject_put(&disk->queue_kobj);
- mutex_unlock(&q->sysfs_dir_lock);
return ret;
}
@@ -855,7 +856,6 @@ void blk_unregister_queue(struct gendisk *disk)
blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
mutex_unlock(&q->sysfs_lock);
- mutex_lock(&q->sysfs_dir_lock);
/*
* Remove the sysfs attributes before unregistering the queue data
* structures that can be modified through sysfs.
@@ -872,7 +872,6 @@ void blk_unregister_queue(struct gendisk *disk)
/* Now that we've deleted all child objects, we can delete the queue. */
kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE);
kobject_del(&disk->queue_kobj);
- mutex_unlock(&q->sysfs_dir_lock);
blk_debugfs_remove(disk);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 82dbaefcfa3b..8d149aff9fd0 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1202,6 +1202,7 @@ static int blk_throtl_init(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct throtl_data *td;
+ unsigned int memflags;
int ret;
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
@@ -1215,7 +1216,7 @@ static int blk_throtl_init(struct gendisk *disk)
* Freeze queue before activating policy, to synchronize with IO path,
* which is protected by 'q_usage_counter'.
*/
- blk_mq_freeze_queue(disk->queue);
+ memflags = blk_mq_freeze_queue(disk->queue);
blk_mq_quiesce_queue(disk->queue);
q->td = td;
@@ -1239,7 +1240,7 @@ static int blk_throtl_init(struct gendisk *disk)
out:
blk_mq_unquiesce_queue(disk->queue);
- blk_mq_unfreeze_queue(disk->queue);
+ blk_mq_unfreeze_queue(disk->queue, memflags);
return ret;
}
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 84da1eadff64..0c77244a35c9 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -11,12 +11,8 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/refcount.h>
#include <linux/mempool.h>
@@ -414,13 +410,14 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,
}
}
hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
+ atomic_inc(&disk->nr_zone_wplugs);
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
return true;
}
-static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
- sector_t sector)
+static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk,
+ sector_t sector)
{
unsigned int zno = disk_zone_no(disk, sector);
unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
@@ -441,6 +438,15 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
return NULL;
}
+static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
+ sector_t sector)
+{
+ if (!atomic_read(&disk->nr_zone_wplugs))
+ return NULL;
+
+ return disk_get_hashed_zone_wplug(disk, sector);
+}
+
static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
{
struct blk_zone_wplug *zwplug =
@@ -463,6 +469,8 @@ static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
struct blk_zone_wplug *zwplug)
{
+ lockdep_assert_held(&zwplug->lock);
+
/* If the zone write plug was already removed, we are done. */
if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
return false;
@@ -505,6 +513,7 @@ static void disk_remove_zone_wplug(struct gendisk *disk,
zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
hlist_del_init_rcu(&zwplug->node);
+ atomic_dec(&disk->nr_zone_wplugs);
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
disk_put_zone_wplug(zwplug);
}
@@ -584,6 +593,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
bio_io_error(bio);
disk_put_zone_wplug(zwplug);
+ /* Drop the reference taken by disk_zone_wplug_add_bio(() */
blk_queue_exit(q);
}
@@ -594,6 +604,11 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
{
struct bio *bio;
+ if (bio_list_empty(&zwplug->bio_list))
+ return;
+
+ pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
+ zwplug->disk->disk_name, zwplug->zone_no);
while ((bio = bio_list_pop(&zwplug->bio_list)))
blk_zone_wplug_bio_io_error(zwplug, bio);
}
@@ -895,10 +910,7 @@ void blk_zone_write_plug_init_request(struct request *req)
break;
}
- /*
- * Drop the extra reference on the queue usage we got when
- * plugging the BIO and advance the write pointer offset.
- */
+ /* Drop the reference taken by disk_zone_wplug_add_bio(). */
blk_queue_exit(q);
zwplug->wp_offset += bio_sectors(bio);
@@ -917,6 +929,8 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
{
struct gendisk *disk = bio->bi_bdev->bd_disk;
+ lockdep_assert_held(&zwplug->lock);
+
/*
* If we lost track of the zone write pointer due to a write error,
* the user must either execute a report zones, reset the zone or finish
@@ -1042,6 +1056,47 @@ plug:
return true;
}
+static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+
+ /*
+ * We have native support for zone append operations, so we are not
+ * going to handle @bio through plugging. However, we may already have a
+ * zone write plug for the target zone if that zone was previously
+ * partially written using regular writes. In such case, we risk leaving
+ * the plug in the disk hash table if the zone is fully written using
+ * zone append operations. Avoid this by removing the zone write plug.
+ */
+ zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
+ if (likely(!zwplug))
+ return;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /*
+ * We are about to remove the zone write plug. But if the user
+ * (mistakenly) has issued regular writes together with native zone
+ * append, we must aborts the writes as otherwise the plugged BIOs would
+ * not be executed by the plug BIO work as disk_get_zone_wplug() will
+ * return NULL after the plug is removed. Aborting the plugged write
+ * BIOs is consistent with the fact that these writes will most likely
+ * fail anyway as there is no ordering guarantees between zone append
+ * operations and regular write operations.
+ */
+ if (!bio_list_empty(&zwplug->bio_list)) {
+ pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
+ disk->disk_name, zwplug->zone_no);
+ disk_zone_wplug_abort(zwplug);
+ }
+ disk_remove_zone_wplug(disk, zwplug);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ disk_put_zone_wplug(zwplug);
+}
+
/**
* blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
* @bio: The BIO being submitted
@@ -1098,8 +1153,10 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
*/
switch (bio_op(bio)) {
case REQ_OP_ZONE_APPEND:
- if (!bdev_emulates_zone_append(bdev))
+ if (!bdev_emulates_zone_append(bdev)) {
+ blk_zone_wplug_handle_native_zone_append(bio);
return false;
+ }
fallthrough;
case REQ_OP_WRITE:
case REQ_OP_WRITE_ZEROES:
@@ -1286,6 +1343,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
{
unsigned int i;
+ atomic_set(&disk->nr_zone_wplugs, 0);
disk->zone_wplugs_hash_bits =
min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
@@ -1340,6 +1398,7 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
}
}
+ WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
kfree(disk->zone_wplugs_hash);
disk->zone_wplugs_hash = NULL;
disk->zone_wplugs_hash_bits = 0;
@@ -1446,7 +1505,6 @@ static int disk_update_zone_resources(struct gendisk *disk,
unsigned int nr_seq_zones, nr_conv_zones;
unsigned int pool_size;
struct queue_limits lim;
- int ret;
disk->nr_zones = args->nr_zones;
disk->zone_capacity = args->zone_capacity;
@@ -1497,11 +1555,7 @@ static int disk_update_zone_resources(struct gendisk *disk,
}
commit:
- blk_mq_freeze_queue(q);
- ret = queue_limits_commit_update(q, &lim);
- blk_mq_unfreeze_queue(q);
-
- return ret;
+ return queue_limits_commit_update_frozen(q, &lim);
}
static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
@@ -1557,11 +1611,12 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
}
/*
- * We need to track the write pointer of all zones that are not
- * empty nor full. So make sure we have a zone write plug for
- * such zone if the device has a zone write plug hash table.
+ * If the device needs zone append emulation, we need to track the
+ * write pointer of all zones that are not empty nor full. So make sure
+ * we have a zone write plug for such zone if the device has a zone
+ * write plug hash table.
*/
- if (!disk->zone_wplugs_hash)
+ if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash)
return 0;
disk_zone_wplug_sync_wp_offset(disk, zone);
@@ -1724,9 +1779,10 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
else
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
if (ret) {
- blk_mq_freeze_queue(q);
+ unsigned int memflags = blk_mq_freeze_queue(q);
+
disk_free_zone_resources(disk);
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
}
return ret;
@@ -1776,37 +1832,41 @@ int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
#ifdef CONFIG_BLK_DEBUG_FS
+static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
+ struct seq_file *m)
+{
+ unsigned int zwp_wp_offset, zwp_flags;
+ unsigned int zwp_zone_no, zwp_ref;
+ unsigned int zwp_bio_list_size;
+ unsigned long flags;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+ zwp_zone_no = zwplug->zone_no;
+ zwp_flags = zwplug->flags;
+ zwp_ref = refcount_read(&zwplug->ref);
+ zwp_wp_offset = zwplug->wp_offset;
+ zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref,
+ zwp_wp_offset, zwp_bio_list_size);
+}
int queue_zone_wplugs_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
struct gendisk *disk = q->disk;
struct blk_zone_wplug *zwplug;
- unsigned int zwp_wp_offset, zwp_flags;
- unsigned int zwp_zone_no, zwp_ref;
- unsigned int zwp_bio_list_size, i;
- unsigned long flags;
+ unsigned int i;
if (!disk->zone_wplugs_hash)
return 0;
rcu_read_lock();
- for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
- hlist_for_each_entry_rcu(zwplug,
- &disk->zone_wplugs_hash[i], node) {
- spin_lock_irqsave(&zwplug->lock, flags);
- zwp_zone_no = zwplug->zone_no;
- zwp_flags = zwplug->flags;
- zwp_ref = refcount_read(&zwplug->ref);
- zwp_wp_offset = zwplug->wp_offset;
- zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
- spin_unlock_irqrestore(&zwplug->lock, flags);
-
- seq_printf(m, "%u 0x%x %u %u %u\n",
- zwp_zone_no, zwp_flags, zwp_ref,
- zwp_wp_offset, zwp_bio_list_size);
- }
- }
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
+ hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i],
+ node)
+ queue_zone_wplug_show(zwplug, m);
rcu_read_unlock();
return 0;
diff --git a/block/blk.h b/block/blk.h
index 2c26abf505b8..9cf9a0099416 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -13,6 +13,9 @@
struct elevator_type;
+#define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9)
+#define BLK_MIN_SEGMENT_SIZE 4096
+
/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ)
@@ -356,8 +359,12 @@ struct bio *bio_split_zone_append(struct bio *bio,
static inline bool bio_may_need_split(struct bio *bio,
const struct queue_limits *lim)
{
- return lim->chunk_sectors || bio->bi_vcnt != 1 ||
- bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
+ if (lim->chunk_sectors)
+ return true;
+ if (bio->bi_vcnt != 1)
+ return true;
+ return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset >
+ lim->min_segment_size;
}
/**
@@ -556,14 +563,6 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);
struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
struct lock_class_key *lkclass);
-int bio_add_hw_page(struct request_queue *q, struct bio *bio,
- struct page *page, unsigned int len, unsigned int offset,
- unsigned int max_sectors, bool *same_page);
-
-int bio_add_hw_folio(struct request_queue *q, struct bio *bio,
- struct folio *folio, size_t len, size_t offset,
- unsigned int max_sectors, bool *same_page);
-
/*
* Clean up a page appropriately, where the page may be pinned, may have a
* ref taken on it or neither.
@@ -720,22 +719,29 @@ void blk_integrity_verify(struct bio *bio);
void blk_integrity_prepare(struct request *rq);
void blk_integrity_complete(struct request *rq, unsigned int nr_bytes);
-static inline void blk_freeze_acquire_lock(struct request_queue *q, bool
- disk_dead, bool queue_dying)
+#ifdef CONFIG_LOCKDEP
+static inline void blk_freeze_acquire_lock(struct request_queue *q)
{
- if (!disk_dead)
+ if (!q->mq_freeze_disk_dead)
rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_);
- if (!queue_dying)
+ if (!q->mq_freeze_queue_dying)
rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_);
}
-static inline void blk_unfreeze_release_lock(struct request_queue *q, bool
- disk_dead, bool queue_dying)
+static inline void blk_unfreeze_release_lock(struct request_queue *q)
{
- if (!queue_dying)
+ if (!q->mq_freeze_queue_dying)
rwsem_release(&q->q_lockdep_map, _RET_IP_);
- if (!disk_dead)
+ if (!q->mq_freeze_disk_dead)
rwsem_release(&q->io_lockdep_map, _RET_IP_);
}
+#else
+static inline void blk_freeze_acquire_lock(struct request_queue *q)
+{
+}
+static inline void blk_unfreeze_release_lock(struct request_queue *q)
+{
+}
+#endif
#endif /* BLK_INTERNAL_H */
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 32da4a4429ce..93523d8f8195 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -381,7 +381,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
set->queue_depth = 128;
set->numa_node = NUMA_NO_NODE;
set->cmd_size = sizeof(struct bsg_job) + dd_job_size;
- set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING;
+ set->flags = BLK_MQ_F_BLOCKING;
if (blk_mq_alloc_tag_set(set))
goto out_tag_set;
diff --git a/block/elevator.c b/block/elevator.c
index 7c3ba80e5ff4..cd2ce4921601 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -405,12 +405,12 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
return NULL;
}
-#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
+#define to_elv(atr) container_of_const((atr), struct elv_fs_entry, attr)
static ssize_t
elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
- struct elv_fs_entry *entry = to_elv(attr);
+ const struct elv_fs_entry *entry = to_elv(attr);
struct elevator_queue *e;
ssize_t error;
@@ -428,7 +428,7 @@ static ssize_t
elv_attr_store(struct kobject *kobj, struct attribute *attr,
const char *page, size_t length)
{
- struct elv_fs_entry *entry = to_elv(attr);
+ const struct elv_fs_entry *entry = to_elv(attr);
struct elevator_queue *e;
ssize_t error;
@@ -461,7 +461,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
if (!error) {
- struct elv_fs_entry *attr = e->type->elevator_attrs;
+ const struct elv_fs_entry *attr = e->type->elevator_attrs;
if (attr) {
while (attr->attr.name) {
if (sysfs_create_file(&e->kobj, &attr->attr))
@@ -547,14 +547,6 @@ void elv_unregister(struct elevator_type *e)
}
EXPORT_SYMBOL_GPL(elv_unregister);
-static inline bool elv_support_iosched(struct request_queue *q)
-{
- if (!queue_is_mq(q) ||
- (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
- return false;
- return true;
-}
-
/*
* For single queue devices, default to using mq-deadline. If we have multiple
* queues or mq-deadline is not available, default to "none".
@@ -578,11 +570,9 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
void elevator_init_mq(struct request_queue *q)
{
struct elevator_type *e;
+ unsigned int memflags;
int err;
- if (!elv_support_iosched(q))
- return;
-
WARN_ON_ONCE(blk_queue_registered(q));
if (unlikely(q->elevator))
@@ -601,16 +591,13 @@ void elevator_init_mq(struct request_queue *q)
*
* Disk isn't added yet, so verifying queue lock only manually.
*/
- blk_freeze_queue_start_non_owner(q);
- blk_freeze_acquire_lock(q, true, false);
- blk_mq_freeze_queue_wait(q);
+ memflags = blk_mq_freeze_queue(q);
blk_mq_cancel_work_sync(q);
err = blk_mq_init_sched(q, e);
- blk_unfreeze_release_lock(q, true, false);
- blk_mq_unfreeze_queue_non_owner(q);
+ blk_mq_unfreeze_queue(q, memflags);
if (err) {
pr_warn("\"%s\" elevator initialization failed, "
@@ -628,11 +615,12 @@ void elevator_init_mq(struct request_queue *q)
*/
int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
+ unsigned int memflags;
int ret;
lockdep_assert_held(&q->sysfs_lock);
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
if (q->elevator) {
@@ -653,7 +641,7 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
out_unfreeze:
blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
if (ret) {
pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
@@ -665,9 +653,11 @@ out_unfreeze:
void elevator_disable(struct request_queue *q)
{
+ unsigned int memflags;
+
lockdep_assert_held(&q->sysfs_lock);
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
elv_unregister_queue(q);
@@ -678,7 +668,7 @@ void elevator_disable(struct request_queue *q)
blk_add_trace_msg(q, "elv switch: none");
blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
}
/*
@@ -717,9 +707,6 @@ void elv_iosched_load_module(struct gendisk *disk, const char *buf,
struct elevator_type *found;
const char *name;
- if (!elv_support_iosched(disk->queue))
- return;
-
strscpy(elevator_name, buf, sizeof(elevator_name));
name = strstrip(elevator_name);
@@ -737,9 +724,6 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
char elevator_name[ELV_NAME_MAX];
int ret;
- if (!elv_support_iosched(disk->queue))
- return count;
-
strscpy(elevator_name, buf, sizeof(elevator_name));
ret = elevator_change(disk->queue, strstrip(elevator_name));
if (!ret)
@@ -754,9 +738,6 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
struct elevator_type *cur = NULL, *e;
int len = 0;
- if (!elv_support_iosched(q))
- return sprintf(name, "none\n");
-
if (!q->elevator) {
len += sprintf(name+len, "[none] ");
} else {
diff --git a/block/elevator.h b/block/elevator.h
index dbf357ef4fab..e526662c5dbb 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -71,7 +71,7 @@ struct elevator_type
size_t icq_size; /* see iocontext.h */
size_t icq_align; /* ditto */
- struct elv_fs_entry *elevator_attrs;
+ const struct elv_fs_entry *elevator_attrs;
const char *elevator_name;
const char *elevator_alias;
struct module *elevator_owner;
diff --git a/block/fops.c b/block/fops.c
index 13a67940d040..be9f1dbea9ce 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -54,6 +54,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct bio bio;
ssize_t ret;
+ WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA);
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
@@ -124,12 +125,16 @@ static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
+ bool is_sync = dio->flags & DIO_IS_SYNC;
if (bio->bi_status && !dio->bio.bi_status)
dio->bio.bi_status = bio->bi_status;
+ if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA))
+ bio_integrity_unmap_user(bio);
+
if (atomic_dec_and_test(&dio->ref)) {
- if (!(dio->flags & DIO_IS_SYNC)) {
+ if (!is_sync) {
struct kiocb *iocb = dio->iocb;
ssize_t ret;
@@ -221,14 +226,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* a retry of this from blocking context.
*/
if (unlikely(iov_iter_count(iter))) {
- bio_release_pages(bio, false);
- bio_clear_flag(bio, BIO_REFFED);
- bio_put(bio);
- blk_finish_plug(&plug);
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto fail;
}
bio->bi_opf |= REQ_NOWAIT;
}
+ if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) {
+ ret = bio_integrity_map_iter(bio, iocb->private);
+ if (unlikely(ret))
+ goto fail;
+ }
if (is_read) {
if (dio->flags & DIO_SHOULD_DIRTY)
@@ -269,6 +276,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio_put(&dio->bio);
return ret;
+fail:
+ bio_release_pages(bio, false);
+ bio_clear_flag(bio, BIO_REFFED);
+ bio_put(bio);
+ blk_finish_plug(&plug);
+ return ret;
}
static void blkdev_bio_end_io_async(struct bio *bio)
@@ -286,6 +299,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
ret = blk_status_to_errno(bio->bi_status);
}
+ if (iocb->ki_flags & IOCB_HAS_METADATA)
+ bio_integrity_unmap_user(bio);
+
iocb->ki_complete(iocb, ret);
if (dio->flags & DIO_SHOULD_DIRTY) {
@@ -330,10 +346,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
bio_iov_bvec_set(bio, iter);
} else {
ret = bio_iov_iter_get_pages(bio, iter);
- if (unlikely(ret)) {
- bio_put(bio);
- return ret;
- }
+ if (unlikely(ret))
+ goto out_bio_put;
}
dio->size = bio->bi_iter.bi_size;
@@ -346,6 +360,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
task_io_account_write(bio->bi_iter.bi_size);
}
+ if (iocb->ki_flags & IOCB_HAS_METADATA) {
+ ret = bio_integrity_map_iter(bio, iocb->private);
+ WRITE_ONCE(iocb->private, NULL);
+ if (unlikely(ret))
+ goto out_bio_put;
+ }
+
if (iocb->ki_flags & IOCB_ATOMIC)
bio->bi_opf |= REQ_ATOMIC;
@@ -360,6 +381,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
submit_bio(bio);
}
return -EIOCBQUEUED;
+
+out_bio_put:
+ bio_put(bio);
+ return ret;
}
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
@@ -758,11 +783,12 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
file_accessed(iocb->ki_filp);
ret = blkdev_direct_IO(iocb, to);
- if (ret >= 0) {
+ if (ret > 0) {
iocb->ki_pos += ret;
count -= ret;
}
- iov_iter_revert(to, count - iov_iter_count(to));
+ if (ret != -EIOCBQUEUED)
+ iov_iter_revert(to, count - iov_iter_count(to));
if (ret < 0 || !count)
goto reexpand;
}
diff --git a/block/genhd.c b/block/genhd.c
index 79230c109fca..e9375e20d866 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -58,6 +58,13 @@ static DEFINE_IDA(ext_devt_ida);
void set_capacity(struct gendisk *disk, sector_t sectors)
{
+ if (sectors > BLK_DEV_MAX_SECTORS) {
+ pr_warn_once("%s: truncate capacity from %lld to %lld\n",
+ disk->disk_name, sectors,
+ BLK_DEV_MAX_SECTORS);
+ sectors = BLK_DEV_MAX_SECTORS;
+ }
+
bdev_set_nr_sectors(disk->part0, sectors);
}
EXPORT_SYMBOL(set_capacity);
@@ -400,21 +407,26 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
struct device *ddev = disk_to_dev(disk);
int ret;
- /* Only makes sense for bio-based to set ->poll_bio */
- if (queue_is_mq(disk->queue) && disk->fops->poll_bio)
+ if (WARN_ON_ONCE(bdev_nr_sectors(disk->part0) > BLK_DEV_MAX_SECTORS))
return -EINVAL;
- /*
- * The disk queue should now be all set with enough information about
- * the device for the elevator code to pick an adequate default
- * elevator if one is needed, that is, for devices requesting queue
- * registration.
- */
- elevator_init_mq(disk->queue);
+ if (queue_is_mq(disk->queue)) {
+ /*
+ * ->submit_bio and ->poll_bio are bypassed for blk-mq drivers.
+ */
+ if (disk->fops->submit_bio || disk->fops->poll_bio)
+ return -EINVAL;
- /* Mark bdev as having a submit_bio, if needed */
- if (disk->fops->submit_bio)
+ /*
+ * Initialize the I/O scheduler code and pick a default one if
+ * needed.
+ */
+ elevator_init_mq(disk->queue);
+ } else {
+ if (!disk->fops->submit_bio)
+ return -EINVAL;
bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO);
+ }
/*
* If the driver provides an explicit major number it also must provide
@@ -661,7 +673,7 @@ void del_gendisk(struct gendisk *disk)
struct request_queue *q = disk->queue;
struct block_device *part;
unsigned long idx;
- bool start_drain, queue_dying;
+ bool start_drain;
might_sleep();
@@ -690,9 +702,8 @@ void del_gendisk(struct gendisk *disk)
*/
mutex_lock(&disk->open_mutex);
start_drain = __blk_mark_disk_dead(disk);
- queue_dying = blk_queue_dying(q);
if (start_drain)
- blk_freeze_acquire_lock(q, true, queue_dying);
+ blk_freeze_acquire_lock(q);
xa_for_each_start(&disk->part_tbl, idx, part, 1)
drop_partition(part);
mutex_unlock(&disk->open_mutex);
@@ -748,7 +759,7 @@ void del_gendisk(struct gendisk *disk)
blk_mq_exit_queue(q);
if (start_drain)
- blk_unfreeze_release_lock(q, true, queue_dying);
+ blk_unfreeze_release_lock(q);
}
EXPORT_SYMBOL(del_gendisk);
@@ -798,7 +809,7 @@ static ssize_t disk_badblocks_store(struct device *dev,
}
#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
-void blk_request_module(dev_t devt)
+static bool blk_probe_dev(dev_t devt)
{
unsigned int major = MAJOR(devt);
struct blk_major_name **n;
@@ -808,14 +819,26 @@ void blk_request_module(dev_t devt)
if ((*n)->major == major && (*n)->probe) {
(*n)->probe(devt);
mutex_unlock(&major_names_lock);
- return;
+ return true;
}
}
mutex_unlock(&major_names_lock);
+ return false;
+}
+
+void blk_request_module(dev_t devt)
+{
+ int error;
+
+ if (blk_probe_dev(devt))
+ return;
- if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
- /* Make old-style 2.4 aliases work */
- request_module("block-major-%d", MAJOR(devt));
+ error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt));
+ /* Make old-style 2.4 aliases work */
+ if (error > 0)
+ error = request_module("block-major-%d", MAJOR(devt));
+ if (!error)
+ blk_probe_dev(devt);
}
#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 4155594aefc6..dc31f2dfa414 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -889,7 +889,7 @@ KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
#undef KYBER_LAT_SHOW_STORE
#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
-static struct elv_fs_entry kyber_sched_attrs[] = {
+static const struct elv_fs_entry kyber_sched_attrs[] = {
KYBER_LAT_ATTR(read),
KYBER_LAT_ATTR(write),
__ATTR_NULL
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 5528347b5fcf..754f6b7415cd 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -834,7 +834,7 @@ STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
#define DD_ATTR(name) \
__ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
-static struct elv_fs_entry deadline_attrs[] = {
+static const struct elv_fs_entry deadline_attrs[] = {
DD_ATTR(read_expire),
DD_ATTR(write_expire),
DD_ATTR(writes_starved),
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index 5e9be13a56a8..7acba66eed48 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -682,7 +682,7 @@ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out)
out[size] = 0;
while (i < size) {
- u8 c = le16_to_cpu(in[i]) & 0xff;
+ u8 c = le16_to_cpu(in[i]) & 0x7f;
if (c && !isprint(c))
c = '!';
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
index e259180c8914..aa3bd050d8cd 100644
--- a/block/partitions/ldm.h
+++ b/block/partitions/ldm.h
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* ldm - Part of the Linux-NTFS project.
*
* Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
diff --git a/block/partitions/mac.c b/block/partitions/mac.c
index c80183156d68..b02530d98629 100644
--- a/block/partitions/mac.c
+++ b/block/partitions/mac.c
@@ -53,13 +53,25 @@ int mac_partition(struct parsed_partitions *state)
}
secsize = be16_to_cpu(md->block_size);
put_dev_sector(sect);
+
+ /*
+ * If the "block size" is not a power of 2, things get weird - we might
+ * end up with a partition straddling a sector boundary, so we wouldn't
+ * be able to read a partition entry with read_part_sector().
+ * Real block sizes are probably (?) powers of two, so just require
+ * that.
+ */
+ if (!is_power_of_2(secsize))
+ return -1;
datasize = round_down(secsize, 512);
data = read_part_sector(state, datasize / 512, &sect);
if (!data)
return -1;
partoffset = secsize % 512;
- if (partoffset + sizeof(*part) > datasize)
+ if (partoffset + sizeof(*part) > datasize) {
+ put_dev_sector(sect);
return -1;
+ }
part = (struct mac_partition *) (data + partoffset);
if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
put_dev_sector(sect);
@@ -112,8 +124,8 @@ int mac_partition(struct parsed_partitions *state)
int i, l;
goodness++;
- l = strlen(part->name);
- if (strcmp(part->name, "/") == 0)
+ l = strnlen(part->name, sizeof(part->name));
+ if (strncmp(part->name, "/", sizeof(part->name)) == 0)
goodness++;
for (i = 0; i <= l - 4; ++i) {
if (strncasecmp(part->name + i, "root",