diff options
Diffstat (limited to 'block')
40 files changed, 743 insertions, 807 deletions
diff --git a/block/Makefile b/block/Makefile index ddfd21c1a9ff..33748123710b 100644 --- a/block/Makefile +++ b/block/Makefile @@ -27,8 +27,6 @@ bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o -obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o -obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 95dd7b795935..167542201603 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6844,16 +6844,24 @@ static struct bfq_queue *bfq_waker_bfqq(struct bfq_queue *bfqq) if (new_bfqq == waker_bfqq) { /* * If waker_bfqq is in the merge chain, and current - * is the only procress. + * is the only process, waker_bfqq can be freed. */ if (bfqq_process_refs(waker_bfqq) == 1) return NULL; - break; + + return waker_bfqq; } new_bfqq = new_bfqq->new_bfqq; } + /* + * If waker_bfqq is not in the merge chain, and it's procress reference + * is 0, waker_bfqq can be freed. + */ + if (bfqq_process_refs(waker_bfqq) == 0) + return NULL; + return waker_bfqq; } @@ -7614,7 +7622,7 @@ static ssize_t bfq_low_latency_store(struct elevator_queue *e, #define BFQ_ATTR(name) \ __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store) -static struct elv_fs_entry bfq_attrs[] = { +static const struct elv_fs_entry bfq_attrs[] = { BFQ_ATTR(fifo_expire_sync), BFQ_ATTR(fifo_expire_async), BFQ_ATTR(back_seek_max), diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2a4bd6611692..5d81ad9a3d20 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -118,17 +118,18 @@ static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs, static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) { - unsigned short nr_vecs = bip->bip_max_vcnt - 1; - struct bio_vec *copy = &bip->bip_vec[1]; - size_t bytes = bip->bip_iter.bi_size; - struct iov_iter iter; + unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1; + struct bio_vec *orig_bvecs = &bip->bip_vec[1]; + struct bio_vec *bounce_bvec = &bip->bip_vec[0]; + size_t bytes = bounce_bvec->bv_len; + struct iov_iter orig_iter; int ret; - iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); - ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); + iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes); + ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter); WARN_ON_ONCE(ret != bytes); - bio_integrity_unpin_bvec(copy, nr_vecs, true); + bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs, true); } /** @@ -301,16 +302,15 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, return nr_bvecs; } -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) +int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + size_t offset, bytes = iter->count; unsigned int direction, nr_bvecs; - struct iov_iter iter; int ret, nr_vecs; - size_t offset; bool copy; if (bio_integrity(bio)) @@ -323,8 +323,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) else direction = ITER_SOURCE; - iov_iter_ubuf(&iter, direction, ubuf, bytes); - nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); + nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1); if (nr_vecs > BIO_MAX_VECS) return -E2BIG; if (nr_vecs > UIO_FASTIOV) { @@ -334,8 +333,8 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) pages = NULL; } - copy = !iov_iter_is_aligned(&iter, align, align); - ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); + copy = !iov_iter_is_aligned(iter, align, align); + ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset); if (unlikely(ret < 0)) goto free_bvec; @@ -365,6 +364,55 @@ free_bvec: return ret; } +static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (meta->flags & IO_INTEGRITY_CHK_GUARD) + bip->bip_flags |= BIP_CHECK_GUARD; + if (meta->flags & IO_INTEGRITY_CHK_APPTAG) + bip->bip_flags |= BIP_CHECK_APPTAG; + if (meta->flags & IO_INTEGRITY_CHK_REFTAG) + bip->bip_flags |= BIP_CHECK_REFTAG; + + bip->app_tag = meta->app_tag; +} + +int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + unsigned int integrity_bytes; + int ret; + struct iov_iter it; + + if (!bi) + return -EINVAL; + /* + * original meta iterator can be bigger. + * process integrity info corresponding to current data buffer only. + */ + it = meta->iter; + integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio)); + if (it.count < integrity_bytes) + return -EINVAL; + + /* should fit into two bytes */ + BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16)); + + if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS)) + return -EINVAL; + + it.count = integrity_bytes; + ret = bio_integrity_map_user(bio, &it); + if (!ret) { + bio_uio_meta_to_bip(bio, meta); + bip_set_seed(bio_integrity(bio), meta->seed); + iov_iter_advance(&meta->iter, integrity_bytes); + meta->seed += bio_integrity_intervals(bi, bio_sectors(bio)); + } + return ret; +} + /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare @@ -435,6 +483,11 @@ bool bio_integrity_prep(struct bio *bio) if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bip->bip_flags |= BIP_IP_CHECKSUM; + /* describe what tags to check in payload */ + if (bi->csum_type) + bip->bip_flags |= BIP_CHECK_GUARD; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bip->bip_flags |= BIP_CHECK_REFTAG; if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) { printk(KERN_ERR "could not attach integrity payload\n"); @@ -559,7 +612,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, bip->bip_vec = bip_src->bip_vec; bip->bip_iter = bip_src->bip_iter; - bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; + bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS; + bip->app_tag = bip_src->app_tag; return 0; } diff --git a/block/bio.c b/block/bio.c index d5bdc31d88d3..6ac5983ba51e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -77,7 +77,7 @@ struct bio_slab { struct kmem_cache *slab; unsigned int slab_ref; unsigned int slab_size; - char name[8]; + char name[12]; }; static DEFINE_MUTEX(bio_slab_lock); static DEFINE_XARRAY(bio_slabs); @@ -946,8 +946,11 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, /* * Try to merge a page into a segment, while obeying the hardware segment - * size limit. This is not for normal read/write bios, but for passthrough - * or Zone Append operations that we can't split. + * size limit. + * + * This is kept around for the integrity metadata, which is still tries + * to build the initial bio to the hardware limit and doesn't have proper + * helpers to split. Hopefully this will go away soon. */ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset, @@ -965,106 +968,6 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, } /** - * bio_add_hw_page - attempt to add a page to a bio with hw constraints - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * @max_sectors: maximum number of sectors that can be added - * @same_page: return if the segment has been merged inside the same page - * - * Add a page to a bio while respecting the hardware max_sectors, max_segment - * and gap limitations. - */ -int bio_add_hw_page(struct request_queue *q, struct bio *bio, - struct page *page, unsigned int len, unsigned int offset, - unsigned int max_sectors, bool *same_page) -{ - unsigned int max_size = max_sectors << SECTOR_SHIFT; - - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return 0; - - len = min3(len, max_size, queue_max_segment_size(q)); - if (len > max_size - bio->bi_iter.bi_size) - return 0; - - if (bio->bi_vcnt > 0) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (bvec_try_merge_hw_page(q, bv, page, len, offset, - same_page)) { - bio->bi_iter.bi_size += len; - return len; - } - - if (bio->bi_vcnt >= - min(bio->bi_max_vecs, queue_max_segments(q))) - return 0; - - /* - * If the queue doesn't support SG gaps and adding this segment - * would create a gap, disallow it. - */ - if (bvec_gap_to_prev(&q->limits, bv, offset)) - return 0; - } - - bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); - bio->bi_vcnt++; - bio->bi_iter.bi_size += len; - return len; -} - -/** - * bio_add_hw_folio - attempt to add a folio to a bio with hw constraints - * @q: the target queue - * @bio: destination bio - * @folio: folio to add - * @len: vec entry length - * @offset: vec entry offset in the folio - * @max_sectors: maximum number of sectors that can be added - * @same_page: return if the segment has been merged inside the same folio - * - * Add a folio to a bio while respecting the hardware max_sectors, max_segment - * and gap limitations. - */ -int bio_add_hw_folio(struct request_queue *q, struct bio *bio, - struct folio *folio, size_t len, size_t offset, - unsigned int max_sectors, bool *same_page) -{ - if (len > UINT_MAX || offset > UINT_MAX) - return 0; - return bio_add_hw_page(q, bio, folio_page(folio, 0), len, offset, - max_sectors, same_page); -} - -/** - * bio_add_pc_page - attempt to add page to passthrough bio - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. - * - * This should only be used by passthrough bios. - */ -int bio_add_pc_page(struct request_queue *q, struct bio *bio, - struct page *page, unsigned int len, unsigned int offset) -{ - bool same_page = false; - return bio_add_hw_page(q, bio, page, len, offset, - queue_max_hw_sectors(q), &same_page); -} -EXPORT_SYMBOL(bio_add_pc_page); - -/** * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio * @page: start page to add @@ -1707,6 +1610,10 @@ EXPORT_SYMBOL(bio_split); */ void bio_trim(struct bio *bio, sector_t offset, sector_t size) { + /* We should never trim an atomic write */ + if (WARN_ON_ONCE(bio->bi_opf & REQ_ATOMIC && size)) + return; + if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS || offset + size > bio_sectors(bio))) return; diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h index 022527b0b043..703a16fe1404 100644 --- a/block/blk-cgroup-rwstat.h +++ b/block/blk-cgroup-rwstat.h @@ -52,7 +52,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, /** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat - * @op: REQ_OP and flags + * @opf: REQ_OP and flags * @val: value to add * * Add @val to @rwstat. The counters are chosen according to @rw. The @@ -83,8 +83,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, /** * blkg_rwstat_read - read the current values of a blkg_rwstat * @rwstat: blkg_rwstat to read + * @result: where to put the current values * - * Read the current snapshot of @rwstat and return it in the aux counts. + * Read the current snapshot of @rwstat and return it in the @result counts. */ static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, struct blkg_rwstat_sample *result) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 45a395862fbc..9ed93d91d754 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1138,6 +1138,7 @@ static void blkcg_fill_root_iostats(void) blkg_iostat_set(&blkg->iostat.cur, &tmp); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } + class_dev_iter_exit(&iter); } static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) @@ -1545,6 +1546,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; + unsigned int memflags; int ret; if (blkcg_policy_enabled(q, pol)) @@ -1559,7 +1561,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) return -EINVAL; if (queue_is_mq(q)) - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); retry: spin_lock_irq(&q->queue_lock); @@ -1623,7 +1625,7 @@ retry: spin_unlock_irq(&q->queue_lock); out: if (queue_is_mq(q)) - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); if (pinned_blkg) blkg_put(pinned_blkg); if (pd_prealloc) @@ -1667,12 +1669,13 @@ void blkcg_deactivate_policy(struct gendisk *disk, { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; + unsigned int memflags; if (!blkcg_policy_enabled(q, pol)) return; if (queue_is_mq(q)) - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); @@ -1696,7 +1699,7 @@ void blkcg_deactivate_policy(struct gendisk *disk, mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b9e3265c1eb3..2c4663bd993a 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -225,7 +225,9 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg - * @return: true if this bio needs to be submitted with the root blkg context. + * @bio: the target &bio + * + * Return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning @@ -245,7 +247,7 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio) * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. - + * * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, @@ -268,7 +270,7 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, } /** - * blkg_to_pdata - get policy private data + * blkg_to_pd - get policy private data * @blkg: blkg of interest * @pol: policy of interest * @@ -287,7 +289,7 @@ static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, } /** - * pdata_to_blkg - get blkg associated with policy private data + * pd_to_blkg - get blkg associated with policy private data * @pd: policy private data of interest * * @pd is policy private data. Determine the blkg it's associated with. diff --git a/block/blk-core.c b/block/blk-core.c index 666efe8fa202..d6c4fa3943b5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -430,7 +430,6 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); - mutex_init(&q->sysfs_dir_lock); mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); @@ -629,8 +628,14 @@ static void __submit_bio(struct bio *bio) blk_mq_submit_bio(bio); } else if (likely(bio_queue_enter(bio) == 0)) { struct gendisk *disk = bio->bi_bdev->bd_disk; - - disk->fops->submit_bio(bio); + + if ((bio->bi_opf & REQ_POLLED) && + !(disk->queue->limits.features & BLK_FEAT_POLL)) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + } else { + disk->fops->submit_bio(bio); + } blk_queue_exit(disk->queue); } @@ -805,12 +810,6 @@ void submit_bio_noacct(struct bio *bio) } } - if (!(q->limits.features & BLK_FEAT_POLL) && - (bio->bi_opf & REQ_POLLED)) { - bio_clear_polled(bio); - goto not_supported; - } - switch (bio_op(bio)) { case REQ_OP_READ: break; @@ -935,7 +934,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) return 0; q = bdev_get_queue(bdev); - if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL)) + if (cookie == BLK_QC_T_NONE) return 0; blk_flush_plug(current->plug, false); @@ -956,7 +955,8 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) } else { struct gendisk *disk = q->disk; - if (disk && disk->fops->poll_bio) + if ((q->limits.features & BLK_FEAT_POLL) && disk && + disk->fops->poll_bio) ret = disk->fops->poll_bio(bio, iob, flags); } blk_queue_exit(q); diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index c9eb4241e048..d479f5481b66 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -111,7 +111,6 @@ int disk_register_independent_access_ranges(struct gendisk *disk) struct request_queue *q = disk->queue; int i, ret; - lockdep_assert_held(&q->sysfs_dir_lock); lockdep_assert_held(&q->sysfs_lock); if (!iars) @@ -155,7 +154,6 @@ void disk_unregister_independent_access_ranges(struct gendisk *disk) struct blk_independent_access_ranges *iars = disk->ia_ranges; int i; - lockdep_assert_held(&q->sysfs_dir_lock); lockdep_assert_held(&q->sysfs_lock); if (!iars) @@ -289,7 +287,6 @@ void disk_set_independent_access_ranges(struct gendisk *disk, { struct request_queue *q = disk->queue; - mutex_lock(&q->sysfs_dir_lock); mutex_lock(&q->sysfs_lock); if (iars && !disk_check_ia_ranges(disk, iars)) { kfree(iars); @@ -313,6 +310,5 @@ void disk_set_independent_access_ranges(struct gendisk *disk, disk_register_independent_access_ranges(disk); unlock: mutex_unlock(&q->sysfs_lock); - mutex_unlock(&q->sysfs_dir_lock); } EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index b180cac61a9d..a1678f0a9f81 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -115,8 +115,16 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) { - int ret = bio_integrity_map_user(rq->bio, ubuf, bytes); + int ret; + struct iov_iter iter; + unsigned int direction; + if (op_is_write(req_op(rq))) + direction = ITER_DEST; + else + direction = ITER_SOURCE; + iov_iter_ubuf(&iter, direction, ubuf, bytes); + ret = bio_integrity_map_user(rq->bio, &iter); if (ret) return ret; @@ -218,9 +226,7 @@ static ssize_t flag_store(struct device *dev, const char *page, size_t count, else lim.integrity.flags |= flag; - blk_mq_freeze_queue(q); - err = queue_limits_commit_update(q, &lim); - blk_mq_unfreeze_queue(q); + err = queue_limits_commit_update_frozen(q, &lim); if (err) return err; return count; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index a5894ec9696e..65a1d4427ccf 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3224,6 +3224,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, u32 qos[NR_QOS_PARAMS]; bool enable, user; char *body, *p; + unsigned int memflags; int ret; blkg_conf_init(&ctx, input); @@ -3247,7 +3248,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(disk->queue); } - blk_mq_freeze_queue(disk->queue); + memflags = blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); @@ -3347,7 +3348,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, wbt_enable_default(disk); blk_mq_unquiesce_queue(disk->queue); - blk_mq_unfreeze_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue, memflags); blkg_conf_exit(&ctx); return nbytes; @@ -3355,7 +3356,7 @@ einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(disk->queue); - blk_mq_unfreeze_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue, memflags); ret = -EINVAL; err: @@ -3414,6 +3415,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, { struct blkg_conf_ctx ctx; struct request_queue *q; + unsigned int memflags; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; @@ -3441,7 +3443,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(q); } - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); spin_lock_irq(&ioc->lock); @@ -3493,7 +3495,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); blkg_conf_exit(&ctx); return nbytes; @@ -3502,7 +3504,7 @@ einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); ret = -EINVAL; err: diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index ebb522788d97..42c1e0b9a68f 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -749,9 +749,11 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) */ enabled = atomic_read(&blkiolat->enable_cnt); if (enabled != blkiolat->enabled) { - blk_mq_freeze_queue(blkiolat->rqos.disk->queue); + unsigned int memflags; + + memflags = blk_mq_freeze_queue(blkiolat->rqos.disk->queue); blkiolat->enabled = enabled; - blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue); + blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue, memflags); } } diff --git a/block/blk-map.c b/block/blk-map.c index 894009b2d881..d2f22744b3d1 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -189,7 +189,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, } } - if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) { + if (bio_add_page(bio, page, bytes, offset) < bytes) { if (!map_data) __free_page(page); break; @@ -272,86 +272,27 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq, static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { - iov_iter_extraction_t extraction_flags = 0; - unsigned int max_sectors = queue_max_hw_sectors(rq->q); unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); struct bio *bio; int ret; - int j; if (!iov_iter_count(iter)) return -EINVAL; bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask); - if (bio == NULL) + if (!bio) return -ENOMEM; - - if (blk_queue_pci_p2pdma(rq->q)) - extraction_flags |= ITER_ALLOW_P2PDMA; - if (iov_iter_extract_will_pin(iter)) - bio_set_flag(bio, BIO_PAGE_PINNED); - - while (iov_iter_count(iter)) { - struct page *stack_pages[UIO_FASTIOV]; - struct page **pages = stack_pages; - ssize_t bytes; - size_t offs; - int npages; - - if (nr_vecs > ARRAY_SIZE(stack_pages)) - pages = NULL; - - bytes = iov_iter_extract_pages(iter, &pages, LONG_MAX, - nr_vecs, extraction_flags, &offs); - if (unlikely(bytes <= 0)) { - ret = bytes ? bytes : -EFAULT; - goto out_unmap; - } - - npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - - if (unlikely(offs & queue_dma_alignment(rq->q))) - j = 0; - else { - for (j = 0; j < npages; j++) { - struct page *page = pages[j]; - unsigned int n = PAGE_SIZE - offs; - bool same_page = false; - - if (n > bytes) - n = bytes; - - if (!bio_add_hw_page(rq->q, bio, page, n, offs, - max_sectors, &same_page)) - break; - - if (same_page) - bio_release_page(bio, page); - bytes -= n; - offs = 0; - } - } - /* - * release the pages we didn't map into the bio, if any - */ - while (j < npages) - bio_release_page(bio, pages[j++]); - if (pages != stack_pages) - kvfree(pages); - /* couldn't stuff something into bio? */ - if (bytes) { - iov_iter_revert(iter, bytes); - break; - } - } - + ret = bio_iov_iter_get_pages(bio, iter); + if (ret) + goto out_put; ret = blk_rq_append_bio(rq, bio); if (ret) - goto out_unmap; + goto out_release; return 0; - out_unmap: +out_release: bio_release_pages(bio, false); +out_put: blk_mq_map_bio_put(bio); return ret; } @@ -422,8 +363,7 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, page = virt_to_page(data); else page = vmalloc_to_page(data); - if (bio_add_pc_page(q, bio, page, bytes, - offset) < bytes) { + if (bio_add_page(bio, page, bytes, offset) < bytes) { /* we don't support partial mappings */ bio_uninit(bio); kfree(bio); @@ -507,7 +447,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, if (!reading) memcpy(page_address(page), p, bytes); - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + if (bio_add_page(bio, page, bytes, 0) < bytes) break; len -= bytes; @@ -536,24 +476,33 @@ cleanup: */ int blk_rq_append_bio(struct request *rq, struct bio *bio) { - struct bvec_iter iter; - struct bio_vec bv; + const struct queue_limits *lim = &rq->q->limits; + unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT; unsigned int nr_segs = 0; + int ret; - bio_for_each_bvec(bv, bio, iter) - nr_segs++; + /* check that the data layout matches the hardware restrictions */ + ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes); + if (ret) { + /* if we would have to split the bio, copy instead */ + if (ret > 0) + ret = -EREMOTEIO; + return ret; + } - if (!rq->bio) { - blk_rq_bio_prep(rq, bio, nr_segs); - } else { + if (rq->bio) { if (!ll_back_merge_fn(rq, bio, nr_segs)) return -EINVAL; rq->biotail->bi_next = bio; rq->biotail = bio; - rq->__data_len += (bio)->bi_iter.bi_size; + rq->__data_len += bio->bi_iter.bi_size; bio_crypt_free_ctx(bio); + return 0; } + rq->nr_phys_segments = nr_segs; + rq->bio = rq->biotail = bio; + rq->__data_len = bio->bi_iter.bi_size; return 0; } EXPORT_SYMBOL(blk_rq_append_bio); @@ -561,9 +510,7 @@ EXPORT_SYMBOL(blk_rq_append_bio); /* Prepare bio for passthrough IO given ITER_BVEC iter */ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) { - const struct queue_limits *lim = &rq->q->limits; - unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT; - unsigned int nsegs; + unsigned int max_bytes = rq->q->limits.max_hw_sectors << SECTOR_SHIFT; struct bio *bio; int ret; @@ -576,18 +523,10 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) return -ENOMEM; bio_iov_bvec_set(bio, iter); - /* check that the data layout matches the hardware restrictions */ - ret = bio_split_rw_at(bio, lim, &nsegs, max_bytes); - if (ret) { - /* if we would have to split the bio, copy instead */ - if (ret > 0) - ret = -EREMOTEIO; + ret = blk_rq_append_bio(rq, bio); + if (ret) blk_mq_map_bio_put(bio); - return ret; - } - - blk_rq_bio_prep(rq, bio, nsegs); - return 0; + return ret; } /** @@ -644,8 +583,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask); else ret = bio_map_user_iov(rq, &i, gfp_mask); - if (ret) + if (ret) { + if (ret == -EREMOTEIO) + ret = -EINVAL; goto unmap_rq; + } if (!bio) bio = rq->bio; } while (iov_iter_count(&i)); diff --git a/block/blk-merge.c b/block/blk-merge.c index e01383c6e534..1d1589c35297 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -270,7 +270,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes, unsigned max_segs, unsigned max_bytes) { - unsigned max_len = min(max_bytes, UINT_MAX) - *bytes; + unsigned max_len = max_bytes - *bytes; unsigned len = min(bv->bv_len, max_len); unsigned total_len = 0; unsigned seg_size = 0; @@ -329,7 +329,7 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, if (nsegs < lim->max_segments && bytes + bv.bv_len <= max_bytes && - bv.bv_offset + bv.bv_len <= PAGE_SIZE) { + bv.bv_offset + bv.bv_len <= lim->min_segment_size) { nsegs++; bytes += bv.bv_len; } else { @@ -473,137 +473,103 @@ unsigned int blk_recalc_rq_segments(struct request *rq) return nr_phys_segs; } -static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, - struct scatterlist *sglist) -{ - if (!*sg) - return sglist; +struct phys_vec { + phys_addr_t paddr; + u32 len; +}; - /* - * If the driver previously mapped a shorter list, we could see a - * termination bit prematurely unless it fully inits the sg table - * on each mapping. We KNOW that there must be more entries here - * or the driver would be buggy, so force clear the termination bit - * to avoid doing a full sg_init_table() in drivers for each command. - */ - sg_unmark_end(*sg); - return sg_next(*sg); -} - -static unsigned blk_bvec_map_sg(struct request_queue *q, - struct bio_vec *bvec, struct scatterlist *sglist, - struct scatterlist **sg) +static bool blk_map_iter_next(struct request *req, + struct req_iterator *iter, struct phys_vec *vec) { - unsigned nbytes = bvec->bv_len; - unsigned nsegs = 0, total = 0; - - while (nbytes > 0) { - unsigned offset = bvec->bv_offset + total; - unsigned len = get_max_segment_size(&q->limits, - bvec_phys(bvec) + total, nbytes); - struct page *page = bvec->bv_page; - - /* - * Unfortunately a fair number of drivers barf on scatterlists - * that have an offset larger than PAGE_SIZE, despite other - * subsystems dealing with that invariant just fine. For now - * stick to the legacy format where we never present those from - * the block layer, but the code below should be removed once - * these offenders (mostly MMC/SD drivers) are fixed. - */ - page += (offset >> PAGE_SHIFT); - offset &= ~PAGE_MASK; - - *sg = blk_next_sg(sg, sglist); - sg_set_page(*sg, page, len, offset); + unsigned int max_size; + struct bio_vec bv; - total += len; - nbytes -= len; - nsegs++; + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + if (!iter->bio) + return false; + vec->paddr = bvec_phys(&req->special_vec); + vec->len = req->special_vec.bv_len; + iter->bio = NULL; + return true; } - return nsegs; -} - -static inline int __blk_bvec_map_sg(struct bio_vec bv, - struct scatterlist *sglist, struct scatterlist **sg) -{ - *sg = blk_next_sg(sg, sglist); - sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); - return 1; -} - -/* only try to merge bvecs into one sg if they are from two bios */ -static inline bool -__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec, - struct bio_vec *bvprv, struct scatterlist **sg) -{ - - int nbytes = bvec->bv_len; - - if (!*sg) + if (!iter->iter.bi_size) return false; - if ((*sg)->length + nbytes > queue_max_segment_size(q)) - return false; + bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + vec->paddr = bvec_phys(&bv); + max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); + bv.bv_len = min(bv.bv_len, max_size); + bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); - if (!biovec_phys_mergeable(q, bvprv, bvec)) - return false; + /* + * If we are entirely done with this bi_io_vec entry, check if the next + * one could be merged into it. This typically happens when moving to + * the next bio, but some callers also don't pack bvecs tight. + */ + while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { + struct bio_vec next; + + if (!iter->iter.bi_size) { + if (!iter->bio->bi_next) + break; + iter->bio = iter->bio->bi_next; + iter->iter = iter->bio->bi_iter; + } - (*sg)->length += nbytes; + next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + if (bv.bv_len + next.bv_len > max_size || + !biovec_phys_mergeable(req->q, &bv, &next)) + break; + bv.bv_len += next.bv_len; + bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); + } + + vec->len = bv.bv_len; return true; } -static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, - struct scatterlist *sglist, - struct scatterlist **sg) +static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, + struct scatterlist *sglist) { - struct bio_vec bvec, bvprv = { NULL }; - struct bvec_iter iter; - int nsegs = 0; - bool new_bio = false; - - for_each_bio(bio) { - bio_for_each_bvec(bvec, bio, iter) { - /* - * Only try to merge bvecs from two bios given we - * have done bio internal merge when adding pages - * to bio - */ - if (new_bio && - __blk_segment_map_sg_merge(q, &bvec, &bvprv, sg)) - goto next_bvec; - - if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE) - nsegs += __blk_bvec_map_sg(bvec, sglist, sg); - else - nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg); - next_bvec: - new_bio = false; - } - if (likely(bio->bi_iter.bi_size)) { - bvprv = bvec; - new_bio = true; - } - } + if (!*sg) + return sglist; - return nsegs; + /* + * If the driver previously mapped a shorter list, we could see a + * termination bit prematurely unless it fully inits the sg table + * on each mapping. We KNOW that there must be more entries here + * or the driver would be buggy, so force clear the termination bit + * to avoid doing a full sg_init_table() in drivers for each command. + */ + sg_unmark_end(*sg); + return sg_next(*sg); } /* - * map a request to scatterlist, return number of sg entries setup. Caller - * must make sure sg can hold rq->nr_phys_segments entries + * Map a request to scatterlist, return number of sg entries setup. Caller + * must make sure sg can hold rq->nr_phys_segments entries. */ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { + struct req_iterator iter = { + .bio = rq->bio, + }; + struct phys_vec vec; int nsegs = 0; - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg); - else if (rq->bio) - nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); + /* the internal flush request may not have bio attached */ + if (iter.bio) + iter.iter = iter.bio->bi_iter; + + while (blk_map_iter_next(rq, &iter, &vec)) { + *last_sg = blk_next_sg(last_sg, sglist); + sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, + offset_in_page(vec.paddr)); + nsegs++; + } if (*last_sg) sg_mark_end(*last_sg); diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 9638b25fd521..444798c5374f 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -11,6 +11,7 @@ #include <linux/smp.h> #include <linux/cpu.h> #include <linux/group_cpus.h> +#include <linux/device/bus.h> #include "blk.h" #include "blk-mq.h" @@ -54,3 +55,38 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index) return NUMA_NO_NODE; } + +/** + * blk_mq_map_hw_queues - Create CPU to hardware queue mapping + * @qmap: CPU to hardware queue map + * @dev: The device to map queues + * @offset: Queue offset to use for the device + * + * Create a CPU to hardware queue mapping in @qmap. The struct bus_type + * irq_get_affinity callback will be used to retrieve the affinity. + */ +void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, + struct device *dev, unsigned int offset) + +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + if (!dev->bus->irq_get_affinity) + goto fallback; + + for (queue = 0; queue < qmap->nr_queues; queue++) { + mask = dev->bus->irq_get_affinity(dev, queue + offset); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + qmap->mq_map[cpu] = qmap->queue_offset + queue; + } + + return; + +fallback: + blk_mq_map_queues(qmap); +} +EXPORT_SYMBOL_GPL(blk_mq_map_hw_queues); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 5463697a8442..adf5f0697b6b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -172,21 +172,13 @@ static int hctx_state_show(void *data, struct seq_file *m) return 0; } -#define BLK_TAG_ALLOC_NAME(name) [BLK_TAG_ALLOC_##name] = #name -static const char *const alloc_policy_name[] = { - BLK_TAG_ALLOC_NAME(FIFO), - BLK_TAG_ALLOC_NAME(RR), -}; -#undef BLK_TAG_ALLOC_NAME - #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name static const char *const hctx_flag_name[] = { - HCTX_FLAG_NAME(SHOULD_MERGE), HCTX_FLAG_NAME(TAG_QUEUE_SHARED), HCTX_FLAG_NAME(STACKING), HCTX_FLAG_NAME(TAG_HCTX_SHARED), HCTX_FLAG_NAME(BLOCKING), - HCTX_FLAG_NAME(NO_SCHED), + HCTX_FLAG_NAME(TAG_RR), HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT), }; #undef HCTX_FLAG_NAME @@ -194,22 +186,11 @@ static const char *const hctx_flag_name[] = { static int hctx_flags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; - const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags); - BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != - BLK_MQ_F_ALLOC_POLICY_START_BIT); - BUILD_BUG_ON(ARRAY_SIZE(alloc_policy_name) != BLK_TAG_ALLOC_MAX); + BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != ilog2(BLK_MQ_F_MAX)); - seq_puts(m, "alloc_policy="); - if (alloc_policy < ARRAY_SIZE(alloc_policy_name) && - alloc_policy_name[alloc_policy]) - seq_puts(m, alloc_policy_name[alloc_policy]); - else - seq_printf(m, "%d", alloc_policy); - seq_puts(m, " "); - blk_flags_show(m, - hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy), - hctx_flag_name, ARRAY_SIZE(hctx_flag_name)); + blk_flags_show(m, hctx->flags, hctx_flag_name, + ARRAY_SIZE(hctx_flag_name)); seq_puts(m, "\n"); return 0; } diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c deleted file mode 100644 index d47b5c73c9eb..000000000000 --- a/block/blk-mq-pci.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2016 Christoph Hellwig. - */ -#include <linux/kobject.h> -#include <linux/blkdev.h> -#include <linux/blk-mq-pci.h> -#include <linux/pci.h> -#include <linux/module.h> - -#include "blk-mq.h" - -/** - * blk_mq_pci_map_queues - provide a default queue mapping for PCI device - * @qmap: CPU to hardware queue map. - * @pdev: PCI device associated with @set. - * @offset: Offset to use for the pci irq vector - * - * This function assumes the PCI device @pdev has at least as many available - * interrupt vectors as @set has queues. It will then query the vector - * corresponding to each queue for it's affinity mask and built queue mapping - * that maps a queue to the CPUs that have irq affinity for the corresponding - * vector. - */ -void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset) -{ - const struct cpumask *mask; - unsigned int queue, cpu; - - for (queue = 0; queue < qmap->nr_queues; queue++) { - mask = pci_irq_get_affinity(pdev, queue + offset); - if (!mask) - goto fallback; - - for_each_cpu(cpu, mask) - qmap->mq_map[cpu] = qmap->queue_offset + queue; - } - - return; - -fallback: - WARN_ON_ONCE(qmap->nr_queues > 1); - blk_mq_clear_mq_map(qmap); -} -EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 451a2c1f1f32..7442ca27c2bf 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -351,8 +351,7 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); type = hctx->type; - if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || - list_empty_careful(&ctx->rq_lists[type])) + if (list_empty_careful(&ctx->rq_lists[type])) goto out_put; /* default per sw-queue merge */ diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 156e9bb07abf..3feeeccf8a99 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -223,30 +223,27 @@ int blk_mq_sysfs_register(struct gendisk *disk) unsigned long i, j; int ret; - lockdep_assert_held(&q->sysfs_dir_lock); - ret = kobject_add(q->mq_kobj, &disk_to_dev(disk)->kobj, "mq"); if (ret < 0) - goto out; + return ret; kobject_uevent(q->mq_kobj, KOBJ_ADD); + mutex_lock(&q->tag_set->tag_list_lock); queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) - goto unreg; + goto out_unreg; } + mutex_unlock(&q->tag_set->tag_list_lock); + return 0; - q->mq_sysfs_init_done = true; - -out: - return ret; - -unreg: +out_unreg: queue_for_each_hw_ctx(q, hctx, j) { if (j < i) blk_mq_unregister_hctx(hctx); } + mutex_unlock(&q->tag_set->tag_list_lock); kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); @@ -259,15 +256,13 @@ void blk_mq_sysfs_unregister(struct gendisk *disk) struct blk_mq_hw_ctx *hctx; unsigned long i; - lockdep_assert_held(&q->sysfs_dir_lock); - + mutex_lock(&q->tag_set->tag_list_lock); queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); + mutex_unlock(&q->tag_set->tag_list_lock); kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); - - q->mq_sysfs_init_done = false; } void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) @@ -275,15 +270,11 @@ void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; - mutex_lock(&q->sysfs_dir_lock); - if (!q->mq_sysfs_init_done) - goto unlock; + if (!blk_queue_registered(q)) + return; queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); - -unlock: - mutex_unlock(&q->sysfs_dir_lock); } int blk_mq_sysfs_register_hctxs(struct request_queue *q) @@ -292,9 +283,8 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q) unsigned long i; int ret = 0; - mutex_lock(&q->sysfs_dir_lock); - if (!q->mq_sysfs_init_done) - goto unlock; + if (!blk_queue_registered(q)) + goto out; queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); @@ -302,8 +292,6 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q) break; } -unlock: - mutex_unlock(&q->sysfs_dir_lock); - +out: return ret; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2cafcf11ee8b..b9f417d980b4 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -544,30 +544,11 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, node); } -int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, - struct sbitmap_queue *breserved_tags, - unsigned int queue_depth, unsigned int reserved, - int node, int alloc_policy) -{ - unsigned int depth = queue_depth - reserved; - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - - if (bt_alloc(bitmap_tags, depth, round_robin, node)) - return -ENOMEM; - if (bt_alloc(breserved_tags, reserved, round_robin, node)) - goto free_bitmap_tags; - - return 0; - -free_bitmap_tags: - sbitmap_queue_free(bitmap_tags); - return -ENOMEM; -} - struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, - unsigned int reserved_tags, - int node, int alloc_policy) + unsigned int reserved_tags, unsigned int flags, int node) { + unsigned int depth = total_tags - reserved_tags; + bool round_robin = flags & BLK_MQ_F_TAG_RR; struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { @@ -582,14 +563,18 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); + if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) + goto out_free_tags; + if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node)) + goto out_free_bitmap_tags; - if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags, - total_tags, reserved_tags, node, - alloc_policy) < 0) { - kfree(tags); - return NULL; - } return tags; + +out_free_bitmap_tags: + sbitmap_queue_free(&tags->bitmap_tags); +out_free_tags: + kfree(tags); + return NULL; } void blk_mq_free_tags(struct blk_mq_tags *tags) diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c deleted file mode 100644 index 68d0945c0b08..000000000000 --- a/block/blk-mq-virtio.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2016 Christoph Hellwig. - */ -#include <linux/device.h> -#include <linux/blk-mq-virtio.h> -#include <linux/virtio_config.h> -#include <linux/module.h> -#include "blk-mq.h" - -/** - * blk_mq_virtio_map_queues - provide a default queue mapping for virtio device - * @qmap: CPU to hardware queue map. - * @vdev: virtio device to provide a mapping for. - * @first_vec: first interrupt vectors to use for queues (usually 0) - * - * This function assumes the virtio device @vdev has at least as many available - * interrupt vectors as @set has queues. It will then query the vector - * corresponding to each queue for it's affinity mask and built queue mapping - * that maps a queue to the CPUs that have irq affinity for the corresponding - * vector. - */ -void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, - struct virtio_device *vdev, int first_vec) -{ - const struct cpumask *mask; - unsigned int queue, cpu; - - if (!vdev->config->get_vq_affinity) - goto fallback; - - for (queue = 0; queue < qmap->nr_queues; queue++) { - mask = vdev->config->get_vq_affinity(vdev, first_vec + queue); - if (!mask) - goto fallback; - - for_each_cpu(cpu, mask) - qmap->mq_map[cpu] = qmap->queue_offset + queue; - } - - return; - -fallback: - blk_mq_map_queues(qmap); -} -EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c index 8ac19d4ae3c0..40490ac88045 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -131,6 +131,10 @@ static bool blk_freeze_set_owner(struct request_queue *q, if (!q->mq_freeze_depth) { q->mq_freeze_owner = owner; q->mq_freeze_owner_depth = 1; + q->mq_freeze_disk_dead = !q->disk || + test_bit(GD_DEAD, &q->disk->state) || + !blk_queue_registered(q); + q->mq_freeze_queue_dying = blk_queue_dying(q); return true; } @@ -142,8 +146,6 @@ static bool blk_freeze_set_owner(struct request_queue *q, /* verify the last unfreeze in owner context */ static bool blk_unfreeze_check_owner(struct request_queue *q) { - if (!q->mq_freeze_owner) - return false; if (q->mq_freeze_owner != current) return false; if (--q->mq_freeze_owner_depth == 0) { @@ -189,7 +191,7 @@ bool __blk_freeze_queue_start(struct request_queue *q, void blk_freeze_queue_start(struct request_queue *q) { if (__blk_freeze_queue_start(q, current)) - blk_freeze_acquire_lock(q, false, false); + blk_freeze_acquire_lock(q); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -208,12 +210,12 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); -void blk_mq_freeze_queue(struct request_queue *q) +void blk_mq_freeze_queue_nomemsave(struct request_queue *q) { blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } -EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave); bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { @@ -234,12 +236,12 @@ bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) return unfreeze; } -void blk_mq_unfreeze_queue(struct request_queue *q) +void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q) { if (__blk_mq_unfreeze_queue(q, false)) - blk_unfreeze_release_lock(q, false, false); + blk_unfreeze_release_lock(q); } -EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore); /* * non_owner variant of blk_freeze_queue_start @@ -2656,8 +2658,10 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, if (bio->bi_opf & REQ_RAHEAD) rq->cmd_flags |= REQ_FAILFAST_MASK; + rq->bio = rq->biotail = bio; rq->__sector = bio->bi_iter.bi_sector; - blk_rq_bio_prep(rq, bio, nr_segs); + rq->__data_len = bio->bi_iter.bi_size; + rq->nr_phys_segments = nr_segs; if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, bio); @@ -2980,12 +2984,9 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, } rq = __blk_mq_alloc_requests(&data); - if (rq) - return rq; - rq_qos_cleanup(q, bio); - if (bio->bi_opf & REQ_NOWAIT) - bio_wouldblock_error(bio); - return NULL; + if (unlikely(!rq)) + rq_qos_cleanup(q, bio); + return rq; } /* @@ -3092,14 +3093,21 @@ void blk_mq_submit_bio(struct bio *bio) } /* - * Device reconfiguration may change logical block size, so alignment - * check has to be done with queue usage counter held + * Device reconfiguration may change logical block size or reduce the + * number of poll queues, so the checks for alignment and poll support + * have to be done with queue usage counter held. */ if (unlikely(bio_unaligned(bio, q))) { bio_io_error(bio); goto queue_exit; } + if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + goto queue_exit; + } + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) goto queue_exit; @@ -3114,12 +3122,15 @@ void blk_mq_submit_bio(struct bio *bio) goto queue_exit; new_request: - if (!rq) { + if (rq) { + blk_mq_use_cached_rq(rq, plug, bio); + } else { rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); goto queue_exit; - } else { - blk_mq_use_cached_rq(rq, plug, bio); + } } trace_block_getrq(bio); @@ -3472,8 +3483,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, node, - BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); + tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node); if (!tags) return NULL; @@ -4213,13 +4223,14 @@ static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, bool shared) { struct request_queue *q; + unsigned int memflags; lockdep_assert_held(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); queue_set_hctx_shared(q, shared); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); } } @@ -4317,12 +4328,6 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -static bool blk_mq_can_poll(struct blk_mq_tag_set *set) -{ - return set->nr_maps > HCTX_TYPE_POLL && - set->map[HCTX_TYPE_POLL].nr_queues; -} - struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata) { @@ -4333,7 +4338,7 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, if (!lim) lim = &default_lim; lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; - if (blk_mq_can_poll(set)) + if (set->nr_maps > HCTX_TYPE_POLL) lim->features |= BLK_FEAT_POLL; q = blk_alloc_queue(lim, set->numa_node); @@ -4988,6 +4993,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, struct request_queue *q; LIST_HEAD(head); int prev_nr_hw_queues = set->nr_hw_queues; + unsigned int memflags; int i; lockdep_assert_held(&set->tag_list_lock); @@ -4999,8 +5005,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; + memflags = memalloc_noio_save(); list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_freeze_queue(q); + blk_mq_freeze_queue_nomemsave(q); + /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done @@ -5021,8 +5029,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { - struct queue_limits lim; - blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { @@ -5036,13 +5042,6 @@ fallback: set->nr_hw_queues = prev_nr_hw_queues; goto fallback; } - lim = queue_limits_start_update(q); - if (blk_mq_can_poll(set)) - lim.features |= BLK_FEAT_POLL; - else - lim.features &= ~BLK_FEAT_POLL; - if (queue_limits_commit_update(q, &lim) < 0) - pr_warn("updating the poll flag failed\n"); blk_mq_map_swqueue(q); } @@ -5057,7 +5056,8 @@ switch_back: blk_mq_elv_switch_back(&head, q); list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue_nomemrestore(q); + memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) @@ -5102,9 +5102,9 @@ static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags) { - struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie); - - return blk_hctx_poll(q, hctx, iob, flags); + if (!blk_mq_can_poll(q)) + return 0; + return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, diff --git a/block/blk-mq.h b/block/blk-mq.h index 89a20fffa4b1..44979e92b79f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -163,11 +163,8 @@ struct blk_mq_alloc_data { }; struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, - unsigned int reserved_tags, int node, int alloc_policy); + unsigned int reserved_tags, unsigned int flags, int node); void blk_mq_free_tags(struct blk_mq_tags *tags); -int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, - struct sbitmap_queue *breserved_tags, unsigned int queue_depth, - unsigned int reserved, int node, int alloc_policy); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, @@ -451,4 +448,10 @@ do { \ #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ +static inline bool blk_mq_can_poll(struct request_queue *q) +{ + return (q->limits.features & BLK_FEAT_POLL) && + q->tag_set->map[HCTX_TYPE_POLL].nr_queues; +} + #endif diff --git a/block/blk-pm.c b/block/blk-pm.c index 42e842074715..8d3e052f91da 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -89,7 +89,7 @@ int blk_pre_runtime_suspend(struct request_queue *q) if (percpu_ref_is_zero(&q->q_usage_counter)) ret = 0; /* Switch q_usage_counter back to per-cpu mode. */ - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue_nomemrestore(q); if (ret < 0) { spin_lock_irq(&q->queue_lock); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index eb9618cd68ad..d4d4f4dc0e23 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -299,6 +299,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, const struct rq_qos_ops *ops) { struct request_queue *q = disk->queue; + unsigned int memflags; lockdep_assert_held(&q->rq_qos_mutex); @@ -310,14 +311,14 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, * No IO can be in-flight when adding rqos, so freeze queue, which * is fine since we only support rq_qos for blk-mq queue. */ - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); if (rq_qos_id(q, rqos->id)) goto ebusy; rqos->next = q->rq_qos; q->rq_qos = rqos; - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); if (rqos->ops->debugfs_attrs) { mutex_lock(&q->debugfs_mutex); @@ -327,7 +328,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, return 0; ebusy: - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); return -EBUSY; } @@ -335,17 +336,18 @@ void rq_qos_del(struct rq_qos *rqos) { struct request_queue *q = rqos->disk->queue; struct rq_qos **cur; + unsigned int memflags; lockdep_assert_held(&q->rq_qos_mutex); - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { if (*cur == rqos) { *cur = rqos->next; break; } } - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_rqos(rqos); diff --git a/block/blk-settings.c b/block/blk-settings.c index 8f09e33f41f6..b9c6f0ec1c49 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -175,6 +175,9 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) { unsigned int boundary_sectors; + if (!(lim->features & BLK_FEAT_ATOMIC_WRITES)) + goto unsupported; + if (!lim->atomic_write_hw_max) goto unsupported; @@ -243,6 +246,7 @@ int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; unsigned int logical_block_sectors; + unsigned long seg_size; int err; /* @@ -300,7 +304,7 @@ int blk_validate_limits(struct queue_limits *lim) max_hw_sectors = min_not_zero(lim->max_hw_sectors, lim->max_dev_sectors); if (lim->max_user_sectors) { - if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) + if (lim->max_user_sectors < BLK_MIN_SEGMENT_SIZE / SECTOR_SIZE) return -EINVAL; lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); } else if (lim->io_opt > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { @@ -338,7 +342,7 @@ int blk_validate_limits(struct queue_limits *lim) */ if (!lim->seg_boundary_mask) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1)) + if (WARN_ON_ONCE(lim->seg_boundary_mask < BLK_MIN_SEGMENT_SIZE - 1)) return -EINVAL; /* @@ -359,10 +363,17 @@ int blk_validate_limits(struct queue_limits *lim) */ if (!lim->max_segment_size) lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE)) + if (WARN_ON_ONCE(lim->max_segment_size < BLK_MIN_SEGMENT_SIZE)) return -EINVAL; } + /* setup min segment size for building new segment in fast path */ + if (lim->seg_boundary_mask > lim->max_segment_size - 1) + seg_size = lim->max_segment_size; + else + seg_size = lim->seg_boundary_mask + 1; + lim->min_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE); + /* * We require drivers to at least do logical block aligned I/O, but * historically could not check for that due to the separate calls @@ -413,7 +424,8 @@ int blk_set_default_limits(struct queue_limits *lim) * @lim: limits to apply * * Apply the limits in @lim that were obtained from queue_limits_start_update() - * and updated by the caller to @q. + * and updated by the caller to @q. The caller must have frozen the queue or + * ensure that there are no outstanding I/Os by other means. * * Returns 0 if successful, else a negative error code. */ @@ -444,6 +456,31 @@ out_unlock: EXPORT_SYMBOL_GPL(queue_limits_commit_update); /** + * queue_limits_commit_update_frozen - commit an atomic update of queue limits + * @q: queue to update + * @lim: limits to apply + * + * Apply the limits in @lim that were obtained from queue_limits_start_update() + * and updated with the new values by the caller to @q. Freezes the queue + * before the update and unfreezes it after. + * + * Returns 0 if successful, else a negative error code. + */ +int queue_limits_commit_update_frozen(struct request_queue *q, + struct queue_limits *lim) +{ + unsigned int memflags; + int ret; + + memflags = blk_mq_freeze_queue(q); + ret = queue_limits_commit_update(q, lim); + blk_mq_unfreeze_queue(q, memflags); + + return ret; +} +EXPORT_SYMBOL_GPL(queue_limits_commit_update_frozen); + +/** * queue_limits_set - apply queue limits to queue * @q: queue to update * @lim: limits to apply @@ -584,12 +621,15 @@ static bool blk_stack_atomic_writes_head(struct queue_limits *t, } static void blk_stack_atomic_writes_limits(struct queue_limits *t, - struct queue_limits *b) + struct queue_limits *b, sector_t start) { - if (!(t->features & BLK_FEAT_ATOMIC_WRITES_STACKED)) + if (!(b->features & BLK_FEAT_ATOMIC_WRITES)) + goto unsupported; + + if (!b->atomic_write_hw_unit_min) goto unsupported; - if (!b->atomic_write_unit_min) + if (!blk_atomic_write_start_sect_aligned(start, b)) goto unsupported; /* @@ -611,7 +651,6 @@ unsupported: t->atomic_write_hw_unit_max = 0; t->atomic_write_hw_unit_min = 0; t->atomic_write_hw_boundary = 0; - t->features &= ~BLK_FEAT_ATOMIC_WRITES_STACKED; } /** @@ -774,7 +813,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->zone_write_granularity = 0; t->max_zone_append_sectors = 0; } - blk_stack_atomic_writes_limits(t, b); + blk_stack_atomic_writes_limits(t, b, start); return ret; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 767598e719ab..6f548a4376aa 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -24,6 +24,8 @@ struct queue_sysfs_entry { struct attribute attr; ssize_t (*show)(struct gendisk *disk, char *page); ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); + int (*store_limit)(struct gendisk *disk, const char *page, + size_t count, struct queue_limits *lim); void (*load_module)(struct gendisk *disk, const char *page, size_t count); }; @@ -153,13 +155,11 @@ QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0) QUEUE_SYSFS_SHOW_CONST(write_same_max, 0) QUEUE_SYSFS_SHOW_CONST(poll_delay, -1) -static ssize_t queue_max_discard_sectors_store(struct gendisk *disk, - const char *page, size_t count) +static int queue_max_discard_sectors_store(struct gendisk *disk, + const char *page, size_t count, struct queue_limits *lim) { unsigned long max_discard_bytes; - struct queue_limits lim; ssize_t ret; - int err; ret = queue_var_store(&max_discard_bytes, page, count); if (ret < 0) @@ -171,38 +171,28 @@ static ssize_t queue_max_discard_sectors_store(struct gendisk *disk, if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; - lim = queue_limits_start_update(disk->queue); - lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; - err = queue_limits_commit_update(disk->queue, &lim); - if (err) - return err; - return ret; + lim->max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; + return 0; } -static ssize_t -queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count) +static int +queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count, + struct queue_limits *lim) { unsigned long max_sectors_kb; - struct queue_limits lim; ssize_t ret; - int err; ret = queue_var_store(&max_sectors_kb, page, count); if (ret < 0) return ret; - lim = queue_limits_start_update(disk->queue); - lim.max_user_sectors = max_sectors_kb << 1; - err = queue_limits_commit_update(disk->queue, &lim); - if (err) - return err; - return ret; + lim->max_user_sectors = max_sectors_kb << 1; + return 0; } static ssize_t queue_feature_store(struct gendisk *disk, const char *page, - size_t count, blk_features_t feature) + size_t count, struct queue_limits *lim, blk_features_t feature) { - struct queue_limits lim; unsigned long val; ssize_t ret; @@ -210,15 +200,11 @@ static ssize_t queue_feature_store(struct gendisk *disk, const char *page, if (ret < 0) return ret; - lim = queue_limits_start_update(disk->queue); if (val) - lim.features |= feature; + lim->features |= feature; else - lim.features &= ~feature; - ret = queue_limits_commit_update(disk->queue, &lim); - if (ret) - return ret; - return count; + lim->features &= ~feature; + return 0; } #define QUEUE_SYSFS_FEATURE(_name, _feature) \ @@ -227,10 +213,10 @@ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } \ -static ssize_t queue_##_name##_store(struct gendisk *disk, \ - const char *page, size_t count) \ +static int queue_##_name##_store(struct gendisk *disk, \ + const char *page, size_t count, struct queue_limits *lim) \ { \ - return queue_feature_store(disk, page, count, _feature); \ + return queue_feature_store(disk, page, count, lim, _feature); \ } QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) @@ -245,10 +231,17 @@ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ !!(disk->queue->limits.features & _feature)); \ } -QUEUE_SYSFS_FEATURE_SHOW(poll, BLK_FEAT_POLL); QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA); QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); +static ssize_t queue_poll_show(struct gendisk *disk, char *page) +{ + if (queue_is_mq(disk->queue)) + return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); + return sysfs_emit(page, "%u\n", + !!(disk->queue->limits.features & BLK_FEAT_POLL)); +} + static ssize_t queue_zoned_show(struct gendisk *disk, char *page) { if (blk_queue_is_zoned(disk->queue)) @@ -266,10 +259,9 @@ static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); } -static ssize_t queue_iostats_passthrough_store(struct gendisk *disk, - const char *page, size_t count) +static int queue_iostats_passthrough_store(struct gendisk *disk, + const char *page, size_t count, struct queue_limits *lim) { - struct queue_limits lim; unsigned long ios; ssize_t ret; @@ -277,18 +269,13 @@ static ssize_t queue_iostats_passthrough_store(struct gendisk *disk, if (ret < 0) return ret; - lim = queue_limits_start_update(disk->queue); if (ios) - lim.flags |= BLK_FLAG_IOSTATS_PASSTHROUGH; + lim->flags |= BLK_FLAG_IOSTATS_PASSTHROUGH; else - lim.flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH; - - ret = queue_limits_commit_update(disk->queue, &lim); - if (ret) - return ret; - - return count; + lim->flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH; + return 0; } + static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) { return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | @@ -391,12 +378,10 @@ static ssize_t queue_wc_show(struct gendisk *disk, char *page) return sysfs_emit(page, "write through\n"); } -static ssize_t queue_wc_store(struct gendisk *disk, const char *page, - size_t count) +static int queue_wc_store(struct gendisk *disk, const char *page, + size_t count, struct queue_limits *lim) { - struct queue_limits lim; bool disable; - int err; if (!strncmp(page, "write back", 10)) { disable = false; @@ -407,15 +392,11 @@ static ssize_t queue_wc_store(struct gendisk *disk, const char *page, return -EINVAL; } - lim = queue_limits_start_update(disk->queue); if (disable) - lim.flags |= BLK_FLAG_WRITE_CACHE_DISABLED; + lim->flags |= BLK_FLAG_WRITE_CACHE_DISABLED; else - lim.flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED; - err = queue_limits_commit_update(disk->queue, &lim); - if (err) - return err; - return count; + lim->flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED; + return 0; } #define QUEUE_RO_ENTRY(_prefix, _name) \ @@ -431,6 +412,13 @@ static struct queue_sysfs_entry _prefix##_entry = { \ .store = _prefix##_store, \ }; +#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0644 }, \ + .show = _prefix##_show, \ + .store_limit = _prefix##_store, \ +} + #define QUEUE_RW_LOAD_MODULE_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ @@ -441,7 +429,7 @@ static struct queue_sysfs_entry _prefix##_entry = { \ QUEUE_RW_ENTRY(queue_requests, "nr_requests"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); -QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); +QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); @@ -457,7 +445,7 @@ QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); -QUEUE_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); +QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); @@ -477,11 +465,11 @@ QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); -QUEUE_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); +QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); QUEUE_RW_ENTRY(queue_poll, "io_poll"); QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); -QUEUE_RW_ENTRY(queue_wc, "write_cache"); +QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache"); QUEUE_RO_ENTRY(queue_fua, "fua"); QUEUE_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); @@ -494,10 +482,10 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .show = queue_logical_block_size_show, }; -QUEUE_RW_ENTRY(queue_rotational, "rotational"); -QUEUE_RW_ENTRY(queue_iostats, "iostats"); -QUEUE_RW_ENTRY(queue_add_random, "add_random"); -QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); +QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational"); +QUEUE_LIM_RW_ENTRY(queue_iostats, "iostats"); +QUEUE_LIM_RW_ENTRY(queue_add_random, "add_random"); +QUEUE_LIM_RW_ENTRY(queue_stable_writes, "stable_writes"); #ifdef CONFIG_BLK_WBT static ssize_t queue_var_store64(s64 *var, const char *page) @@ -693,9 +681,10 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; + unsigned int memflags; ssize_t res; - if (!entry->store) + if (!entry->store_limit && !entry->store) return -EIO; /* @@ -706,11 +695,26 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, if (entry->load_module) entry->load_module(disk, page, length); - blk_mq_freeze_queue(q); + if (entry->store_limit) { + struct queue_limits lim = queue_limits_start_update(q); + + res = entry->store_limit(disk, page, length, &lim); + if (res < 0) { + queue_limits_cancel_update(q); + return res; + } + + res = queue_limits_commit_update_frozen(q, &lim); + if (res) + return res; + return length; + } + mutex_lock(&q->sysfs_lock); + memflags = blk_mq_freeze_queue(q); res = entry->store(disk, page, length); + blk_mq_unfreeze_queue(q, memflags); mutex_unlock(&q->sysfs_lock); - blk_mq_unfreeze_queue(q); return res; } @@ -758,7 +762,6 @@ int blk_register_queue(struct gendisk *disk) struct request_queue *q = disk->queue; int ret; - mutex_lock(&q->sysfs_dir_lock); kobject_init(&disk->queue_kobj, &blk_queue_ktype); ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) @@ -799,7 +802,6 @@ int blk_register_queue(struct gendisk *disk) if (q->elevator) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); - mutex_unlock(&q->sysfs_dir_lock); /* * SCSI probing may synchronously create and destroy a lot of @@ -824,7 +826,6 @@ out_debugfs_remove: mutex_unlock(&q->sysfs_lock); out_put_queue_kobj: kobject_put(&disk->queue_kobj); - mutex_unlock(&q->sysfs_dir_lock); return ret; } @@ -855,7 +856,6 @@ void blk_unregister_queue(struct gendisk *disk) blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q); mutex_unlock(&q->sysfs_lock); - mutex_lock(&q->sysfs_dir_lock); /* * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. @@ -872,7 +872,6 @@ void blk_unregister_queue(struct gendisk *disk) /* Now that we've deleted all child objects, we can delete the queue. */ kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE); kobject_del(&disk->queue_kobj); - mutex_unlock(&q->sysfs_dir_lock); blk_debugfs_remove(disk); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 82dbaefcfa3b..8d149aff9fd0 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1202,6 +1202,7 @@ static int blk_throtl_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct throtl_data *td; + unsigned int memflags; int ret; td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); @@ -1215,7 +1216,7 @@ static int blk_throtl_init(struct gendisk *disk) * Freeze queue before activating policy, to synchronize with IO path, * which is protected by 'q_usage_counter'. */ - blk_mq_freeze_queue(disk->queue); + memflags = blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); q->td = td; @@ -1239,7 +1240,7 @@ static int blk_throtl_init(struct gendisk *disk) out: blk_mq_unquiesce_queue(disk->queue); - blk_mq_unfreeze_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue, memflags); return ret; } diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 84da1eadff64..0c77244a35c9 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -11,12 +11,8 @@ */ #include <linux/kernel.h> -#include <linux/module.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> -#include <linux/mm.h> -#include <linux/vmalloc.h> -#include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/refcount.h> #include <linux/mempool.h> @@ -414,13 +410,14 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, } } hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); + atomic_inc(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); return true; } -static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, - sector_t sector) +static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, + sector_t sector) { unsigned int zno = disk_zone_no(disk, sector); unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); @@ -441,6 +438,15 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, return NULL; } +static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, + sector_t sector) +{ + if (!atomic_read(&disk->nr_zone_wplugs)) + return NULL; + + return disk_get_hashed_zone_wplug(disk, sector); +} + static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) { struct blk_zone_wplug *zwplug = @@ -463,6 +469,8 @@ static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { + lockdep_assert_held(&zwplug->lock); + /* If the zone write plug was already removed, we are done. */ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) return false; @@ -505,6 +513,7 @@ static void disk_remove_zone_wplug(struct gendisk *disk, zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); hlist_del_init_rcu(&zwplug->node); + atomic_dec(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); disk_put_zone_wplug(zwplug); } @@ -584,6 +593,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); bio_io_error(bio); disk_put_zone_wplug(zwplug); + /* Drop the reference taken by disk_zone_wplug_add_bio(() */ blk_queue_exit(q); } @@ -594,6 +604,11 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { struct bio *bio; + if (bio_list_empty(&zwplug->bio_list)) + return; + + pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", + zwplug->disk->disk_name, zwplug->zone_no); while ((bio = bio_list_pop(&zwplug->bio_list))) blk_zone_wplug_bio_io_error(zwplug, bio); } @@ -895,10 +910,7 @@ void blk_zone_write_plug_init_request(struct request *req) break; } - /* - * Drop the extra reference on the queue usage we got when - * plugging the BIO and advance the write pointer offset. - */ + /* Drop the reference taken by disk_zone_wplug_add_bio(). */ blk_queue_exit(q); zwplug->wp_offset += bio_sectors(bio); @@ -917,6 +929,8 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, { struct gendisk *disk = bio->bi_bdev->bd_disk; + lockdep_assert_held(&zwplug->lock); + /* * If we lost track of the zone write pointer due to a write error, * the user must either execute a report zones, reset the zone or finish @@ -1042,6 +1056,47 @@ plug: return true; } +static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct blk_zone_wplug *zwplug; + unsigned long flags; + + /* + * We have native support for zone append operations, so we are not + * going to handle @bio through plugging. However, we may already have a + * zone write plug for the target zone if that zone was previously + * partially written using regular writes. In such case, we risk leaving + * the plug in the disk hash table if the zone is fully written using + * zone append operations. Avoid this by removing the zone write plug. + */ + zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); + if (likely(!zwplug)) + return; + + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * We are about to remove the zone write plug. But if the user + * (mistakenly) has issued regular writes together with native zone + * append, we must aborts the writes as otherwise the plugged BIOs would + * not be executed by the plug BIO work as disk_get_zone_wplug() will + * return NULL after the plug is removed. Aborting the plugged write + * BIOs is consistent with the fact that these writes will most likely + * fail anyway as there is no ordering guarantees between zone append + * operations and regular write operations. + */ + if (!bio_list_empty(&zwplug->bio_list)) { + pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", + disk->disk_name, zwplug->zone_no); + disk_zone_wplug_abort(zwplug); + } + disk_remove_zone_wplug(disk, zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); + + disk_put_zone_wplug(zwplug); +} + /** * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging * @bio: The BIO being submitted @@ -1098,8 +1153,10 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) */ switch (bio_op(bio)) { case REQ_OP_ZONE_APPEND: - if (!bdev_emulates_zone_append(bdev)) + if (!bdev_emulates_zone_append(bdev)) { + blk_zone_wplug_handle_native_zone_append(bio); return false; + } fallthrough; case REQ_OP_WRITE: case REQ_OP_WRITE_ZEROES: @@ -1286,6 +1343,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, { unsigned int i; + atomic_set(&disk->nr_zone_wplugs, 0); disk->zone_wplugs_hash_bits = min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); @@ -1340,6 +1398,7 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) } } + WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; @@ -1446,7 +1505,6 @@ static int disk_update_zone_resources(struct gendisk *disk, unsigned int nr_seq_zones, nr_conv_zones; unsigned int pool_size; struct queue_limits lim; - int ret; disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; @@ -1497,11 +1555,7 @@ static int disk_update_zone_resources(struct gendisk *disk, } commit: - blk_mq_freeze_queue(q); - ret = queue_limits_commit_update(q, &lim); - blk_mq_unfreeze_queue(q); - - return ret; + return queue_limits_commit_update_frozen(q, &lim); } static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, @@ -1557,11 +1611,12 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, } /* - * We need to track the write pointer of all zones that are not - * empty nor full. So make sure we have a zone write plug for - * such zone if the device has a zone write plug hash table. + * If the device needs zone append emulation, we need to track the + * write pointer of all zones that are not empty nor full. So make sure + * we have a zone write plug for such zone if the device has a zone + * write plug hash table. */ - if (!disk->zone_wplugs_hash) + if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) return 0; disk_zone_wplug_sync_wp_offset(disk, zone); @@ -1724,9 +1779,10 @@ int blk_revalidate_disk_zones(struct gendisk *disk) else pr_warn("%s: failed to revalidate zones\n", disk->disk_name); if (ret) { - blk_mq_freeze_queue(q); + unsigned int memflags = blk_mq_freeze_queue(q); + disk_free_zone_resources(disk); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); } return ret; @@ -1776,37 +1832,41 @@ int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout); #ifdef CONFIG_BLK_DEBUG_FS +static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, + struct seq_file *m) +{ + unsigned int zwp_wp_offset, zwp_flags; + unsigned int zwp_zone_no, zwp_ref; + unsigned int zwp_bio_list_size; + unsigned long flags; + + spin_lock_irqsave(&zwplug->lock, flags); + zwp_zone_no = zwplug->zone_no; + zwp_flags = zwplug->flags; + zwp_ref = refcount_read(&zwplug->ref); + zwp_wp_offset = zwplug->wp_offset; + zwp_bio_list_size = bio_list_size(&zwplug->bio_list); + spin_unlock_irqrestore(&zwplug->lock, flags); + + seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref, + zwp_wp_offset, zwp_bio_list_size); +} int queue_zone_wplugs_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct gendisk *disk = q->disk; struct blk_zone_wplug *zwplug; - unsigned int zwp_wp_offset, zwp_flags; - unsigned int zwp_zone_no, zwp_ref; - unsigned int zwp_bio_list_size, i; - unsigned long flags; + unsigned int i; if (!disk->zone_wplugs_hash) return 0; rcu_read_lock(); - for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { - hlist_for_each_entry_rcu(zwplug, - &disk->zone_wplugs_hash[i], node) { - spin_lock_irqsave(&zwplug->lock, flags); - zwp_zone_no = zwplug->zone_no; - zwp_flags = zwplug->flags; - zwp_ref = refcount_read(&zwplug->ref); - zwp_wp_offset = zwplug->wp_offset; - zwp_bio_list_size = bio_list_size(&zwplug->bio_list); - spin_unlock_irqrestore(&zwplug->lock, flags); - - seq_printf(m, "%u 0x%x %u %u %u\n", - zwp_zone_no, zwp_flags, zwp_ref, - zwp_wp_offset, zwp_bio_list_size); - } - } + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) + hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], + node) + queue_zone_wplug_show(zwplug, m); rcu_read_unlock(); return 0; diff --git a/block/blk.h b/block/blk.h index 2c26abf505b8..9cf9a0099416 100644 --- a/block/blk.h +++ b/block/blk.h @@ -13,6 +13,9 @@ struct elevator_type; +#define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9) +#define BLK_MIN_SEGMENT_SIZE 4096 + /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) @@ -356,8 +359,12 @@ struct bio *bio_split_zone_append(struct bio *bio, static inline bool bio_may_need_split(struct bio *bio, const struct queue_limits *lim) { - return lim->chunk_sectors || bio->bi_vcnt != 1 || - bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; + if (lim->chunk_sectors) + return true; + if (bio->bi_vcnt != 1) + return true; + return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > + lim->min_segment_size; } /** @@ -556,14 +563,6 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors); struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass); -int bio_add_hw_page(struct request_queue *q, struct bio *bio, - struct page *page, unsigned int len, unsigned int offset, - unsigned int max_sectors, bool *same_page); - -int bio_add_hw_folio(struct request_queue *q, struct bio *bio, - struct folio *folio, size_t len, size_t offset, - unsigned int max_sectors, bool *same_page); - /* * Clean up a page appropriately, where the page may be pinned, may have a * ref taken on it or neither. @@ -720,22 +719,29 @@ void blk_integrity_verify(struct bio *bio); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); -static inline void blk_freeze_acquire_lock(struct request_queue *q, bool - disk_dead, bool queue_dying) +#ifdef CONFIG_LOCKDEP +static inline void blk_freeze_acquire_lock(struct request_queue *q) { - if (!disk_dead) + if (!q->mq_freeze_disk_dead) rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_); - if (!queue_dying) + if (!q->mq_freeze_queue_dying) rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_); } -static inline void blk_unfreeze_release_lock(struct request_queue *q, bool - disk_dead, bool queue_dying) +static inline void blk_unfreeze_release_lock(struct request_queue *q) { - if (!queue_dying) + if (!q->mq_freeze_queue_dying) rwsem_release(&q->q_lockdep_map, _RET_IP_); - if (!disk_dead) + if (!q->mq_freeze_disk_dead) rwsem_release(&q->io_lockdep_map, _RET_IP_); } +#else +static inline void blk_freeze_acquire_lock(struct request_queue *q) +{ +} +static inline void blk_unfreeze_release_lock(struct request_queue *q) +{ +} +#endif #endif /* BLK_INTERNAL_H */ diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 32da4a4429ce..93523d8f8195 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -381,7 +381,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, set->queue_depth = 128; set->numa_node = NUMA_NO_NODE; set->cmd_size = sizeof(struct bsg_job) + dd_job_size; - set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING; + set->flags = BLK_MQ_F_BLOCKING; if (blk_mq_alloc_tag_set(set)) goto out_tag_set; diff --git a/block/elevator.c b/block/elevator.c index 7c3ba80e5ff4..cd2ce4921601 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -405,12 +405,12 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) return NULL; } -#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) +#define to_elv(atr) container_of_const((atr), struct elv_fs_entry, attr) static ssize_t elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { - struct elv_fs_entry *entry = to_elv(attr); + const struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; ssize_t error; @@ -428,7 +428,7 @@ static ssize_t elv_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { - struct elv_fs_entry *entry = to_elv(attr); + const struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; ssize_t error; @@ -461,7 +461,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { - struct elv_fs_entry *attr = e->type->elevator_attrs; + const struct elv_fs_entry *attr = e->type->elevator_attrs; if (attr) { while (attr->attr.name) { if (sysfs_create_file(&e->kobj, &attr->attr)) @@ -547,14 +547,6 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -static inline bool elv_support_iosched(struct request_queue *q) -{ - if (!queue_is_mq(q) || - (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) - return false; - return true; -} - /* * For single queue devices, default to using mq-deadline. If we have multiple * queues or mq-deadline is not available, default to "none". @@ -578,11 +570,9 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) void elevator_init_mq(struct request_queue *q) { struct elevator_type *e; + unsigned int memflags; int err; - if (!elv_support_iosched(q)) - return; - WARN_ON_ONCE(blk_queue_registered(q)); if (unlikely(q->elevator)) @@ -601,16 +591,13 @@ void elevator_init_mq(struct request_queue *q) * * Disk isn't added yet, so verifying queue lock only manually. */ - blk_freeze_queue_start_non_owner(q); - blk_freeze_acquire_lock(q, true, false); - blk_mq_freeze_queue_wait(q); + memflags = blk_mq_freeze_queue(q); blk_mq_cancel_work_sync(q); err = blk_mq_init_sched(q, e); - blk_unfreeze_release_lock(q, true, false); - blk_mq_unfreeze_queue_non_owner(q); + blk_mq_unfreeze_queue(q, memflags); if (err) { pr_warn("\"%s\" elevator initialization failed, " @@ -628,11 +615,12 @@ void elevator_init_mq(struct request_queue *q) */ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { + unsigned int memflags; int ret; lockdep_assert_held(&q->sysfs_lock); - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); if (q->elevator) { @@ -653,7 +641,7 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) out_unfreeze: blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); if (ret) { pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n", @@ -665,9 +653,11 @@ out_unfreeze: void elevator_disable(struct request_queue *q) { + unsigned int memflags; + lockdep_assert_held(&q->sysfs_lock); - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); elv_unregister_queue(q); @@ -678,7 +668,7 @@ void elevator_disable(struct request_queue *q) blk_add_trace_msg(q, "elv switch: none"); blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); } /* @@ -717,9 +707,6 @@ void elv_iosched_load_module(struct gendisk *disk, const char *buf, struct elevator_type *found; const char *name; - if (!elv_support_iosched(disk->queue)) - return; - strscpy(elevator_name, buf, sizeof(elevator_name)); name = strstrip(elevator_name); @@ -737,9 +724,6 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, char elevator_name[ELV_NAME_MAX]; int ret; - if (!elv_support_iosched(disk->queue)) - return count; - strscpy(elevator_name, buf, sizeof(elevator_name)); ret = elevator_change(disk->queue, strstrip(elevator_name)); if (!ret) @@ -754,9 +738,6 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) struct elevator_type *cur = NULL, *e; int len = 0; - if (!elv_support_iosched(q)) - return sprintf(name, "none\n"); - if (!q->elevator) { len += sprintf(name+len, "[none] "); } else { diff --git a/block/elevator.h b/block/elevator.h index dbf357ef4fab..e526662c5dbb 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -71,7 +71,7 @@ struct elevator_type size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ - struct elv_fs_entry *elevator_attrs; + const struct elv_fs_entry *elevator_attrs; const char *elevator_name; const char *elevator_alias; struct module *elevator_owner; diff --git a/block/fops.c b/block/fops.c index 13a67940d040..be9f1dbea9ce 100644 --- a/block/fops.c +++ b/block/fops.c @@ -54,6 +54,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct bio bio; ssize_t ret; + WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA); if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -124,12 +125,16 @@ static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; + bool is_sync = dio->flags & DIO_IS_SYNC; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; + if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA)) + bio_integrity_unmap_user(bio); + if (atomic_dec_and_test(&dio->ref)) { - if (!(dio->flags & DIO_IS_SYNC)) { + if (!is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; @@ -221,14 +226,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * a retry of this from blocking context. */ if (unlikely(iov_iter_count(iter))) { - bio_release_pages(bio, false); - bio_clear_flag(bio, BIO_REFFED); - bio_put(bio); - blk_finish_plug(&plug); - return -EAGAIN; + ret = -EAGAIN; + goto fail; } bio->bi_opf |= REQ_NOWAIT; } + if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) { + ret = bio_integrity_map_iter(bio, iocb->private); + if (unlikely(ret)) + goto fail; + } if (is_read) { if (dio->flags & DIO_SHOULD_DIRTY) @@ -269,6 +276,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio_put(&dio->bio); return ret; +fail: + bio_release_pages(bio, false); + bio_clear_flag(bio, BIO_REFFED); + bio_put(bio); + blk_finish_plug(&plug); + return ret; } static void blkdev_bio_end_io_async(struct bio *bio) @@ -286,6 +299,9 @@ static void blkdev_bio_end_io_async(struct bio *bio) ret = blk_status_to_errno(bio->bi_status); } + if (iocb->ki_flags & IOCB_HAS_METADATA) + bio_integrity_unmap_user(bio); + iocb->ki_complete(iocb, ret); if (dio->flags & DIO_SHOULD_DIRTY) { @@ -330,10 +346,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, bio_iov_bvec_set(bio, iter); } else { ret = bio_iov_iter_get_pages(bio, iter); - if (unlikely(ret)) { - bio_put(bio); - return ret; - } + if (unlikely(ret)) + goto out_bio_put; } dio->size = bio->bi_iter.bi_size; @@ -346,6 +360,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_HAS_METADATA) { + ret = bio_integrity_map_iter(bio, iocb->private); + WRITE_ONCE(iocb->private, NULL); + if (unlikely(ret)) + goto out_bio_put; + } + if (iocb->ki_flags & IOCB_ATOMIC) bio->bi_opf |= REQ_ATOMIC; @@ -360,6 +381,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, submit_bio(bio); } return -EIOCBQUEUED; + +out_bio_put: + bio_put(bio); + return ret; } static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) @@ -758,11 +783,12 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) file_accessed(iocb->ki_filp); ret = blkdev_direct_IO(iocb, to); - if (ret >= 0) { + if (ret > 0) { iocb->ki_pos += ret; count -= ret; } - iov_iter_revert(to, count - iov_iter_count(to)); + if (ret != -EIOCBQUEUED) + iov_iter_revert(to, count - iov_iter_count(to)); if (ret < 0 || !count) goto reexpand; } diff --git a/block/genhd.c b/block/genhd.c index 79230c109fca..e9375e20d866 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -58,6 +58,13 @@ static DEFINE_IDA(ext_devt_ida); void set_capacity(struct gendisk *disk, sector_t sectors) { + if (sectors > BLK_DEV_MAX_SECTORS) { + pr_warn_once("%s: truncate capacity from %lld to %lld\n", + disk->disk_name, sectors, + BLK_DEV_MAX_SECTORS); + sectors = BLK_DEV_MAX_SECTORS; + } + bdev_set_nr_sectors(disk->part0, sectors); } EXPORT_SYMBOL(set_capacity); @@ -400,21 +407,26 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, struct device *ddev = disk_to_dev(disk); int ret; - /* Only makes sense for bio-based to set ->poll_bio */ - if (queue_is_mq(disk->queue) && disk->fops->poll_bio) + if (WARN_ON_ONCE(bdev_nr_sectors(disk->part0) > BLK_DEV_MAX_SECTORS)) return -EINVAL; - /* - * The disk queue should now be all set with enough information about - * the device for the elevator code to pick an adequate default - * elevator if one is needed, that is, for devices requesting queue - * registration. - */ - elevator_init_mq(disk->queue); + if (queue_is_mq(disk->queue)) { + /* + * ->submit_bio and ->poll_bio are bypassed for blk-mq drivers. + */ + if (disk->fops->submit_bio || disk->fops->poll_bio) + return -EINVAL; - /* Mark bdev as having a submit_bio, if needed */ - if (disk->fops->submit_bio) + /* + * Initialize the I/O scheduler code and pick a default one if + * needed. + */ + elevator_init_mq(disk->queue); + } else { + if (!disk->fops->submit_bio) + return -EINVAL; bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO); + } /* * If the driver provides an explicit major number it also must provide @@ -661,7 +673,7 @@ void del_gendisk(struct gendisk *disk) struct request_queue *q = disk->queue; struct block_device *part; unsigned long idx; - bool start_drain, queue_dying; + bool start_drain; might_sleep(); @@ -690,9 +702,8 @@ void del_gendisk(struct gendisk *disk) */ mutex_lock(&disk->open_mutex); start_drain = __blk_mark_disk_dead(disk); - queue_dying = blk_queue_dying(q); if (start_drain) - blk_freeze_acquire_lock(q, true, queue_dying); + blk_freeze_acquire_lock(q); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); @@ -748,7 +759,7 @@ void del_gendisk(struct gendisk *disk) blk_mq_exit_queue(q); if (start_drain) - blk_unfreeze_release_lock(q, true, queue_dying); + blk_unfreeze_release_lock(q); } EXPORT_SYMBOL(del_gendisk); @@ -798,7 +809,7 @@ static ssize_t disk_badblocks_store(struct device *dev, } #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD -void blk_request_module(dev_t devt) +static bool blk_probe_dev(dev_t devt) { unsigned int major = MAJOR(devt); struct blk_major_name **n; @@ -808,14 +819,26 @@ void blk_request_module(dev_t devt) if ((*n)->major == major && (*n)->probe) { (*n)->probe(devt); mutex_unlock(&major_names_lock); - return; + return true; } } mutex_unlock(&major_names_lock); + return false; +} + +void blk_request_module(dev_t devt) +{ + int error; + + if (blk_probe_dev(devt)) + return; - if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) - /* Make old-style 2.4 aliases work */ - request_module("block-major-%d", MAJOR(devt)); + error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)); + /* Make old-style 2.4 aliases work */ + if (error > 0) + error = request_module("block-major-%d", MAJOR(devt)); + if (!error) + blk_probe_dev(devt); } #endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */ diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 4155594aefc6..dc31f2dfa414 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -889,7 +889,7 @@ KYBER_LAT_SHOW_STORE(KYBER_WRITE, write); #undef KYBER_LAT_SHOW_STORE #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) -static struct elv_fs_entry kyber_sched_attrs[] = { +static const struct elv_fs_entry kyber_sched_attrs[] = { KYBER_LAT_ATTR(read), KYBER_LAT_ATTR(write), __ATTR_NULL diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 5528347b5fcf..754f6b7415cd 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -834,7 +834,7 @@ STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) -static struct elv_fs_entry deadline_attrs[] = { +static const struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), DD_ATTR(write_expire), DD_ATTR(writes_starved), diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 5e9be13a56a8..7acba66eed48 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -682,7 +682,7 @@ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out) out[size] = 0; while (i < size) { - u8 c = le16_to_cpu(in[i]) & 0xff; + u8 c = le16_to_cpu(in[i]) & 0x7f; if (c && !isprint(c)) c = '!'; diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h index e259180c8914..aa3bd050d8cd 100644 --- a/block/partitions/ldm.h +++ b/block/partitions/ldm.h @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * ldm - Part of the Linux-NTFS project. * * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> diff --git a/block/partitions/mac.c b/block/partitions/mac.c index c80183156d68..b02530d98629 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -53,13 +53,25 @@ int mac_partition(struct parsed_partitions *state) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); + + /* + * If the "block size" is not a power of 2, things get weird - we might + * end up with a partition straddling a sector boundary, so we wouldn't + * be able to read a partition entry with read_part_sector(). + * Real block sizes are probably (?) powers of two, so just require + * that. + */ + if (!is_power_of_2(secsize)) + return -1; datasize = round_down(secsize, 512); data = read_part_sector(state, datasize / 512, §); if (!data) return -1; partoffset = secsize % 512; - if (partoffset + sizeof(*part) > datasize) + if (partoffset + sizeof(*part) > datasize) { + put_dev_sector(sect); return -1; + } part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); @@ -112,8 +124,8 @@ int mac_partition(struct parsed_partitions *state) int i, l; goodness++; - l = strlen(part->name); - if (strcmp(part->name, "/") == 0) + l = strnlen(part->name, sizeof(part->name)); + if (strncmp(part->name, "/", sizeof(part->name)) == 0) goodness++; for (i = 0; i <= l - 4; ++i) { if (strncasecmp(part->name + i, "root", |