From eeadd68e2a5f6bfe0bf1038ec49e3a8d99eb5fe8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 May 2025 10:11:25 +0200 Subject: block: remove bounce buffering support The block layer bounce buffering support is unused now, remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: John Garry Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250505081138.3435992-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 - include/linux/blkdev.h | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index dce7615c35e7..5a46067e85b1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -286,7 +286,6 @@ struct bio { enum { BIO_PAGE_PINNED, /* Unpin pages in bio_release_pages() */ BIO_CLONED, /* doesn't own data */ - BIO_BOUNCED, /* bio is a bounce bio */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f3d74f9dae8e..5ccb961ee2ae 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -331,9 +331,6 @@ typedef unsigned int __bitwise blk_features_t; /* skip this queue in blk_mq_(un)quiesce_tagset */ #define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13)) -/* bounce all highmem pages */ -#define BLK_FEAT_BOUNCE_HIGH ((__force blk_features_t)(1u << 14)) - /* undocumented magic for bcache */ #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) @@ -347,7 +344,7 @@ typedef unsigned int __bitwise blk_features_t; */ #define BLK_FEAT_INHERIT_MASK \ (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ - BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH | \ + BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | \ BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) /* internal flags in queue_limits.flags */ -- cgit v1.2.3 From 194df9f66db8d6f74f03c78c2ad47b74a5a8b886 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 May 2025 10:11:26 +0200 Subject: mm: remove NR_BOUNCE zone stat The stat is always 0 now, so remove it and hardwire the user visible output to 0. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250505081138.3435992-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/base/node.c | 2 +- fs/proc/meminfo.c | 3 +-- include/linux/mmzone.h | 1 - mm/show_mem.c | 4 ++-- 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index cd13ef287011..618712071a1e 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -468,7 +468,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_PAGETABLE)), nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), nid, 0UL, - nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), + nid, 0UL, nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), nid, K(sreclaimable + node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)), diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 83be312159c9..bc2bc60c36cc 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -120,8 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_SECONDARY_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); - show_val_kb(m, "Bounce: ", - global_zone_page_state(NR_BOUNCE)); + show_val_kb(m, "Bounce: ", 0); show_val_kb(m, "WritebackTmp: ", global_node_page_state(NR_WRITEBACK_TEMP)); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6ccec1bf2896..b1c459f7a485 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -148,7 +148,6 @@ enum zone_stat_item { NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ /* Second 128 byte cacheline */ - NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif diff --git a/mm/show_mem.c b/mm/show_mem.c index 6af13bcd2ab3..5acb51a9fc49 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -223,7 +223,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), global_node_page_state(NR_SECONDARY_PAGETABLE), - global_zone_page_state(NR_BOUNCE), + 0UL, global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), free_pcp, @@ -341,7 +341,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_BOUNCE)), + 0UL, K(free_pcp), K(this_cpu_read(zone->per_cpu_pageset->count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES))); -- cgit v1.2.3 From 56dee46ff47f0ef9944dddd1fd137c94b7c2d9de Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 5 May 2025 22:17:40 +0800 Subject: block: move ELEVATOR_FLAG_DISABLE_WBT a request queue flag ELEVATOR_FLAG_DISABLE_WBT is only used by BFQ to disallow wbt when BFQ is in use. The flag is set in BFQ's init(), and cleared in BFQ's exit(). Making it as request queue flag, so that we can avoid to deal with elevator switch race. Also it isn't graceful to checking one scheduler flag in wbt_enable_default(). Reviewed-by: Christoph Hellwig Reviewed-by: Nilay Shroff Reviewed-by: Hannes Reinecke Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250505141805.2751237-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 ++-- block/blk-mq-debugfs.c | 1 + block/blk-wbt.c | 3 +-- block/elevator.h | 1 - include/linux/blkdev.h | 3 +++ 5 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index abd80dc13562..cc6f59836dcd 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7210,7 +7210,7 @@ static void bfq_exit_queue(struct elevator_queue *e) #endif blk_stat_disable_accounting(bfqd->queue); - clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags); + blk_queue_flag_clear(QUEUE_FLAG_DISABLE_WBT_DEF, bfqd->queue); wbt_enable_default(bfqd->queue->disk); kfree(bfqd); @@ -7397,7 +7397,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); - set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags); + blk_queue_flag_set(QUEUE_FLAG_DISABLE_WBT_DEF, q); wbt_disable_default(q->disk); blk_stat_enable_accounting(q); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3421b5521fe2..7710c409e432 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -93,6 +93,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(SQ_SCHED), + QUEUE_FLAG_NAME(DISABLE_WBT_DEF), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-wbt.c b/block/blk-wbt.c index f1754d07f7e0..29cd2e33666f 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -704,8 +704,7 @@ void wbt_enable_default(struct gendisk *disk) struct rq_qos *rqos; bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ); - if (q->elevator && - test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags)) + if (blk_queue_disable_wbt(q)) enable = false; /* Throttling already enabled? */ diff --git a/block/elevator.h b/block/elevator.h index e4e44dfac503..e27af5492cdb 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -121,7 +121,6 @@ struct elevator_queue }; #define ELEVATOR_FLAG_REGISTERED 0 -#define ELEVATOR_FLAG_DISABLE_WBT 1 /* * block elevator interface diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5ccb961ee2ae..c36d7a1c2cc0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -641,6 +641,7 @@ enum { QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */ QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */ QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */ + QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ QUEUE_FLAG_MAX }; @@ -676,6 +677,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) #define blk_queue_skip_tagset_quiesce(q) \ ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE) +#define blk_queue_disable_wbt(q) \ + test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit v1.2.3 From 98e68f67020ce30e1a4d8e2d05d85a453738dfb8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 5 May 2025 22:17:46 +0800 Subject: block: prevent adding/deleting disk during updating nr_hw_queues Both adding/deleting disk code are reader of `nr_hw_queues`, so we can't allow them in-progress when updating nr_hw_queues, kernel panic and kasan has been reported in [1]. Prevent adding/deleting disk during updating nr_hw_queues by adding rw_semaphore to tagset, write lock is grabbed in blk_mq_update_nr_hw_queues(), and read lock is acquired when adding/deleting disk. Also mark GFP_NOIO allocation scope for adding/deleting disk because blk_mq_update_nr_hw_queues() is part of some driver's error handler. This way avoids lot of trouble. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Nilay Shroff Suggested-by: Nilay Shroff Reported-by: Nilay Shroff Closes: https://lore.kernel.org/linux-block/a5896cdb-a59a-4a37-9f99-20522f5d2987@linux.ibm.com/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250505141805.2751237-9-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++ block/genhd.c | 113 ++++++++++++++++++++++++++++++++++--------------- include/linux/blk-mq.h | 3 ++ 3 files changed, 86 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8caff40c7511..74f4b1d37376 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4846,6 +4846,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) goto out_free_srcu; } + init_rwsem(&set->update_nr_hwq_lock); + ret = -ENOMEM; set->tags = kcalloc_node(set->nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, @@ -5141,9 +5143,11 @@ switch_back: void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { + down_write(&set->update_nr_hwq_lock); mutex_lock(&set->tag_list_lock); __blk_mq_update_nr_hw_queues(set, nr_hw_queues); mutex_unlock(&set->tag_list_lock); + up_write(&set->update_nr_hwq_lock); } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); diff --git a/block/genhd.c b/block/genhd.c index 50f3deeec5e3..e0dd8ecc925f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -415,19 +415,9 @@ static void add_disk_final(struct gendisk *disk) set_bit(GD_ADDED, &disk->state); } -/** - * add_disk_fwnode - add disk information to kernel list with fwnode - * @parent: parent device for the disk - * @disk: per-device partitioning information - * @groups: Additional per-device sysfs groups - * @fwnode: attached disk fwnode - * - * This function registers the partitioning information in @disk - * with the kernel. Also attach a fwnode to the disk device. - */ -int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups, - struct fwnode_handle *fwnode) +static int __add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, + struct fwnode_handle *fwnode) { struct device *ddev = disk_to_dev(disk); @@ -550,7 +540,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, */ disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor); } - add_disk_final(disk); return 0; out_unregister_bdi: @@ -580,6 +569,45 @@ out_exit_elevator: } return ret; } + +/** + * add_disk_fwnode - add disk information to kernel list with fwnode + * @parent: parent device for the disk + * @disk: per-device partitioning information + * @groups: Additional per-device sysfs groups + * @fwnode: attached disk fwnode + * + * This function registers the partitioning information in @disk + * with the kernel. Also attach a fwnode to the disk device. + */ +int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, + struct fwnode_handle *fwnode) +{ + struct blk_mq_tag_set *set; + unsigned int memflags; + int ret; + + if (queue_is_mq(disk->queue)) { + set = disk->queue->tag_set; + memflags = memalloc_noio_save(); + down_read(&set->update_nr_hwq_lock); + ret = __add_disk(parent, disk, groups, fwnode); + up_read(&set->update_nr_hwq_lock); + memalloc_noio_restore(memflags); + } else { + ret = __add_disk(parent, disk, groups, fwnode); + } + + /* + * add_disk_final() needn't to read `nr_hw_queues`, so move it out + * of read lock `set->update_nr_hwq_lock` for avoiding unnecessary + * lock dependency on `disk->open_mutex` from scanning partition. + */ + if (!ret) + add_disk_final(disk); + return ret; +} EXPORT_SYMBOL_GPL(add_disk_fwnode); /** @@ -660,26 +688,7 @@ void blk_mark_disk_dead(struct gendisk *disk) } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); -/** - * del_gendisk - remove the gendisk - * @disk: the struct gendisk to remove - * - * Removes the gendisk and all its associated resources. This deletes the - * partitions associated with the gendisk, and unregisters the associated - * request_queue. - * - * This is the counter to the respective __device_add_disk() call. - * - * The final removal of the struct gendisk happens when its refcount reaches 0 - * with put_disk(), which should be called after del_gendisk(), if - * __device_add_disk() was used. - * - * Drivers exist which depend on the release of the gendisk to be synchronous, - * it should not be deferred. - * - * Context: can sleep - */ -void del_gendisk(struct gendisk *disk) +static void __del_gendisk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct block_device *part; @@ -772,6 +781,42 @@ void del_gendisk(struct gendisk *disk) if (start_drain) blk_unfreeze_release_lock(q); } + +/** + * del_gendisk - remove the gendisk + * @disk: the struct gendisk to remove + * + * Removes the gendisk and all its associated resources. This deletes the + * partitions associated with the gendisk, and unregisters the associated + * request_queue. + * + * This is the counter to the respective __device_add_disk() call. + * + * The final removal of the struct gendisk happens when its refcount reaches 0 + * with put_disk(), which should be called after del_gendisk(), if + * __device_add_disk() was used. + * + * Drivers exist which depend on the release of the gendisk to be synchronous, + * it should not be deferred. + * + * Context: can sleep + */ +void del_gendisk(struct gendisk *disk) +{ + struct blk_mq_tag_set *set; + unsigned int memflags; + + if (!queue_is_mq(disk->queue)) { + __del_gendisk(disk); + } else { + set = disk->queue->tag_set; + memflags = memalloc_noio_save(); + down_read(&set->update_nr_hwq_lock); + __del_gendisk(disk); + up_read(&set->update_nr_hwq_lock); + memalloc_noio_restore(memflags); + } +} EXPORT_SYMBOL(del_gendisk); /** diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8eb9b3310167..ef84d53095a6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -9,6 +9,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -527,6 +528,8 @@ struct blk_mq_tag_set { struct mutex tag_list_lock; struct list_head tag_list; struct srcu_struct *srcu; + + struct rw_semaphore update_nr_hwq_lock; }; /** -- cgit v1.2.3 From 21eed794ab4bd1a6c82a55df4416d18fb4d21da9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 5 May 2025 22:17:58 +0800 Subject: block: add new helper for disabling elevator switch when deleting disk Add new helper disable_elv_switch() and new flag QUEUE_FLAG_NO_ELV_SWITCH for disabling elevator switch before deleting disk: - originally flag QUEUE_FLAG_REGISTERED is added for preventing elevator switch during removing disk, but this flag has been used widely for other purposes, so add one new flag for disabling elevator switch only - for avoiding deadlock risk, we have to move elevator queue register/unregister out of elevator lock and queue freeze, which will be done in next patch. However, this way adds small race window between elevator switch and deleting ->queue_kobj, in which elevator queue register/unregister could be run concurrently. The added helper will be used for avoiding the race in the following patch. - drain in-progress elevator switch before deleting disk Suggested-by: Nilay Shroff Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Nilay Shroff Link: https://lore.kernel.org/r/20250505141805.2751237-21-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 + block/elevator.c | 13 ++++++++++--- block/genhd.c | 13 +++++++++++++ include/linux/blkdev.h | 3 +++ 4 files changed, 27 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 2837a8ce8054..29b3540dd180 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -94,6 +94,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(SQ_SCHED), QUEUE_FLAG_NAME(DISABLE_WBT_DEF), + QUEUE_FLAG_NAME(NO_ELV_SWITCH), }; #undef QUEUE_FLAG_NAME diff --git a/block/elevator.c b/block/elevator.c index 2edaf84900fc..f7e333abefe3 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -680,6 +680,9 @@ void elevator_set_default(struct request_queue *q) }; int err = 0; + /* now we allow to switch elevator */ + blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q); + if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) return; @@ -744,9 +747,13 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, elv_iosched_load_module(ctx.name); down_read(&set->update_nr_hwq_lock); - ret = elevator_change(q, &ctx); - if (!ret) - ret = count; + if (!blk_queue_no_elv_switch(q)) { + ret = elevator_change(q, &ctx); + if (!ret) + ret = count; + } else { + ret = -ENOENT; + } up_read(&set->update_nr_hwq_lock); return ret; } diff --git a/block/genhd.c b/block/genhd.c index f192fe4808b9..a8cb5607b6e3 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -764,6 +764,16 @@ static void __del_gendisk(struct gendisk *disk) blk_unfreeze_release_lock(q); } +static void disable_elv_switch(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + WARN_ON_ONCE(!queue_is_mq(q)); + + down_write(&set->update_nr_hwq_lock); + blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q); + up_write(&set->update_nr_hwq_lock); +} + /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove @@ -792,6 +802,9 @@ void del_gendisk(struct gendisk *disk) __del_gendisk(disk); } else { set = disk->queue->tag_set; + + disable_elv_switch(disk->queue); + memflags = memalloc_noio_save(); down_read(&set->update_nr_hwq_lock); __del_gendisk(disk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c36d7a1c2cc0..3aa1fd637d57 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -642,6 +642,7 @@ enum { QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */ QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */ QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ + QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_MAX }; @@ -679,6 +680,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE) #define blk_queue_disable_wbt(q) \ test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags) +#define blk_queue_no_elv_switch(q) \ + test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit v1.2.3 From 78c271344b6f64ce24c845e54903e09928cf2061 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 5 May 2025 22:18:03 +0800 Subject: block: move wbt_enable_default() out of queue freezing from sched ->exit() scheduler's ->exit() is called with queue frozen and elevator lock is held, and wbt_enable_default() can't be called with queue frozen, otherwise the following lockdep warning is triggered: #6 (&q->rq_qos_mutex){+.+.}-{4:4}: #5 (&eq->sysfs_lock){+.+.}-{4:4}: #4 (&q->elevator_lock){+.+.}-{4:4}: #3 (&q->q_usage_counter(io)#3){++++}-{0:0}: #2 (fs_reclaim){+.+.}-{0:0}: #1 (&sb->s_type->i_mutex_key#3){+.+.}-{4:4}: #0 (&q->debugfs_mutex){+.+.}-{4:4}: Fix the issue by moving wbt_enable_default() out of bfq's exit(), and call it from elevator_change_done(). Meantime add disk->rqos_state_mutex for covering wbt state change, which matches the purpose more than ->elevator_lock. Reviewed-by: Hannes Reinecke Reviewed-by: Nilay Shroff Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250505141805.2751237-26-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- block/blk-sysfs.c | 10 ++++------ block/blk-wbt.c | 6 ++++++ block/elevator.c | 5 +++++ block/elevator.h | 1 + block/genhd.c | 1 + include/linux/blkdev.h | 2 ++ 7 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index cc6f59836dcd..0cb1e9873aab 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7211,7 +7211,7 @@ static void bfq_exit_queue(struct elevator_queue *e) blk_stat_disable_accounting(bfqd->queue); blk_queue_flag_clear(QUEUE_FLAG_DISABLE_WBT_DEF, bfqd->queue); - wbt_enable_default(bfqd->queue->disk); + set_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, &e->flags); kfree(bfqd); } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 741e607dfab6..01e0ead13278 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -560,7 +560,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) ssize_t ret; struct request_queue *q = disk->queue; - mutex_lock(&q->elevator_lock); + mutex_lock(&disk->rqos_state_mutex); if (!wbt_rq_qos(q)) { ret = -EINVAL; goto out; @@ -573,7 +573,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); out: - mutex_unlock(&q->elevator_lock); + mutex_unlock(&disk->rqos_state_mutex); return ret; } @@ -593,7 +593,6 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, return -EINVAL; memflags = blk_mq_freeze_queue(q); - mutex_lock(&q->elevator_lock); rqos = wbt_rq_qos(q); if (!rqos) { @@ -618,11 +617,12 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, */ blk_mq_quiesce_queue(q); + mutex_lock(&disk->rqos_state_mutex); wbt_set_min_lat(q, val); + mutex_unlock(&disk->rqos_state_mutex); blk_mq_unquiesce_queue(q); out: - mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); return ret; @@ -871,9 +871,7 @@ int blk_register_queue(struct gendisk *disk) if (queue_is_mq(q)) elevator_set_default(q); - mutex_lock(&q->elevator_lock); wbt_enable_default(disk); - mutex_unlock(&q->elevator_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 29cd2e33666f..74ae7131ada9 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -704,6 +704,8 @@ void wbt_enable_default(struct gendisk *disk) struct rq_qos *rqos; bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ); + mutex_lock(&disk->rqos_state_mutex); + if (blk_queue_disable_wbt(q)) enable = false; @@ -712,8 +714,10 @@ void wbt_enable_default(struct gendisk *disk) if (rqos) { if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; + mutex_unlock(&disk->rqos_state_mutex); return; } + mutex_unlock(&disk->rqos_state_mutex); /* Queue not registered? Maybe shutting down... */ if (!blk_queue_registered(q)) @@ -773,11 +777,13 @@ void wbt_disable_default(struct gendisk *disk) struct rq_wb *rwb; if (!rqos) return; + mutex_lock(&disk->rqos_state_mutex); rwb = RQWB(rqos); if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { blk_stat_deactivate(rwb->cb); rwb->enable_state = WBT_STATE_OFF_DEFAULT; } + mutex_unlock(&disk->rqos_state_mutex); } EXPORT_SYMBOL_GPL(wbt_disable_default); diff --git a/block/elevator.c b/block/elevator.c index 8578b969e173..f8d72bd20610 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -637,8 +637,13 @@ static int elevator_change_done(struct request_queue *q, int ret = 0; if (ctx->old) { + bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, + &ctx->old->flags); + elv_unregister_queue(q, ctx->old); kobject_put(&ctx->old->kobj); + if (enable_wbt) + wbt_enable_default(q->disk); } if (ctx->new) { ret = elv_register_queue(q, ctx->new, !ctx->no_uevent); diff --git a/block/elevator.h b/block/elevator.h index 76a90a1b7ed6..a07ce773a38f 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -122,6 +122,7 @@ struct elevator_queue #define ELEVATOR_FLAG_REGISTERED 0 #define ELEVATOR_FLAG_DYING 1 +#define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2 /* * block elevator interface diff --git a/block/genhd.c b/block/genhd.c index a8cb5607b6e3..9c7c657380db 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1470,6 +1470,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); #endif + mutex_init(&disk->rqos_state_mutex); return disk; out_erase_part0: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3aa1fd637d57..94323f303b37 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -218,6 +218,8 @@ struct gendisk { * devices that do not have multiple independent access ranges. */ struct blk_independent_access_ranges *ia_ranges; + + struct mutex rqos_state_mutex; /* rqos state change mutex */ }; /** -- cgit v1.2.3 From 732f25a2895a8c1c54fb56544f0b1e23770ef4d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 May 2025 17:47:22 +0530 Subject: fs: add a write stream field to the kiocb Prepare for io_uring passthrough of write streams. The write stream field in the kiocb structure fits into an existing 2-byte hole, so its size is not changed. Reviewed-by: Hannes Reinecke Reviewed-by: Nitesh Shetty Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250506121732.8211-2-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..d5988867fe31 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -408,6 +408,7 @@ struct kiocb { void *private; int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ + u8 ki_write_stream; union { /* * Only used for async buffered reads, where it denotes the -- cgit v1.2.3 From 5006f85ea23ea0bda9a8e31fdda126f4fca48f20 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 May 2025 17:47:23 +0530 Subject: block: add a bi_write_stream field Add the ability to pass a write stream for placement control in the bio. The new field fits in an existing hole, so does not change the size of the struct. Reviewed-by: Hannes Reinecke Reviewed-by: Nitesh Shetty Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250506121732.8211-3-joshi.k@samsung.com Signed-off-by: Jens Axboe --- block/bio.c | 2 ++ block/blk-crypto-fallback.c | 1 + block/blk-merge.c | 4 ++++ include/linux/blk_types.h | 1 + 4 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 4e6c85a33d74..1e42aefc7377 100644 --- a/block/bio.c +++ b/block/bio.c @@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_flags = 0; bio->bi_ioprio = 0; bio->bi_write_hint = 0; + bio->bi_write_stream = 0; bio->bi_status = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; @@ -827,6 +828,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index f154be0b575a..005c9157ffb3 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -173,6 +173,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; diff --git a/block/blk-merge.c b/block/blk-merge.c index fdd4efb54c6c..782308b73b53 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -832,6 +832,8 @@ static struct request *attempt_merge(struct request_queue *q, if (req->bio->bi_write_hint != next->bio->bi_write_hint) return NULL; + if (req->bio->bi_write_stream != next->bio->bi_write_stream) + return NULL; if (req->bio->bi_ioprio != next->bio->bi_ioprio) return NULL; if (!blk_atomic_write_mergeable_rqs(req, next)) @@ -953,6 +955,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; if (rq->bio->bi_write_hint != bio->bi_write_hint) return false; + if (rq->bio->bi_write_stream != bio->bi_write_stream) + return false; if (rq->bio->bi_ioprio != bio->bi_ioprio) return false; if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 5a46067e85b1..f38425338c3f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -220,6 +220,7 @@ struct bio { unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; enum rw_hint bi_write_hint; + u8 bi_write_stream; blk_status_t bi_status; atomic_t __bi_remaining; -- cgit v1.2.3 From d2f526ba27d29c442542f7c5df0a86ef0b576716 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 6 May 2025 17:47:24 +0530 Subject: block: introduce max_write_streams queue limit Drivers with hardware that support write streams need a way to export how many are available so applications can generically query this. Reviewed-by: Hannes Reinecke Reviewed-by: Nitesh Shetty Signed-off-by: Keith Busch [hch: renamed hints to streams, removed stacking] Signed-off-by: Christoph Hellwig Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250506121732.8211-4-joshi.k@samsung.com Signed-off-by: Jens Axboe --- Documentation/ABI/stable/sysfs-block | 7 +++++++ block/blk-sysfs.c | 3 +++ include/linux/blkdev.h | 9 +++++++++ 3 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 11545c9e2e93..8bbe1eca28df 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -547,6 +547,13 @@ Description: [RO] Maximum size in bytes of a single element in a DMA scatter/gather list. +What: /sys/block//queue/max_write_streams +Date: November 2024 +Contact: linux-block@vger.kernel.org +Description: + [RO] Maximum number of write streams supported, 0 if not + supported. If supported, valid values are 1 through + max_write_streams, inclusive. What: /sys/block//queue/max_segments Date: March 2010 diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 01e0ead13278..5934150d4204 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -134,6 +134,7 @@ QUEUE_SYSFS_LIMIT_SHOW(max_segments) QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments) QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments) QUEUE_SYSFS_LIMIT_SHOW(max_segment_size) +QUEUE_SYSFS_LIMIT_SHOW(max_write_streams) QUEUE_SYSFS_LIMIT_SHOW(logical_block_size) QUEUE_SYSFS_LIMIT_SHOW(physical_block_size) QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors) @@ -488,6 +489,7 @@ QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size"); +QUEUE_LIM_RO_ENTRY(queue_max_write_streams, "max_write_streams"); QUEUE_RW_ENTRY(elv_iosched, "scheduler"); QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size"); @@ -642,6 +644,7 @@ static struct attribute *queue_attrs[] = { &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, + &queue_max_write_streams_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 94323f303b37..13e1426ad38d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -404,6 +404,8 @@ struct queue_limits { unsigned short max_integrity_segments; unsigned short max_discard_segments; + unsigned short max_write_streams; + unsigned int max_open_zones; unsigned int max_active_zones; @@ -1270,6 +1272,13 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev) return queue_max_segments(bdev_get_queue(bdev)); } +static inline unsigned short bdev_max_write_streams(struct block_device *bdev) +{ + if (bdev_is_partition(bdev)) + return 0; + return bdev_limits(bdev)->max_write_streams; +} + static inline unsigned queue_logical_block_size(const struct request_queue *q) { return q->limits.logical_block_size; -- cgit v1.2.3 From c23acfac10786ac5062a0615e23e68b913ac8da0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 May 2025 17:47:25 +0530 Subject: block: introduce a write_stream_granularity queue limit Export the granularity that write streams should be discarded with, as it is essential for making good use of them. Reviewed-by: Hannes Reinecke Reviewed-by: Nitesh Shetty Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250506121732.8211-5-joshi.k@samsung.com Signed-off-by: Jens Axboe --- Documentation/ABI/stable/sysfs-block | 8 ++++++++ block/blk-sysfs.c | 3 +++ include/linux/blkdev.h | 1 + 3 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 8bbe1eca28df..4ba771b56b3b 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -555,6 +555,14 @@ Description: supported. If supported, valid values are 1 through max_write_streams, inclusive. +What: /sys/block//queue/write_stream_granularity +Date: November 2024 +Contact: linux-block@vger.kernel.org +Description: + [RO] Granularity of a write stream in bytes. The granularity + of a write stream is the size that should be discarded or + overwritten together to avoid write amplification in the device. + What: /sys/block//queue/max_segments Date: March 2010 Contact: linux-block@vger.kernel.org diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5934150d4204..386374ff655b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -135,6 +135,7 @@ QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments) QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments) QUEUE_SYSFS_LIMIT_SHOW(max_segment_size) QUEUE_SYSFS_LIMIT_SHOW(max_write_streams) +QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity) QUEUE_SYSFS_LIMIT_SHOW(logical_block_size) QUEUE_SYSFS_LIMIT_SHOW(physical_block_size) QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors) @@ -490,6 +491,7 @@ QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size"); QUEUE_LIM_RO_ENTRY(queue_max_write_streams, "max_write_streams"); +QUEUE_LIM_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity"); QUEUE_RW_ENTRY(elv_iosched, "scheduler"); QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size"); @@ -645,6 +647,7 @@ static struct attribute *queue_attrs[] = { &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, &queue_max_write_streams_entry.attr, + &queue_write_stream_granularity_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 13e1426ad38d..52b7a27c46e8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -405,6 +405,7 @@ struct queue_limits { unsigned short max_discard_segments; unsigned short max_write_streams; + unsigned int write_stream_granularity; unsigned int max_open_zones; unsigned int max_active_zones; -- cgit v1.2.3 From ee203d3d86113559b77b1723e0d10909ebbd66ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 May 2025 17:47:30 +0530 Subject: nvme: add FDP definitions Add the config feature result, config log page, and management receive commands needed for FDP. Partially based on a patch from Kanchan Joshi . Reviewed-by: Hannes Reinecke Reviewed-by: Nitesh Shetty Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250506121732.8211-10-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/nvme.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2479ed10f53e..51308f65b72f 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -303,6 +303,7 @@ enum nvme_ctrl_attr { NVME_CTRL_ATTR_TBKAS = (1 << 6), NVME_CTRL_ATTR_ELBAS = (1 << 15), NVME_CTRL_ATTR_RHII = (1 << 18), + NVME_CTRL_ATTR_FDPS = (1 << 19), }; struct nvme_id_ctrl { @@ -689,6 +690,44 @@ struct nvme_rotational_media_log { __u8 rsvd24[488]; }; +struct nvme_fdp_config { + __u8 flags; +#define FDPCFG_FDPE (1U << 0) + __u8 fdpcidx; + __le16 reserved; +}; + +struct nvme_fdp_ruh_desc { + __u8 ruht; + __u8 reserved[3]; +}; + +struct nvme_fdp_config_desc { + __le16 dsze; + __u8 fdpa; + __u8 vss; + __le32 nrg; + __le16 nruh; + __le16 maxpids; + __le32 nns; + __le64 runs; + __le32 erutl; + __u8 rsvd28[36]; + struct nvme_fdp_ruh_desc ruhs[]; +}; + +struct nvme_fdp_config_log { + __le16 numfdpc; + __u8 ver; + __u8 rsvd3; + __le32 sze; + __u8 rsvd8[8]; + /* + * This is followed by variable number of nvme_fdp_config_desc + * structures, but sparse doesn't like nested variable sized arrays. + */ +}; + struct nvme_smart_log { __u8 critical_warning; __u8 temperature[2]; @@ -915,6 +954,7 @@ enum nvme_opcode { nvme_cmd_resv_register = 0x0d, nvme_cmd_resv_report = 0x0e, nvme_cmd_resv_acquire = 0x11, + nvme_cmd_io_mgmt_recv = 0x12, nvme_cmd_resv_release = 0x15, nvme_cmd_zone_mgmt_send = 0x79, nvme_cmd_zone_mgmt_recv = 0x7a, @@ -936,6 +976,7 @@ enum nvme_opcode { nvme_opcode_name(nvme_cmd_resv_register), \ nvme_opcode_name(nvme_cmd_resv_report), \ nvme_opcode_name(nvme_cmd_resv_acquire), \ + nvme_opcode_name(nvme_cmd_io_mgmt_recv), \ nvme_opcode_name(nvme_cmd_resv_release), \ nvme_opcode_name(nvme_cmd_zone_mgmt_send), \ nvme_opcode_name(nvme_cmd_zone_mgmt_recv), \ @@ -1087,6 +1128,7 @@ enum { NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, NVME_RW_PRINFO_PRACT = 1 << 13, NVME_RW_DTYPE_STREAMS = 1 << 4, + NVME_RW_DTYPE_DPLCMT = 2 << 4, NVME_WZ_DEAC = 1 << 9, }; @@ -1174,6 +1216,38 @@ struct nvme_zone_mgmt_recv_cmd { __le32 cdw14[2]; }; +struct nvme_io_mgmt_recv_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le64 rsvd2[2]; + union nvme_data_ptr dptr; + __u8 mo; + __u8 rsvd11; + __u16 mos; + __le32 numd; + __le32 cdw12[4]; +}; + +enum { + NVME_IO_MGMT_RECV_MO_RUHS = 1, +}; + +struct nvme_fdp_ruh_status_desc { + __le16 pid; + __le16 ruhid; + __le32 earutr; + __le64 ruamw; + __u8 reserved[16]; +}; + +struct nvme_fdp_ruh_status { + __u8 rsvd0[14]; + __le16 nruhsd; + struct nvme_fdp_ruh_status_desc ruhsd[]; +}; + enum { NVME_ZRA_ZONE_REPORT = 0, NVME_ZRASF_ZONE_REPORT_ALL = 0, @@ -1309,6 +1383,7 @@ enum { NVME_FEAT_PLM_WINDOW = 0x14, NVME_FEAT_HOST_BEHAVIOR = 0x16, NVME_FEAT_SANITIZE = 0x17, + NVME_FEAT_FDP = 0x1d, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, @@ -1329,6 +1404,7 @@ enum { NVME_LOG_ANA = 0x0c, NVME_LOG_FEATURES = 0x12, NVME_LOG_RMI = 0x16, + NVME_LOG_FDP_CONFIGS = 0x20, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), @@ -1923,6 +1999,7 @@ struct nvme_command { struct nvmf_auth_receive_command auth_receive; struct nvme_dbbuf dbbuf; struct nvme_directive_cmd directive; + struct nvme_io_mgmt_recv_cmd imr; }; }; -- cgit v1.2.3 From f31acff017b1d80e649f674450d3b64d3265848c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 7 May 2025 17:25:37 +0800 Subject: block: fix warning on 'make htmldocs' Fix the following warning when running 'make htmldocs': +WARNING: include/linux/blk-mq.h:532 struct member 'update_nr_hwq_lock' not described in 'blk_mq_tag_set' Fixes: 98e68f67020c ("block: prevent adding/deleting disk during updating nr_hw_queues") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/all/20250507163220.00141d77@canb.auug.org.au/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250507092537.3009112-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ef84d53095a6..906bf330ffb5 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -507,6 +507,9 @@ enum hctx_type { * request_queue.tag_set_list. * @srcu: Use as lock when type of the request queue is blocking * (BLK_MQ_F_BLOCKING). + * @update_nr_hwq_lock: + * Synchronize updating nr_hw_queues with add/del disk & + * switching elevator. */ struct blk_mq_tag_set { const struct blk_mq_ops *ops; -- cgit v1.2.3 From 850e210d5ad21b94b55b97d4d82b4cdeb0bb05df Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 May 2025 14:04:25 +0200 Subject: block: add a bio_add_virt_nofail helper Add a helper to add a directly mapped kernel virtual address to a bio so that callers don't have to convert to pages or folios. For now only the _nofail variant is provided as that is what all the obvious callers want. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250507120451.4000627-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 16 ++++++++++++++++ include/linux/bio.h | 2 ++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 1e42aefc7377..bd3d048d0a72 100644 --- a/block/bio.c +++ b/block/bio.c @@ -991,6 +991,22 @@ void __bio_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL_GPL(__bio_add_page); +/** + * bio_add_virt_nofail - add data in the direct kernel mapping to a bio + * @bio: destination bio + * @vaddr: data to add + * @len: length of the data to add, may cross pages + * + * Add the data at @vaddr to @bio. The caller must have ensure a segment + * is available for the added data. No merging into an existing segment + * will be performed. + */ +void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len) +{ + __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr)); +} +EXPORT_SYMBOL_GPL(bio_add_virt_nofail); + /** * bio_add_page - attempt to add page(s) to bio * @bio: destination bio diff --git a/include/linux/bio.h b/include/linux/bio.h index cafc7c215de8..acca7464080c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -417,6 +417,8 @@ void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off); +void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len); + int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); -- cgit v1.2.3 From 10b1e59cdadabff16fc78eb2ca4c341b1502293c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 May 2025 14:04:26 +0200 Subject: block: add a bdev_rw_virt helper Add a helper to perform synchronous I/O on a kernel direct map range. Currently this is implemented in various places in usually not very efficient ways, so provide a generic helper instead. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250507120451.4000627-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 30 ++++++++++++++++++++++++++++++ include/linux/bio.h | 5 ++++- 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index bd3d048d0a72..26782ff85dbf 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1319,6 +1319,36 @@ int submit_bio_wait(struct bio *bio) } EXPORT_SYMBOL(submit_bio_wait); +/** + * bdev_rw_virt - synchronously read into / write from kernel mapping + * @bdev: block device to access + * @sector: sector to access + * @data: data to read/write + * @len: length in byte to read/write + * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE) + * + * Performs synchronous I/O to @bdev for @data/@len. @data must be in + * the kernel direct mapping and not a vmalloc address. + */ +int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, + size_t len, enum req_op op) +{ + struct bio_vec bv; + struct bio bio; + int error; + + if (WARN_ON_ONCE(is_vmalloc_addr(data))) + return -EIO; + + bio_init(&bio, bdev, &bv, 1, op); + bio.bi_iter.bi_sector = sector; + bio_add_virt_nofail(&bio, data, len); + error = submit_bio_wait(&bio); + bio_uninit(&bio); + return error; +} +EXPORT_SYMBOL_GPL(bdev_rw_virt); + static void bio_wait_end_io(struct bio *bio) { complete(bio->bi_private); diff --git a/include/linux/bio.h b/include/linux/bio.h index acca7464080c..ad54e6af20dc 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -402,7 +402,6 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) struct request_queue; -extern int submit_bio_wait(struct bio *bio); void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf); extern void bio_uninit(struct bio *); @@ -419,6 +418,10 @@ void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off); void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len); +int submit_bio_wait(struct bio *bio); +int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, + size_t len, enum req_op op); + int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); -- cgit v1.2.3 From 75f88659e47dc570bdebddf77d7a3cd5f0845612 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 May 2025 14:04:27 +0200 Subject: block: add a bio_add_max_vecs helper Add a helper to check how many bio_vecs are needed to add a kernel virtual address range to a bio, accounting for the always contiguous direct mapping and vmalloc mappings that usually need a bio_vec per page sized chunk. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250507120451.4000627-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index ad54e6af20dc..128b1c6ca648 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -418,6 +418,21 @@ void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off); void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len); +/** + * bio_add_max_vecs - number of bio_vecs needed to add data to a bio + * @kaddr: kernel virtual address to add + * @len: length in bytes to add + * + * Calculate how many bio_vecs need to be allocated to add the kernel virtual + * address range in [@kaddr:@len] in the worse case. + */ +static inline unsigned int bio_add_max_vecs(void *kaddr, unsigned int len) +{ + if (is_vmalloc_addr(kaddr)) + return DIV_ROUND_UP(offset_in_page(kaddr) + len, PAGE_SIZE); + return 1; +} + int submit_bio_wait(struct bio *bio); int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op); -- cgit v1.2.3 From 8dd16f5e34693aa0aa6a4ed48427045008097e64 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 May 2025 14:04:28 +0200 Subject: block: add a bio_add_vmalloc helpers Add a helper to add a vmalloc region to a bio, abstracting away the vmalloc addresses from the underlying pages and another one wrapping it for the simple case where all data fits into a single bio. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250507120451.4000627-5-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/bio.h | 3 +++ 2 files changed, 58 insertions(+) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 26782ff85dbf..988f5de3c02c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1076,6 +1076,61 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, } EXPORT_SYMBOL(bio_add_folio); +/** + * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio + * @bio: destination bio + * @vaddr: vmalloc address to add + * @len: total length in bytes of the data to add + * + * Add data starting at @vaddr to @bio and return how many bytes were added. + * This may be less than the amount originally asked. Returns 0 if no data + * could be added to @bio. + * + * This helper calls flush_kernel_vmap_range() for the range added. For reads + * the caller still needs to manually call invalidate_kernel_vmap_range() in + * the completion handler. + */ +unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len) +{ + unsigned int offset = offset_in_page(vaddr); + + len = min(len, PAGE_SIZE - offset); + if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len) + return 0; + if (op_is_write(bio_op(bio))) + flush_kernel_vmap_range(vaddr, len); + return len; +} +EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk); + +/** + * bio_add_vmalloc - add a vmalloc region to a bio + * @bio: destination bio + * @vaddr: vmalloc address to add + * @len: total length in bytes of the data to add + * + * Add data starting at @vaddr to @bio. Return %true on success or %false if + * @bio does not have enough space for the payload. + * + * This helper calls flush_kernel_vmap_range() for the range added. For reads + * the caller still needs to manually call invalidate_kernel_vmap_range() in + * the completion handler. + */ +bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len) +{ + do { + unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len); + + if (!added) + return false; + vaddr += added; + len -= added; + } while (len); + + return true; +} +EXPORT_SYMBOL_GPL(bio_add_vmalloc); + void __bio_release_pages(struct bio *bio, bool mark_dirty) { struct folio_iter fi; diff --git a/include/linux/bio.h b/include/linux/bio.h index 128b1c6ca648..5d880903bcc5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -433,6 +433,9 @@ static inline unsigned int bio_add_max_vecs(void *kaddr, unsigned int len) return 1; } +unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len); +bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len); + int submit_bio_wait(struct bio *bio); int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op); -- cgit v1.2.3 From af78428ed3f3eebad7be9d0463251046e9582cf6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 May 2025 14:04:29 +0200 Subject: block: remove the q argument from blk_rq_map_kern Remove the q argument from blk_rq_map_kern and the internal helpers called by it as the queue can trivially be derived from the request. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250507120451.4000627-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-map.c | 22 +++++++++------------- drivers/block/pktcdvd.c | 2 +- drivers/block/ublk_drv.c | 3 +-- drivers/block/virtio_blk.c | 4 ++-- drivers/nvme/host/core.c | 2 +- drivers/scsi/scsi_ioctl.c | 2 +- drivers/scsi/scsi_lib.c | 3 +-- include/linux/blk-mq.h | 4 ++-- 8 files changed, 18 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/block/blk-map.c b/block/blk-map.c index cadbf11b50a3..9002cfe855b9 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -319,7 +319,6 @@ static void bio_map_kern_endio(struct bio *bio) /** * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio * @data: pointer to buffer to map * @len: length in bytes * @gfp_mask: allocation flags for bio allocation @@ -327,8 +326,7 @@ static void bio_map_kern_endio(struct bio *bio) * Map the kernel address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ -static struct bio *bio_map_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask) +static struct bio *bio_map_kern(void *data, unsigned int len, gfp_t gfp_mask) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -402,7 +400,6 @@ static void bio_copy_kern_endio_read(struct bio *bio) /** * bio_copy_kern - copy kernel address into bio - * @q: the struct request_queue for the bio * @data: pointer to buffer to copy * @len: length in bytes * @gfp_mask: allocation flags for bio and page allocation @@ -411,8 +408,8 @@ static void bio_copy_kern_endio_read(struct bio *bio) * copy the kernel address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ -static struct bio *bio_copy_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask, int reading) +static struct bio *bio_copy_kern(void *data, unsigned int len, gfp_t gfp_mask, + int reading) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -687,7 +684,6 @@ EXPORT_SYMBOL(blk_rq_unmap_user); /** * blk_rq_map_kern - map kernel data to a request, for passthrough requests - * @q: request queue where request should be inserted * @rq: request to fill * @kbuf: the kernel buffer * @len: length of user data @@ -698,23 +694,23 @@ EXPORT_SYMBOL(blk_rq_unmap_user); * buffer is used. Can be called multiple times to append multiple * buffers. */ -int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, - unsigned int len, gfp_t gfp_mask) +int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len, + gfp_t gfp_mask) { int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; struct bio *bio; int ret; - if (len > (queue_max_hw_sectors(q) << 9)) + if (len > (queue_max_hw_sectors(rq->q) << SECTOR_SHIFT)) return -EINVAL; if (!len || !kbuf) return -EINVAL; - if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf)) - bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); + if (!blk_rq_aligned(rq->q, addr, len) || object_is_on_stack(kbuf)) + bio = bio_copy_kern(kbuf, len, gfp_mask, reading); else - bio = bio_map_kern(q, kbuf, len, gfp_mask); + bio = bio_map_kern(kbuf, len, gfp_mask); if (IS_ERR(bio)) return PTR_ERR(bio); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 65b96c083b3c..d5cc7bd2875c 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -725,7 +725,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * scmd = blk_mq_rq_to_pdu(rq); if (cgc->buflen) { - ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, + ret = blk_rq_map_kern(rq, cgc->buffer, cgc->buflen, GFP_NOIO); if (ret) goto out; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 3650bab40dd0..cb612151e9a1 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -368,8 +368,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, if (ret) goto free_req; - ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, - GFP_KERNEL); + ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL); if (ret) goto erase_desc; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 7cffea01d868..30bca8cb7106 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -571,7 +571,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk, vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT); vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector); - err = blk_rq_map_kern(q, req, report_buf, report_len, GFP_KERNEL); + err = blk_rq_map_kern(req, report_buf, report_len, GFP_KERNEL); if (err) goto out; @@ -817,7 +817,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); vbr->out_hdr.sector = 0; - err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL); + err = blk_rq_map_kern(req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL); if (err) goto out; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a8444d1e8398..af871d268fcb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1174,7 +1174,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, req->cmd_flags &= ~REQ_FAILFAST_DRIVER; if (buffer && bufflen) { - ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); + ret = blk_rq_map_kern(req, buffer, bufflen, GFP_KERNEL); if (ret) goto out; } diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index 2fa45556e1ea..0ddc95bafc71 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -601,7 +601,7 @@ static int sg_scsi_ioctl(struct request_queue *q, bool open_for_write, } if (bytes) { - err = blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO); + err = blk_rq_map_kern(rq, buffer, bytes, GFP_NOIO); if (err) goto error; } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 39320dccbdd5..6f006cca0b5e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -313,8 +313,7 @@ retry: return PTR_ERR(req); if (bufflen) { - ret = blk_rq_map_kern(sdev->request_queue, req, - buffer, bufflen, GFP_NOIO); + ret = blk_rq_map_kern(req, buffer, bufflen, GFP_NOIO); if (ret) goto out; } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 906bf330ffb5..de8c85a03bb7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1037,8 +1037,8 @@ int blk_rq_map_user_io(struct request *, struct rq_map_data *, int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); int blk_rq_unmap_user(struct bio *); -int blk_rq_map_kern(struct request_queue *, struct request *, void *, - unsigned int, gfp_t); +int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len, + gfp_t gfp); int blk_rq_append_bio(struct request *rq, struct bio *bio); void blk_execute_rq_nowait(struct request *rq, bool at_head); blk_status_t blk_execute_rq(struct request *rq, bool at_head); -- cgit v1.2.3 From f2987c5816bda01a8ffdd4eb5dfaaa2b41b67039 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 6 May 2025 20:48:59 +0800 Subject: block: export API to get the number of bdev inflight IO - rename part_in_{flight, flight_rw} to bdev_count_{inflight, inflight_rw} - export bdev_count_inflight, to fix a problem in mdraid that foreground IO can be starved by background sync IO in later patches Link: https://lore.kernel.org/linux-raid/20250506124903.2540268-6-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke --- block/blk-core.c | 2 +- block/blk.h | 1 - block/genhd.c | 22 ++++++++++++++++------ include/linux/part_stat.h | 2 ++ 4 files changed, 19 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index e8cc270a453f..b862c66018f2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1018,7 +1018,7 @@ again: stamp = READ_ONCE(part->bd_stamp); if (unlikely(time_after(now, stamp)) && likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) && - (end || part_in_flight(part))) + (end || bdev_count_inflight(part))) __part_stat_add(part, io_ticks, now - stamp); if (bdev_is_partition(part)) { diff --git a/block/blk.h b/block/blk.h index 665b3d1fb504..21af4f0c4c00 100644 --- a/block/blk.h +++ b/block/blk.h @@ -419,7 +419,6 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi, int blk_dev_init(void); void update_io_ticks(struct block_device *part, unsigned long now, bool end); -unsigned int part_in_flight(struct block_device *part); static inline void req_set_nomerge(struct request_queue *q, struct request *req) { diff --git a/block/genhd.c b/block/genhd.c index 1c1a642b321e..8171a6bc3210 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -125,7 +125,7 @@ static void part_stat_read_all(struct block_device *part, } } -static void part_in_flight_rw(struct block_device *part, +static void bdev_count_inflight_rw(struct block_device *part, unsigned int inflight[2], bool mq_driver) { int cpu; @@ -147,14 +147,24 @@ static void part_in_flight_rw(struct block_device *part, inflight[WRITE] = 0; } -unsigned int part_in_flight(struct block_device *part) +/** + * bdev_count_inflight - get the number of inflight IOs for a block device. + * + * @part: the block device. + * + * Inflight here means started IO accounting, from bdev_start_io_acct() for + * bio-based block device, and from blk_account_io_start() for rq-based block + * device. + */ +unsigned int bdev_count_inflight(struct block_device *part) { unsigned int inflight[2] = {0}; - part_in_flight_rw(part, inflight, false); + bdev_count_inflight_rw(part, inflight, false); return inflight[READ] + inflight[WRITE]; } +EXPORT_SYMBOL_GPL(bdev_count_inflight); /* * Can be deleted altogether. Later. @@ -1052,7 +1062,7 @@ ssize_t part_stat_show(struct device *dev, struct disk_stats stat; unsigned int inflight; - inflight = part_in_flight(bdev); + inflight = bdev_count_inflight(bdev); if (inflight) { part_stat_lock(); update_io_ticks(bdev, jiffies, true); @@ -1101,7 +1111,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, struct request_queue *q = bdev_get_queue(bdev); unsigned int inflight[2] = {0}; - part_in_flight_rw(bdev, inflight, queue_is_mq(q)); + bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q)); return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]); } @@ -1356,7 +1366,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) continue; - inflight = part_in_flight(hd); + inflight = bdev_count_inflight(hd); if (inflight) { part_stat_lock(); update_io_ticks(hd, jiffies, true); diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index c5e9cac0575e..eeeff2a04529 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -79,4 +79,6 @@ static inline void part_stat_set_all(struct block_device *part, int value) #define part_stat_local_read_cpu(part, field, cpu) \ local_read(&(part_stat_get_cpu(part, field, cpu))) +unsigned int bdev_count_inflight(struct block_device *part); + #endif /* _LINUX_PART_STAT_H */ -- cgit v1.2.3 From 752d0464b78a5b28682256ed7a057106119e1d1a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 6 May 2025 20:49:03 +0800 Subject: md: clean up accounting for issued sync IO It's no longer used and can be removed, also remove the field 'gendisk->sync_io'. Link: https://lore.kernel.org/linux-raid/20250506124903.2540268-10-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/md.h | 11 ----------- drivers/md/raid1.c | 3 --- drivers/md/raid10.c | 9 --------- drivers/md/raid5.c | 8 -------- include/linux/blkdev.h | 1 - 5 files changed, 32 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/md.h b/drivers/md/md.h index 1982f1f18627..d45a9e6ead80 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -717,17 +717,6 @@ static inline int mddev_trylock(struct mddev *mddev) } extern void mddev_unlock(struct mddev *mddev); -static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) -{ - if (blk_queue_io_stat(bdev->bd_disk->queue)) - atomic_add(nr_sectors, &bdev->bd_disk->sync_io); -} - -static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) -{ - md_sync_acct(bio->bi_bdev, nr_sectors); -} - struct md_personality { struct md_submodule_head head; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index de9bccbe7337..657d481525be 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2382,7 +2382,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) wbio->bi_end_io = end_sync_write; atomic_inc(&r1_bio->remaining); - md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); submit_bio_noacct(wbio); } @@ -3055,7 +3054,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { read_targets--; - md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; submit_bio_noacct(bio); @@ -3064,7 +3062,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } else { atomic_set(&r1_bio->remaining, 1); bio = r1_bio->bios[r1_bio->read_disk]; - md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; submit_bio_noacct(bio); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ba32bac975b8..dce06bf65016 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2426,7 +2426,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); - md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) tbio->bi_opf |= MD_FAILFAST; @@ -2448,8 +2447,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) bio_copy_data(tbio, fbio); d = r10_bio->devs[i].devnum; atomic_inc(&r10_bio->remaining); - md_sync_acct(conf->mirrors[d].replacement->bdev, - bio_sectors(tbio)); submit_bio_noacct(tbio); } @@ -2583,13 +2580,10 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) d = r10_bio->devs[1].devnum; if (wbio->bi_end_io) { atomic_inc(&conf->mirrors[d].rdev->nr_pending); - md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); submit_bio_noacct(wbio); } if (wbio2) { atomic_inc(&conf->mirrors[d].replacement->nr_pending); - md_sync_acct(conf->mirrors[d].replacement->bdev, - bio_sectors(wbio2)); submit_bio_noacct(wbio2); } } @@ -3757,7 +3751,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->sectors = nr_sectors; if (bio->bi_end_io == end_sync_read) { - md_sync_acct_bio(bio, nr_sectors); bio->bi_status = 0; submit_bio_noacct(bio); } @@ -4880,7 +4873,6 @@ read_more: r10_bio->sectors = nr_sectors; /* Now submit the read */ - md_sync_acct_bio(read_bio, r10_bio->sectors); atomic_inc(&r10_bio->remaining); read_bio->bi_next = NULL; submit_bio_noacct(read_bio); @@ -4940,7 +4932,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) continue; atomic_inc(&rdev->nr_pending); - md_sync_acct_bio(b, r10_bio->sectors); atomic_inc(&r10_bio->remaining); b->bi_next = NULL; submit_bio_noacct(b); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6389383166c0..ca5b0e8ba707 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1240,10 +1240,6 @@ again: } if (rdev) { - if (s->syncing || s->expanding || s->expanded - || s->replacing) - md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf)); - set_bit(STRIPE_IO_STARTED, &sh->state); bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags); @@ -1300,10 +1296,6 @@ again: submit_bio_noacct(bi); } if (rrdev) { - if (s->syncing || s->expanding || s->expanded - || s->replacing) - md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf)); - set_bit(STRIPE_IO_STARTED, &sh->state); bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 52b7a27c46e8..ad75dee3b213 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -182,7 +182,6 @@ struct gendisk { struct list_head slave_bdevs; #endif struct timer_rand_state *random; - atomic_t sync_io; /* RAID */ struct disk_events *ev; #ifdef CONFIG_BLK_DEV_ZONED -- cgit v1.2.3 From c4da7bf54b1f76e7c5c8cc6d1c4db8b19af67c5d Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Tue, 6 May 2025 10:09:31 +0800 Subject: blk-throttle: Introduce flag "BIO_TG_BPS_THROTTLED" Subsequent patches will split the single queue into separate bps and iops queues. To prevent IO that has already passed through the bps queue at a single tg level from being counted toward bps wait time again, we introduce "BIO_TG_BPS_THROTTLED" flag. Since throttle and QoS operate at different levels, we reuse the value as "BIO_QOS_THROTTLED". We set this flag when charge bps and clear it when charge iops, as the bio will move to the upper-level tg or be dispatched. This patch does not involve functional changes. Signed-off-by: Zizhi Wo Reviewed-by: Yu Kuai Signed-off-by: Zizhi Wo Link: https://lore.kernel.org/r/20250506020935.655574-5-wozizhi@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 9 +++++++-- include/linux/blk_types.h | 8 ++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index fea09a91c20b..ee4eeee8f21f 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -792,12 +792,16 @@ static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio) unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ - if (!bio_flagged(bio, BIO_BPS_THROTTLED)) + if (!bio_flagged(bio, BIO_BPS_THROTTLED) && + !bio_flagged(bio, BIO_TG_BPS_THROTTLED)) { + bio_set_flag(bio, BIO_TG_BPS_THROTTLED); tg->bytes_disp[bio_data_dir(bio)] += bio_size; + } } static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio) { + bio_clear_flag(bio, BIO_TG_BPS_THROTTLED); tg->io_disp[bio_data_dir(bio)]++; } @@ -823,7 +827,8 @@ static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio /* no need to throttle if this bio's bytes have been accounted */ if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING || - bio_flagged(bio, BIO_BPS_THROTTLED)) + bio_flagged(bio, BIO_BPS_THROTTLED) || + bio_flagged(bio, BIO_TG_BPS_THROTTLED)) return 0; tg_update_slice(tg, rw); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f38425338c3f..3d1577f07c1c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -296,6 +296,14 @@ enum { * of this bio. */ BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */ + /* + * This bio has completed bps throttling at the single tg granularity, + * which is different from BIO_BPS_THROTTLED. When the bio is enqueued + * into the sq->queued of the upper tg, or is about to be dispatched, + * this flag needs to be cleared. Since blk-throttle and rq_qos are not + * on the same hierarchical level, reuse the value. + */ + BIO_TG_BPS_THROTTLED = BIO_QOS_THROTTLED, BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */ -- cgit v1.2.3 From 1c9a93bf1d01729c54e9acbaa538db81f16563b2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 25 Apr 2025 20:06:34 -0600 Subject: dmapool: add NUMA affinity support Introduce dma_pool_create_node(), like dma_pool_create() but taking an additional NUMA node argument. Allocate struct dma_pool on the desired node, and store the node on dma_pool for allocating struct dma_page. Make dma_pool_create() an alias for dma_pool_create_node() with node set to NUMA_NO_NODE. Signed-off-by: Keith Busch Signed-off-by: Caleb Sander Mateos Reviewed-by: Jens Axboe Reviewed-by: John Garry Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi Signed-off-by: Christoph Hellwig --- include/linux/dmapool.h | 21 ++++++++++++++++----- mm/dmapool.c | 15 +++++++++------ 2 files changed, 25 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h index f632ecfb4238..06c4de602b2f 100644 --- a/include/linux/dmapool.h +++ b/include/linux/dmapool.h @@ -11,6 +11,7 @@ #ifndef LINUX_DMAPOOL_H #define LINUX_DMAPOOL_H +#include #include #include @@ -18,8 +19,8 @@ struct device; #ifdef CONFIG_HAS_DMA -struct dma_pool *dma_pool_create(const char *name, struct device *dev, - size_t size, size_t align, size_t allocation); +struct dma_pool *dma_pool_create_node(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary, int node); void dma_pool_destroy(struct dma_pool *pool); @@ -35,9 +36,12 @@ struct dma_pool *dmam_pool_create(const char *name, struct device *dev, void dmam_pool_destroy(struct dma_pool *pool); #else /* !CONFIG_HAS_DMA */ -static inline struct dma_pool *dma_pool_create(const char *name, - struct device *dev, size_t size, size_t align, size_t allocation) -{ return NULL; } +static inline struct dma_pool *dma_pool_create_node(const char *name, + struct device *dev, size_t size, size_t align, size_t boundary, + int node) +{ + return NULL; +} static inline void dma_pool_destroy(struct dma_pool *pool) { } static inline void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle) { return NULL; } @@ -49,6 +53,13 @@ static inline struct dma_pool *dmam_pool_create(const char *name, static inline void dmam_pool_destroy(struct dma_pool *pool) { } #endif /* !CONFIG_HAS_DMA */ +static inline struct dma_pool *dma_pool_create(const char *name, + struct device *dev, size_t size, size_t align, size_t boundary) +{ + return dma_pool_create_node(name, dev, size, align, boundary, + NUMA_NO_NODE); +} + static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle) { diff --git a/mm/dmapool.c b/mm/dmapool.c index f0bfc6c490f4..5be8cc1c6529 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -56,6 +56,7 @@ struct dma_pool { /* the pool */ unsigned int size; unsigned int allocation; unsigned int boundary; + int node; char name[32]; struct list_head pools; }; @@ -199,12 +200,13 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, /** - * dma_pool_create - Creates a pool of consistent memory blocks, for dma. + * dma_pool_create_node - Creates a pool of consistent memory blocks, for dma. * @name: name of pool, for diagnostics * @dev: device that will be doing the DMA * @size: size of the blocks in this pool. * @align: alignment requirement for blocks; must be a power of two * @boundary: returned blocks won't cross this power of two boundary + * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on * Context: not in_interrupt() * * Given one of these pools, dma_pool_alloc() @@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, * Return: a dma allocation pool with the requested characteristics, or * %NULL if one can't be created. */ -struct dma_pool *dma_pool_create(const char *name, struct device *dev, - size_t size, size_t align, size_t boundary) +struct dma_pool *dma_pool_create_node(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary, int node) { struct dma_pool *retval; size_t allocation; @@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, boundary = min(boundary, allocation); - retval = kzalloc(sizeof(*retval), GFP_KERNEL); + retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node); if (!retval) return retval; @@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->size = size; retval->boundary = boundary; retval->allocation = allocation; + retval->node = node; INIT_LIST_HEAD(&retval->pools); /* @@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, mutex_unlock(&pools_reg_lock); return retval; } -EXPORT_SYMBOL(dma_pool_create); +EXPORT_SYMBOL(dma_pool_create_node); static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) { @@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) { struct dma_page *page; - page = kmalloc(sizeof(*page), mem_flags); + page = kmalloc_node(sizeof(*page), mem_flags, pool->node); if (!page) return NULL; -- cgit v1.2.3 From 3a91f28fab43f093c72312148288d125ae160c2d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 22 May 2025 23:20:39 +0800 Subject: io_uring: add helper io_uring_cmd_ctx_handle() Add helper io_uring_cmd_ctx_handle() for driver to track per-context resource, such as registered kernel io buffer. Suggested-by: Caleb Sander Mateos Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250522152043.399824-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 0634a3de1782..53408124c1e5 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -140,6 +140,15 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur return cmd_to_io_kiocb(cmd)->async_data; } +/* + * Return uring_cmd's context reference as its context handle for driver to + * track per-context resource, such as registered kernel IO buffer + */ +static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->ctx; +} + int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, void (*release)(void *), unsigned int index, unsigned int issue_flags); -- cgit v1.2.3