From 0fbb4d93b38bce1f8235aacfa37e90ad8f011473 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 17 Feb 2022 23:40:32 -0500 Subject: dm: add dm_submit_bio_remap interface Where possible, switch from early bio-based IO accounting (at the time DM clones each incoming bio) to late IO accounting just before each remapped bio is issued to underlying device via submit_bio_noacct(). Allows more precise bio-based IO accounting for DM targets that use their own workqueues to perform additional processing of each bio in conjunction with their DM_MAPIO_SUBMITTED return from their map function. When a target is updated to use dm_submit_bio_remap() they must also set ti->accounts_remapped_io to true. Use xchg() in start_io_acct(), as suggested by Mikulas, to ensure each IO is only started once. The xchg race only happens if __send_duplicate_bios() sends multiple bios -- that case is reflected via tio->is_duplicate_bio. Given the niche nature of this race, it is best to avoid any xchg performance penalty for normal IO. For IO that was never submitted with dm_bio_submit_remap(), but the target completes the clone with bio_endio, accounting is started then ended and pending_io counter decremented. Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- include/linux/device-mapper.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/device-mapper.h') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index b26fecf6c8e8..7752d14d13f8 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -362,6 +362,12 @@ struct dm_target { * zone append operations using regular writes. */ bool emulate_zone_append:1; + + /* + * Set if the target will submit IO using dm_submit_bio_remap() + * after returning DM_MAPIO_SUBMITTED from its map function. + */ + bool accounts_remapped_io:1; }; void *dm_per_bio_data(struct bio *bio, size_t data_size); @@ -465,6 +471,7 @@ int dm_suspended(struct dm_target *ti); int dm_post_suspending(struct dm_target *ti); int dm_noflush_suspending(struct dm_target *ti); void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors); +void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone, bool from_wq); union map_info *dm_get_rq_mapinfo(struct request *rq); #ifdef CONFIG_BLK_DEV_ZONED -- cgit v1.2.3 From a8b9d116cda047f38ba29ce26df49c57479ffaa2 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Wed, 26 Jan 2022 05:18:26 -0800 Subject: dm: cleanup double word in comment Remove the second 'a'. Signed-off-by: Tom Rix Signed-off-by: Mike Snitzer --- include/linux/device-mapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/device-mapper.h') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 7752d14d13f8..70cd9449275a 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -358,7 +358,7 @@ struct dm_target { bool limit_swap_bios:1; /* - * Set if this target implements a a zoned device and needs emulation of + * Set if this target implements a zoned device and needs emulation of * zone append operations using regular writes. */ bool emulate_zone_append:1; -- cgit v1.2.3 From a773187e37fa5c3bcd92ffda9085707df34f903a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Feb 2022 09:28:27 +0100 Subject: scsi: dm: Remove WRITE_SAME support There are no more end-users of REQ_OP_WRITE_SAME left, so we can start deleting it. Link: https://lore.kernel.org/r/20220209082828.2629273-7-hch@lst.de Reviewed-by: Mike Snitzer Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/md/dm-core.h | 1 - drivers/md/dm-crypt.c | 1 - drivers/md/dm-ebs-target.c | 1 - drivers/md/dm-io.c | 22 ++-------------------- drivers/md/dm-linear.c | 1 - drivers/md/dm-mpath.c | 1 - drivers/md/dm-rq.c | 3 --- drivers/md/dm-stripe.c | 4 +--- drivers/md/dm-table.c | 29 ----------------------------- drivers/md/dm-zone.c | 4 ---- drivers/md/dm.c | 15 --------------- include/linux/device-mapper.h | 6 ------ 12 files changed, 3 insertions(+), 85 deletions(-) (limited to 'include/linux/device-mapper.h') diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index b855fef4f38a..219407a7b77e 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -141,7 +141,6 @@ struct mapped_device { #define DMF_EMULATE_ZONE_APPEND 9 void disable_discard(struct mapped_device *md); -void disable_write_same(struct mapped_device *md); void disable_write_zeroes(struct mapped_device *md); static inline sector_t dm_get_size(struct mapped_device *md) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index d4ae31558826..4727b72073fe 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2006,7 +2006,6 @@ static bool kcryptd_crypt_write_inline(struct crypt_config *cc, */ switch (bio_op(ctx->bio_in)) { case REQ_OP_WRITE: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE_ZEROES: return true; default: diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 7ce5d509b940..0221fa63f888 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -335,7 +335,6 @@ static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_secure_erase_bios = 0; - ti->num_write_same_bios = 0; ti->num_write_zeroes_bios = 0; return 0; bad: diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 2d3cda0acacb..82d431677bb6 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -304,7 +304,6 @@ static void do_region(int op, int op_flags, unsigned region, unsigned num_bvecs; sector_t remaining = where->count; struct request_queue *q = bdev_get_queue(where->bdev); - unsigned short logical_block_size = queue_logical_block_size(q); sector_t num_sectors; unsigned int special_cmd_max_sectors; @@ -315,10 +314,8 @@ static void do_region(int op, int op_flags, unsigned region, special_cmd_max_sectors = q->limits.max_discard_sectors; else if (op == REQ_OP_WRITE_ZEROES) special_cmd_max_sectors = q->limits.max_write_zeroes_sectors; - else if (op == REQ_OP_WRITE_SAME) - special_cmd_max_sectors = q->limits.max_write_same_sectors; - if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES || - op == REQ_OP_WRITE_SAME) && special_cmd_max_sectors == 0) { + if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) && + special_cmd_max_sectors == 0) { atomic_inc(&io->count); dec_count(io, region, BLK_STS_NOTSUPP); return; @@ -337,9 +334,6 @@ static void do_region(int op, int op_flags, unsigned region, case REQ_OP_WRITE_ZEROES: num_bvecs = 0; break; - case REQ_OP_WRITE_SAME: - num_bvecs = 1; - break; default: num_bvecs = bio_max_segs(dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); @@ -356,18 +350,6 @@ static void do_region(int op, int op_flags, unsigned region, num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining); bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; remaining -= num_sectors; - } else if (op == REQ_OP_WRITE_SAME) { - /* - * WRITE SAME only uses a single page. - */ - dp->get_page(dp, &page, &len, &offset); - bio_add_page(bio, page, logical_block_size, offset); - num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining); - bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; - - offset = 0; - remaining -= num_sectors; - dp->next_page(dp); } else while (remaining) { /* * Try and add as many pages as possible. diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 1b97a11d7151..76b486e4d2be 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -60,7 +60,6 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_secure_erase_bios = 1; - ti->num_write_same_bios = 1; ti->num_write_zeroes_bios = 1; ti->private = lc; return 0; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index f4719b65e5e3..9ab971834a53 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1252,7 +1252,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; - ti->num_write_same_bios = 1; ti->num_write_zeroes_bios = 1; if (m->queue_mode == DM_TYPE_BIO_BASED) ti->per_io_data_size = multipath_per_bio_data_size(); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 579ab6183d4d..3907950a0ddc 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -217,9 +217,6 @@ static void dm_done(struct request *clone, blk_status_t error, bool mapped) if (req_op(clone) == REQ_OP_DISCARD && !clone->q->limits.max_discard_sectors) disable_discard(tio->md); - else if (req_op(clone) == REQ_OP_WRITE_SAME && - !clone->q->limits.max_write_same_sectors) - disable_write_same(tio->md); else if (req_op(clone) == REQ_OP_WRITE_ZEROES && !clone->q->limits.max_write_zeroes_sectors) disable_write_zeroes(tio->md); diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index e566115ec0bb..c81d331d1afe 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -157,7 +157,6 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = stripes; ti->num_discard_bios = stripes; ti->num_secure_erase_bios = stripes; - ti->num_write_same_bios = stripes; ti->num_write_zeroes_bios = stripes; sc->chunk_size = chunk_size; @@ -284,8 +283,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) } if (unlikely(bio_op(bio) == REQ_OP_DISCARD) || unlikely(bio_op(bio) == REQ_OP_SECURE_ERASE) || - unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) || - unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) { + unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES)) { target_bio_nr = dm_bio_get_target_bio_nr(bio); BUG_ON(target_bio_nr >= sc->stripes); return stripe_map_range(sc, bio, target_bio_nr); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e43096cfe9e2..88ab98637934 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1822,33 +1822,6 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, return !blk_queue_add_random(q); } -static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return !q->limits.max_write_same_sectors; -} - -static bool dm_table_supports_write_same(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (!ti->num_write_same_bios) - return false; - - if (!ti->type->iterate_devices || - ti->type->iterate_devices(ti, device_not_write_same_capable, NULL)) - return false; - } - - return true; -} - static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -2027,8 +2000,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - if (!dm_table_supports_write_same(t)) - q->limits.max_write_same_sectors = 0; if (!dm_table_supports_write_zeroes(t)) q->limits.max_write_zeroes_sectors = 0; diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 6d82a34438c8..c1ca9be4b79e 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -130,7 +130,6 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) switch (bio_op(bio)) { case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE: return !op_is_flush(bio->bi_opf) && bio_sectors(bio); default: @@ -390,7 +389,6 @@ static bool dm_zone_map_bio_begin(struct mapped_device *md, case REQ_OP_ZONE_FINISH: return true; case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE: /* Writes must be aligned to the zone write pointer */ if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset) @@ -446,7 +444,6 @@ static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, blk_queue_zone_sectors(md->queue)); return BLK_STS_OK; case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE: WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); return BLK_STS_OK; @@ -503,7 +500,6 @@ static bool dm_need_zone_wp_tracking(struct bio *orig_bio) return false; switch (bio_op(orig_bio)) { case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE: case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_FINISH: diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c0ae8087c602..79e273924afd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -855,14 +855,6 @@ void disable_discard(struct mapped_device *md) blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue); } -void disable_write_same(struct mapped_device *md) -{ - struct queue_limits *limits = dm_get_queue_limits(md); - - /* device doesn't really support WRITE SAME, disable it */ - limits->max_write_same_sectors = 0; -} - void disable_write_zeroes(struct mapped_device *md) { struct queue_limits *limits = dm_get_queue_limits(md); @@ -889,9 +881,6 @@ static void clone_endio(struct bio *bio) if (bio_op(bio) == REQ_OP_DISCARD && !q->limits.max_discard_sectors) disable_discard(md); - else if (bio_op(bio) == REQ_OP_WRITE_SAME && - !q->limits.max_write_same_sectors) - disable_write_same(md); else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && !q->limits.max_write_zeroes_sectors) disable_write_zeroes(md); @@ -1370,7 +1359,6 @@ static bool is_abnormal_io(struct bio *bio) switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE_ZEROES: r = true; break; @@ -1392,9 +1380,6 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, case REQ_OP_SECURE_ERASE: num_bios = ti->num_secure_erase_bios; break; - case REQ_OP_WRITE_SAME: - num_bios = ti->num_write_same_bios; - break; case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; break; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index b26fecf6c8e8..721db99f4f63 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -316,12 +316,6 @@ struct dm_target { */ unsigned num_secure_erase_bios; - /* - * The number of WRITE SAME bios that will be submitted to the target. - * The bio number can be accessed with dm_bio_get_target_bio_nr. - */ - unsigned num_write_same_bios; - /* * The number of WRITE ZEROES bios that will be submitted to the target. * The bio number can be accessed with dm_bio_get_target_bio_nr. -- cgit v1.2.3 From b7f8dff09827c96032c34a945ee7757e394b5952 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 10 Mar 2022 11:45:58 -0500 Subject: dm: simplify dm_sumbit_bio_remap interface Remove the from_wq argument from dm_sumbit_bio_remap(). Eliminates the need for dm_sumbit_bio_remap() callers to know whether they are calling for a workqueue or from the original dm_submit_bio(). Add map_task to dm_io struct, record the map_task in alloc_io and clear it after all target ->map() calls have completed. Update dm_sumbit_bio_remap to check if 'current' matches io->map_task rather than rely on passed 'from_rq' argument. This change really simplifies the chore of porting each DM target to using dm_sumbit_bio_remap() because there is no longer the risk of programming error by not completely knowing all the different contexts a particular method that calls dm_sumbit_bio_remap() might be used in. Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 1 + drivers/md/dm-crypt.c | 6 +++--- drivers/md/dm-delay.c | 2 +- drivers/md/dm-thin.c | 4 ++-- drivers/md/dm.c | 8 ++++---- include/linux/device-mapper.h | 2 +- 6 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux/device-mapper.h') diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 8cc03c0c262e..8d3d11887343 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -237,6 +237,7 @@ struct dm_io { unsigned long start_time; void *data; struct hlist_node node; + struct task_struct *map_task; spinlock_t endio_lock; struct dm_stats_aux stats_aux; /* last member of dm_target_io is 'struct bio' */ diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index ec746cc4b1f8..a1730cc8ae27 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1857,7 +1857,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) return 1; } - dm_submit_bio_remap(io->base_bio, clone, (gfp != CRYPT_MAP_READ_GFP)); + dm_submit_bio_remap(io->base_bio, clone); return 0; } @@ -1883,7 +1883,7 @@ static void kcryptd_io_write(struct dm_crypt_io *io) { struct bio *clone = io->ctx.bio_out; - dm_submit_bio_remap(io->base_bio, clone, true); + dm_submit_bio_remap(io->base_bio, clone); } #define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node) @@ -1962,7 +1962,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) if ((likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) || test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) { - dm_submit_bio_remap(io->base_bio, clone, true); + dm_submit_bio_remap(io->base_bio, clone); return; } diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index b25b45011b11..9a51bf51a859 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -72,7 +72,7 @@ static void flush_bios(struct bio *bio) while (bio) { n = bio->bi_next; bio->bi_next = NULL; - dm_submit_bio_remap(bio, NULL, true); + dm_submit_bio_remap(bio, NULL); bio = n; } } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index ba74bc22ba42..4d25d0e27031 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -755,7 +755,7 @@ static void issue(struct thin_c *tc, struct bio *bio) struct pool *pool = tc->pool; if (!bio_triggers_commit(tc, bio)) { - dm_submit_bio_remap(bio, NULL, true); + dm_submit_bio_remap(bio, NULL); return; } @@ -2383,7 +2383,7 @@ static void process_deferred_bios(struct pool *pool) if (bio->bi_opf & REQ_PREFLUSH) bio_endio(bio); else - dm_submit_bio_remap(bio, NULL, true); + dm_submit_bio_remap(bio, NULL); } } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 1c5c9036a20e..c470f54f9193 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -574,6 +574,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) this_cpu_inc(*md->pending_io); io->orig_bio = NULL; io->md = md; + io->map_task = current; spin_lock_init(&io->endio_lock); io->start_time = jiffies; @@ -1189,15 +1190,13 @@ static inline void __dm_submit_bio_remap(struct bio *clone, /* * @clone: clone bio that DM core passed to target's .map function * @tgt_clone: clone of @clone bio that target needs submitted - * @from_wq: caller is a workqueue thread managed by DM target * * Targets should use this interface to submit bios they take * ownership of when returning DM_MAPIO_SUBMITTED. * * Target should also enable ti->accounts_remapped_io */ -void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone, - bool from_wq) +void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) { struct dm_target_io *tio = clone_to_tio(clone); struct dm_io *io = tio->io; @@ -1212,7 +1211,7 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone, * Account io->origin_bio to DM dev on behalf of target * that took ownership of IO with DM_MAPIO_SUBMITTED. */ - if (!from_wq) { + if (io->map_task == current) { /* Still in target's map function */ io->start_io_acct = true; } else { @@ -1568,6 +1567,7 @@ static void dm_split_and_process_bio(struct mapped_device *md, } error = __split_and_process_bio(&ci); + ci.io->map_task = NULL; if (error || !ci.sector_count) goto out; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 70cd9449275a..901ec191250c 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -471,7 +471,7 @@ int dm_suspended(struct dm_target *ti); int dm_post_suspending(struct dm_target *ti); int dm_noflush_suspending(struct dm_target *ti); void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors); -void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone, bool from_wq); +void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone); union map_info *dm_get_rq_mapinfo(struct request *rq); #ifdef CONFIG_BLK_DEV_ZONED -- cgit v1.2.3 From e511c4a3d2a1f64aafc1f5df37a2ffcf7ef91b55 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Fri, 13 May 2022 15:10:58 -0700 Subject: dax: introduce DAX_RECOVERY_WRITE dax access mode Up till now, dax_direct_access() is used implicitly for normal access, but for the purpose of recovery write, dax range with poison is requested. To make the interface clear, introduce enum dax_access_mode { DAX_ACCESS, DAX_RECOVERY_WRITE, } where DAX_ACCESS is used for normal dax access, and DAX_RECOVERY_WRITE is used for dax recovery write. Suggested-by: Dan Williams Signed-off-by: Jane Chu Reviewed-by: Christoph Hellwig Cc: Mike Snitzer Reviewed-by: Vivek Goyal Link: https://lore.kernel.org/r/165247982851.52965.11024212198889762949.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/dax/super.c | 5 +++-- drivers/md/dm-linear.c | 5 +++-- drivers/md/dm-log-writes.c | 5 +++-- drivers/md/dm-stripe.c | 5 +++-- drivers/md/dm-target.c | 4 +++- drivers/md/dm-writecache.c | 7 ++++--- drivers/md/dm.c | 5 +++-- drivers/nvdimm/pmem.c | 8 +++++--- drivers/nvdimm/pmem.h | 5 ++++- drivers/s390/block/dcssblk.c | 9 ++++++--- fs/dax.c | 9 +++++---- fs/fuse/dax.c | 4 ++-- fs/fuse/virtio_fs.c | 6 ++++-- include/linux/dax.h | 9 +++++++-- include/linux/device-mapper.h | 4 +++- tools/testing/nvdimm/pmem-dax.c | 4 +++- 16 files changed, 61 insertions(+), 33 deletions(-) (limited to 'include/linux/device-mapper.h') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 0211e6f7b47a..5405eb553430 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -117,6 +117,7 @@ enum dax_device_flags { * @dax_dev: a dax_device instance representing the logical memory range * @pgoff: offset in pages from the start of the device to translate * @nr_pages: number of consecutive pages caller can handle relative to @pfn + * @mode: indicator on normal access or recovery write * @kaddr: output parameter that returns a virtual address mapping of pfn * @pfn: output parameter that returns an absolute pfn translation of @pgoff * @@ -124,7 +125,7 @@ enum dax_device_flags { * pages accessible at the device relative @pgoff. */ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, - void **kaddr, pfn_t *pfn) + enum dax_access_mode mode, void **kaddr, pfn_t *pfn) { long avail; @@ -138,7 +139,7 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, return -EINVAL; avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, - kaddr, pfn); + mode, kaddr, pfn); if (!avail) return -ERANGE; return min(avail, nr_pages); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 76b486e4d2be..13e263299c9c 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -172,11 +172,12 @@ static struct dax_device *linear_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff) } static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); - return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); } static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index c9d036d6bb2e..06bdbed65eb1 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -889,11 +889,12 @@ static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti, } static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); - return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); } static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index c81d331d1afe..77d72900e997 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -315,11 +315,12 @@ static struct dax_device *stripe_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff) } static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); - return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); } static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 64dd0b34fcf4..8cd5184e62f0 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -10,6 +10,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "target" @@ -142,7 +143,8 @@ static void io_err_release_clone_rq(struct request *clone, } static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { return -EIO; } diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 5630b470ba42..d74c5a7a0ab4 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -286,7 +286,8 @@ static int persistent_memory_claim(struct dm_writecache *wc) id = dax_read_lock(); - da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, &wc->memory_map, &pfn); + da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS, + &wc->memory_map, &pfn); if (da < 0) { wc->memory_map = NULL; r = da; @@ -308,8 +309,8 @@ static int persistent_memory_claim(struct dm_writecache *wc) i = 0; do { long daa; - daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, p - i, - NULL, &pfn); + daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, + p - i, DAX_ACCESS, NULL, &pfn); if (daa <= 0) { r = daa ? daa : -EINVAL; goto err3; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 82957bd460e8..9c452641c3d5 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1093,7 +1093,8 @@ static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, } static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct mapped_device *md = dax_get_private(dax_dev); sector_t sector = pgoff * PAGE_SECTORS; @@ -1111,7 +1112,7 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (len < 1) goto out; nr_pages = min(len, nr_pages); - ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); + ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn); out: dm_put_live_table(md, srcu_idx); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4aa17132a557..47f34c50f944 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -239,7 +239,8 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; @@ -278,11 +279,12 @@ static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, } static long pmem_dax_direct_access(struct dax_device *dax_dev, - pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) + pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, + void **kaddr, pfn_t *pfn) { struct pmem_device *pmem = dax_get_private(dax_dev); - return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn); + return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn); } static const struct dax_operations pmem_dax_ops = { diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index 1f51a2361429..392b0b38acb9 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -8,6 +8,8 @@ #include #include +enum dax_access_mode; + /* this definition is in it's own header for tools/testing/nvdimm to consume */ struct pmem_device { /* One contiguous memory region per device */ @@ -28,7 +30,8 @@ struct pmem_device { }; long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn); + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn); #ifdef CONFIG_MEMORY_FAILURE static inline bool test_and_clear_pmem_poison(struct page *page) diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index d614843caf6c..8d0d0eaa3059 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -32,7 +32,8 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); static void dcssblk_submit_bio(struct bio *bio); static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn); + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn); static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; @@ -50,7 +51,8 @@ static int dcssblk_dax_zero_page_range(struct dax_device *dax_dev, long rc; void *kaddr; - rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); + rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, + &kaddr, NULL); if (rc < 0) return rc; memset(kaddr, 0, nr_pages << PAGE_SHIFT); @@ -927,7 +929,8 @@ __dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff, static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct dcssblk_dev_info *dev_info = dax_get_private(dax_dev); diff --git a/fs/dax.c b/fs/dax.c index 67a08a32fccb..ef3103107104 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -721,7 +721,8 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter int id; id = dax_read_lock(); - rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL); + rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS, + &kaddr, NULL); if (rc < 0) { dax_read_unlock(id); return rc; @@ -1013,7 +1014,7 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, id = dax_read_lock(); length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), - NULL, pfnp); + DAX_ACCESS, NULL, pfnp); if (length < 0) { rc = length; goto out; @@ -1122,7 +1123,7 @@ static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff, void *kaddr; long ret; - ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); + ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); if (ret > 0) { memset(kaddr + offset, 0, size); dax_flush(dax_dev, kaddr + offset, size); @@ -1247,7 +1248,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, } map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), - &kaddr, NULL); + DAX_ACCESS, &kaddr, NULL); if (map_len < 0) { ret = map_len; break; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index d7d3a7f06862..10eb50cbf398 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1241,8 +1241,8 @@ static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); id = dax_read_lock(); - nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), NULL, - NULL); + nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), + DAX_ACCESS, NULL, NULL); dax_read_unlock(id); if (nr_pages < 0) { pr_debug("dax_direct_access() returned %ld\n", nr_pages); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 86b7dbb6a0d4..8db53fa67359 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -752,7 +752,8 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, * offset. */ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, + void **kaddr, pfn_t *pfn) { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); @@ -772,7 +773,8 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev, long rc; void *kaddr; - rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); + rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, + NULL); if (rc < 0) return rc; memset(kaddr, 0, nr_pages << PAGE_SHIFT); diff --git a/include/linux/dax.h b/include/linux/dax.h index 9fc5f99a0ae2..3f1339bce3c0 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,6 +14,11 @@ struct iomap_ops; struct iomap_iter; struct iomap; +enum dax_access_mode { + DAX_ACCESS, + DAX_RECOVERY_WRITE, +}; + struct dax_operations { /* * direct_access: translate a device-relative @@ -21,7 +26,7 @@ struct dax_operations { * number of pages available for DAX at that pfn. */ long (*direct_access)(struct dax_device *, pgoff_t, long, - void **, pfn_t *); + enum dax_access_mode, void **, pfn_t *); /* * Validate whether this device is usable as an fsdax backing * device. @@ -178,7 +183,7 @@ static inline void dax_read_unlock(int id) bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, - void **kaddr, pfn_t *pfn); + enum dax_access_mode mode, void **kaddr, pfn_t *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index c2a3758c4aaa..acdedda0d12b 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -20,6 +20,7 @@ struct dm_table; struct dm_report_zones_args; struct mapped_device; struct bio_vec; +enum dax_access_mode; /* * Type of table, mapped_device's mempool and request_queue @@ -146,7 +147,8 @@ typedef int (*dm_busy_fn) (struct dm_target *ti); * >= 0 : the number of bytes accessible at the address */ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn); + long nr_pages, enum dax_access_mode node, void **kaddr, + pfn_t *pfn); typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, size_t nr_pages); diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c index af19c85558e7..c1ec099a3b1d 100644 --- a/tools/testing/nvdimm/pmem-dax.c +++ b/tools/testing/nvdimm/pmem-dax.c @@ -4,11 +4,13 @@ */ #include "test/nfit_test.h" #include +#include #include #include long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; -- cgit v1.2.3 From 047218ec904da19c45c4a70274fc3f818a1fcba1 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Fri, 22 Apr 2022 16:45:06 -0600 Subject: dax: add .recovery_write dax_operation Introduce dax_recovery_write() operation. The function is used to recover a dax range that contains poison. Typical use case is when a user process receives a SIGBUS with si_code BUS_MCEERR_AR indicating poison(s) in a dax range, in response, the user process issues a pwrite() to the page-aligned dax range, thus clears the poison and puts valid data in the range. Reviewed-by: Christoph Hellwig Signed-off-by: Jane Chu Link: https://lore.kernel.org/r/20220422224508.440670-6-jane.chu@oracle.com Signed-off-by: Dan Williams --- drivers/dax/super.c | 9 +++++++++ drivers/md/dm-linear.c | 10 ++++++++++ drivers/md/dm-log-writes.c | 10 ++++++++++ drivers/md/dm-stripe.c | 10 ++++++++++ drivers/md/dm.c | 20 ++++++++++++++++++++ drivers/nvdimm/pmem.c | 7 +++++++ fs/dax.c | 13 ++++++++++++- include/linux/dax.h | 13 +++++++++++++ include/linux/device-mapper.h | 9 +++++++++ 9 files changed, 100 insertions(+), 1 deletion(-) (limited to 'include/linux/device-mapper.h') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 5405eb553430..50a08b2ec247 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -195,6 +195,15 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, } EXPORT_SYMBOL_GPL(dax_zero_page_range); +size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *iter) +{ + if (!dax_dev->ops->recovery_write) + return 0; + return dax_dev->ops->recovery_write(dax_dev, pgoff, addr, bytes, iter); +} +EXPORT_SYMBOL_GPL(dax_recovery_write); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 13e263299c9c..cdf48bc8c5b0 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -188,9 +188,18 @@ static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, return dax_zero_page_range(dax_dev, pgoff, nr_pages); } +static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); +} + #else #define linear_dax_direct_access NULL #define linear_dax_zero_page_range NULL +#define linear_dax_recovery_write NULL #endif static struct target_type linear_target = { @@ -208,6 +217,7 @@ static struct target_type linear_target = { .iterate_devices = linear_iterate_devices, .direct_access = linear_dax_direct_access, .dax_zero_page_range = linear_dax_zero_page_range, + .dax_recovery_write = linear_dax_recovery_write, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 06bdbed65eb1..22739dccdd17 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -905,9 +905,18 @@ static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT); } +static size_t log_writes_dax_recovery_write(struct dm_target *ti, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); +} + #else #define log_writes_dax_direct_access NULL #define log_writes_dax_zero_page_range NULL +#define log_writes_dax_recovery_write NULL #endif static struct target_type log_writes_target = { @@ -925,6 +934,7 @@ static struct target_type log_writes_target = { .io_hints = log_writes_io_hints, .direct_access = log_writes_dax_direct_access, .dax_zero_page_range = log_writes_dax_zero_page_range, + .dax_recovery_write = log_writes_dax_recovery_write, }; static int __init dm_log_writes_init(void) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 77d72900e997..baa085cc67bd 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -331,9 +331,18 @@ static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, return dax_zero_page_range(dax_dev, pgoff, nr_pages); } +static size_t stripe_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); +} + #else #define stripe_dax_direct_access NULL #define stripe_dax_zero_page_range NULL +#define stripe_dax_recovery_write NULL #endif /* @@ -470,6 +479,7 @@ static struct target_type stripe_target = { .io_hints = stripe_io_hints, .direct_access = stripe_dax_direct_access, .dax_zero_page_range = stripe_dax_zero_page_range, + .dax_recovery_write = stripe_dax_recovery_write, }; int __init dm_stripe_init(void) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 9c452641c3d5..3fe76ab20069 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1147,6 +1147,25 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, return ret; } +static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + int srcu_idx; + long ret = 0; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + if (!ti || !ti->type->dax_recovery_write) + goto out; + + ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i); +out: + dm_put_live_table(md, srcu_idx); + return ret; +} + /* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management @@ -3147,6 +3166,7 @@ static const struct block_device_operations dm_rq_blk_dops = { static const struct dax_operations dm_dax_ops = { .direct_access = dm_dax_direct_access, .zero_page_range = dm_dax_zero_page_range, + .recovery_write = dm_dax_recovery_write, }; /* diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 47f34c50f944..e5e288135af7 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -287,9 +287,16 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev, return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn); } +static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return 0; +} + static const struct dax_operations pmem_dax_ops = { .direct_access = pmem_dax_direct_access, .zero_page_range = pmem_dax_zero_page_range, + .recovery_write = pmem_recovery_write, }; static ssize_t write_cache_show(struct device *dev, diff --git a/fs/dax.c b/fs/dax.c index ef3103107104..a1e4b45cbf55 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1240,6 +1240,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); ssize_t map_len; + bool recovery = false; void *kaddr; if (fatal_signal_pending(current)) { @@ -1249,6 +1250,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_ACCESS, &kaddr, NULL); + if (map_len == -EIO && iov_iter_rw(iter) == WRITE) { + map_len = dax_direct_access(dax_dev, pgoff, + PHYS_PFN(size), DAX_RECOVERY_WRITE, + &kaddr, NULL); + if (map_len > 0) + recovery = true; + } if (map_len < 0) { ret = map_len; break; @@ -1260,7 +1268,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (map_len > end - pos) map_len = end - pos; - if (iov_iter_rw(iter) == WRITE) + if (recovery) + xfer = dax_recovery_write(dax_dev, pgoff, kaddr, + map_len, iter); + else if (iov_iter_rw(iter) == WRITE) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else diff --git a/include/linux/dax.h b/include/linux/dax.h index 3f1339bce3c0..e7b81634c52a 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -35,6 +35,12 @@ struct dax_operations { sector_t, sector_t); /* zero_page_range: required operation. Zero page range */ int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); + /* + * recovery_write: recover a poisoned range by DAX device driver + * capable of clearing poison. + */ + size_t (*recovery_write)(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *iter); }; #if IS_ENABLED(CONFIG_DAX) @@ -45,6 +51,8 @@ void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); bool dax_synchronous(struct dax_device *dax_dev); void set_dax_synchronous(struct dax_device *dax_dev); +size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i); /* * Check if given mapping is supported by the file / underlying device. */ @@ -92,6 +100,11 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, { return !(vma->vm_flags & VM_SYNC); } +static inline size_t dax_recovery_write(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + return 0; +} #endif void set_dax_nocache(struct dax_device *dax_dev); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index acdedda0d12b..47a01c7cffdf 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -152,6 +152,14 @@ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, size_t nr_pages); +/* + * Returns: + * != 0 : number of bytes transferred + * 0 : recovery write failed + */ +typedef size_t (*dm_dax_recovery_write_fn)(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i); + void dm_error(const char *message); struct dm_dev { @@ -201,6 +209,7 @@ struct target_type { dm_io_hints_fn io_hints; dm_dax_direct_access_fn direct_access; dm_dax_zero_page_range_fn dax_zero_page_range; + dm_dax_recovery_write_fn dax_recovery_write; /* For internal device-mapper use. */ struct list_head list; -- cgit v1.2.3 From 2aec377a29250b942f14d3c16d49783da3e9df11 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 5 Jul 2022 14:00:36 -0400 Subject: dm table: remove dm_table_get_num_targets() wrapper More efficient and readable to just access table->num_targets directly. Suggested-by: Linus Torvalds Signed-off-by: Mike Snitzer --- drivers/md/dm-ima.c | 2 +- drivers/md/dm-ioctl.c | 6 +++--- drivers/md/dm-table.c | 37 ++++++++++++++++--------------------- drivers/md/dm-zone.c | 2 +- drivers/md/dm.c | 4 ++-- include/linux/device-mapper.h | 1 - 6 files changed, 23 insertions(+), 29 deletions(-) (limited to 'include/linux/device-mapper.h') diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c index 1842d3a958ef..3c15c069947f 100644 --- a/drivers/md/dm-ima.c +++ b/drivers/md/dm-ima.c @@ -208,7 +208,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl if (!target_data_buf) goto error; - num_targets = dm_table_get_num_targets(table); + num_targets = table->num_targets; if (dm_ima_alloc_and_copy_device_data(table->md, &device_data_buf, num_targets, noio)) goto error; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 87310fceb0d8..98976aaa9db9 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -832,7 +832,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { if (get_disk_ro(disk)) param->flags |= DM_READONLY_FLAG; - param->target_count = dm_table_get_num_targets(table); + param->target_count = table->num_targets; } param->flags |= DM_ACTIVE_PRESENT_FLAG; @@ -845,7 +845,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) if (table) { if (!(dm_table_get_mode(table) & FMODE_WRITE)) param->flags |= DM_READONLY_FLAG; - param->target_count = dm_table_get_num_targets(table); + param->target_count = table->num_targets; } dm_put_live_table(md, srcu_idx); } @@ -1248,7 +1248,7 @@ static void retrieve_status(struct dm_table *table, type = STATUSTYPE_INFO; /* Get all the target info */ - num_targets = dm_table_get_num_targets(table); + num_targets = table->num_targets; for (i = 0; i < num_targets; i++) { struct dm_target *ti = dm_table_get_target(table, i); size_t l; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 3f29b1113294..b4af34041a6f 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -593,7 +593,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table, /* * Check each entry in the table in turn. */ - for (i = 0; i < dm_table_get_num_targets(table); i++) { + for (i = 0; i < table->num_targets; i++) { ti = dm_table_get_target(table, i); blk_set_stacking_limits(&ti_limits); @@ -832,7 +832,7 @@ static bool dm_table_supports_dax(struct dm_table *t, unsigned i; /* Ensure that all targets support DAX. */ - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); if (!ti->type->direct_access) @@ -987,7 +987,7 @@ struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) struct dm_target *ti; unsigned i; - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); if (dm_target_is_wildcard(ti->type)) return ti; @@ -1127,7 +1127,7 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t) struct gendisk *prev_disk = NULL, *template_disk = NULL; unsigned i; - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); if (!dm_target_passes_integrity(ti->type)) goto no_integrity; @@ -1248,7 +1248,7 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, t = dm_get_live_table(md, &srcu_idx); if (!t) return 0; - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); if (!ti->type->iterate_devices) continue; @@ -1318,7 +1318,7 @@ static int dm_table_construct_crypto_profile(struct dm_table *t) memset(profile->modes_supported, 0xFF, sizeof(profile->modes_supported)); - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); if (!dm_target_passes_crypto(ti->type)) { @@ -1540,7 +1540,7 @@ static bool dm_table_any_dev_attr(struct dm_table *t, struct dm_target *ti; unsigned int i; - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); if (ti->type->iterate_devices && @@ -1566,7 +1566,7 @@ static bool dm_table_supports_poll(struct dm_table *t) struct dm_target *ti; unsigned i = 0; - while (i < dm_table_get_num_targets(t)) { + while (i < t->num_targets) { ti = dm_table_get_target(t, i++); if (!ti->type->iterate_devices || @@ -1588,7 +1588,7 @@ bool dm_table_has_no_data_devices(struct dm_table *table) struct dm_target *ti; unsigned i, num_devices; - for (i = 0; i < dm_table_get_num_targets(table); i++) { + for (i = 0; i < table->num_targets; i++) { ti = dm_table_get_target(table, i); if (!ti->type->iterate_devices) @@ -1625,7 +1625,7 @@ static bool dm_table_supports_zoned_model(struct dm_table *t, struct dm_target *ti; unsigned i; - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); if (dm_target_supports_zoned_hm(ti->type)) { @@ -1699,7 +1699,7 @@ int dm_calculate_queue_limits(struct dm_table *table, blk_set_stacking_limits(limits); - for (i = 0; i < dm_table_get_num_targets(table); i++) { + for (i = 0; i < table->num_targets; i++) { blk_set_stacking_limits(&ti_limits); ti = dm_table_get_target(table, i); @@ -1819,7 +1819,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) * so we need to use iterate_devices here, which targets * supporting flushes must provide. */ - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); if (!ti->num_flush_bios) @@ -1877,7 +1877,7 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t) struct dm_target *ti; unsigned i = 0; - while (i < dm_table_get_num_targets(t)) { + while (i < t->num_targets) { ti = dm_table_get_target(t, i++); if (!ti->num_write_zeroes_bios) @@ -1904,7 +1904,7 @@ static bool dm_table_supports_nowait(struct dm_table *t) struct dm_target *ti; unsigned i = 0; - while (i < dm_table_get_num_targets(t)) { + while (i < t->num_targets) { ti = dm_table_get_target(t, i++); if (!dm_target_supports_nowait(ti->type)) @@ -1929,7 +1929,7 @@ static bool dm_table_supports_discards(struct dm_table *t) struct dm_target *ti; unsigned i; - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); if (!ti->num_discard_bios) @@ -1961,7 +1961,7 @@ static bool dm_table_supports_secure_erase(struct dm_table *t) struct dm_target *ti; unsigned int i; - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); if (!ti->num_secure_erase_bios) @@ -2092,11 +2092,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, return 0; } -unsigned int dm_table_get_num_targets(struct dm_table *t) -{ - return t->num_targets; -} - struct list_head *dm_table_get_devices(struct dm_table *t) { return &t->devices; diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 3e7b1fe1580b..16dc9ec4bb11 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -278,7 +278,7 @@ static bool dm_table_supports_zone_append(struct dm_table *t) struct dm_target *ti; unsigned int i; - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); if (ti->emulate_zone_append) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index fa6839141118..44dae3869f29 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -422,7 +422,7 @@ retry: return r; /* We only support devices that have a single target */ - if (dm_table_get_num_targets(map) != 1) + if (map->num_targets != 1) return r; tgt = dm_table_get_target(map, 0); @@ -3092,7 +3092,7 @@ static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, goto out; /* We only support devices that have a single target */ - if (dm_table_get_num_targets(table) != 1) + if (table->num_targets != 1) goto out; ti = dm_table_get_target(table, 0); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 47a01c7cffdf..920085dd7f3b 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -561,7 +561,6 @@ void dm_sync_table(struct mapped_device *md); * Queries */ sector_t dm_table_get_size(struct dm_table *t); -unsigned int dm_table_get_num_targets(struct dm_table *t); fmode_t dm_table_get_mode(struct dm_table *t); struct mapped_device *dm_table_get_md(struct dm_table *t); const char *dm_table_device_name(struct dm_table *t); -- cgit v1.2.3 From 9dd1cd3220eca534f2d47afad7ce85f4c40118d8 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 20 Jul 2022 13:58:04 -0400 Subject: dm: fix dm-raid crash if md_handle_request() splits bio Commit ca522482e3eaf ("dm: pass NULL bdev to bio_alloc_clone") introduced the optimization to _not_ perform bio_associate_blkg()'s relatively costly work when DM core clones its bio. But in doing so it exposed the possibility for DM's cloned bio to alter DM target behavior (e.g. crash) if a target were to issue IO without first calling bio_set_dev(). The DM raid target can trigger an MD crash due to its need to split the DM bio that is passed to md_handle_request(). The split will recurse to submit_bio_noacct() using a bio with an uninitialized ->bi_blkg. This NULL bio->bi_blkg causes blk_throtl_bio() to dereference a NULL blkg_to_tg(bio->bi_blkg). Fix this in DM core by adding a new 'needs_bio_set_dev' target flag that will make alloc_tio() call bio_set_dev() on behalf of the target. dm-raid is the only target that requires this flag. bio_set_dev() initializes the DM cloned bio's ->bi_blkg, using bio_associate_blkg, before passing the bio to md_handle_request(). Long-term fix would be to audit and refactor MD code to rely on DM to split its bio, using dm_accept_partial_bio(), but there are MD raid personalities (e.g. raid1 and raid10) whose implementation are tightly coupled to handling the bio splitting inline. Fixes: ca522482e3eaf ("dm: pass NULL bdev to bio_alloc_clone") Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 1 + drivers/md/dm.c | 13 ++++++------- include/linux/device-mapper.h | 6 ++++++ include/uapi/linux/dm-ioctl.h | 4 ++-- 4 files changed, 15 insertions(+), 9 deletions(-) (limited to 'include/linux/device-mapper.h') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index e6a9b8cb22d3..3203aecd6961 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3095,6 +3095,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; ti->num_flush_bios = 1; + ti->needs_bio_set_dev = true; /* Restore any requested new layout for conversion decision */ rs_config_restore(rs, &rs_layout); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 47bcc5081b2b..f6a6437d0a7c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -574,9 +574,6 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) struct bio *clone; clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs); - /* Set default bdev, but target must bio_set_dev() before issuing IO */ - clone->bi_bdev = md->disk->part0; - tio = clone_to_tio(clone); tio->flags = 0; dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO); @@ -609,6 +606,7 @@ static void free_io(struct dm_io *io) static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) { + struct mapped_device *md = ci->io->md; struct dm_target_io *tio; struct bio *clone; @@ -618,14 +616,10 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, /* alloc_io() already initialized embedded clone */ clone = &tio->clone; } else { - struct mapped_device *md = ci->io->md; - clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, &md->mempools->bs); if (!clone) return NULL; - /* Set default bdev, but target must bio_set_dev() before issuing IO */ - clone->bi_bdev = md->disk->part0; /* REQ_DM_POLL_LIST shouldn't be inherited */ clone->bi_opf &= ~REQ_DM_POLL_LIST; @@ -641,6 +635,11 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, tio->len_ptr = len; tio->old_sector = 0; + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; + if (unlikely(ti->needs_bio_set_dev)) + bio_set_dev(clone, md->disk->part0); + if (len) { clone->bi_iter.bi_size = to_bytes(*len); if (bio_integrity(clone)) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 920085dd7f3b..04c6acf7faaa 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -373,6 +373,12 @@ struct dm_target { * after returning DM_MAPIO_SUBMITTED from its map function. */ bool accounts_remapped_io:1; + + /* + * Set if the target will submit the DM bio without first calling + * bio_set_dev(). NOTE: ideally a target should _not_ need this. + */ + bool needs_bio_set_dev:1; }; void *dm_per_bio_data(struct bio *bio, size_t data_size); diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 2e9550fef90f..27ad9671f2df 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -286,9 +286,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 46 +#define DM_VERSION_MINOR 47 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2022-02-22)" +#define DM_VERSION_EXTRA "-ioctl (2022-07-28)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3