diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-25 11:53:26 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-25 11:53:26 -0800 |
commit | eeee2827ae75ca58a6965e1b6d208576a5a01920 (patch) | |
tree | 9d5f6301a3db7d11c9dee0188bfbe5168c7b185d | |
parent | 7e5192b93c3b8661791f65f0d477d0da234ca202 (diff) | |
parent | f612b2132db529feac4f965f28a1b9258ea7c22b (diff) | |
download | lwn-eeee2827ae75ca58a6965e1b6d208576a5a01920.tar.gz lwn-eeee2827ae75ca58a6965e1b6d208576a5a01920.zip |
Merge tag 'for-5.5/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- Fix DM core to disallow stacking request-based DM on partitions.
- Fix DM raid target to properly resync raidset even if bitmap needed
additional pages.
- Fix DM crypt performance regression due to use of WQ_HIGHPRI for the
IO and crypt workqueues.
- Fix DM integrity metadata layout that was aligned on 128K boundary
rather than the intended 4K boundary (removes 124K of wasted space
for each metadata block).
- Improve the DM thin, cache and clone targets to use spin_lock_irq
rather than spin_lock_irqsave where possible.
- Fix DM thin single thread performance that was lost due to needless
workqueue wakeups.
- Fix DM zoned target performance that was lost due to excessive
backing device checks.
- Add ability to trigger write failure with the DM dust test target.
- Fix whitespace indentation in drivers/md/Kconfig.
- Various smalls fixes and cleanups (e.g. use struct_size, fix
uninitialized variable, variable renames, etc).
* tag 'for-5.5/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (22 commits)
Revert "dm crypt: use WQ_HIGHPRI for the IO and crypt workqueues"
dm: Fix Kconfig indentation
dm thin: wakeup worker only when deferred bios exist
dm integrity: fix excessive alignment of metadata runs
dm raid: Remove unnecessary negation of a shift in raid10_format_to_md_layout
dm zoned: reduce overhead of backing device checks
dm dust: add limited write failure mode
dm dust: change ret to r in dust_map_read and dust_map
dm dust: change result vars to r
dm cache: replace spin_lock_irqsave with spin_lock_irq
dm bio prison: replace spin_lock_irqsave with spin_lock_irq
dm thin: replace spin_lock_irqsave with spin_lock_irq
dm clone: add bucket_lock_irq/bucket_unlock_irq helpers
dm clone: replace spin_lock_irqsave with spin_lock_irq
dm writecache: handle REQ_FUA
dm writecache: fix uninitialized variable warning
dm stripe: use struct_size() in kmalloc()
dm raid: streamline rs_get_progress() and its raid_status() caller side
dm raid: simplify rs_setup_recovery call chain
dm raid: to ensure resynchronization, perform raid set grow in preresume
...
-rw-r--r-- | Documentation/admin-guide/device-mapper/dm-integrity.rst | 5 | ||||
-rw-r--r-- | Documentation/admin-guide/device-mapper/dm-raid.rst | 2 | ||||
-rw-r--r-- | drivers/md/Kconfig | 54 | ||||
-rw-r--r-- | drivers/md/dm-bio-prison-v1.c | 27 | ||||
-rw-r--r-- | drivers/md/dm-bio-prison-v2.c | 26 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 77 | ||||
-rw-r--r-- | drivers/md/dm-clone-metadata.c | 29 | ||||
-rw-r--r-- | drivers/md/dm-clone-metadata.h | 4 | ||||
-rw-r--r-- | drivers/md/dm-clone-target.c | 62 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 9 | ||||
-rw-r--r-- | drivers/md/dm-dust.c | 97 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 28 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 164 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 15 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 27 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 118 | ||||
-rw-r--r-- | drivers/md/dm-writecache.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-zoned-metadata.c | 29 | ||||
-rw-r--r-- | drivers/md/dm-zoned-reclaim.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-zoned-target.c | 54 | ||||
-rw-r--r-- | drivers/md/dm-zoned.h | 2 | ||||
-rw-r--r-- | include/linux/device-mapper.h | 3 |
22 files changed, 433 insertions, 412 deletions
diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst index a30aa91b5fbe..594095b54b29 100644 --- a/Documentation/admin-guide/device-mapper/dm-integrity.rst +++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst @@ -177,6 +177,11 @@ bitmap_flush_interval:number The bitmap flush interval in milliseconds. The metadata buffers are synchronized when this interval expires. +fix_padding + Use a smaller padding of the tag area that is more + space-efficient. If this option is not present, large padding is + used - that is for compatibility with older kernels. + The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can be changed when reloading the target (load an inactive table and swap the diff --git a/Documentation/admin-guide/device-mapper/dm-raid.rst b/Documentation/admin-guide/device-mapper/dm-raid.rst index 2fe255b130fb..f6344675e395 100644 --- a/Documentation/admin-guide/device-mapper/dm-raid.rst +++ b/Documentation/admin-guide/device-mapper/dm-raid.rst @@ -417,3 +417,5 @@ Version History deadlock/potential data corruption. Update superblock when specific devices are requested via rebuild. Fix RAID leg rebuild errors. + 1.15.0 Fix size extensions not being synchronized in case of new MD bitmap + pages allocated; also fix those not occuring after previous reductions diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index aa98953f4462..d6d5ab23c088 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -38,9 +38,9 @@ config MD_AUTODETECT default y ---help--- If you say Y here, then the kernel will try to autodetect raid - arrays as part of its boot process. + arrays as part of its boot process. - If you don't use raid and say Y, this autodetection can cause + If you don't use raid and say Y, this autodetection can cause a several-second delay in the boot time due to various synchronisation steps that are part of this step. @@ -290,7 +290,7 @@ config DM_SNAPSHOT depends on BLK_DEV_DM select DM_BUFIO ---help--- - Allow volume managers to take writable snapshots of a device. + Allow volume managers to take writable snapshots of a device. config DM_THIN_PROVISIONING tristate "Thin provisioning target" @@ -298,7 +298,7 @@ config DM_THIN_PROVISIONING select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- - Provides thin provisioning and snapshots that share a data store. + Provides thin provisioning and snapshots that share a data store. config DM_CACHE tristate "Cache target (EXPERIMENTAL)" @@ -307,23 +307,23 @@ config DM_CACHE select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- - dm-cache attempts to improve performance of a block device by - moving frequently used data to a smaller, higher performance - device. Different 'policy' plugins can be used to change the - algorithms used to select which blocks are promoted, demoted, - cleaned etc. It supports writeback and writethrough modes. + dm-cache attempts to improve performance of a block device by + moving frequently used data to a smaller, higher performance + device. Different 'policy' plugins can be used to change the + algorithms used to select which blocks are promoted, demoted, + cleaned etc. It supports writeback and writethrough modes. config DM_CACHE_SMQ tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)" depends on DM_CACHE default y ---help--- - A cache policy that uses a multiqueue ordered by recent hits - to select which blocks should be promoted and demoted. - This is meant to be a general purpose policy. It prioritises - reads over writes. This SMQ policy (vs MQ) offers the promise - of less memory utilization, improved performance and increased - adaptability in the face of changing workloads. + A cache policy that uses a multiqueue ordered by recent hits + to select which blocks should be promoted and demoted. + This is meant to be a general purpose policy. It prioritises + reads over writes. This SMQ policy (vs MQ) offers the promise + of less memory utilization, improved performance and increased + adaptability in the face of changing workloads. config DM_WRITECACHE tristate "Writecache target" @@ -343,9 +343,9 @@ config DM_ERA select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- - dm-era tracks which parts of a block device are written to - over time. Useful for maintaining cache coherency when using - vendor snapshots. + dm-era tracks which parts of a block device are written to + over time. Useful for maintaining cache coherency when using + vendor snapshots. config DM_CLONE tristate "Clone target (EXPERIMENTAL)" @@ -353,20 +353,20 @@ config DM_CLONE default n select DM_PERSISTENT_DATA ---help--- - dm-clone produces a one-to-one copy of an existing, read-only source - device into a writable destination device. The cloned device is - visible/mountable immediately and the copy of the source device to the - destination device happens in the background, in parallel with user - I/O. + dm-clone produces a one-to-one copy of an existing, read-only source + device into a writable destination device. The cloned device is + visible/mountable immediately and the copy of the source device to the + destination device happens in the background, in parallel with user + I/O. - If unsure, say N. + If unsure, say N. config DM_MIRROR tristate "Mirror target" depends on BLK_DEV_DM ---help--- - Allow volume managers to mirror logical volumes, also - needed for live data migration tools such as 'pvmove'. + Allow volume managers to mirror logical volumes, also + needed for live data migration tools such as 'pvmove'. config DM_LOG_USERSPACE tristate "Mirror userspace logging" @@ -483,7 +483,7 @@ config DM_FLAKEY tristate "Flakey target" depends on BLK_DEV_DM ---help--- - A target that intermittently fails I/O for debugging purposes. + A target that intermittently fails I/O for debugging purposes. config DM_VERITY tristate "Verity target support" diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c index b5389890bbc3..1f8f98efd97a 100644 --- a/drivers/md/dm-bio-prison-v1.c +++ b/drivers/md/dm-bio-prison-v1.c @@ -150,11 +150,10 @@ static int bio_detain(struct dm_bio_prison *prison, struct dm_bio_prison_cell **cell_result) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -198,11 +197,9 @@ void dm_cell_release(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell, struct bio_list *bios) { - unsigned long flags; - - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); __cell_release(prison, cell, bios); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); } EXPORT_SYMBOL_GPL(dm_cell_release); @@ -250,12 +247,10 @@ void dm_cell_visit_release(struct dm_bio_prison *prison, void *context, struct dm_bio_prison_cell *cell) { - unsigned long flags; - - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); visit_fn(context, cell); rb_erase(&cell->node, &prison->cells); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); } EXPORT_SYMBOL_GPL(dm_cell_visit_release); @@ -275,11 +270,10 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __promote_or_release(prison, cell); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -379,10 +373,9 @@ EXPORT_SYMBOL_GPL(dm_deferred_entry_dec); int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work) { int r = 1; - unsigned long flags; unsigned next_entry; - spin_lock_irqsave(&ds->lock, flags); + spin_lock_irq(&ds->lock); if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->current_entry].count) r = 0; @@ -392,7 +385,7 @@ int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work) if (!ds->entries[next_entry].count) ds->current_entry = next_entry; } - spin_unlock_irqrestore(&ds->lock, flags); + spin_unlock_irq(&ds->lock); return r; } diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c index b092cdc8e1ae..8ee019eda32d 100644 --- a/drivers/md/dm-bio-prison-v2.c +++ b/drivers/md/dm-bio-prison-v2.c @@ -177,11 +177,10 @@ bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 **cell_result) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -261,11 +260,10 @@ int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 **cell_result) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __lock(prison, key, lock_level, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -285,11 +283,9 @@ void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 *cell, struct work_struct *continuation) { - unsigned long flags; - - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); __quiesce(prison, cell, continuation); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); } EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2); @@ -309,11 +305,10 @@ int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison, unsigned new_lock_level) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __promote(prison, cell, new_lock_level); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -342,11 +337,10 @@ bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison, struct bio_list *bios) { bool r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __unlock(prison, cell, bios); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 8346e6d1816c..2d32821b3a5b 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -74,22 +74,19 @@ static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) { bool r; - unsigned long flags; - spin_lock_irqsave(&iot->lock, flags); + spin_lock_irq(&iot->lock); r = __iot_idle_for(iot, jifs); - spin_unlock_irqrestore(&iot->lock, flags); + spin_unlock_irq(&iot->lock); return r; } static void iot_io_begin(struct io_tracker *iot, sector_t len) { - unsigned long flags; - - spin_lock_irqsave(&iot->lock, flags); + spin_lock_irq(&iot->lock); iot->in_flight += len; - spin_unlock_irqrestore(&iot->lock, flags); + spin_unlock_irq(&iot->lock); } static void __iot_io_end(struct io_tracker *iot, sector_t len) @@ -172,7 +169,6 @@ static void __commit(struct work_struct *_ws) { struct batcher *b = container_of(_ws, struct batcher, commit_work); blk_status_t r; - unsigned long flags; struct list_head work_items; struct work_struct *ws, *tmp; struct continuation *k; @@ -186,12 +182,12 @@ static void __commit(struct work_struct *_ws) * We have to grab these before the commit_op to avoid a race * condition. */ - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); list_splice_init(&b->work_items, &work_items); bio_list_merge(&bios, &b->bios); bio_list_init(&b->bios); b->commit_scheduled = false; - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); r = b->commit_op(b->commit_context); @@ -238,13 +234,12 @@ static void async_commit(struct batcher *b) static void continue_after_commit(struct batcher *b, struct continuation *k) { - unsigned long flags; bool commit_scheduled; - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); commit_scheduled = b->commit_scheduled; list_add_tail(&k->ws.entry, &b->work_items); - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); if (commit_scheduled) async_commit(b); @@ -255,13 +250,12 @@ static void continue_after_commit(struct batcher *b, struct continuation *k) */ static void issue_after_commit(struct batcher *b, struct bio *bio) { - unsigned long flags; bool commit_scheduled; - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); commit_scheduled = b->commit_scheduled; bio_list_add(&b->bios, bio); - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); if (commit_scheduled) async_commit(b); @@ -273,12 +267,11 @@ static void issue_after_commit(struct batcher *b, struct bio *bio) static void schedule_commit(struct batcher *b) { bool immediate; - unsigned long flags; - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); b->commit_scheduled = true; - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); if (immediate) async_commit(b); @@ -630,23 +623,19 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio) static void defer_bio(struct cache *cache, struct bio *bio) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); bio_list_add(&cache->deferred_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); wake_deferred_bio_worker(cache); } static void defer_bios(struct cache *cache, struct bio_list *bios) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); bio_list_merge(&cache->deferred_bios, bios); bio_list_init(bios); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); wake_deferred_bio_worker(cache); } @@ -756,33 +745,27 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) static void set_discard(struct cache *cache, dm_dblock_t b) { - unsigned long flags; - BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); atomic_inc(&cache->stats.discard_count); - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); set_bit(from_dblock(b), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); } static void clear_discard(struct cache *cache, dm_dblock_t b) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); clear_bit(from_dblock(b), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); } static bool is_discarded(struct cache *cache, dm_dblock_t b) { int r; - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); r = test_bit(from_dblock(b), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); return r; } @@ -790,12 +773,10 @@ static bool is_discarded(struct cache *cache, dm_dblock_t b) static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) { int r; - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); r = test_bit(from_dblock(oblock_to_dblock(cache, b)), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); return r; } @@ -827,17 +808,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio, static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) { - unsigned long flags; struct per_bio_data *pb; - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && bio_op(bio) != REQ_OP_DISCARD) { pb = get_per_bio_data(bio); pb->tick = true; cache->need_tick_bio = false; } - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); } static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, @@ -1889,17 +1869,16 @@ static void process_deferred_bios(struct work_struct *ws) { struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); - unsigned long flags; bool commit_needed = false; struct bio_list bios; struct bio *bio; bio_list_init(&bios); - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); bio_list_merge(&bios, &cache->deferred_bios); bio_list_init(&cache->deferred_bios); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); while ((bio = bio_list_pop(&bios))) { if (bio->bi_opf & REQ_PREFLUSH) diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c index 6bc8c1d1c351..08c552e5e41b 100644 --- a/drivers/md/dm-clone-metadata.c +++ b/drivers/md/dm-clone-metadata.c @@ -712,7 +712,7 @@ static int __metadata_commit(struct dm_clone_metadata *cmd) static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap) { int r; - unsigned long word, flags; + unsigned long word; word = 0; do { @@ -736,9 +736,9 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap) return r; /* Update the changed flag */ - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); dmap->changed = 0; - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); return 0; } @@ -746,7 +746,6 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap) int dm_clone_metadata_commit(struct dm_clone_metadata *cmd) { int r = -EPERM; - unsigned long flags; struct dirty_map *dmap, *next_dmap; down_write(&cmd->lock); @@ -770,9 +769,9 @@ int dm_clone_metadata_commit(struct dm_clone_metadata *cmd) } /* Swap dirty bitmaps */ - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); cmd->current_dmap = next_dmap; - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); /* * No one is accessing the old dirty bitmap anymore, so we can flush @@ -817,9 +816,9 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, { int r = 0; struct dirty_map *dmap; - unsigned long word, region_nr, flags; + unsigned long word, region_nr; - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); if (cmd->read_only) { r = -EPERM; @@ -836,7 +835,7 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, } } out: - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); return r; } @@ -903,13 +902,11 @@ out: void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd) { - unsigned long flags; - down_write(&cmd->lock); - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); cmd->read_only = 1; - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); if (!cmd->fail_io) dm_bm_set_read_only(cmd->bm); @@ -919,13 +916,11 @@ void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd) void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd) { - unsigned long flags; - down_write(&cmd->lock); - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); cmd->read_only = 0; - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); if (!cmd->fail_io) dm_bm_set_read_write(cmd->bm); diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h index 434bff08508b..3fe50a781c11 100644 --- a/drivers/md/dm-clone-metadata.h +++ b/drivers/md/dm-clone-metadata.h @@ -44,7 +44,9 @@ int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long re * @start: Starting region number * @nr_regions: Number of regions in the range * - * This function doesn't block, so it's safe to call it from interrupt context. + * This function doesn't block, but since it uses spin_lock_irq()/spin_unlock_irq() + * it's NOT safe to call it from any context where interrupts are disabled, e.g., + * from interrupt context. */ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, unsigned long nr_regions); diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 4ca8f1977222..b3d89072d21c 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -332,8 +332,6 @@ static void submit_bios(struct bio_list *bios) */ static void issue_bio(struct clone *clone, struct bio *bio) { - unsigned long flags; - if (!bio_triggers_commit(clone, bio)) { generic_make_request(bio); return; @@ -352,9 +350,9 @@ static void issue_bio(struct clone *clone, struct bio *bio) * Batch together any bios that trigger commits and then issue a single * commit for them in process_deferred_flush_bios(). */ - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_add(&clone->deferred_flush_bios, bio); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); wake_worker(clone); } @@ -469,7 +467,7 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ static void process_discard_bio(struct clone *clone, struct bio *bio) { - unsigned long rs, re, flags; + unsigned long rs, re; bio_region_range(clone, bio, &rs, &re); BUG_ON(re > clone->nr_regions); @@ -501,9 +499,9 @@ static void process_discard_bio(struct clone *clone, struct bio *bio) /* * Defer discard processing. */ - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_add(&clone->deferred_discard_bios, bio); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); wake_worker(clone); } @@ -554,6 +552,12 @@ struct hash_table_bucket { #define bucket_unlock_irqrestore(bucket, flags) \ spin_unlock_irqrestore(&(bucket)->lock, flags) +#define bucket_lock_irq(bucket) \ + spin_lock_irq(&(bucket)->lock) + +#define bucket_unlock_irq(bucket) \ + spin_unlock_irq(&(bucket)->lock) + static int hash_table_init(struct clone *clone) { unsigned int i, sz; @@ -851,7 +855,6 @@ static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio */ static void hydrate_bio_region(struct clone *clone, struct bio *bio) { - unsigned long flags; unsigned long region_nr; struct hash_table_bucket *bucket; struct dm_clone_region_hydration *hd, *hd2; @@ -859,19 +862,19 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) region_nr = bio_to_region(clone, bio); bucket = get_hash_table_bucket(clone, region_nr); - bucket_lock_irqsave(bucket, flags); + bucket_lock_irq(bucket); hd = __hash_find(bucket, region_nr); if (hd) { /* Someone else is hydrating the region */ bio_list_add(&hd->deferred_bios, bio); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); return; } if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { /* The region has been hydrated */ - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); issue_bio(clone, bio); return; } @@ -880,16 +883,16 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) * We must allocate a hydration descriptor and start the hydration of * the corresponding region. */ - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); hd = alloc_hydration(clone); hydration_init(hd, region_nr); - bucket_lock_irqsave(bucket, flags); + bucket_lock_irq(bucket); /* Check if the region has been hydrated in the meantime. */ if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); free_hydration(hd); issue_bio(clone, bio); return; @@ -899,7 +902,7 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) if (hd2 != hd) { /* Someone else started the region's hydration. */ bio_list_add(&hd2->deferred_bios, bio); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); free_hydration(hd); return; } @@ -911,7 +914,7 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) */ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { hlist_del(&hd->h); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); free_hydration(hd); bio_io_error(bio); return; @@ -925,11 +928,11 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) * to the destination device. */ if (is_overwrite_bio(clone, bio)) { - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); hydration_overwrite(hd, bio); } else { bio_list_add(&hd->deferred_bios, bio); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); hydration_copy(hd, 1); } } @@ -996,7 +999,6 @@ static unsigned long __start_next_hydration(struct clone *clone, unsigned long offset, struct batch_info *batch) { - unsigned long flags; struct hash_table_bucket *bucket; struct dm_clone_region_hydration *hd; unsigned long nr_regions = clone->nr_regions; @@ -1010,13 +1012,13 @@ static unsigned long __start_next_hydration(struct clone *clone, break; bucket = get_hash_table_bucket(clone, offset); - bucket_lock_irqsave(bucket, flags); + bucket_lock_irq(bucket); if (!dm_clone_is_region_hydrated(clone->cmd, offset) && !__hash_find(bucket, offset)) { hydration_init(hd, offset); __insert_region_hydration(bucket, hd); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); /* Batch hydration */ __batch_hydration(batch, hd); @@ -1024,7 +1026,7 @@ static unsigned long __start_next_hydration(struct clone *clone, return (offset + 1); } - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); } while (++offset < nr_regions); @@ -1140,13 +1142,13 @@ static void process_deferred_discards(struct clone *clone) int r = -EPERM; struct bio *bio; struct blk_plug plug; - unsigned long rs, re, flags; + unsigned long rs, re; struct bio_list discards = BIO_EMPTY_LIST; - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_merge(&discards, &clone->deferred_discard_bios); bio_list_init(&clone->deferred_discard_bios); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); if (bio_list_empty(&discards)) return; @@ -1176,13 +1178,12 @@ out: static void process_deferred_bios(struct clone *clone) { - unsigned long flags; struct bio_list bios = BIO_EMPTY_LIST; - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_merge(&bios, &clone->deferred_bios); bio_list_init(&clone->deferred_bios); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); if (bio_list_empty(&bios)) return; @@ -1193,7 +1194,6 @@ static void process_deferred_bios(struct clone *clone) static void process_deferred_flush_bios(struct clone *clone) { struct bio *bio; - unsigned long flags; struct bio_list bios = BIO_EMPTY_LIST; struct bio_list bio_completions = BIO_EMPTY_LIST; @@ -1201,13 +1201,13 @@ static void process_deferred_flush_bios(struct clone *clone) * If there are any deferred flush bios, we must commit the metadata * before issuing them or signaling their completion. */ - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_merge(&bios, &clone->deferred_flush_bios); bio_list_init(&clone->deferred_flush_bios); bio_list_merge(&bio_completions, &clone->deferred_flush_completions); bio_list_init(&clone->deferred_flush_completions); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone))) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index f87f6495652f..eb9782fc93fe 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2700,21 +2700,18 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -ENOMEM; - cc->io_queue = alloc_workqueue("kcryptd_io/%s", - WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, - 1, devname); + cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd/%s", - WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, + cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1, devname); else cc->crypt_queue = alloc_workqueue("kcryptd/%s", - WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus(), devname); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 8288887b7f94..eb37584427a4 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -17,6 +17,7 @@ struct badblock { struct rb_node node; sector_t bb; + unsigned char wr_fail_cnt; }; struct dust_device { @@ -101,7 +102,8 @@ static int dust_remove_block(struct dust_device *dd, unsigned long long block) return 0; } -static int dust_add_block(struct dust_device *dd, unsigned long long block) +static int dust_add_block(struct dust_device *dd, unsigned long long block, + unsigned char wr_fail_cnt) { struct badblock *bblock; unsigned long flags; @@ -115,6 +117,7 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block) spin_lock_irqsave(&dd->dust_lock, flags); bblock->bb = block; + bblock->wr_fail_cnt = wr_fail_cnt; if (!dust_rb_insert(&dd->badblocklist, bblock)) { if (!dd->quiet_mode) { DMERR("%s: block %llu already in badblocklist", @@ -126,8 +129,10 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block) } dd->badblock_count++; - if (!dd->quiet_mode) - DMINFO("%s: badblock added at block %llu", __func__, block); + if (!dd->quiet_mode) { + DMINFO("%s: badblock added at block %llu with write fail count %hhu", + __func__, block, wr_fail_cnt); + } spin_unlock_irqrestore(&dd->dust_lock, flags); return 0; @@ -163,22 +168,27 @@ static int dust_map_read(struct dust_device *dd, sector_t thisblock, bool fail_read_on_bb) { unsigned long flags; - int ret = DM_MAPIO_REMAPPED; + int r = DM_MAPIO_REMAPPED; if (fail_read_on_bb) { thisblock >>= dd->sect_per_block_shift; spin_lock_irqsave(&dd->dust_lock, flags); - ret = __dust_map_read(dd, thisblock); + r = __dust_map_read(dd, thisblock); spin_unlock_irqrestore(&dd->dust_lock, flags); } - return ret; + return r; } -static void __dust_map_write(struct dust_device *dd, sector_t thisblock) +static int __dust_map_write(struct dust_device *dd, sector_t thisblock) { struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock); + if (bblk && bblk->wr_fail_cnt > 0) { + bblk->wr_fail_cnt--; + return DM_MAPIO_KILL; + } + if (bblk) { rb_erase(&bblk->node, &dd->badblocklist); dd->badblock_count--; @@ -189,37 +199,40 @@ static void __dust_map_write(struct dust_device *dd, sector_t thisblock) (unsigned long long)thisblock); } } + + return DM_MAPIO_REMAPPED; } static int dust_map_write(struct dust_device *dd, sector_t thisblock, bool fail_read_on_bb) { unsigned long flags; + int ret = DM_MAPIO_REMAPPED; if (fail_read_on_bb) { thisblock >>= dd->sect_per_block_shift; spin_lock_irqsave(&dd->dust_lock, flags); - __dust_map_write(dd, thisblock); + ret = __dust_map_write(dd, thisblock); spin_unlock_irqrestore(&dd->dust_lock, flags); } - return DM_MAPIO_REMAPPED; + return ret; } static int dust_map(struct dm_target *ti, struct bio *bio) { struct dust_device *dd = ti->private; - int ret; + int r; bio_set_dev(bio, dd->dev->bdev); bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector); if (bio_data_dir(bio) == READ) - ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + r = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); else - ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + r = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); - return ret; + return r; } static bool __dust_clear_badblocks(struct rb_root *tree, @@ -375,8 +388,10 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, struct dust_device *dd = ti->private; sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT; bool invalid_msg = false; - int result = -EINVAL; + int r = -EINVAL; unsigned long long tmp, block; + unsigned char wr_fail_cnt; + unsigned int tmp_ui; unsigned long flags; char dummy; @@ -388,45 +403,69 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, } else if (!strcasecmp(argv[0], "disable")) { DMINFO("disabling read failures on bad sectors"); dd->fail_read_on_bb = false; - result = 0; + r = 0; } else if (!strcasecmp(argv[0], "enable")) { DMINFO("enabling read failures on bad sectors"); dd->fail_read_on_bb = true; - result = 0; + r = 0; } else if (!strcasecmp(argv[0], "countbadblocks")) { spin_lock_irqsave(&dd->dust_lock, flags); DMINFO("countbadblocks: %llu badblock(s) found", dd->badblock_count); spin_unlock_irqrestore(&dd->dust_lock, flags); - result = 0; + r = 0; } else if (!strcasecmp(argv[0], "clearbadblocks")) { - result = dust_clear_badblocks(dd); + r = dust_clear_badblocks(dd); } else if (!strcasecmp(argv[0], "quiet")) { if (!dd->quiet_mode) dd->quiet_mode = true; else dd->quiet_mode = false; - result = 0; + r = 0; } else { invalid_msg = true; } } else if (argc == 2) { if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) - return result; + return r; block = tmp; sector_div(size, dd->sect_per_block); if (block > size) { DMERR("selected block value out of range"); - return result; + return r; } if (!strcasecmp(argv[0], "addbadblock")) - result = dust_add_block(dd, block); + r = dust_add_block(dd, block, 0); else if (!strcasecmp(argv[0], "removebadblock")) - result = dust_remove_block(dd, block); + r = dust_remove_block(dd, block); else if (!strcasecmp(argv[0], "queryblock")) - result = dust_query_block(dd, block); + r = dust_query_block(dd, block); + else + invalid_msg = true; + + } else if (argc == 3) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) + return r; + + if (sscanf(argv[2], "%u%c", &tmp_ui, &dummy) != 1) + return r; + + block = tmp; + if (tmp_ui > 255) { + DMERR("selected write fail count out of range"); + return r; + } + wr_fail_cnt = tmp_ui; + sector_div(size, dd->sect_per_block); + if (block > size) { + DMERR("selected block value out of range"); + return r; + } + + if (!strcasecmp(argv[0], "addbadblock")) + r = dust_add_block(dd, block, wr_fail_cnt); else invalid_msg = true; @@ -436,7 +475,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, if (invalid_msg) DMERR("unrecognized message '%s' received", argv[0]); - return result; + return r; } static void dust_status(struct dm_target *ti, status_type_t type, @@ -499,12 +538,12 @@ static struct target_type dust_target = { static int __init dm_dust_init(void) { - int result = dm_register_target(&dust_target); + int r = dm_register_target(&dust_target); - if (result < 0) - DMERR("dm_register_target failed %d", result); + if (r < 0) + DMERR("dm_register_target failed %d", r); - return result; + return r; } static void __exit dm_dust_exit(void) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index dab4446fe7d8..b225b3e445fa 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -53,6 +53,7 @@ #define SB_VERSION_1 1 #define SB_VERSION_2 2 #define SB_VERSION_3 3 +#define SB_VERSION_4 4 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -73,6 +74,7 @@ struct superblock { #define SB_FLAG_HAVE_JOURNAL_MAC 0x1 #define SB_FLAG_RECALCULATING 0x2 #define SB_FLAG_DIRTY_BITMAP 0x4 +#define SB_FLAG_FIXED_PADDING 0x8 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -250,6 +252,7 @@ struct dm_integrity_c { bool journal_uptodate; bool just_formatted; bool recalculate_flag; + bool fix_padding; struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; @@ -463,7 +466,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) static void sb_set_version(struct dm_integrity_c *ic) { - if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) + ic->sb->version = SB_VERSION_4; + else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) ic->sb->version = SB_VERSION_3; else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) ic->sb->version = SB_VERSION_2; @@ -2955,6 +2960,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; + arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0; DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, ic->tag_size, ic->mode, arg_count); if (ic->meta_dev) @@ -2974,6 +2980,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval)); } + if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) + DMEMIT(" fix_padding"); #define EMIT_ALG(a, n) \ do { \ @@ -3042,8 +3050,14 @@ static int calculate_device_limits(struct dm_integrity_c *ic) if (!ic->meta_dev) { sector_t last_sector, last_area, last_offset; - ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), - (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT; + /* we have to maintain excessive padding for compatibility with existing volumes */ + __u64 metadata_run_padding = + ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING) ? + (__u64)(METADATA_PADDING_SECTORS << SECTOR_SHIFT) : + (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS); + + ic->metadata_run = round_up((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), + metadata_run_padding) >> SECTOR_SHIFT; if (!(ic->metadata_run & (ic->metadata_run - 1))) ic->log2_metadata_run = __ffs(ic->metadata_run); else @@ -3086,6 +3100,8 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec journal_sections = 1; if (!ic->meta_dev) { + if (ic->fix_padding) + ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING); ic->sb->journal_sections = cpu_to_le32(journal_sections); if (!interleave_sectors) interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; @@ -3725,6 +3741,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } else if (!strcmp(opt_string, "recalculate")) { ic->recalculate_flag = true; + } else if (!strcmp(opt_string, "fix_padding")) { + ic->fix_padding = true; } else { r = -EINVAL; ti->error = "Invalid argument"; @@ -3867,7 +3885,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) should_write_sb = true; } - if (!ic->sb->version || ic->sb->version > SB_VERSION_3) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_4) { r = -EINVAL; ti->error = "Unknown version"; goto bad; @@ -4182,7 +4200,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index b0aa595e4375..c412eaa975fc 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -209,6 +209,7 @@ struct raid_dev { #define RT_FLAG_RS_SUSPENDED 5 #define RT_FLAG_RS_IN_SYNC 6 #define RT_FLAG_RS_RESYNCING 7 +#define RT_FLAG_RS_GROW 8 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -241,6 +242,9 @@ struct raid_set { struct raid_type *raid_type; struct dm_target_callbacks callbacks; + sector_t array_sectors; + sector_t dev_sectors; + /* Optional raid4/5/6 journal device */ struct journal_dev { struct dm_dev *dev; @@ -616,7 +620,6 @@ static int raid10_format_to_md_layout(struct raid_set *rs, } else if (algorithm == ALGORITHM_RAID10_FAR) { f = copies; - r = !RAID10_OFFSET; if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) r |= RAID10_USE_FAR_SETS; @@ -1615,13 +1618,12 @@ static int _check_data_dev_sectors(struct raid_set *rs) } /* Calculate the sectors per device and per array used for @rs */ -static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) +static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, bool use_mddev) { int delta_disks; unsigned int data_stripes; + sector_t array_sectors = sectors, dev_sectors = sectors; struct mddev *mddev = &rs->md; - struct md_rdev *rdev; - sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len; if (use_mddev) { delta_disks = mddev->delta_disks; @@ -1656,12 +1658,9 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) /* Striped layouts */ array_sectors = (data_stripes + delta_disks) * dev_sectors; - rdev_for_each(rdev, mddev) - if (!test_bit(Journal, &rdev->flags)) - rdev->sectors = dev_sectors; - mddev->array_sectors = array_sectors; mddev->dev_sectors = dev_sectors; + rs_set_rdev_sectors(rs); return _check_data_dev_sectors(rs); bad: @@ -1670,7 +1669,7 @@ bad: } /* Setup recovery on @rs */ -static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) +static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) { /* raid0 does not recover */ if (rs_is_raid0(rs)) @@ -1691,22 +1690,6 @@ static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) ? MaxSector : dev_sectors; } -/* Setup recovery on @rs based on raid type, device size and 'nosync' flag */ -static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) -{ - if (!dev_sectors) - /* New raid set or 'sync' flag provided */ - __rs_setup_recovery(rs, 0); - else if (dev_sectors == MaxSector) - /* Prevent recovery */ - __rs_setup_recovery(rs, MaxSector); - else if (__rdev_sectors(rs) < dev_sectors) - /* Grown raid set */ - __rs_setup_recovery(rs, __rdev_sectors(rs)); - else - __rs_setup_recovery(rs, MaxSector); -} - static void do_table_event(struct work_struct *ws) { struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); @@ -2474,7 +2457,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) return -EINVAL; } - /* Enable bitmap creation for RAID levels != 0 */ + /* Enable bitmap creation on @rs unless no metadevs or raid0 or journaled raid4/5/6 set. */ mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096); mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; @@ -2911,7 +2894,7 @@ static int rs_setup_reshape(struct raid_set *rs) /* Remove disk(s) */ } else if (rs->delta_disks < 0) { - r = rs_set_dev_and_array_sectors(rs, true); + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, true); mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */ /* Change layout and/or chunk size */ @@ -3008,7 +2991,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) bool resize = false; struct raid_type *rt; unsigned int num_raid_params, num_raid_devs; - sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors; + sector_t sb_array_sectors, rdev_sectors, reshape_sectors; struct raid_set *rs = NULL; const char *arg; struct rs_layout rs_layout; @@ -3067,11 +3050,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) * * Any existing superblock will overwrite the array and device sizes */ - r = rs_set_dev_and_array_sectors(rs, false); + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); if (r) goto bad; - calculated_dev_sectors = rs->md.dev_sectors; + /* Memorize just calculated, potentially larger sizes to grow the raid set in preresume */ + rs->array_sectors = rs->md.array_sectors; + rs->dev_sectors = rs->md.dev_sectors; /* * Backup any new raid set level, layout, ... @@ -3084,6 +3069,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; + /* All in-core metadata now as of current superblocks after calling analyse_superblocks() */ + sb_array_sectors = rs->md.array_sectors; rdev_sectors = __rdev_sectors(rs); if (!rdev_sectors) { ti->error = "Invalid rdev size"; @@ -3093,8 +3080,11 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) reshape_sectors = _get_reshape_sectors(rs); - if (calculated_dev_sectors != rdev_sectors) - resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors); + if (rs->dev_sectors != rdev_sectors) { + resize = (rs->dev_sectors != rdev_sectors - reshape_sectors); + if (rs->dev_sectors > rdev_sectors - reshape_sectors) + set_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); + } INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; @@ -3121,13 +3111,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); rs_set_new(rs); } else if (rs_is_recovering(rs)) { - /* Rebuild particular devices */ - if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { - set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); - rs_setup_recovery(rs, MaxSector); - } /* A recovering raid set may be resized */ - ; /* skip setup rs */ + goto size_check; } else if (rs_is_reshaping(rs)) { /* Have to reject size change request during reshape */ if (resize) { @@ -3171,6 +3156,9 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) rs_setup_recovery(rs, MaxSector); rs_set_new(rs); } else if (rs_reshape_requested(rs)) { + /* Only request grow on raid set size extensions, not on reshapes. */ + clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); + /* * No need to check for 'ongoing' takeover here, because takeover * is an instant operation as oposed to an ongoing reshape. @@ -3201,13 +3189,31 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) } rs_set_cur(rs); } else { +size_check: /* May not set recovery when a device rebuild is requested */ if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { - rs_setup_recovery(rs, MaxSector); + clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); - } else - rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ? - 0 : (resize ? calculated_dev_sectors : MaxSector)); + rs_setup_recovery(rs, MaxSector); + } else if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) { + /* + * Set raid set to current size, i.e. size as of + * superblocks to grow to larger size in preresume. + */ + r = rs_set_dev_and_array_sectors(rs, sb_array_sectors, false); + if (r) + goto bad; + + rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors); + } else { + /* This is no size change or it is shrinking, update size and record in superblocks */ + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); + if (r) + goto bad; + + if (sb_array_sectors > rs->array_sectors) + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); + } rs_set_cur(rs); } @@ -3406,10 +3412,9 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev) /* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, - sector_t resync_max_sectors) + enum sync_state state, sector_t resync_max_sectors) { sector_t r; - enum sync_state state; struct mddev *mddev = &rs->md; clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); @@ -3420,8 +3425,6 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { - state = decipher_sync_action(mddev, recovery); - if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery)) r = mddev->recovery_cp; else @@ -3439,18 +3442,14 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, /* * In case we are recovering, the array is not in sync * and health chars should show the recovering legs. + * + * Already retrieved recovery offset from curr_resync_completed above. */ ; - else if (state == st_resync) - /* - * If "resync" is occurring, the raid set - * is or may be out of sync hence the health - * characters shall be 'a'. - */ - set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); - else if (state == st_reshape) + + else if (state == st_resync || state == st_reshape) /* - * If "reshape" is occurring, the raid set + * If "resync/reshape" is occurring, the raid set * is or may be out of sync hence the health * characters shall be 'a'. */ @@ -3464,22 +3463,22 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, */ set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); - else { - struct md_rdev *rdev; - + else if (test_bit(MD_RECOVERY_NEEDED, &recovery)) /* * We are idle and recovery is needed, prevent 'A' chars race * caused by components still set to in-sync by constructor. */ - if (test_bit(MD_RECOVERY_NEEDED, &recovery)) - set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + else { /* - * The raid set may be doing an initial sync, or it may - * be rebuilding individual components. If all the - * devices are In_sync, then it is the raid set that is - * being initialized. + * We are idle and the raid set may be doing an initial + * sync, or it may be rebuilding individual components. + * If all the devices are In_sync, then it is the raid set + * that is being initialized. */ + struct md_rdev *rdev; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags) && @@ -3512,7 +3511,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, unsigned int rebuild_disks; unsigned int write_mostly_params = 0; sector_t progress, resync_max_sectors, resync_mismatches; - const char *sync_action; + enum sync_state state; struct raid_type *rt; switch (type) { @@ -3526,14 +3525,14 @@ static void raid_status(struct dm_target *ti, status_type_t type, /* Access most recent mddev properties for status output */ smp_rmb(); - recovery = rs->md.recovery; /* Get sensible max sectors even if raid set not yet started */ resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ? mddev->resync_max_sectors : mddev->dev_sectors; - progress = rs_get_progress(rs, recovery, resync_max_sectors); + recovery = rs->md.recovery; + state = decipher_sync_action(mddev, recovery); + progress = rs_get_progress(rs, recovery, state, resync_max_sectors); resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? atomic64_read(&mddev->resync_mismatches) : 0; - sync_action = sync_str(decipher_sync_action(&rs->md, recovery)); /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ for (i = 0; i < rs->raid_disks; i++) @@ -3561,7 +3560,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, * See Documentation/admin-guide/device-mapper/dm-raid.rst for * information on each of these states. */ - DMEMIT(" %s", sync_action); + DMEMIT(" %s", sync_str(state)); /* * v1.5.0+: @@ -3955,11 +3954,22 @@ static int raid_preresume(struct dm_target *ti) if (r) return r; - /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */ - if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap && - mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) { - r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, - to_bytes(rs->requested_bitmap_chunk_sectors), 0); + /* We are extending the raid set size, adjust mddev/md_rdev sizes and set capacity. */ + if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) { + mddev->array_sectors = rs->array_sectors; + mddev->dev_sectors = rs->dev_sectors; + rs_set_rdev_sectors(rs); + rs_set_capacity(rs); + } + + /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) or grown device size */ + if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap && + (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags) || + (rs->requested_bitmap_chunk_sectors && + mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { + int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; + + r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0); if (r) DMERR("Failed to resize bitmap"); } @@ -3968,8 +3978,10 @@ static int raid_preresume(struct dm_target *ti) /* Be prepared for mddev_resume() in raid_resume() */ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); mddev->resync_min = mddev->recovery_cp; + if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) + mddev->resync_max_sectors = mddev->dev_sectors; } /* Check for any reshape request unless new raid set */ @@ -4017,7 +4029,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 14, 0}, + .version = {1, 15, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 8547d7594338..63bbcc20f49a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -55,19 +55,6 @@ static void trigger_event(struct work_struct *work) dm_table_event(sc->ti->table); } -static inline struct stripe_c *alloc_context(unsigned int stripes) -{ - size_t len; - - if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), - stripes)) - return NULL; - - len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); - - return kmalloc(len, GFP_KERNEL); -} - /* * Parse a single <dev> <sector> pair */ @@ -142,7 +129,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - sc = alloc_context(stripes); + sc = kmalloc(struct_size(sc, stripe, stripes), GFP_KERNEL); if (!sc) { ti->error = "Memory allocation for striped context " "failed"; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 52e049554f5c..2ae0c1913766 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -918,21 +918,15 @@ bool dm_table_supports_dax(struct dm_table *t, static bool dm_table_does_not_support_partial_completion(struct dm_table *t); -struct verify_rq_based_data { - unsigned sq_count; - unsigned mq_count; -}; - -static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - struct verify_rq_based_data *v = data; + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); - if (queue_is_mq(q)) - v->mq_count++; - else - v->sq_count++; + /* request-based cannot stack on partitions! */ + if (bdev != bdev->bd_contains) + return false; return queue_is_mq(q); } @@ -941,7 +935,6 @@ static int dm_table_determine_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; - struct verify_rq_based_data v = {.sq_count = 0, .mq_count = 0}; struct dm_target *tgt; struct list_head *devices = dm_table_get_devices(t); enum dm_queue_mode live_md_type = dm_get_md_type(t->md); @@ -1045,14 +1038,10 @@ verify_rq_based: /* Non-request-stackable devices can't be used for request-based dm */ if (!tgt->type->iterate_devices || - !tgt->type->iterate_devices(tgt, device_is_rq_based, &v)) { + !tgt->type->iterate_devices(tgt, device_is_rq_stackable, NULL)) { DMERR("table load rejected: including non-request-stackable devices"); return -EINVAL; } - if (v.sq_count > 0) { - DMERR("table load rejected: not all devices are blk-mq request-stackable"); - return -EINVAL; - } return 0; } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fcd887703f95..5a2c494cb552 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -609,13 +609,12 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, blk_status_t error) { struct bio_list bios; - unsigned long flags; bio_list_init(&bios); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); __merge_bio_list(&bios, master); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); error_bio_list(&bios, error); } @@ -623,15 +622,14 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, static void requeue_deferred_cells(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; struct list_head cells; struct dm_bio_prison_cell *cell, *tmp; INIT_LIST_HEAD(&cells); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_splice_init(&tc->deferred_cells, &cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); list_for_each_entry_safe(cell, tmp, &cells, user_list) cell_requeue(pool, cell); @@ -640,14 +638,13 @@ static void requeue_deferred_cells(struct thin_c *tc) static void requeue_io(struct thin_c *tc) { struct bio_list bios; - unsigned long flags; bio_list_init(&bios); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); __merge_bio_list(&bios, &tc->deferred_bio_list); __merge_bio_list(&bios, &tc->retry_on_resume_list); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); error_bio_list(&bios, BLK_STS_DM_REQUEUE); requeue_deferred_cells(tc); @@ -756,7 +753,6 @@ static void inc_all_io_entry(struct pool *pool, struct bio *bio) static void issue(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; - unsigned long flags; if (!bio_triggers_commit(tc, bio)) { generic_make_request(bio); @@ -777,9 +773,9 @@ static void issue(struct thin_c *tc, struct bio *bio) * Batch together any bios that trigger commits and then issue a * single commit for them in process_deferred_bios(). */ - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_list_add(&pool->deferred_flush_bios, bio); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); } static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) @@ -886,12 +882,15 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c { struct pool *pool = tc->pool; unsigned long flags; + int has_work; spin_lock_irqsave(&tc->lock, flags); cell_release_no_holder(pool, cell, &tc->deferred_bio_list); + has_work = !bio_list_empty(&tc->deferred_bio_list); spin_unlock_irqrestore(&tc->lock, flags); - wake_worker(pool); + if (has_work) + wake_worker(pool); } static void thin_defer_bio(struct thin_c *tc, struct bio *bio); @@ -960,7 +959,6 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; - unsigned long flags; /* * If the bio has the REQ_FUA flag set we must commit the metadata @@ -985,9 +983,9 @@ static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio) * Batch together any bios that trigger commits and then issue a * single commit for them in process_deferred_bios(). */ - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_list_add(&pool->deferred_flush_completions, bio); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); } static void process_prepared_mapping(struct dm_thin_new_mapping *m) @@ -1226,14 +1224,13 @@ static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m) static void process_prepared(struct pool *pool, struct list_head *head, process_mapping_fn *fn) { - unsigned long flags; struct list_head maps; struct dm_thin_new_mapping *m, *tmp; INIT_LIST_HEAD(&maps); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); list_splice_init(head, &maps); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); list_for_each_entry_safe(m, tmp, &maps, list) (*fn)(m); @@ -1510,14 +1507,12 @@ static int commit(struct pool *pool) static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) { - unsigned long flags; - if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { DMWARN("%s: reached low water mark for data device: sending event.", dm_device_name(pool->pool_md)); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->low_water_triggered = true; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); dm_table_event(pool->ti->table); } } @@ -1593,11 +1588,10 @@ static void retry_on_resume(struct bio *bio) { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); struct thin_c *tc = h->tc; - unsigned long flags; - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_add(&tc->retry_on_resume_list, bio); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); } static blk_status_t should_error_unserviceable_bio(struct pool *pool) @@ -2170,7 +2164,6 @@ static void __sort_thin_deferred_bios(struct thin_c *tc) static void process_thin_deferred_bios(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; struct bio *bio; struct bio_list bios; struct blk_plug plug; @@ -2184,10 +2177,10 @@ static void process_thin_deferred_bios(struct thin_c *tc) bio_list_init(&bios); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); if (bio_list_empty(&tc->deferred_bio_list)) { - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); return; } @@ -2196,7 +2189,7 @@ static void process_thin_deferred_bios(struct thin_c *tc) bio_list_merge(&bios, &tc->deferred_bio_list); bio_list_init(&tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { @@ -2206,10 +2199,10 @@ static void process_thin_deferred_bios(struct thin_c *tc) * prepared mappings to process. */ if (ensure_next_mapping(pool)) { - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_add(&tc->deferred_bio_list, bio); bio_list_merge(&tc->deferred_bio_list, &bios); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); break; } @@ -2264,16 +2257,15 @@ static unsigned sort_cells(struct pool *pool, struct list_head *cells) static void process_thin_deferred_cells(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; struct list_head cells; struct dm_bio_prison_cell *cell; unsigned i, j, count; INIT_LIST_HEAD(&cells); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_splice_init(&tc->deferred_cells, &cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); if (list_empty(&cells)) return; @@ -2294,9 +2286,9 @@ static void process_thin_deferred_cells(struct thin_c *tc) for (j = i; j < count; j++) list_add(&pool->cell_sort_array[j]->user_list, &cells); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_splice(&cells, &tc->deferred_cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); return; } @@ -2349,7 +2341,6 @@ static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc) static void process_deferred_bios(struct pool *pool) { - unsigned long flags; struct bio *bio; struct bio_list bios, bio_completions; struct thin_c *tc; @@ -2368,13 +2359,13 @@ static void process_deferred_bios(struct pool *pool) bio_list_init(&bios); bio_list_init(&bio_completions); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_list_merge(&bios, &pool->deferred_flush_bios); bio_list_init(&pool->deferred_flush_bios); bio_list_merge(&bio_completions, &pool->deferred_flush_completions); bio_list_init(&pool->deferred_flush_completions); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) @@ -2657,12 +2648,11 @@ static void metadata_operation_failed(struct pool *pool, const char *op, int r) */ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) { - unsigned long flags; struct pool *pool = tc->pool; - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_add(&tc->deferred_bio_list, bio); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); wake_worker(pool); } @@ -2678,13 +2668,12 @@ static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio) static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) { - unsigned long flags; struct pool *pool = tc->pool; throttle_lock(&pool->throttle); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_add_tail(&cell->user_list, &tc->deferred_cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); throttle_unlock(&pool->throttle); wake_worker(pool); @@ -2810,15 +2799,14 @@ static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) static void requeue_bios(struct pool *pool) { - unsigned long flags; struct thin_c *tc; rcu_read_lock(); list_for_each_entry_rcu(tc, &pool->active_thins, list) { - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list); bio_list_init(&tc->retry_on_resume_list); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); } rcu_read_unlock(); } @@ -3412,15 +3400,14 @@ static int pool_map(struct dm_target *ti, struct bio *bio) int r; struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; /* * As this is a singleton target, ti->begin is always zero. */ - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_set_dev(bio, pt->data_dev->bdev); r = DM_MAPIO_REMAPPED; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); return r; } @@ -3591,7 +3578,6 @@ static void pool_resume(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; /* * Must requeue active_thins' bios and then resume @@ -3600,10 +3586,10 @@ static void pool_resume(struct dm_target *ti) requeue_bios(pool); pool_resume_active_thins(pool); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->low_water_triggered = false; pool->suspended = false; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); do_waker(&pool->waker.work); } @@ -3612,11 +3598,10 @@ static void pool_presuspend(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->suspended = true; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); pool_suspend_active_thins(pool); } @@ -3625,13 +3610,12 @@ static void pool_presuspend_undo(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; pool_resume_active_thins(pool); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->suspended = false; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); } static void pool_postsuspend(struct dm_target *ti) @@ -4110,11 +4094,10 @@ static void thin_put(struct thin_c *tc) static void thin_dtr(struct dm_target *ti) { struct thin_c *tc = ti->private; - unsigned long flags; - spin_lock_irqsave(&tc->pool->lock, flags); + spin_lock_irq(&tc->pool->lock); list_del_rcu(&tc->list); - spin_unlock_irqrestore(&tc->pool->lock, flags); + spin_unlock_irq(&tc->pool->lock); synchronize_rcu(); thin_put(tc); @@ -4150,7 +4133,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) struct thin_c *tc; struct dm_dev *pool_dev, *origin_dev; struct mapped_device *pool_md; - unsigned long flags; mutex_lock(&dm_thin_pool_table.mutex); @@ -4244,9 +4226,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) mutex_unlock(&dm_thin_pool_table.mutex); - spin_lock_irqsave(&tc->pool->lock, flags); + spin_lock_irq(&tc->pool->lock); if (tc->pool->suspended) { - spin_unlock_irqrestore(&tc->pool->lock, flags); + spin_unlock_irq(&tc->pool->lock); mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */ ti->error = "Unable to activate thin device while pool is suspended"; r = -EINVAL; @@ -4255,7 +4237,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) refcount_set(&tc->refcount, 1); init_completion(&tc->can_destroy); list_add_tail_rcu(&tc->list, &tc->pool->active_thins); - spin_unlock_irqrestore(&tc->pool->lock, flags); + spin_unlock_irq(&tc->pool->lock); /* * This synchronize_rcu() call is needed here otherwise we risk a * wake_worker() call finding no bios to process (because the newly diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index d06b8aa41e26..7d727a72aa13 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1218,7 +1218,8 @@ bio_copy: } } while (bio->bi_iter.bi_size); - if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) + if (unlikely(bio->bi_opf & REQ_FUA || + wc->uncommitted_blocks >= wc->autocommit_blocks)) writecache_flush(wc); else writecache_schedule_autocommit(wc); @@ -1561,7 +1562,7 @@ static void writecache_writeback(struct work_struct *work) { struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); struct blk_plug plug; - struct wc_entry *f, *g, *e = NULL; + struct wc_entry *f, *uninitialized_var(g), *e = NULL; struct rb_node *node, *next_node; struct list_head skipped; struct writeback_list wbl; diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 069e4675da6b..22b3cb0050a7 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -554,6 +554,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { dmz_release_mblock(zmd, mblk); + dmz_check_bdev(zmd->dev); return ERR_PTR(-EIO); } @@ -625,6 +626,8 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, ret = submit_bio_wait(bio); bio_put(bio); + if (ret) + dmz_check_bdev(zmd->dev); return ret; } @@ -691,6 +694,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { clear_bit(DMZ_META_ERROR, &mblk->state); + dmz_check_bdev(zmd->dev); ret = -EIO; } nr_mblks_submitted--; @@ -768,7 +772,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); - goto out; + goto err; } /* @@ -778,7 +782,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ ret = dmz_log_dirty_mblocks(zmd, &write_list); if (ret) - goto out; + goto err; /* * The log is on disk. It is now safe to update in place @@ -786,11 +790,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); if (ret) - goto out; + goto err; ret = dmz_write_sb(zmd, zmd->mblk_primary); if (ret) - goto out; + goto err; while (!list_empty(&write_list)) { mblk = list_first_entry(&write_list, struct dmz_mblock, link); @@ -805,16 +809,20 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) zmd->sb_gen++; out: - if (ret && !list_empty(&write_list)) { - spin_lock(&zmd->mblk_lock); - list_splice(&write_list, &zmd->mblk_dirty_list); - spin_unlock(&zmd->mblk_lock); - } - dmz_unlock_flush(zmd); up_write(&zmd->mblk_sem); return ret; + +err: + if (!list_empty(&write_list)) { + spin_lock(&zmd->mblk_lock); + list_splice(&write_list, &zmd->mblk_dirty_list); + spin_unlock(&zmd->mblk_lock); + } + if (!dmz_check_bdev(zmd->dev)) + ret = -EIO; + goto out; } /* @@ -1221,6 +1229,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) if (ret < 0) { dmz_dev_err(zmd->dev, "Get zone %u report failed", dmz_id(zmd, zone)); + dmz_check_bdev(zmd->dev); return ret; } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index d240d7ca8a8a..e7ace908a9b7 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -82,6 +82,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", dmz_id(zmd, zone), (unsigned long long)wp_block, (unsigned long long)block, nr_blocks, ret); + dmz_check_bdev(zrc->dev); return ret; } @@ -489,12 +490,7 @@ static void dmz_reclaim_work(struct work_struct *work) ret = dmz_do_reclaim(zrc); if (ret) { dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret); - if (ret == -EIO) - /* - * LLD might be performing some error handling sequence - * at the underlying device. To not interfere, do not - * attempt to schedule the next reclaim run immediately. - */ + if (!dmz_check_bdev(zrc->dev)) return; } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index d3bcc4197f5d..4574e0dedbd6 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -80,6 +80,8 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) bio->bi_status = status; + if (bio->bi_status != BLK_STS_OK) + bioctx->target->dev->flags |= DMZ_CHECK_BDEV; if (refcount_dec_and_test(&bioctx->ref)) { struct dm_zone *zone = bioctx->zone; @@ -565,32 +567,52 @@ out: } /* - * Check the backing device availability. If it's on the way out, + * Check if the backing device is being removed. If it's on the way out, * start failing I/O. Reclaim and metadata components also call this * function to cleanly abort operation in the event of such failure. */ bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev) { - struct gendisk *disk; + if (dmz_dev->flags & DMZ_BDEV_DYING) + return true; - if (!(dmz_dev->flags & DMZ_BDEV_DYING)) { - disk = dmz_dev->bdev->bd_disk; - if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { - dmz_dev_warn(dmz_dev, "Backing device queue dying"); - dmz_dev->flags |= DMZ_BDEV_DYING; - } else if (disk->fops->check_events) { - if (disk->fops->check_events(disk, 0) & - DISK_EVENT_MEDIA_CHANGE) { - dmz_dev_warn(dmz_dev, "Backing device offline"); - dmz_dev->flags |= DMZ_BDEV_DYING; - } - } + if (dmz_dev->flags & DMZ_CHECK_BDEV) + return !dmz_check_bdev(dmz_dev); + + if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { + dmz_dev_warn(dmz_dev, "Backing device queue dying"); + dmz_dev->flags |= DMZ_BDEV_DYING; } return dmz_dev->flags & DMZ_BDEV_DYING; } /* + * Check the backing device availability. This detects such events as + * backing device going offline due to errors, media removals, etc. + * This check is less efficient than dmz_bdev_is_dying() and should + * only be performed as a part of error handling. + */ +bool dmz_check_bdev(struct dmz_dev *dmz_dev) +{ + struct gendisk *disk; + + dmz_dev->flags &= ~DMZ_CHECK_BDEV; + + if (dmz_bdev_is_dying(dmz_dev)) + return false; + + disk = dmz_dev->bdev->bd_disk; + if (disk->fops->check_events && + disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) { + dmz_dev_warn(dmz_dev, "Backing device offline"); + dmz_dev->flags |= DMZ_BDEV_DYING; + } + + return !(dmz_dev->flags & DMZ_BDEV_DYING); +} + +/* * Process a new BIO. */ static int dmz_map(struct dm_target *ti, struct bio *bio) @@ -902,8 +924,8 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct dmz_target *dmz = ti->private; - if (dmz_bdev_is_dying(dmz->dev)) - return -ENODEV; + if (!dmz_check_bdev(dmz->dev)) + return -EIO; *bdev = dmz->dev->bdev; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index d8e70b0ade35..5b5e493d479c 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -72,6 +72,7 @@ struct dmz_dev { /* Device flags. */ #define DMZ_BDEV_DYING (1 << 0) +#define DMZ_CHECK_BDEV (2 << 0) /* * Zone descriptor. @@ -255,5 +256,6 @@ void dmz_schedule_reclaim(struct dmz_reclaim *zrc); * Functions defined in dm-zoned-target.c */ bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev); +bool dmz_check_bdev(struct dmz_dev *dmz_dev); #endif /* DM_ZONED_H */ diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index a164cc81b710..475668c69dbc 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -608,9 +608,6 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); */ #define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz)) -#define dm_array_too_big(fixed, obj, num) \ - ((num) > (UINT_MAX - (fixed)) / (obj)) - /* * Sector offset taken relative to the start of the target instead of * relative to the start of the device. |