diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-06-30 18:19:39 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-06-30 18:19:39 -0700 |
commit | 2cfa582be80081fb8db02d4d9b44bff34b82ac54 (patch) | |
tree | 2faf8db8426b389ca8c9ed76065c688431bb7eb9 | |
parent | dbe69e43372212527abf48609aba7fc39a6daa27 (diff) | |
parent | 5c0de3d72f8c05678ed769bea24e98128f7ab570 (diff) | |
download | lwn-2cfa582be80081fb8db02d4d9b44bff34b82ac54.tar.gz lwn-2cfa582be80081fb8db02d4d9b44bff34b82ac54.zip |
Merge tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- Various DM persistent-data library improvements and fixes that
benefit both the DM thinp and cache targets.
- A few small DM kcopyd efficiency improvements.
- Significant zoned related block core, DM core and DM zoned target
changes that culminate with adding zoned append emulation (which is
required to properly fix DM crypt's zoned support).
- Various DM writecache target changes that improve efficiency. Adds an
optional "metadata_only" feature that only promotes bios flagged with
REQ_META. But the most significant improvement is writecache's
ability to pause writeback, for a confiurable time, if/when the
working set is larger than the cache (and the cache is full) -- this
ensures performance is no worse than the slower origin device.
* tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (35 commits)
dm writecache: make writeback pause configurable
dm writecache: pause writeback if cache full and origin being written directly
dm io tracker: factor out IO tracker
dm btree remove: assign new_root only when removal succeeds
dm zone: fix dm_revalidate_zones() memory allocation
dm ps io affinity: remove redundant continue statement
dm writecache: add optional "metadata_only" parameter
dm writecache: add "cleaner" and "max_age" to Documentation
dm writecache: write at least 4k when committing
dm writecache: flush origin device when writing and cache is full
dm writecache: have ssd writeback wait if the kcopyd workqueue is busy
dm writecache: use list_move instead of list_del/list_add in writecache_writeback()
dm writecache: commit just one block, not a full page
dm writecache: remove unused gfp_t argument from wc_add_block()
dm crypt: Fix zoned block device support
dm: introduce zone append emulation
dm: rearrange core declarations for extended use from dm-zone.c
block: introduce BIO_ZONE_WRITE_LOCKED bio flag
block: introduce bio zone helpers
block: improve handling of all zones reset operation
...
38 files changed, 2548 insertions, 622 deletions
diff --git a/Documentation/admin-guide/device-mapper/writecache.rst b/Documentation/admin-guide/device-mapper/writecache.rst index dce0184e07ca..65427d8dfca6 100644 --- a/Documentation/admin-guide/device-mapper/writecache.rst +++ b/Documentation/admin-guide/device-mapper/writecache.rst @@ -12,7 +12,6 @@ first sector should contain valid superblock from previous invocation. Constructor parameters: 1. type of the cache device - "p" or "s" - - p - persistent memory - s - SSD 2. the underlying device that will be cached @@ -21,7 +20,6 @@ Constructor parameters: size) 5. the number of optional parameters (the parameters with an argument count as two) - start_sector n (default: 0) offset from the start of cache device in 512-byte sectors high_watermark n (default: 50) @@ -53,6 +51,27 @@ Constructor parameters: - some underlying devices perform better with fua, some with nofua. The user should test it + cleaner + when this option is activated (either in the constructor + arguments or by a message), the cache will not promote + new writes (however, writes to already cached blocks are + promoted, to avoid data corruption due to misordered + writes) and it will gradually writeback any cached + data. The userspace can then monitor the cleaning + process with "dmsetup status". When the number of cached + blocks drops to zero, userspace can unload the + dm-writecache target and replace it with dm-linear or + other targets. + max_age n + specifies the maximum age of a block in milliseconds. If + a block is stored in the cache for too long, it will be + written to the underlying device and cleaned up. + metadata_only + only metadata is promoted to the cache. This option + improves performance for heavier REQ_META workloads. + pause_writeback n (default: 3000) + pause writeback if there was some write I/O redirected to + the origin volume in the last n milliseconds Status: 1. error indicator - 0 if there was no error, otherwise error number @@ -77,3 +96,5 @@ Messages: 5. resume the device, so that it will use the linear target 6. the cache device is now inactive and it can be deleted + cleaner + See above "cleaner" constructor documentation. diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 250cb76ee615..86fce751bb17 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -161,18 +161,89 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(blkdev_report_zones); -static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, - sector_t sector, - sector_t nr_sectors) +static inline unsigned long *blk_alloc_zone_bitmap(int node, + unsigned int nr_zones) { - if (!blk_queue_zone_resetall(bdev_get_queue(bdev))) - return false; + return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), + GFP_NOIO, node); +} +static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ /* - * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors - * of the applicable zone range is the entire disk. + * For an all-zones reset, ignore conventional, empty, read-only + * and offline zones. */ - return !sector && nr_sectors == get_capacity(bdev->bd_disk); + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_READONLY: + case BLK_ZONE_COND_OFFLINE: + return 0; + default: + set_bit(idx, (unsigned long *)data); + return 0; + } +} + +static int blkdev_zone_reset_all_emulated(struct block_device *bdev, + gfp_t gfp_mask) +{ + struct request_queue *q = bdev_get_queue(bdev); + sector_t capacity = get_capacity(bdev->bd_disk); + sector_t zone_sectors = blk_queue_zone_sectors(q); + unsigned long *need_reset; + struct bio *bio = NULL; + sector_t sector = 0; + int ret; + + need_reset = blk_alloc_zone_bitmap(q->node, q->nr_zones); + if (!need_reset) + return -ENOMEM; + + ret = bdev->bd_disk->fops->report_zones(bdev->bd_disk, 0, + q->nr_zones, blk_zone_need_reset_cb, + need_reset); + if (ret < 0) + goto out_free_need_reset; + + ret = 0; + while (sector < capacity) { + if (!test_bit(blk_queue_zone_no(q, sector), need_reset)) { + sector += zone_sectors; + continue; + } + + bio = blk_next_bio(bio, 0, gfp_mask); + bio_set_dev(bio, bdev); + bio->bi_opf = REQ_OP_ZONE_RESET | REQ_SYNC; + bio->bi_iter.bi_sector = sector; + sector += zone_sectors; + + /* This may take a while, so be nice to others */ + cond_resched(); + } + + if (bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } + +out_free_need_reset: + kfree(need_reset); + return ret; +} + +static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) +{ + struct bio bio; + + bio_init(&bio, NULL, 0); + bio_set_dev(&bio, bdev); + bio.bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC; + + return submit_bio_wait(&bio); } /** @@ -200,7 +271,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, sector_t capacity = get_capacity(bdev->bd_disk); sector_t end_sector = sector + nr_sectors; struct bio *bio = NULL; - int ret; + int ret = 0; if (!blk_queue_is_zoned(q)) return -EOPNOTSUPP; @@ -222,20 +293,21 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity) return -EINVAL; + /* + * In the case of a zone reset operation over all zones, + * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this + * command. For other devices, we emulate this command behavior by + * identifying the zones needing a reset. + */ + if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { + if (!blk_queue_zone_resetall(q)) + return blkdev_zone_reset_all_emulated(bdev, gfp_mask); + return blkdev_zone_reset_all(bdev, gfp_mask); + } + while (sector < end_sector) { bio = blk_next_bio(bio, 0, gfp_mask); bio_set_dev(bio, bdev); - - /* - * Special case for the zone reset operation that reset all - * zones, this is useful for applications like mkfs. - */ - if (op == REQ_OP_ZONE_RESET && - blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { - bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC; - break; - } - bio->bi_opf = op | REQ_SYNC; bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -396,13 +468,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, return ret; } -static inline unsigned long *blk_alloc_zone_bitmap(int node, - unsigned int nr_zones) -{ - return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), - GFP_NOIO, node); -} - void blk_queue_free_zone_bitmaps(struct request_queue *q) { kfree(q->conv_zones_bitmap); diff --git a/drivers/md/Makefile b/drivers/md/Makefile index ef7ddc27685c..a74aaf8b1445 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -92,6 +92,10 @@ ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o endif +ifeq ($(CONFIG_BLK_DEV_ZONED),y) +dm-mod-objs += dm-zone.o +endif + ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 6ab01ff25747..8e4ced5a2516 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -8,6 +8,7 @@ #include "dm-bio-prison-v2.h" #include "dm-bio-record.h" #include "dm-cache-metadata.h" +#include "dm-io-tracker.h" #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> @@ -39,77 +40,6 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, /*----------------------------------------------------------------*/ -struct io_tracker { - spinlock_t lock; - - /* - * Sectors of in-flight IO. - */ - sector_t in_flight; - - /* - * The time, in jiffies, when this device became idle (if it is - * indeed idle). - */ - unsigned long idle_time; - unsigned long last_update_time; -}; - -static void iot_init(struct io_tracker *iot) -{ - spin_lock_init(&iot->lock); - iot->in_flight = 0ul; - iot->idle_time = 0ul; - iot->last_update_time = jiffies; -} - -static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) -{ - if (iot->in_flight) - return false; - - return time_after(jiffies, iot->idle_time + jifs); -} - -static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) -{ - bool r; - - spin_lock_irq(&iot->lock); - r = __iot_idle_for(iot, jifs); - spin_unlock_irq(&iot->lock); - - return r; -} - -static void iot_io_begin(struct io_tracker *iot, sector_t len) -{ - spin_lock_irq(&iot->lock); - iot->in_flight += len; - spin_unlock_irq(&iot->lock); -} - -static void __iot_io_end(struct io_tracker *iot, sector_t len) -{ - if (!len) - return; - - iot->in_flight -= len; - if (!iot->in_flight) - iot->idle_time = jiffies; -} - -static void iot_io_end(struct io_tracker *iot, sector_t len) -{ - unsigned long flags; - - spin_lock_irqsave(&iot->lock, flags); - __iot_io_end(iot, len); - spin_unlock_irqrestore(&iot->lock, flags); -} - -/*----------------------------------------------------------------*/ - /* * Represents a chunk of future work. 'input' allows continuations to pass * values between themselves, typically error values. @@ -470,7 +400,7 @@ struct cache { struct batcher committer; struct work_struct commit_ws; - struct io_tracker tracker; + struct dm_io_tracker tracker; mempool_t migration_pool; @@ -866,7 +796,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio) if (accountable_bio(cache, bio)) { pb = get_per_bio_data(bio); pb->len = bio_sectors(bio); - iot_io_begin(&cache->tracker, pb->len); + dm_iot_io_begin(&cache->tracker, pb->len); } } @@ -874,7 +804,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio) { struct per_bio_data *pb = get_per_bio_data(bio); - iot_io_end(&cache->tracker, pb->len); + dm_iot_io_end(&cache->tracker, pb->len); } static void accounted_request(struct cache *cache, struct bio *bio) @@ -1642,7 +1572,7 @@ enum busy { static enum busy spare_migration_bandwidth(struct cache *cache) { - bool idle = iot_idle_for(&cache->tracker, HZ); + bool idle = dm_iot_idle_for(&cache->tracker, HZ); sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * cache->sectors_per_block; @@ -2603,7 +2533,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) batcher_init(&cache->committer, commit_op, cache, issue_op, cache, cache->wq); - iot_init(&cache->tracker); + dm_iot_init(&cache->tracker); init_rwsem(&cache->background_work_lock); prevent_background_work(cache); diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 5953ff2bd260..edc1553c4eea 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -114,8 +114,27 @@ struct mapped_device { bool init_tio_pdu:1; struct srcu_struct io_barrier; + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_zones; + unsigned int *zwp_offset; +#endif }; +/* + * Bits for the flags field of struct mapped_device. + */ +#define DMF_BLOCK_IO_FOR_SUSPEND 0 +#define DMF_SUSPENDED 1 +#define DMF_FROZEN 2 +#define DMF_FREEING 3 +#define DMF_DELETING 4 +#define DMF_NOFLUSH_SUSPENDING 5 +#define DMF_DEFERRED_REMOVE 6 +#define DMF_SUSPENDED_INTERNALLY 7 +#define DMF_POST_SUSPENDING 8 +#define DMF_EMULATE_ZONE_APPEND 9 + void disable_discard(struct mapped_device *md); void disable_write_same(struct mapped_device *md); void disable_write_zeroes(struct mapped_device *md); @@ -130,6 +149,13 @@ static inline struct dm_stats *dm_get_stats(struct mapped_device *md) return &md->stats; } +static inline bool dm_emulate_zone_append(struct mapped_device *md) +{ + if (blk_queue_is_zoned(md->queue)) + return test_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + return false; +} + #define DM_TABLE_MAX_DEPTH 16 struct dm_table { @@ -173,6 +199,45 @@ struct dm_table { #endif }; +/* + * One of these is allocated per clone bio. + */ +#define DM_TIO_MAGIC 7282014 +struct dm_target_io { + unsigned int magic; + struct dm_io *io; + struct dm_target *ti; + unsigned int target_bio_nr; + unsigned int *len_ptr; + bool inside_dm_io; + struct bio clone; +}; + +/* + * One of these is allocated per original bio. + * It contains the first clone used for that original. + */ +#define DM_IO_MAGIC 5191977 +struct dm_io { + unsigned int magic; + struct mapped_device *md; + blk_status_t status; + atomic_t io_count; + struct bio *orig_bio; + unsigned long start_time; + spinlock_t endio_lock; + struct dm_stats_aux stats_aux; + /* last member of dm_target_io is 'struct bio' */ + struct dm_target_io tio; +}; + +static inline void dm_io_inc_pending(struct dm_io *io) +{ + atomic_inc(&io->io_count); +} + +void dm_io_dec_pending(struct dm_io *io, blk_status_t error); + static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) { return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index b0ab080f2567..50f4cbd600d5 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3138,11 +3138,10 @@ static int crypt_report_zones(struct dm_target *ti, struct dm_report_zones_args *args, unsigned int nr_zones) { struct crypt_config *cc = ti->private; - sector_t sector = cc->start + dm_target_offset(ti, args->next_sector); - args->start = cc->start; - return blkdev_report_zones(cc->dev->bdev, sector, nr_zones, - dm_report_zones_cb, args); + return dm_report_zones(cc->dev->bdev, cc->start, + cc->start + dm_target_offset(ti, args->next_sector), + args, nr_zones); } #else #define crypt_report_zones NULL @@ -3281,14 +3280,28 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->start = tmpll; - /* - * For zoned block devices, we need to preserve the issuer write - * ordering. To do so, disable write workqueues and force inline - * encryption completion. - */ if (bdev_is_zoned(cc->dev->bdev)) { + /* + * For zoned block devices, we need to preserve the issuer write + * ordering. To do so, disable write workqueues and force inline + * encryption completion. + */ set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags); + + /* + * All zone append writes to a zone of a zoned block device will + * have the same BIO sector, the start of the zone. When the + * cypher IV mode uses sector values, all data targeting a + * zone will be encrypted using the first sector numbers of the + * zone. This will not result in write errors but will + * cause most reads to fail as reads will use the sector values + * for the actual data locations, resulting in IV mismatch. + * To avoid this problem, ask DM core to emulate zone append + * operations with regular writes. + */ + DMDEBUG("Zone append operations will be emulated"); + ti->emulate_zone_append = true; } if (crypt_integrity_aead(cc) || cc->integrity_iv_size) { diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index d9ac7372108c..3b748393fca5 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -363,28 +363,32 @@ static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata core->root = le64_to_cpu(disk->root); } -static void ws_inc(void *context, const void *value) +static void ws_inc(void *context, const void *value, unsigned count) { struct era_metadata *md = context; struct writeset_disk ws_d; dm_block_t b; + unsigned i; - memcpy(&ws_d, value, sizeof(ws_d)); - b = le64_to_cpu(ws_d.root); - - dm_tm_inc(md->tm, b); + for (i = 0; i < count; i++) { + memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d)); + b = le64_to_cpu(ws_d.root); + dm_tm_inc(md->tm, b); + } } -static void ws_dec(void *context, const void *value) +static void ws_dec(void *context, const void *value, unsigned count) { struct era_metadata *md = context; struct writeset_disk ws_d; dm_block_t b; + unsigned i; - memcpy(&ws_d, value, sizeof(ws_d)); - b = le64_to_cpu(ws_d.root); - - dm_bitset_del(&md->bitset_info, b); + for (i = 0; i < count; i++) { + memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d)); + b = le64_to_cpu(ws_d.root); + dm_bitset_del(&md->bitset_info, b); + } } static int ws_eq(void *context, const void *value1, const void *value2) diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index b7fee9936f05..5877220c01ed 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -463,11 +463,10 @@ static int flakey_report_zones(struct dm_target *ti, struct dm_report_zones_args *args, unsigned int nr_zones) { struct flakey_c *fc = ti->private; - sector_t sector = flakey_map_sector(ti, args->next_sector); - args->start = fc->start; - return blkdev_report_zones(fc->dev->bdev, sector, nr_zones, - dm_report_zones_cb, args); + return dm_report_zones(fc->dev->bdev, fc->start, + flakey_map_sector(ti, args->next_sector), + args, nr_zones); } #else #define flakey_report_zones NULL diff --git a/drivers/md/dm-io-tracker.h b/drivers/md/dm-io-tracker.h new file mode 100644 index 000000000000..bdcc6273ebf0 --- /dev/null +++ b/drivers/md/dm-io-tracker.h @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2021 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_IO_TRACKER_H +#define DM_IO_TRACKER_H + +#include <linux/jiffies.h> + +struct dm_io_tracker { + spinlock_t lock; + + /* + * Sectors of in-flight IO. + */ + sector_t in_flight; + + /* + * The time, in jiffies, when this device became idle + * (if it is indeed idle). + */ + unsigned long idle_time; + unsigned long last_update_time; +}; + +static inline void dm_iot_init(struct dm_io_tracker *iot) +{ + spin_lock_init(&iot->lock); + iot->in_flight = 0ul; + iot->idle_time = 0ul; + iot->last_update_time = jiffies; +} + +static inline bool dm_iot_idle_for(struct dm_io_tracker *iot, unsigned long j) +{ + bool r = false; + + spin_lock_irq(&iot->lock); + if (!iot->in_flight) + r = time_after(jiffies, iot->idle_time + j); + spin_unlock_irq(&iot->lock); + + return r; +} + +static inline unsigned long dm_iot_idle_time(struct dm_io_tracker *iot) +{ + unsigned long r = 0; + + spin_lock_irq(&iot->lock); + if (!iot->in_flight) + r = jiffies - iot->idle_time; + spin_unlock_irq(&iot->lock); + + return r; +} + +static inline void dm_iot_io_begin(struct dm_io_tracker *iot, sector_t len) +{ + spin_lock_irq(&iot->lock); + iot->in_flight += len; + spin_unlock_irq(&iot->lock); +} + +static inline void dm_iot_io_end(struct dm_io_tracker *iot, sector_t len) +{ + unsigned long flags; + + if (!len) + return; + + spin_lock_irqsave(&iot->lock, flags); + iot->in_flight -= len; + if (!iot->in_flight) + iot->idle_time = jiffies; + spin_unlock_irqrestore(&iot->lock, flags); +} + +#endif diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 1bbe4a34ef4c..37b03ab7e5c9 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -341,7 +341,7 @@ static void client_free_pages(struct dm_kcopyd_client *kc) struct kcopyd_job { struct dm_kcopyd_client *kc; struct list_head list; - unsigned long flags; + unsigned flags; /* * Error state of the job. @@ -418,7 +418,7 @@ static struct kcopyd_job *pop_io_job(struct list_head *jobs, * constraint and sequential writes that are at the right position. */ list_for_each_entry(job, jobs, list) { - if (job->rw == READ || !test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) { + if (job->rw == READ || !(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { list_del(&job->list); return job; } @@ -437,9 +437,8 @@ static struct kcopyd_job *pop(struct list_head *jobs, struct dm_kcopyd_client *kc) { struct kcopyd_job *job = NULL; - unsigned long flags; - spin_lock_irqsave(&kc->job_lock, flags); + spin_lock_irq(&kc->job_lock); if (!list_empty(jobs)) { if (jobs == &kc->io_jobs) @@ -449,7 +448,7 @@ static struct kcopyd_job *pop(struct list_head *jobs, list_del(&job->list); } } - spin_unlock_irqrestore(&kc->job_lock, flags); + spin_unlock_irq(&kc->job_lock); return job; } @@ -467,12 +466,11 @@ static void push(struct list_head *jobs, struct kcopyd_job *job) static void push_head(struct list_head *jobs, struct kcopyd_job *job) { - unsigned long flags; struct dm_kcopyd_client *kc = job->kc; - spin_lock_irqsave(&kc->job_lock, flags); + spin_lock_irq(&kc->job_lock); list_add(&job->list, jobs); - spin_unlock_irqrestore(&kc->job_lock, flags); + spin_unlock_irq(&kc->job_lock); } /* @@ -525,7 +523,7 @@ static void complete_io(unsigned long error, void *context) else job->read_err = 1; - if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { + if (!(job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))) { push(&kc->complete_jobs, job); wake(kc); return; @@ -565,7 +563,7 @@ static int run_io_job(struct kcopyd_job *job) * If we need to write sequentially and some reads or writes failed, * no point in continuing. */ - if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) && + if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) && job->master_job->write_err) { job->write_err = job->master_job->write_err; return -EIO; @@ -648,7 +646,6 @@ static void do_work(struct work_struct *work) struct dm_kcopyd_client *kc = container_of(work, struct dm_kcopyd_client, kcopyd_work); struct blk_plug plug; - unsigned long flags; /* * The order that these are called is *very* important. @@ -657,9 +654,9 @@ static void do_work(struct work_struct *work) * list. io jobs call wake when they complete and it all * starts again. */ - spin_lock_irqsave(&kc->job_lock, flags); + spin_lock_irq(&kc->job_lock); list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs); - spin_unlock_irqrestore(&kc->job_lock, flags); + spin_unlock_irq(&kc->job_lock); blk_start_plug(&plug); process_jobs(&kc->complete_jobs, kc, run_complete_job); @@ -709,7 +706,7 @@ static void segment_complete(int read_err, unsigned long write_err, * Only dispatch more work if there hasn't been an error. */ if ((!job->read_err && !job->write_err) || - test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { + job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) { /* get the next chunk of work */ progress = job->progress; count = job->source.count - progress; @@ -801,10 +798,10 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, * we need to write sequentially. If one of the destination is a * host-aware device, then leave it to the caller to choose what to do. */ - if (!test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) { + if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { for (i = 0; i < job->num_dests; i++) { if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) { - set_bit(DM_KCOPYD_WRITE_SEQ, &job->flags); + job->flags |= BIT(DM_KCOPYD_WRITE_SEQ); break; } } @@ -813,9 +810,9 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, /* * If we need to write sequentially, errors cannot be ignored. */ - if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) && - test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) - clear_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags); + if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) && + job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) + job->flags &= ~BIT(DM_KCOPYD_IGNORE_ERROR); if (from) { job->source = *from; @@ -983,3 +980,9 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) kfree(kc); } EXPORT_SYMBOL(dm_kcopyd_client_destroy); + +void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc) +{ + flush_workqueue(kc->kcopyd_wq); +} +EXPORT_SYMBOL(dm_kcopyd_client_flush); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 92db0f5e7f28..c91f1e2e2f65 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -140,11 +140,10 @@ static int linear_report_zones(struct dm_target *ti, struct dm_report_zones_args *args, unsigned int nr_zones) { struct linear_c *lc = ti->private; - sector_t sector = linear_map_sector(ti, args->next_sector); - args->start = lc->start; - return blkdev_report_zones(lc->dev->bdev, sector, nr_zones, - dm_report_zones_cb, args); + return dm_report_zones(lc->dev->bdev, lc->start, + linear_map_sector(ti, args->next_sector), + args, nr_zones); } #else #define linear_report_zones NULL diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c index 077655cd4fae..cb8e83bfb1a7 100644 --- a/drivers/md/dm-ps-io-affinity.c +++ b/drivers/md/dm-ps-io-affinity.c @@ -91,7 +91,6 @@ static int ioa_add_path(struct path_selector *ps, struct dm_path *path, cpumask_set_cpu(cpu, s->path_mask); s->path_map[cpu] = pi; refcount_inc(&pi->refcount); - continue; } if (refcount_dec_and_test(&pi->refcount)) { diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index b0a82f29a2e4..ebb4810cc3b4 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -364,7 +364,7 @@ static void recover(struct mirror_set *ms, struct dm_region *reg) /* hand to kcopyd */ if (!errors_handled(ms)) - set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); + flags |= BIT(DM_KCOPYD_IGNORE_ERROR); dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, recovery_complete, reg); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ee47a332b462..0543cdf89e92 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -249,7 +249,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, * If the target is mapped to zoned block device(s), check * that the zones are not partially mapped. */ - if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) { + if (bdev_is_zoned(bdev)) { unsigned int zone_sectors = bdev_zone_sectors(bdev); if (start & (zone_sectors - 1)) { @@ -1244,7 +1244,7 @@ static int dm_keyslot_evict(struct blk_keyslot_manager *ksm, return args.err; } -static struct blk_ksm_ll_ops dm_ksm_ll_ops = { +static const struct blk_ksm_ll_ops dm_ksm_ll_ops = { .keyslot_evict = dm_keyslot_evict, }; @@ -1981,11 +1981,12 @@ static int device_requires_stable_pages(struct dm_target *ti, return blk_queue_stable_writes(q); } -void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, - struct queue_limits *limits) +int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *limits) { bool wc = false, fua = false; int page_size = PAGE_SIZE; + int r; /* * Copy table's limits to the DM device's request_queue @@ -2065,19 +2066,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); /* - * For a zoned target, the number of zones should be updated for the - * correct value to be exposed in sysfs queue/nr_zones. For a BIO based - * target, this is all that is needed. + * For a zoned target, setup the zones related queue attributes + * and resources necessary for zone append emulation if necessary. */ -#ifdef CONFIG_BLK_DEV_ZONED if (blk_queue_is_zoned(q)) { - WARN_ON_ONCE(queue_is_mq(q)); - q->nr_zones = blkdev_nr_zones(t->md->disk); + r = dm_set_zones_restrictions(t, q); + if (r) + return r; } -#endif dm_update_keyslot_manager(q, t); blk_queue_update_readahead(q); + + return 0; } unsigned int dm_table_get_num_targets(struct dm_table *t) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index e75b20480e46..c88ed14d49e6 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -311,28 +311,53 @@ static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) *t = v & ((1 << 24) - 1); } -static void data_block_inc(void *context, const void *value_le) +/* + * It's more efficient to call dm_sm_{inc,dec}_blocks as few times as + * possible. 'with_runs' reads contiguous runs of blocks, and calls the + * given sm function. + */ +typedef int (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t); + +static void with_runs(struct dm_space_map *sm, const __le64 *value_le, unsigned count, run_fn fn) { - struct dm_space_map *sm = context; - __le64 v_le; - uint64_t b; + uint64_t b, begin, end; uint32_t t; + bool in_run = false; + unsigned i; - memcpy(&v_le, value_le, sizeof(v_le)); - unpack_block_time(le64_to_cpu(v_le), &b, &t); - dm_sm_inc_block(sm, b); + for (i = 0; i < count; i++, value_le++) { + /* We know value_le is 8 byte aligned */ + unpack_block_time(le64_to_cpu(*value_le), &b, &t); + + if (in_run) { + if (b == end) { + end++; + } else { + fn(sm, begin, end); + begin = b; + end = b + 1; + } + } else { + in_run = true; + begin = b; + end = b + 1; + } + } + + if (in_run) + fn(sm, begin, end); } -static void data_block_dec(void *context, const void *value_le) +static void data_block_inc(void *context, const void *value_le, unsigned count) { - struct dm_space_map *sm = context; - __le64 v_le; - uint64_t b; - uint32_t t; + with_runs((struct dm_space_map *) context, + (const __le64 *) value_le, count, dm_sm_inc_blocks); +} - memcpy(&v_le, value_le, sizeof(v_le)); - unpack_block_time(le64_to_cpu(v_le), &b, &t); - dm_sm_dec_block(sm, b); +static void data_block_dec(void *context, const void *value_le, unsigned count) +{ + with_runs((struct dm_space_map *) context, + (const __le64 *) value_le, count, dm_sm_dec_blocks); } static int data_block_equal(void *context, const void *value1_le, const void *value2_le) @@ -349,27 +374,25 @@ static int data_block_equal(void *context, const void *value1_le, const void *va return b1 == b2; } -static void subtree_inc(void *context, const void *value) +static void subtree_inc(void *context, const void *value, unsigned count) { struct dm_btree_info *info = context; - __le64 root_le; - uint64_t root; + const __le64 *root_le = value; + unsigned i; - memcpy(&root_le, value, sizeof(root_le)); - root = le64_to_cpu(root_le); - dm_tm_inc(info->tm, root); + for (i = 0; i < count; i++, root_le++) + dm_tm_inc(info->tm, le64_to_cpu(*root_le)); } -static void subtree_dec(void *context, const void *value) +static void subtree_dec(void *context, const void *value, unsigned count) { struct dm_btree_info *info = context; - __le64 root_le; - uint64_t root; + const __le64 *root_le = value; + unsigned i; - memcpy(&root_le, value, sizeof(root_le)); - root = le64_to_cpu(root_le); - if (dm_btree_del(info, root)) - DMERR("btree delete failed"); + for (i = 0; i < count; i++, root_le++) + if (dm_btree_del(info, le64_to_cpu(*root_le))) + DMERR("btree delete failed"); } static int subtree_equal(void *context, const void *value1_le, const void *value2_le) @@ -1761,11 +1784,7 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_ int r = 0; pmd_write_lock(pmd); - for (; b != e; b++) { - r = dm_sm_inc_block(pmd->data_sm, b); - if (r) - break; - } + r = dm_sm_inc_blocks(pmd->data_sm, b, e); pmd_write_unlock(pmd); return r; @@ -1776,11 +1795,7 @@ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_ int r = 0; pmd_write_lock(pmd); - for (; b != e; b++) { - r = dm_sm_dec_block(pmd->data_sm, b); - if (r) - break; - } + r = dm_sm_dec_blocks(pmd->data_sm, b, e); pmd_write_unlock(pmd); return r; diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index aecc246ade26..e21e29e81bbf 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -15,6 +15,8 @@ #include <linux/dax.h> #include <linux/pfn_t.h> #include <linux/libnvdimm.h> +#include <linux/delay.h> +#include "dm-io-tracker.h" #define DM_MSG_PREFIX "writecache" @@ -28,6 +30,7 @@ #define AUTOCOMMIT_MSEC 1000 #define MAX_AGE_DIV 16 #define MAX_AGE_UNSPECIFIED -1UL +#define PAUSE_WRITEBACK (HZ * 3) #define BITMAP_GRANULARITY 65536 #if BITMAP_GRANULARITY < PAGE_SIZE @@ -123,6 +126,7 @@ struct dm_writecache { size_t freelist_high_watermark; size_t freelist_low_watermark; unsigned long max_age; + unsigned long pause; unsigned uncommitted_blocks; unsigned autocommit_blocks; @@ -171,17 +175,22 @@ struct dm_writecache { bool flush_on_suspend:1; bool cleaner:1; bool cleaner_set:1; + bool metadata_only:1; + bool pause_set:1; unsigned high_wm_percent_value; unsigned low_wm_percent_value; unsigned autocommit_time_value; unsigned max_age_value; + unsigned pause_value; unsigned writeback_all; struct workqueue_struct *writeback_wq; struct work_struct writeback_work; struct work_struct flush_work; + struct dm_io_tracker iot; + struct dm_io_client *dm_io; raw_spinlock_t endio_list_lock; @@ -532,7 +541,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc) region.bdev = wc->ssd_dev->bdev; region.sector = 0; - region.count = PAGE_SIZE >> SECTOR_SHIFT; + region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT; if (unlikely(region.sector + region.count > wc->metadata_sectors)) region.count = wc->metadata_sectors - region.sector; @@ -1301,8 +1310,12 @@ static int writecache_map(struct dm_target *ti, struct bio *bio) writecache_flush(wc); if (writecache_has_error(wc)) goto unlock_error; + if (unlikely(wc->cleaner) || unlikely(wc->metadata_only)) + goto unlock_remap_origin; goto unlock_submit; } else { + if (dm_bio_get_target_bio_nr(bio)) + goto unlock_remap_origin; writecache_offload_bio(wc, bio); goto unlock_return; } @@ -1360,24 +1373,29 @@ read_next_block: } else { do { bool found_entry = false; + bool search_used = false; if (writecache_has_error(wc)) goto unlock_error; e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); if (e) { - if (!writecache_entry_is_committed(wc, e)) + if (!writecache_entry_is_committed(wc, e)) { + search_used = true; goto bio_copy; + } if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { wc->overwrote_committed = true; + search_used = true; goto bio_copy; } found_entry = true; } else { - if (unlikely(wc->cleaner)) + if (unlikely(wc->cleaner) || + (wc->metadata_only && !(bio->bi_opf & REQ_META))) goto direct_write; } e = writecache_pop_from_freelist(wc, (sector_t)-1); if (unlikely(!e)) { - if (!found_entry) { + if (!WC_MODE_PMEM(wc) && !found_entry) { direct_write: e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); if (e) { @@ -1404,13 +1422,31 @@ bio_copy: sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); while (bio_size < bio->bi_iter.bi_size) { - struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); - if (!f) - break; - write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + - (bio_size >> SECTOR_SHIFT), wc->seq_count); - writecache_insert_entry(wc, f); - wc->uncommitted_blocks++; + if (!search_used) { + struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); + if (!f) + break; + write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + + (bio_size >> SECTOR_SHIFT), wc->seq_count); + writecache_insert_entry(wc, f); + wc->uncommitted_blocks++; + } else { + struct wc_entry *f; + struct rb_node *next = rb_next(&e->rb_node); + if (!next) + break; + f = container_of(next, struct wc_entry, rb_node); + if (f != e + 1) + break; + if (read_original_sector(wc, f) != + read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) + break; + if (unlikely(f->write_in_progress)) + break; + if (writecache_entry_is_committed(wc, f)) + wc->overwrote_committed = true; + e = f; + } bio_size += wc->block_size; current_cache_sec += wc->block_size >> SECTOR_SHIFT; } @@ -1438,6 +1474,12 @@ bio_copy: } unlock_remap_origin: + if (likely(wc->pause != 0)) { + if (bio_op(bio) == REQ_OP_WRITE) { + dm_iot_io_begin(&wc->iot, 1); + bio->bi_private = (void *)2; + } + } bio_set_dev(bio, wc->dev->bdev); wc_unlock(wc); return DM_MAPIO_REMAPPED; @@ -1468,11 +1510,13 @@ static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t { struct dm_writecache *wc = ti->private; - if (bio->bi_private != NULL) { + if (bio->bi_private == (void *)1) { int dir = bio_data_dir(bio); if (atomic_dec_and_test(&wc->bio_in_progress[dir])) if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) wake_up(&wc->bio_in_progress_wait[dir]); + } else if (bio->bi_private == (void *)2) { + dm_iot_io_end(&wc->iot, 1); } return 0; } @@ -1642,7 +1686,7 @@ pop_from_list: return 0; } -static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp) +static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e) { struct dm_writecache *wc = wb->wc; unsigned block_size = wc->block_size; @@ -1703,7 +1747,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba max_pages = WB_LIST_INLINE; } - BUG_ON(!wc_add_block(wb, e, GFP_NOIO)); + BUG_ON(!wc_add_block(wb, e)); wb->wc_list[0] = e; wb->wc_list_n = 1; @@ -1713,7 +1757,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba if (read_original_sector(wc, f) != read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) break; - if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN)) + if (!wc_add_block(wb, f)) break; wbl->size--; list_del(&f->lru); @@ -1794,6 +1838,27 @@ static void writecache_writeback(struct work_struct *work) struct writeback_list wbl; unsigned long n_walked; + if (!WC_MODE_PMEM(wc)) { + /* Wait for any active kcopyd work on behalf of ssd writeback */ + dm_kcopyd_client_flush(wc->dm_kcopyd); + } + + if (likely(wc->pause != 0)) { + while (1) { + unsigned long idle; + if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) || + unlikely(dm_suspended(wc->ti))) + break; + idle = dm_iot_idle_time(&wc->iot); + if (idle >= wc->pause) + break; + idle = wc->pause - idle; + if (idle > HZ) + idle = HZ; + schedule_timeout_idle(idle); + } + } + wc_lock(wc); restart: if (writecache_has_error(wc)) { @@ -1822,8 +1887,9 @@ restart: n_walked++; if (unlikely(n_walked > WRITEBACK_LATENCY) && - likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) { - queue_work(wc->writeback_wq, &wc->writeback_work); + likely(!wc->writeback_all)) { + if (likely(!dm_suspended(wc->ti))) + queue_work(wc->writeback_wq, &wc->writeback_work); break; } @@ -1845,15 +1911,13 @@ restart: if (unlikely(read_original_sector(wc, f) == read_original_sector(wc, e))) { BUG_ON(!f->write_in_progress); - list_del(&e->lru); - list_add(&e->lru, &skipped); + list_move(&e->lru, &skipped); cond_resched(); continue; } } wc->writeback_size++; - list_del(&e->lru); - list_add(&e->lru, &wbl.list); + list_move(&e->lru, &wbl.list); wbl.size++; e->write_in_progress = true; e->wc_list_contiguous = 1; @@ -1888,8 +1952,7 @@ restart: // break; wc->writeback_size++; - list_del(&g->lru); - list_add(&g->lru, &wbl.list); + list_move(&g->lru, &wbl.list); wbl.size++; g->write_in_progress = true; g->wc_list_contiguous = BIO_MAX_VECS; @@ -2065,7 +2128,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) struct wc_memory_superblock s; static struct dm_arg _args[] = { - {0, 16, "Invalid number of feature args"}, + {0, 18, "Invalid number of feature args"}, }; as.argc = argc; @@ -2109,6 +2172,8 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) INIT_WORK(&wc->writeback_work, writecache_writeback); INIT_WORK(&wc->flush_work, writecache_flush_work); + dm_iot_init(&wc->iot); + raw_spin_lock_init(&wc->endio_list_lock); INIT_LIST_HEAD(&wc->endio_list); wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio"); @@ -2156,6 +2221,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } } else { + wc->pause = PAUSE_WRITEBACK; r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); if (r) { ti->error = "Could not allocate mempool"; @@ -2292,6 +2358,20 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) wc->writeback_fua = false; wc->writeback_fua_set = true; } else goto invalid_optional; + } else if (!strcasecmp(string, "metadata_only")) { + wc->metadata_only = true; + } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) { + unsigned pause_msecs; + if (WC_MODE_PMEM(wc)) + goto invalid_optional; + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1) + goto invalid_optional; + if (pause_msecs > 60000) + goto invalid_optional; + wc->pause = msecs_to_jiffies(pause_msecs); + wc->pause_set = true; + wc->pause_value = pause_msecs; } else { invalid_optional: r = -EINVAL; @@ -2463,7 +2543,7 @@ overflow: goto bad; } - ti->num_flush_bios = 1; + ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2; ti->flush_supported = true; ti->num_discard_bios = 1; @@ -2515,6 +2595,10 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args++; if (wc->writeback_fua_set) extra_args++; + if (wc->metadata_only) + extra_args++; + if (wc->pause_set) + extra_args += 2; DMEMIT("%u", extra_args); if (wc->start_sector_set) @@ -2535,13 +2619,17 @@ static void writecache_status(struct dm_target *ti, status_type_t type, DMEMIT(" cleaner"); if (wc->writeback_fua_set) DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); + if (wc->metadata_only) + DMEMIT(" metadata_only"); + if (wc->pause_set) + DMEMIT(" pause_writeback %u", wc->pause_value); break; } } static struct target_type writecache_target = { .name = "writecache", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c new file mode 100644 index 000000000000..6d82a34438c8 --- /dev/null +++ b/drivers/md/dm-zone.c @@ -0,0 +1,660 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + */ + +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/sched/mm.h> +#include <linux/slab.h> + +#include "dm-core.h" + +#define DM_MSG_PREFIX "zone" + +#define DM_ZONE_INVALID_WP_OFST UINT_MAX + +/* + * For internal zone reports bypassing the top BIO submission path. + */ +static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t, + sector_t sector, unsigned int nr_zones, + report_zones_cb cb, void *data) +{ + struct gendisk *disk = md->disk; + int ret; + struct dm_report_zones_args args = { + .next_sector = sector, + .orig_data = data, + .orig_cb = cb, + }; + + do { + struct dm_target *tgt; + + tgt = dm_table_find_target(t, args.next_sector); + if (WARN_ON_ONCE(!tgt->type->report_zones)) + return -EIO; + + args.tgt = tgt; + ret = tgt->type->report_zones(tgt, &args, + nr_zones - args.zone_idx); + if (ret < 0) + return ret; + } while (args.zone_idx < nr_zones && + args.next_sector < get_capacity(disk)); + + return args.zone_idx; +} + +/* + * User facing dm device block device report zone operation. This calls the + * report_zones operation for each target of a device table. This operation is + * generally implemented by targets using dm_report_zones(). + */ +int dm_blk_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct mapped_device *md = disk->private_data; + struct dm_table *map; + int srcu_idx, ret; + + if (dm_suspended_md(md)) + return -EAGAIN; + + map = dm_get_live_table(md, &srcu_idx); + if (!map) + return -EIO; + + ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data); + + dm_put_live_table(md, srcu_idx); + + return ret; +} + +static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct dm_report_zones_args *args = data; + sector_t sector_diff = args->tgt->begin - args->start; + + /* + * Ignore zones beyond the target range. + */ + if (zone->start >= args->start + args->tgt->len) + return 0; + + /* + * Remap the start sector and write pointer position of the zone + * to match its position in the target range. + */ + zone->start += sector_diff; + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { + if (zone->cond == BLK_ZONE_COND_FULL) + zone->wp = zone->start + zone->len; + else if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->wp = zone->start; + else + zone->wp += sector_diff; + } + + args->next_sector = zone->start + zone->len; + return args->orig_cb(zone, args->zone_idx++, args->orig_data); +} + +/* + * Helper for drivers of zoned targets to implement struct target_type + * report_zones operation. + */ +int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + /* + * Set the target mapping start sector first so that + * dm_report_zones_cb() can correctly remap zone information. + */ + args->start = start; + + return blkdev_report_zones(bdev, sector, nr_zones, + dm_report_zones_cb, args); +} +EXPORT_SYMBOL_GPL(dm_report_zones); + +bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) +{ + struct request_queue *q = md->queue; + + if (!blk_queue_is_zoned(q)) + return false; + + switch (bio_op(bio)) { + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + return !op_is_flush(bio->bi_opf) && bio_sectors(bio); + default: + return false; + } +} + +void dm_cleanup_zoned_dev(struct mapped_device *md) +{ + struct request_queue *q = md->queue; + + if (q) { + kfree(q->conv_zones_bitmap); + q->conv_zones_bitmap = NULL; + kfree(q->seq_zones_wlock); + q->seq_zones_wlock = NULL; + } + + kvfree(md->zwp_offset); + md->zwp_offset = NULL; + md->nr_zones = 0; +} + +static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return zone->wp - zone->start; + case BLK_ZONE_COND_FULL: + return zone->len; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + default: + /* + * Conventional, offline and read-only zones do not have a valid + * write pointer. Use 0 as for an empty zone. + */ + return 0; + } +} + +static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct mapped_device *md = data; + struct request_queue *q = md->queue; + + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!q->conv_zones_bitmap) { + q->conv_zones_bitmap = + kcalloc(BITS_TO_LONGS(q->nr_zones), + sizeof(unsigned long), GFP_NOIO); + if (!q->conv_zones_bitmap) + return -ENOMEM; + } + set_bit(idx, q->conv_zones_bitmap); + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + if (!q->seq_zones_wlock) { + q->seq_zones_wlock = + kcalloc(BITS_TO_LONGS(q->nr_zones), + sizeof(unsigned long), GFP_NOIO); + if (!q->seq_zones_wlock) + return -ENOMEM; + } + if (!md->zwp_offset) { + md->zwp_offset = + kvcalloc(q->nr_zones, sizeof(unsigned int), + GFP_KERNEL); + if (!md->zwp_offset) + return -ENOMEM; + } + md->zwp_offset[idx] = dm_get_zone_wp_offset(zone); + + break; + default: + DMERR("Invalid zone type 0x%x at sectors %llu", + (int)zone->type, zone->start); + return -ENODEV; + } + + return 0; +} + +/* + * Revalidate the zones of a mapped device to initialize resource necessary + * for zone append emulation. Note that we cannot simply use the block layer + * blk_revalidate_disk_zones() function here as the mapped device is suspended + * (this is called from __bind() context). + */ +static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) +{ + struct request_queue *q = md->queue; + unsigned int noio_flag; + int ret; + + /* + * Check if something changed. If yes, cleanup the current resources + * and reallocate everything. + */ + if (!q->nr_zones || q->nr_zones != md->nr_zones) + dm_cleanup_zoned_dev(md); + if (md->nr_zones) + return 0; + + /* + * Scan all zones to initialize everything. Ensure that all vmalloc + * operations in this context are done as if GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); + ret = dm_blk_do_report_zones(md, t, 0, q->nr_zones, + dm_zone_revalidate_cb, md); + memalloc_noio_restore(noio_flag); + if (ret < 0) + goto err; + if (ret != q->nr_zones) { + ret = -EIO; + goto err; + } + + md->nr_zones = q->nr_zones; + + return 0; + +err: + DMERR("Revalidate zones failed %d", ret); + dm_cleanup_zoned_dev(md); + return ret; +} + +static int device_not_zone_append_capable(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + return !blk_queue_is_zoned(bdev_get_queue(dev->bdev)); +} + +static bool dm_table_supports_zone_append(struct dm_table *t) +{ + struct dm_target *ti; + unsigned int i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (ti->emulate_zone_append) + return false; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL)) + return false; + } + + return true; +} + +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) +{ + struct mapped_device *md = t->md; + + /* + * For a zoned target, the number of zones should be updated for the + * correct value to be exposed in sysfs queue/nr_zones. + */ + WARN_ON_ONCE(queue_is_mq(q)); + q->nr_zones = blkdev_nr_zones(md->disk); + + /* Check if zone append is natively supported */ + if (dm_table_supports_zone_append(t)) { + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + dm_cleanup_zoned_dev(md); + return 0; + } + + /* + * Mark the mapped device as needing zone append emulation and + * initialize the emulation resources once the capacity is set. + */ + set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + if (!get_capacity(md->disk)) + return 0; + + return dm_revalidate_zones(md, t); +} + +static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + unsigned int *wp_offset = data; + + *wp_offset = dm_get_zone_wp_offset(zone); + + return 0; +} + +static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno, + unsigned int *wp_ofst) +{ + sector_t sector = zno * blk_queue_zone_sectors(md->queue); + unsigned int noio_flag; + struct dm_table *t; + int srcu_idx, ret; + + t = dm_get_live_table(md, &srcu_idx); + if (!t) + return -EIO; + + /* + * Ensure that all memory allocations in this context are done as if + * GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); + ret = dm_blk_do_report_zones(md, t, sector, 1, + dm_update_zone_wp_offset_cb, wp_ofst); + memalloc_noio_restore(noio_flag); + + dm_put_live_table(md, srcu_idx); + + if (ret != 1) + return -EIO; + + return 0; +} + +/* + * First phase of BIO mapping for targets with zone append emulation: + * check all BIO that change a zone writer pointer and change zone + * append operations into regular write operations. + */ +static bool dm_zone_map_bio_begin(struct mapped_device *md, + struct bio *orig_bio, struct bio *clone) +{ + sector_t zsectors = blk_queue_zone_sectors(md->queue); + unsigned int zno = bio_zone_no(orig_bio); + unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); + + /* + * If the target zone is in an error state, recover by inspecting the + * zone to get its current write pointer position. Note that since the + * target zone is already locked, a BIO issuing context should never + * see the zone write in the DM_ZONE_UPDATING_WP_OFST state. + */ + if (zwp_offset == DM_ZONE_INVALID_WP_OFST) { + if (dm_update_zone_wp_offset(md, zno, &zwp_offset)) + return false; + WRITE_ONCE(md->zwp_offset[zno], zwp_offset); + } + + switch (bio_op(orig_bio)) { + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_FINISH: + return true; + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + /* Writes must be aligned to the zone write pointer */ + if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset) + return false; + break; + case REQ_OP_ZONE_APPEND: + /* + * Change zone append operations into a non-mergeable regular + * writes directed at the current write pointer position of the + * target zone. + */ + clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE | + (orig_bio->bi_opf & (~REQ_OP_MASK)); + clone->bi_iter.bi_sector = + orig_bio->bi_iter.bi_sector + zwp_offset; + break; + default: + DMWARN_LIMIT("Invalid BIO operation"); + return false; + } + + /* Cannot write to a full zone */ + if (zwp_offset >= zsectors) + return false; + + return true; +} + +/* + * Second phase of BIO mapping for targets with zone append emulation: + * update the zone write pointer offset array to account for the additional + * data written to a zone. Note that at this point, the remapped clone BIO + * may already have completed, so we do not touch it. + */ +static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, + struct bio *orig_bio, + unsigned int nr_sectors) +{ + unsigned int zno = bio_zone_no(orig_bio); + unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); + + /* The clone BIO may already have been completed and failed */ + if (zwp_offset == DM_ZONE_INVALID_WP_OFST) + return BLK_STS_IOERR; + + /* Update the zone wp offset */ + switch (bio_op(orig_bio)) { + case REQ_OP_ZONE_RESET: + WRITE_ONCE(md->zwp_offset[zno], 0); + return BLK_STS_OK; + case REQ_OP_ZONE_FINISH: + WRITE_ONCE(md->zwp_offset[zno], + blk_queue_zone_sectors(md->queue)); + return BLK_STS_OK; + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); + return BLK_STS_OK; + case REQ_OP_ZONE_APPEND: + /* + * Check that the target did not truncate the write operation + * emulating a zone append. + */ + if (nr_sectors != bio_sectors(orig_bio)) { + DMWARN_LIMIT("Truncated write for zone append"); + return BLK_STS_IOERR; + } + WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); + return BLK_STS_OK; + default: + DMWARN_LIMIT("Invalid BIO operation"); + return BLK_STS_IOERR; + } +} + +static inline void dm_zone_lock(struct request_queue *q, + unsigned int zno, struct bio *clone) +{ + if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))) + return; + + wait_on_bit_lock_io(q->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE); + bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED); +} + +static inline void dm_zone_unlock(struct request_queue *q, + unsigned int zno, struct bio *clone) +{ + if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) + return; + + WARN_ON_ONCE(!test_bit(zno, q->seq_zones_wlock)); + clear_bit_unlock(zno, q->seq_zones_wlock); + smp_mb__after_atomic(); + wake_up_bit(q->seq_zones_wlock, zno); + + bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED); +} + +static bool dm_need_zone_wp_tracking(struct bio *orig_bio) +{ + /* + * Special processing is not needed for operations that do not need the + * zone write lock, that is, all operations that target conventional + * zones and all operations that do not modify directly a sequential + * zone write pointer. + */ + if (op_is_flush(orig_bio->bi_opf) && !bio_sectors(orig_bio)) + return false; + switch (bio_op(orig_bio)) { + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_FINISH: + case REQ_OP_ZONE_APPEND: + return bio_zone_is_seq(orig_bio); + default: + return false; + } +} + +/* + * Special IO mapping for targets needing zone append emulation. + */ +int dm_zone_map_bio(struct dm_target_io *tio) +{ + struct dm_io *io = tio->io; + struct dm_target *ti = tio->ti; + struct mapped_device *md = io->md; + struct request_queue *q = md->queue; + struct bio *orig_bio = io->orig_bio; + struct bio *clone = &tio->clone; + unsigned int zno; + blk_status_t sts; + int r; + + /* + * IOs that do not change a zone write pointer do not need + * any additional special processing. + */ + if (!dm_need_zone_wp_tracking(orig_bio)) + return ti->type->map(ti, clone); + + /* Lock the target zone */ + zno = bio_zone_no(orig_bio); + dm_zone_lock(q, zno, clone); + + /* + * Check that the bio and the target zone write pointer offset are + * both valid, and if the bio is a zone append, remap it to a write. + */ + if (!dm_zone_map_bio_begin(md, orig_bio, clone)) { + dm_zone_unlock(q, zno, clone); + return DM_MAPIO_KILL; + } + + /* + * The target map function may issue and complete the IO quickly. + * Take an extra reference on the IO to make sure it does disappear + * until we run dm_zone_map_bio_end(). + */ + dm_io_inc_pending(io); + + /* Let the target do its work */ + r = ti->type->map(ti, clone); + switch (r) { + case DM_MAPIO_SUBMITTED: + /* + * The target submitted the clone BIO. The target zone will + * be unlocked on completion of the clone. + */ + sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr); + break; + case DM_MAPIO_REMAPPED: + /* + * The target only remapped the clone BIO. In case of error, + * unlock the target zone here as the clone will not be + * submitted. + */ + sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr); + if (sts != BLK_STS_OK) + dm_zone_unlock(q, zno, clone); + break; + case DM_MAPIO_REQUEUE: + case DM_MAPIO_KILL: + default: + dm_zone_unlock(q, zno, clone); + sts = BLK_STS_IOERR; + break; + } + + /* Drop the extra reference on the IO */ + dm_io_dec_pending(io, sts); + + if (sts != BLK_STS_OK) + return DM_MAPIO_KILL; + + return r; +} + +/* + * IO completion callback called from clone_endio(). + */ +void dm_zone_endio(struct dm_io *io, struct bio *clone) +{ + struct mapped_device *md = io->md; + struct request_queue *q = md->queue; + struct bio *orig_bio = io->orig_bio; + unsigned int zwp_offset; + unsigned int zno; + + /* + * For targets that do not emulate zone append, we only need to + * handle native zone-append bios. + */ + if (!dm_emulate_zone_append(md)) { + /* + * Get the offset within the zone of the written sector + * and add that to the original bio sector position. + */ + if (clone->bi_status == BLK_STS_OK && + bio_op(clone) == REQ_OP_ZONE_APPEND) { + sector_t mask = (sector_t)blk_queue_zone_sectors(q) - 1; + + orig_bio->bi_iter.bi_sector += + clone->bi_iter.bi_sector & mask; + } + + return; + } + + /* + * For targets that do emulate zone append, if the clone BIO does not + * own the target zone write lock, we have nothing to do. + */ + if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) + return; + + zno = bio_zone_no(orig_bio); + + if (clone->bi_status != BLK_STS_OK) { + /* + * BIOs that modify a zone write pointer may leave the zone + * in an unknown state in case of failure (e.g. the write + * pointer was only partially advanced). In this case, set + * the target zone write pointer as invalid unless it is + * already being updated. + */ + WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST); + } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { + /* + * Get the written sector for zone append operation that were + * emulated using regular write operations. + */ + zwp_offset = READ_ONCE(md->zwp_offset[zno]); + if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio))) + WRITE_ONCE(md->zwp_offset[zno], + DM_ZONE_INVALID_WP_OFST); + else + orig_bio->bi_iter.bi_sector += + zwp_offset - bio_sectors(orig_bio); + } + + dm_zone_unlock(q, zno, clone); +} diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 039d17b28938..ee4626d08557 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1390,6 +1390,13 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) return -ENXIO; } + /* + * Devices that have zones with a capacity smaller than the zone size + * (e.g. NVMe zoned namespaces) are not supported. + */ + if (blkz->capacity != blkz->len) + return -ENXIO; + switch (blkz->type) { case BLK_ZONE_TYPE_CONVENTIONAL: set_bit(DMZ_RND, &zone->flags); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 9c0ecc9568a4..d58db9a27e6c 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -134,7 +134,7 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, dst_zone_block = dmz_start_block(zmd, dst_zone); if (dmz_is_seq(dst_zone)) - set_bit(DM_KCOPYD_WRITE_SEQ, &flags); + flags |= BIT(DM_KCOPYD_WRITE_SEQ); while (block < end_block) { if (src_zone->dev->flags & DMZ_BDEV_DYING) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a57aba553ebb..2c5f9e585211 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -74,38 +74,6 @@ struct clone_info { unsigned sector_count; }; -/* - * One of these is allocated per clone bio. - */ -#define DM_TIO_MAGIC 7282014 -struct dm_target_io { - unsigned magic; - struct dm_io *io; - struct dm_target *ti; - unsigned target_bio_nr; - unsigned *len_ptr; - bool inside_dm_io; - struct bio clone; -}; - -/* - * One of these is allocated per original bio. - * It contains the first clone used for that original. - */ -#define DM_IO_MAGIC 5191977 -struct dm_io { - unsigned magic; - struct mapped_device *md; - blk_status_t status; - atomic_t io_count; - struct bio *orig_bio; - unsigned long start_time; - spinlock_t endio_lock; - struct dm_stats_aux stats_aux; - /* last member of dm_target_io is 'struct bio' */ - struct dm_target_io tio; -}; - #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) #define DM_IO_BIO_OFFSET \ (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) @@ -137,19 +105,6 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); #define MINOR_ALLOCED ((void *)-1) -/* - * Bits for the md->flags field. - */ -#define DMF_BLOCK_IO_FOR_SUSPEND 0 -#define DMF_SUSPENDED 1 -#define DMF_FROZEN 2 -#define DMF_FREEING 3 -#define DMF_DELETING 4 -#define DMF_NOFLUSH_SUSPENDING 5 -#define DMF_DEFERRED_REMOVE 6 -#define DMF_SUSPENDED_INTERNALLY 7 -#define DMF_POST_SUSPENDING 8 - #define DM_NUMA_NODE NUMA_NO_NODE static int dm_numa_node = DM_NUMA_NODE; @@ -444,84 +399,6 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } -#ifdef CONFIG_BLK_DEV_ZONED -int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data) -{ - struct dm_report_zones_args *args = data; - sector_t sector_diff = args->tgt->begin - args->start; - - /* - * Ignore zones beyond the target range. - */ - if (zone->start >= args->start + args->tgt->len) - return 0; - - /* - * Remap the start sector and write pointer position of the zone - * to match its position in the target range. - */ - zone->start += sector_diff; - if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { - if (zone->cond == BLK_ZONE_COND_FULL) - zone->wp = zone->start + zone->len; - else if (zone->cond == BLK_ZONE_COND_EMPTY) - zone->wp = zone->start; - else - zone->wp += sector_diff; - } - - args->next_sector = zone->start + zone->len; - return args->orig_cb(zone, args->zone_idx++, args->orig_data); -} -EXPORT_SYMBOL_GPL(dm_report_zones_cb); - -static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) -{ - struct mapped_device *md = disk->private_data; - struct dm_table *map; - int srcu_idx, ret; - struct dm_report_zones_args args = { - .next_sector = sector, - .orig_data = data, - .orig_cb = cb, - }; - - if (dm_suspended_md(md)) - return -EAGAIN; - - map = dm_get_live_table(md, &srcu_idx); - if (!map) { - ret = -EIO; - goto out; - } - - do { - struct dm_target *tgt; - - tgt = dm_table_find_target(map, args.next_sector); - if (WARN_ON_ONCE(!tgt->type->report_zones)) { - ret = -EIO; - goto out; - } - - args.tgt = tgt; - ret = tgt->type->report_zones(tgt, &args, - nr_zones - args.zone_idx); - if (ret < 0) - goto out; - } while (args.zone_idx < nr_zones && - args.next_sector < get_capacity(disk)); - - ret = args.zone_idx; -out: - dm_put_live_table(md, srcu_idx); - return ret; -} -#else -#define dm_blk_report_zones NULL -#endif /* CONFIG_BLK_DEV_ZONED */ - static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, struct block_device **bdev) { @@ -903,7 +780,7 @@ static int __noflush_suspending(struct mapped_device *md) * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ -static void dec_pending(struct dm_io *io, blk_status_t error) +void dm_io_dec_pending(struct dm_io *io, blk_status_t error) { unsigned long flags; blk_status_t io_error; @@ -919,22 +796,27 @@ static void dec_pending(struct dm_io *io, blk_status_t error) } if (atomic_dec_and_test(&io->io_count)) { + bio = io->orig_bio; if (io->status == BLK_STS_DM_REQUEUE) { /* * Target requested pushing back the I/O. */ spin_lock_irqsave(&md->deferred_lock, flags); - if (__noflush_suspending(md)) + if (__noflush_suspending(md) && + !WARN_ON_ONCE(dm_is_zone_write(md, bio))) { /* NOTE early return due to BLK_STS_DM_REQUEUE below */ - bio_list_add_head(&md->deferred, io->orig_bio); - else - /* noflush suspend was interrupted. */ + bio_list_add_head(&md->deferred, bio); + } else { + /* + * noflush suspend was interrupted or this is + * a write to a zoned target. + */ io->status = BLK_STS_IOERR; + } spin_unlock_irqrestore(&md->deferred_lock, flags); } io_error = io->status; - bio = io->orig_bio; end_io_acct(io); free_io(md, io); @@ -994,7 +876,6 @@ static void clone_endio(struct bio *bio) struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; - struct bio *orig_bio = io->orig_bio; struct request_queue *q = bio->bi_bdev->bd_disk->queue; if (unlikely(error == BLK_STS_TARGET)) { @@ -1009,23 +890,22 @@ static void clone_endio(struct bio *bio) disable_write_zeroes(md); } - /* - * For zone-append bios get offset in zone of the written - * sector and add that to the original bio sector pos. - */ - if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { - sector_t written_sector = bio->bi_iter.bi_sector; - struct request_queue *q = orig_bio->bi_bdev->bd_disk->queue; - u64 mask = (u64)blk_queue_zone_sectors(q) - 1; - - orig_bio->bi_iter.bi_sector += written_sector & mask; - } + if (blk_queue_is_zoned(q)) + dm_zone_endio(io, bio); if (endio) { int r = endio(tio->ti, bio, &error); switch (r) { case DM_ENDIO_REQUEUE: - error = BLK_STS_DM_REQUEUE; + /* + * Requeuing writes to a sequential zone of a zoned + * target will break the sequential write pattern: + * fail such IO. + */ + if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) + error = BLK_STS_IOERR; + else + error = BLK_STS_DM_REQUEUE; fallthrough; case DM_ENDIO_DONE: break; @@ -1044,7 +924,7 @@ static void clone_endio(struct bio *bio) } free_tio(tio); - dec_pending(io, error); + dm_io_dec_pending(io, error); } /* @@ -1237,8 +1117,8 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, /* * A target may call dm_accept_partial_bio only from the map routine. It is - * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET, - * REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH. + * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management + * operations and REQ_OP_ZONE_APPEND (zone append writes). * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be @@ -1268,9 +1148,13 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) { struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; + BUG_ON(bio->bi_opf & REQ_PREFLUSH); + BUG_ON(op_is_zone_mgmt(bio_op(bio))); + BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); BUG_ON(bi_size > *tio->len_ptr); BUG_ON(n_sectors > bi_size); + *tio->len_ptr -= bi_size - n_sectors; bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; } @@ -1308,7 +1192,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) * anything, the target has assumed ownership of * this io. */ - atomic_inc(&io->io_count); + dm_io_inc_pending(io); sector = clone->bi_iter.bi_sector; if (unlikely(swap_bios_limit(ti, clone))) { @@ -1319,7 +1203,16 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) down(&md->swap_bios_semaphore); } - r = ti->type->map(ti, clone); + /* + * Check if the IO needs a special mapping due to zone append emulation + * on zoned target. In this case, dm_zone_map_bio() calls the target + * map operation. + */ + if (dm_emulate_zone_append(io->md)) + r = dm_zone_map_bio(tio); + else + r = ti->type->map(ti, clone); + switch (r) { case DM_MAPIO_SUBMITTED: break; @@ -1334,7 +1227,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) up(&md->swap_bios_semaphore); } free_tio(tio); - dec_pending(io, BLK_STS_IOERR); + dm_io_dec_pending(io, BLK_STS_IOERR); break; case DM_MAPIO_REQUEUE: if (unlikely(swap_bios_limit(ti, clone))) { @@ -1342,7 +1235,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) up(&md->swap_bios_semaphore); } free_tio(tio); - dec_pending(io, BLK_STS_DM_REQUEUE); + dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); break; default: DMWARN("unimplemented target map return value: %d", r); @@ -1631,7 +1524,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, if (bio->bi_opf & REQ_PREFLUSH) { error = __send_empty_flush(&ci); - /* dec_pending submits any data associated with flush */ + /* dm_io_dec_pending submits any data associated with flush */ } else if (op_is_zone_mgmt(bio_op(bio))) { ci.bio = bio; ci.sector_count = 0; @@ -1672,7 +1565,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, } /* drop the extra reference count */ - dec_pending(ci.io, errno_to_blk_status(error)); + dm_io_dec_pending(ci.io, errno_to_blk_status(error)); return ret; } @@ -1817,6 +1710,7 @@ static void cleanup_mapped_device(struct mapped_device *md) mutex_destroy(&md->swap_bios_lock); dm_mq_cleanup_mapped_device(md); + dm_cleanup_zoned_dev(md); } /* @@ -2060,11 +1954,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, goto out; } + ret = dm_table_set_restrictions(t, q, limits); + if (ret) { + old_map = ERR_PTR(ret); + goto out; + } + old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); rcu_assign_pointer(md->map, (void *)t); md->immutable_target_type = dm_table_get_immutable_target_type(t); - dm_table_set_restrictions(t, q, limits); if (old_map) dm_sync_table(md); @@ -2183,7 +2082,10 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) DMERR("Cannot calculate initial queue limits"); return r; } - dm_table_set_restrictions(t, md->queue, &limits); + r = dm_table_set_restrictions(t, md->queue, &limits); + if (r) + return r; + blk_register_queue(md->disk); return 0; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index b441ad772c18..742d9c80efe1 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -45,6 +45,8 @@ struct dm_dev_internal { struct dm_table; struct dm_md_mempools; +struct dm_target_io; +struct dm_io; /*----------------------------------------------------------------- * Internal table functions. @@ -56,8 +58,8 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); bool dm_table_has_no_data_devices(struct dm_table *table); int dm_calculate_queue_limits(struct dm_table *table, struct queue_limits *limits); -void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, - struct queue_limits *limits); +int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *limits); struct list_head *dm_table_get_devices(struct dm_table *t); void dm_table_presuspend_targets(struct dm_table *t); void dm_table_presuspend_undo_targets(struct dm_table *t); @@ -100,6 +102,30 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); */ #define dm_target_hybrid(t) (dm_target_bio_based(t) && dm_target_request_based(t)) +/* + * Zoned targets related functions. + */ +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q); +void dm_zone_endio(struct dm_io *io, struct bio *clone); +#ifdef CONFIG_BLK_DEV_ZONED +void dm_cleanup_zoned_dev(struct mapped_device *md); +int dm_blk_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); +bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); +int dm_zone_map_bio(struct dm_target_io *io); +#else +static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {} +#define dm_blk_report_zones NULL +static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) +{ + return false; +} +static inline int dm_zone_map_bio(struct dm_target_io *tio) +{ + return DM_MAPIO_KILL; +} +#endif + /*----------------------------------------------------------------- * A registry of target types. *---------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 185dc60360b5..3a963d783a86 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -108,12 +108,10 @@ static void *element_at(struct dm_array_info *info, struct array_block *ab, * in an array block. */ static void on_entries(struct dm_array_info *info, struct array_block *ab, - void (*fn)(void *, const void *)) + void (*fn)(void *, const void *, unsigned)) { - unsigned i, nr_entries = le32_to_cpu(ab->nr_entries); - - for (i = 0; i < nr_entries; i++) - fn(info->value_type.context, element_at(info, ab, i)); + unsigned nr_entries = le32_to_cpu(ab->nr_entries); + fn(info->value_type.context, element_at(info, ab, 0), nr_entries); } /* @@ -175,19 +173,18 @@ static int alloc_ablock(struct dm_array_info *info, size_t size_of_block, static void fill_ablock(struct dm_array_info *info, struct array_block *ab, const void *value, unsigned new_nr) { - unsigned i; - uint32_t nr_entries; + uint32_t nr_entries, delta, i; struct dm_btree_value_type *vt = &info->value_type; BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); BUG_ON(new_nr < le32_to_cpu(ab->nr_entries)); nr_entries = le32_to_cpu(ab->nr_entries); - for (i = nr_entries; i < new_nr; i++) { - if (vt->inc) - vt->inc(vt->context, value); + delta = new_nr - nr_entries; + if (vt->inc) + vt->inc(vt->context, value, delta); + for (i = nr_entries; i < new_nr; i++) memcpy(element_at(info, ab, i), value, vt->size); - } ab->nr_entries = cpu_to_le32(new_nr); } @@ -199,17 +196,16 @@ static void fill_ablock(struct dm_array_info *info, struct array_block *ab, static void trim_ablock(struct dm_array_info *info, struct array_block *ab, unsigned new_nr) { - unsigned i; - uint32_t nr_entries; + uint32_t nr_entries, delta; struct dm_btree_value_type *vt = &info->value_type; BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); BUG_ON(new_nr > le32_to_cpu(ab->nr_entries)); nr_entries = le32_to_cpu(ab->nr_entries); - for (i = nr_entries; i > new_nr; i--) - if (vt->dec) - vt->dec(vt->context, element_at(info, ab, i - 1)); + delta = nr_entries - new_nr; + if (vt->dec) + vt->dec(vt->context, element_at(info, ab, new_nr - 1), delta); ab->nr_entries = cpu_to_le32(new_nr); } @@ -573,16 +569,17 @@ static int grow(struct resize *resize) * These are the value_type functions for the btree elements, which point * to array blocks. */ -static void block_inc(void *context, const void *value) +static void block_inc(void *context, const void *value, unsigned count) { - __le64 block_le; + const __le64 *block_le = value; struct dm_array_info *info = context; + unsigned i; - memcpy(&block_le, value, sizeof(block_le)); - dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le)); + for (i = 0; i < count; i++, block_le++) + dm_tm_inc(info->btree_info.tm, le64_to_cpu(*block_le)); } -static void block_dec(void *context, const void *value) +static void __block_dec(void *context, const void *value) { int r; uint64_t b; @@ -621,6 +618,13 @@ static void block_dec(void *context, const void *value) dm_tm_dec(info->btree_info.tm, b); } +static void block_dec(void *context, const void *value, unsigned count) +{ + unsigned i; + for (i = 0; i < count; i++, value += sizeof(__le64)) + __block_dec(context, value); +} + static int block_equal(void *context, const void *value1, const void *value2) { return !memcmp(value1, value2, sizeof(__le64)); @@ -711,7 +715,7 @@ static int populate_ablock_with_values(struct dm_array_info *info, struct array_ return r; if (vt->inc) - vt->inc(vt->context, element_at(info, ab, i)); + vt->inc(vt->context, element_at(info, ab, i), 1); } ab->nr_entries = cpu_to_le32(new_nr); @@ -822,9 +826,9 @@ static int array_set_value(struct dm_array_info *info, dm_block_t root, old_value = element_at(info, ab, entry); if (vt->dec && (!vt->equal || !vt->equal(vt->context, old_value, value))) { - vt->dec(vt->context, old_value); + vt->dec(vt->context, old_value, 1); if (vt->inc) - vt->inc(vt->context, value); + vt->inc(vt->context, value, 1); } memcpy(old_value, value, info->value_type.size); diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index b1788853a355..893edb426dba 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -144,4 +144,17 @@ extern struct dm_block_validator btree_node_validator; extern void init_le64_type(struct dm_transaction_manager *tm, struct dm_btree_value_type *vt); +/* + * This returns a shadowed btree leaf that you may modify. In practise + * this means overwrites only, since an insert could cause a node to + * be split. Useful if you need access to the old value to calculate the + * new one. + * + * This only works with single level btrees. The given key must be present in + * the tree, otherwise -EINVAL will be returned. + */ +int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root, + uint64_t key, int *index, + dm_block_t *new_root, struct dm_block **leaf); + #endif /* DM_BTREE_INTERNAL_H */ diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index eff04fa23dfa..70532335c7c7 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -544,12 +544,13 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, if (info->value_type.dec) info->value_type.dec(info->value_type.context, - value_ptr(n, index)); + value_ptr(n, index), 1); delete_at(n, index); } - *new_root = shadow_root(&spine); + if (!r) + *new_root = shadow_root(&spine); exit_shadow_spine(&spine); return r; @@ -653,7 +654,7 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root, if (k >= keys[last_level] && k < end_key) { if (info->value_type.dec) info->value_type.dec(info->value_type.context, - value_ptr(n, index)); + value_ptr(n, index), 1); delete_at(n, index); keys[last_level] = k + 1ull; diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index 2061ab865567..f5bd76ed8fe6 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -236,22 +236,14 @@ dm_block_t shadow_root(struct shadow_spine *s) return s->root; } -static void le64_inc(void *context, const void *value_le) +static void le64_inc(void *context, const void *value_le, unsigned count) { - struct dm_transaction_manager *tm = context; - __le64 v_le; - - memcpy(&v_le, value_le, sizeof(v_le)); - dm_tm_inc(tm, le64_to_cpu(v_le)); + dm_tm_with_runs(context, value_le, count, dm_tm_inc_range); } -static void le64_dec(void *context, const void *value_le) +static void le64_dec(void *context, const void *value_le, unsigned count) { - struct dm_transaction_manager *tm = context; - __le64 v_le; - - memcpy(&v_le, value_le, sizeof(v_le)); - dm_tm_dec(tm, le64_to_cpu(v_le)); + dm_tm_with_runs(context, value_le, count, dm_tm_dec_range); } static int le64_equal(void *context, const void *value1_le, const void *value2_le) diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index ef6e78d45d5b..0703ca7a7d9a 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -71,15 +71,13 @@ static int upper_bound(struct btree_node *n, uint64_t key) void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, struct dm_btree_value_type *vt) { - unsigned i; uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) - for (i = 0; i < nr_entries; i++) - dm_tm_inc(tm, value64(n, i)); + dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range); + else if (vt->inc) - for (i = 0; i < nr_entries; i++) - vt->inc(vt->context, value_ptr(n, i)); + vt->inc(vt->context, value_ptr(n, 0), nr_entries); } static int insert_at(size_t value_size, struct btree_node *node, unsigned index, @@ -318,13 +316,9 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) goto out; } else { - if (info->value_type.dec) { - unsigned i; - - for (i = 0; i < f->nr_children; i++) - info->value_type.dec(info->value_type.context, - value_ptr(f->n, i)); - } + if (info->value_type.dec) + info->value_type.dec(info->value_type.context, + value_ptr(f->n, 0), f->nr_children); pop_frame(s); } } @@ -500,6 +494,122 @@ out: EXPORT_SYMBOL_GPL(dm_btree_lookup_next); +/*----------------------------------------------------------------*/ + +/* + * Copies entries from one region of a btree node to another. The regions + * must not overlap. + */ +static void copy_entries(struct btree_node *dest, unsigned dest_offset, + struct btree_node *src, unsigned src_offset, + unsigned count) +{ + size_t value_size = le32_to_cpu(dest->header.value_size); + memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); + memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); +} + +/* + * Moves entries from one region fo a btree node to another. The regions + * may overlap. + */ +static void move_entries(struct btree_node *dest, unsigned dest_offset, + struct btree_node *src, unsigned src_offset, + unsigned count) +{ + size_t value_size = le32_to_cpu(dest->header.value_size); + memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); + memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); +} + +/* + * Erases the first 'count' entries of a btree node, shifting following + * entries down into their place. + */ +static void shift_down(struct btree_node *n, unsigned count) +{ + move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count); +} + +/* + * Moves entries in a btree node up 'count' places, making space for + * new entries at the start of the node. + */ +static void shift_up(struct btree_node *n, unsigned count) +{ + move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries)); +} + +/* + * Redistributes entries between two btree nodes to make them + * have similar numbers of entries. + */ +static void redistribute2(struct btree_node *left, struct btree_node *right) +{ + unsigned nr_left = le32_to_cpu(left->header.nr_entries); + unsigned nr_right = le32_to_cpu(right->header.nr_entries); + unsigned total = nr_left + nr_right; + unsigned target_left = total / 2; + unsigned target_right = total - target_left; + + if (nr_left < target_left) { + unsigned delta = target_left - nr_left; + copy_entries(left, nr_left, right, 0, delta); + shift_down(right, delta); + } else if (nr_left > target_left) { + unsigned delta = nr_left - target_left; + if (nr_right) + shift_up(right, delta); + copy_entries(right, 0, left, target_left, delta); + } + + left->header.nr_entries = cpu_to_le32(target_left); + right->header.nr_entries = cpu_to_le32(target_right); +} + +/* + * Redistribute entries between three nodes. Assumes the central + * node is empty. + */ +static void redistribute3(struct btree_node *left, struct btree_node *center, + struct btree_node *right) +{ + unsigned nr_left = le32_to_cpu(left->header.nr_entries); + unsigned nr_center = le32_to_cpu(center->header.nr_entries); + unsigned nr_right = le32_to_cpu(right->header.nr_entries); + unsigned total, target_left, target_center, target_right; + + BUG_ON(nr_center); + + total = nr_left + nr_right; + target_left = total / 3; + target_center = (total - target_left) / 2; + target_right = (total - target_left - target_center); + + if (nr_left < target_left) { + unsigned left_short = target_left - nr_left; + copy_entries(left, nr_left, right, 0, left_short); + copy_entries(center, 0, right, left_short, target_center); + shift_down(right, nr_right - target_right); + + } else if (nr_left < (target_left + target_center)) { + unsigned left_to_center = nr_left - target_left; + copy_entries(center, 0, left, target_left, left_to_center); + copy_entries(center, left_to_center, right, 0, target_center - left_to_center); + shift_down(right, nr_right - target_right); + + } else { + unsigned right_short = target_right - nr_right; + shift_up(right, right_short); + copy_entries(right, 0, left, nr_left - right_short, right_short); + copy_entries(center, 0, left, target_left, nr_left - target_left); + } + + left->header.nr_entries = cpu_to_le32(target_left); + center->header.nr_entries = cpu_to_le32(target_center); + right->header.nr_entries = cpu_to_le32(target_right); +} + /* * Splits a node by creating a sibling node and shifting half the nodes * contents across. Assumes there is a parent node, and it has room for @@ -530,12 +640,10 @@ EXPORT_SYMBOL_GPL(dm_btree_lookup_next); * * Where A* is a shadow of A. */ -static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, - uint64_t key) +static int split_one_into_two(struct shadow_spine *s, unsigned parent_index, + struct dm_btree_value_type *vt, uint64_t key) { int r; - size_t size; - unsigned nr_left, nr_right; struct dm_block *left, *right, *parent; struct btree_node *ln, *rn, *pn; __le64 location; @@ -549,36 +657,18 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, ln = dm_block_data(left); rn = dm_block_data(right); - nr_left = le32_to_cpu(ln->header.nr_entries) / 2; - nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; - - ln->header.nr_entries = cpu_to_le32(nr_left); - rn->header.flags = ln->header.flags; - rn->header.nr_entries = cpu_to_le32(nr_right); + rn->header.nr_entries = cpu_to_le32(0); rn->header.max_entries = ln->header.max_entries; rn->header.value_size = ln->header.value_size; - memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); - - size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? - sizeof(uint64_t) : s->info->value_type.size; - memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), - size * nr_right); + redistribute2(ln, rn); - /* - * Patch up the parent - */ + /* patch up the parent */ parent = shadow_parent(s); - pn = dm_block_data(parent); - location = cpu_to_le64(dm_block_location(left)); - __dm_bless_for_disk(&location); - memcpy_disk(value_ptr(pn, parent_index), - &location, sizeof(__le64)); location = cpu_to_le64(dm_block_location(right)); __dm_bless_for_disk(&location); - r = insert_at(sizeof(__le64), pn, parent_index + 1, le64_to_cpu(rn->keys[0]), &location); if (r) { @@ -586,6 +676,7 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, return r; } + /* patch up the spine */ if (key < le64_to_cpu(rn->keys[0])) { unlock_block(s->info, right); s->nodes[1] = left; @@ -598,6 +689,121 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, } /* + * We often need to modify a sibling node. This function shadows a particular + * child of the given parent node. Making sure to update the parent to point + * to the new shadow. + */ +static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, + struct btree_node *parent, unsigned index, + struct dm_block **result) +{ + int r, inc; + dm_block_t root; + struct btree_node *node; + + root = value64(parent, index); + + r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, + result, &inc); + if (r) + return r; + + node = dm_block_data(*result); + + if (inc) + inc_children(info->tm, node, vt); + + *((__le64 *) value_ptr(parent, index)) = + cpu_to_le64(dm_block_location(*result)); + + return 0; +} + +/* + * Splits two nodes into three. This is more work, but results in fuller + * nodes, so saves metadata space. + */ +static int split_two_into_three(struct shadow_spine *s, unsigned parent_index, + struct dm_btree_value_type *vt, uint64_t key) +{ + int r; + unsigned middle_index; + struct dm_block *left, *middle, *right, *parent; + struct btree_node *ln, *rn, *mn, *pn; + __le64 location; + + parent = shadow_parent(s); + pn = dm_block_data(parent); + + if (parent_index == 0) { + middle_index = 1; + left = shadow_current(s); + r = shadow_child(s->info, vt, pn, parent_index + 1, &right); + if (r) + return r; + } else { + middle_index = parent_index; + right = shadow_current(s); + r = shadow_child(s->info, vt, pn, parent_index - 1, &left); + if (r) + return r; + } + + r = new_block(s->info, &middle); + if (r < 0) + return r; + + ln = dm_block_data(left); + mn = dm_block_data(middle); + rn = dm_block_data(right); + + mn->header.nr_entries = cpu_to_le32(0); + mn->header.flags = ln->header.flags; + mn->header.max_entries = ln->header.max_entries; + mn->header.value_size = ln->header.value_size; + + redistribute3(ln, mn, rn); + + /* patch up the parent */ + pn->keys[middle_index] = rn->keys[0]; + location = cpu_to_le64(dm_block_location(middle)); + __dm_bless_for_disk(&location); + r = insert_at(sizeof(__le64), pn, middle_index, + le64_to_cpu(mn->keys[0]), &location); + if (r) { + if (shadow_current(s) != left) + unlock_block(s->info, left); + + unlock_block(s->info, middle); + + if (shadow_current(s) != right) + unlock_block(s->info, right); + + return r; + } + + + /* patch up the spine */ + if (key < le64_to_cpu(mn->keys[0])) { + unlock_block(s->info, middle); + unlock_block(s->info, right); + s->nodes[1] = left; + } else if (key < le64_to_cpu(rn->keys[0])) { + unlock_block(s->info, left); + unlock_block(s->info, right); + s->nodes[1] = middle; + } else { + unlock_block(s->info, left); + unlock_block(s->info, middle); + s->nodes[1] = right; + } + + return 0; +} + +/*----------------------------------------------------------------*/ + +/* * Splits a node by creating two new children beneath the given node. * * Before: @@ -690,6 +896,186 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) return 0; } +/*----------------------------------------------------------------*/ + +/* + * Redistributes a node's entries with its left sibling. + */ +static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt, + unsigned parent_index, uint64_t key) +{ + int r; + struct dm_block *sib; + struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); + + r = shadow_child(s->info, vt, parent, parent_index - 1, &sib); + if (r) + return r; + + left = dm_block_data(sib); + right = dm_block_data(shadow_current(s)); + redistribute2(left, right); + *key_ptr(parent, parent_index) = right->keys[0]; + + if (key < le64_to_cpu(right->keys[0])) { + unlock_block(s->info, s->nodes[1]); + s->nodes[1] = sib; + } else { + unlock_block(s->info, sib); + } + + return 0; +} + +/* + * Redistributes a nodes entries with its right sibling. + */ +static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt, + unsigned parent_index, uint64_t key) +{ + int r; + struct dm_block *sib; + struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); + + r = shadow_child(s->info, vt, parent, parent_index + 1, &sib); + if (r) + return r; + + left = dm_block_data(shadow_current(s)); + right = dm_block_data(sib); + redistribute2(left, right); + *key_ptr(parent, parent_index + 1) = right->keys[0]; + + if (key < le64_to_cpu(right->keys[0])) { + unlock_block(s->info, sib); + } else { + unlock_block(s->info, s->nodes[1]); + s->nodes[1] = sib; + } + + return 0; +} + +/* + * Returns the number of spare entries in a node. + */ +static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned *space) +{ + int r; + unsigned nr_entries; + struct dm_block *block; + struct btree_node *node; + + r = bn_read_lock(info, b, &block); + if (r) + return r; + + node = dm_block_data(block); + nr_entries = le32_to_cpu(node->header.nr_entries); + *space = le32_to_cpu(node->header.max_entries) - nr_entries; + + unlock_block(info, block); + return 0; +} + +/* + * Make space in a node, either by moving some entries to a sibling, + * or creating a new sibling node. SPACE_THRESHOLD defines the minimum + * number of free entries that must be in the sibling to make the move + * worth while. If the siblings are shared (eg, part of a snapshot), + * then they are not touched, since this break sharing and so consume + * more space than we save. + */ +#define SPACE_THRESHOLD 8 +static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt, + unsigned parent_index, uint64_t key) +{ + int r; + struct btree_node *parent = dm_block_data(shadow_parent(s)); + unsigned nr_parent = le32_to_cpu(parent->header.nr_entries); + unsigned free_space; + int left_shared = 0, right_shared = 0; + + /* Should we move entries to the left sibling? */ + if (parent_index > 0) { + dm_block_t left_b = value64(parent, parent_index - 1); + r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared); + if (r) + return r; + + if (!left_shared) { + r = get_node_free_space(s->info, left_b, &free_space); + if (r) + return r; + + if (free_space >= SPACE_THRESHOLD) + return rebalance_left(s, vt, parent_index, key); + } + } + + /* Should we move entries to the right sibling? */ + if (parent_index < (nr_parent - 1)) { + dm_block_t right_b = value64(parent, parent_index + 1); + r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared); + if (r) + return r; + + if (!right_shared) { + r = get_node_free_space(s->info, right_b, &free_space); + if (r) + return r; + + if (free_space >= SPACE_THRESHOLD) + return rebalance_right(s, vt, parent_index, key); + } + } + + /* + * We need to split the node, normally we split two nodes + * into three. But when inserting a sequence that is either + * monotonically increasing or decreasing it's better to split + * a single node into two. + */ + if (left_shared || right_shared || (nr_parent <= 2) || + (parent_index == 0) || (parent_index + 1 == nr_parent)) { + return split_one_into_two(s, parent_index, vt, key); + } else { + return split_two_into_three(s, parent_index, vt, key); + } +} + +/* + * Does the node contain a particular key? + */ +static bool contains_key(struct btree_node *node, uint64_t key) +{ + int i = lower_bound(node, key); + + if (i >= 0 && le64_to_cpu(node->keys[i]) == key) + return true; + + return false; +} + +/* + * In general we preemptively make sure there's a free entry in every + * node on the spine when doing an insert. But we can avoid that with + * leaf nodes if we know it's an overwrite. + */ +static bool has_space_for_insert(struct btree_node *node, uint64_t key) +{ + if (node->header.nr_entries == node->header.max_entries) { + if (le32_to_cpu(node->header.flags) & LEAF_NODE) { + /* we don't need space if it's an overwrite */ + return contains_key(node, key); + } + + return false; + } + + return true; +} + static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, struct dm_btree_value_type *vt, uint64_t key, unsigned *index) @@ -719,17 +1105,18 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, node = dm_block_data(shadow_current(s)); - if (node->header.nr_entries == node->header.max_entries) { + if (!has_space_for_insert(node, key)) { if (top) r = btree_split_beneath(s, key); else - r = btree_split_sibling(s, i, key); + r = rebalance_or_split(s, vt, i, key); if (r < 0) return r; - } - node = dm_block_data(shadow_current(s)); + /* making space can cause the current node to change */ + node = dm_block_data(shadow_current(s)); + } i = lower_bound(node, key); @@ -753,6 +1140,77 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, return 0; } +static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root, + uint64_t key, int *index) +{ + int r, i = -1; + struct btree_node *node; + + *index = 0; + for (;;) { + r = shadow_step(s, root, &s->info->value_type); + if (r < 0) + return r; + + node = dm_block_data(shadow_current(s)); + + /* + * We have to patch up the parent node, ugly, but I don't + * see a way to do this automatically as part of the spine + * op. + */ + if (shadow_has_parent(s) && i >= 0) { + __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); + + __dm_bless_for_disk(&location); + memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), + &location, sizeof(__le64)); + } + + node = dm_block_data(shadow_current(s)); + i = lower_bound(node, key); + + BUG_ON(i < 0); + BUG_ON(i >= le32_to_cpu(node->header.nr_entries)); + + if (le32_to_cpu(node->header.flags) & LEAF_NODE) { + if (key != le64_to_cpu(node->keys[i])) + return -EINVAL; + break; + } + + root = value64(node, i); + } + + *index = i; + return 0; +} + +int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root, + uint64_t key, int *index, + dm_block_t *new_root, struct dm_block **leaf) +{ + int r; + struct shadow_spine spine; + + BUG_ON(info->levels > 1); + init_shadow_spine(&spine, info); + r = __btree_get_overwrite_leaf(&spine, root, key, index); + if (!r) { + *new_root = shadow_root(&spine); + *leaf = shadow_current(&spine); + + /* + * Decrement the count so exit_shadow_spine() doesn't + * unlock the leaf. + */ + spine.count--; + } + exit_shadow_spine(&spine); + + return r; +} + static bool need_insert(struct btree_node *node, uint64_t *keys, unsigned level, unsigned index) { @@ -829,7 +1287,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root, value_ptr(n, index), value))) { info->value_type.dec(info->value_type.context, - value_ptr(n, index)); + value_ptr(n, index), 1); } memcpy_disk(value_ptr(n, index), value, info->value_type.size); diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index 3dc5bb1a4748..d2ae5aa4d00b 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h @@ -51,21 +51,21 @@ struct dm_btree_value_type { */ /* - * The btree is making a duplicate of the value, for instance + * The btree is making a duplicate of a run of values, for instance * because previously-shared btree nodes have now diverged. * @value argument is the new copy that the copy function may modify. * (Probably it just wants to increment a reference count * somewhere.) This method is _not_ called for insertion of a new * value: It is assumed the ref count is already 1. */ - void (*inc)(void *context, const void *value); + void (*inc)(void *context, const void *value, unsigned count); /* - * This value is being deleted. The btree takes care of freeing + * These values are being deleted. The btree takes care of freeing * the memory pointed to by @value. Often the del function just - * needs to decrement a reference count somewhere. + * needs to decrement a reference counts somewhere. */ - void (*dec)(void *context, const void *value); + void (*dec)(void *context, const void *value, unsigned count); /* * A test for equality between two values. When a value is diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index a213bf11738f..4a6a2a9b4eb4 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -6,6 +6,8 @@ #include "dm-space-map-common.h" #include "dm-transaction-manager.h" +#include "dm-btree-internal.h" +#include "dm-persistent-data-internal.h" #include <linux/bitops.h> #include <linux/device-mapper.h> @@ -409,12 +411,13 @@ int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, return r; } -static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, - int (*mutator)(void *context, uint32_t old, uint32_t *new), - void *context, enum allocation_event *ev) +/*----------------------------------------------------------------*/ + +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, + uint32_t ref_count, int32_t *nr_allocations) { int r; - uint32_t bit, old, ref_count; + uint32_t bit, old; struct dm_block *nb; dm_block_t index = b; struct disk_index_entry ie_disk; @@ -433,10 +436,9 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, return r; } ie_disk.blocknr = cpu_to_le64(dm_block_location(nb)); - bm_le = dm_bitmap_data(nb); - old = sm_lookup_bitmap(bm_le, bit); + old = sm_lookup_bitmap(bm_le, bit); if (old > 2) { r = sm_ll_lookup_big_ref_count(ll, b, &old); if (r < 0) { @@ -445,7 +447,6 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, } } - r = mutator(context, old, &ref_count); if (r) { dm_tm_unlock(ll->tm, nb); return r; @@ -453,7 +454,6 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, if (ref_count <= 2) { sm_set_bitmap(bm_le, bit, ref_count); - dm_tm_unlock(ll->tm, nb); if (old > 2) { @@ -480,62 +480,459 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, } if (ref_count && !old) { - *ev = SM_ALLOC; + *nr_allocations = 1; ll->nr_allocated++; le32_add_cpu(&ie_disk.nr_free, -1); if (le32_to_cpu(ie_disk.none_free_before) == bit) ie_disk.none_free_before = cpu_to_le32(bit + 1); } else if (old && !ref_count) { - *ev = SM_FREE; + *nr_allocations = -1; ll->nr_allocated--; le32_add_cpu(&ie_disk.nr_free, 1); ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit)); } else - *ev = SM_NONE; + *nr_allocations = 0; return ll->save_ie(ll, index, &ie_disk); } -static int set_ref_count(void *context, uint32_t old, uint32_t *new) +/*----------------------------------------------------------------*/ + +/* + * Holds useful intermediate results for the range based inc and dec + * operations. + */ +struct inc_context { + struct disk_index_entry ie_disk; + struct dm_block *bitmap_block; + void *bitmap; + + struct dm_block *overflow_leaf; +}; + +static inline void init_inc_context(struct inc_context *ic) +{ + ic->bitmap_block = NULL; + ic->bitmap = NULL; + ic->overflow_leaf = NULL; +} + +static inline void exit_inc_context(struct ll_disk *ll, struct inc_context *ic) +{ + if (ic->bitmap_block) + dm_tm_unlock(ll->tm, ic->bitmap_block); + if (ic->overflow_leaf) + dm_tm_unlock(ll->tm, ic->overflow_leaf); +} + +static inline void reset_inc_context(struct ll_disk *ll, struct inc_context *ic) +{ + exit_inc_context(ll, ic); + init_inc_context(ic); +} + +/* + * Confirms a btree node contains a particular key at an index. + */ +static bool contains_key(struct btree_node *n, uint64_t key, int index) +{ + return index >= 0 && + index < le32_to_cpu(n->header.nr_entries) && + le64_to_cpu(n->keys[index]) == key; +} + +static int __sm_ll_inc_overflow(struct ll_disk *ll, dm_block_t b, struct inc_context *ic) { - *new = *((uint32_t *) context); + int r; + int index; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + /* + * bitmap_block needs to be unlocked because getting the + * overflow_leaf may need to allocate, and thus use the space map. + */ + reset_inc_context(ll, ic); + + r = btree_get_overwrite_leaf(&ll->ref_count_info, ll->ref_count_root, + b, &index, &ll->ref_count_root, &ic->overflow_leaf); + if (r < 0) + return r; + + n = dm_block_data(ic->overflow_leaf); + + if (!contains_key(n, b, index)) { + DMERR("overflow btree is missing an entry"); + return -EINVAL; + } + + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr) + 1; + *v_ptr = cpu_to_le32(rc); + return 0; } -int sm_ll_insert(struct ll_disk *ll, dm_block_t b, - uint32_t ref_count, enum allocation_event *ev) +static int sm_ll_inc_overflow(struct ll_disk *ll, dm_block_t b, struct inc_context *ic) +{ + int index; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + /* + * Do we already have the correct overflow leaf? + */ + if (ic->overflow_leaf) { + n = dm_block_data(ic->overflow_leaf); + index = lower_bound(n, b); + if (contains_key(n, b, index)) { + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr) + 1; + *v_ptr = cpu_to_le32(rc); + + return 0; + } + } + + return __sm_ll_inc_overflow(ll, b, ic); +} + +static inline int shadow_bitmap(struct ll_disk *ll, struct inc_context *ic) +{ + int r, inc; + r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ic->ie_disk.blocknr), + &dm_sm_bitmap_validator, &ic->bitmap_block, &inc); + if (r < 0) { + DMERR("dm_tm_shadow_block() failed"); + return r; + } + ic->ie_disk.blocknr = cpu_to_le64(dm_block_location(ic->bitmap_block)); + ic->bitmap = dm_bitmap_data(ic->bitmap_block); + return 0; +} + +/* + * Once shadow_bitmap has been called, which always happens at the start of inc/dec, + * we can reopen the bitmap with a simple write lock, rather than re calling + * dm_tm_shadow_block(). + */ +static inline int ensure_bitmap(struct ll_disk *ll, struct inc_context *ic) +{ + if (!ic->bitmap_block) { + int r = dm_bm_write_lock(dm_tm_get_bm(ll->tm), le64_to_cpu(ic->ie_disk.blocknr), + &dm_sm_bitmap_validator, &ic->bitmap_block); + if (r) { + DMERR("unable to re-get write lock for bitmap"); + return r; + } + ic->bitmap = dm_bitmap_data(ic->bitmap_block); + } + + return 0; +} + +/* + * Loops round incrementing entries in a single bitmap. + */ +static inline int sm_ll_inc_bitmap(struct ll_disk *ll, dm_block_t b, + uint32_t bit, uint32_t bit_end, + int32_t *nr_allocations, dm_block_t *new_b, + struct inc_context *ic) +{ + int r; + __le32 le_rc; + uint32_t old; + + for (; bit != bit_end; bit++, b++) { + /* + * We only need to drop the bitmap if we need to find a new btree + * leaf for the overflow. So if it was dropped last iteration, + * we now re-get it. + */ + r = ensure_bitmap(ll, ic); + if (r) + return r; + + old = sm_lookup_bitmap(ic->bitmap, bit); + switch (old) { + case 0: + /* inc bitmap, adjust nr_allocated */ + sm_set_bitmap(ic->bitmap, bit, 1); + (*nr_allocations)++; + ll->nr_allocated++; + le32_add_cpu(&ic->ie_disk.nr_free, -1); + if (le32_to_cpu(ic->ie_disk.none_free_before) == bit) + ic->ie_disk.none_free_before = cpu_to_le32(bit + 1); + break; + + case 1: + /* inc bitmap */ + sm_set_bitmap(ic->bitmap, bit, 2); + break; + + case 2: + /* inc bitmap and insert into overflow */ + sm_set_bitmap(ic->bitmap, bit, 3); + reset_inc_context(ll, ic); + + le_rc = cpu_to_le32(3); + __dm_bless_for_disk(&le_rc); + r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root, + &b, &le_rc, &ll->ref_count_root); + if (r < 0) { + DMERR("ref count insert failed"); + return r; + } + break; + + default: + /* + * inc within the overflow tree only. + */ + r = sm_ll_inc_overflow(ll, b, ic); + if (r < 0) + return r; + } + } + + *new_b = b; + return 0; +} + +/* + * Finds a bitmap that contains entries in the block range, and increments + * them. + */ +static int __sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations, dm_block_t *new_b) { - return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); + int r; + struct inc_context ic; + uint32_t bit, bit_end; + dm_block_t index = b; + + init_inc_context(&ic); + + bit = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ic.ie_disk); + if (r < 0) + return r; + + r = shadow_bitmap(ll, &ic); + if (r) + return r; + + bit_end = min(bit + (e - b), (dm_block_t) ll->entries_per_block); + r = sm_ll_inc_bitmap(ll, b, bit, bit_end, nr_allocations, new_b, &ic); + + exit_inc_context(ll, &ic); + + if (r) + return r; + + return ll->save_ie(ll, index, &ic.ie_disk); } -static int inc_ref_count(void *context, uint32_t old, uint32_t *new) +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations) { - *new = old + 1; + *nr_allocations = 0; + while (b != e) { + int r = __sm_ll_inc(ll, b, e, nr_allocations, &b); + if (r) + return r; + } + return 0; } -int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +/*----------------------------------------------------------------*/ + +static int __sm_ll_del_overflow(struct ll_disk *ll, dm_block_t b, + struct inc_context *ic) { - return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); + reset_inc_context(ll, ic); + return dm_btree_remove(&ll->ref_count_info, ll->ref_count_root, + &b, &ll->ref_count_root); } -static int dec_ref_count(void *context, uint32_t old, uint32_t *new) +static int __sm_ll_dec_overflow(struct ll_disk *ll, dm_block_t b, + struct inc_context *ic, uint32_t *old_rc) { - if (!old) { - DMERR_LIMIT("unable to decrement a reference count below 0"); + int r; + int index = -1; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + reset_inc_context(ll, ic); + r = btree_get_overwrite_leaf(&ll->ref_count_info, ll->ref_count_root, + b, &index, &ll->ref_count_root, &ic->overflow_leaf); + if (r < 0) + return r; + + n = dm_block_data(ic->overflow_leaf); + + if (!contains_key(n, b, index)) { + DMERR("overflow btree is missing an entry"); return -EINVAL; } - *new = old - 1; + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr); + *old_rc = rc; + + if (rc == 3) { + return __sm_ll_del_overflow(ll, b, ic); + } else { + rc--; + *v_ptr = cpu_to_le32(rc); + return 0; + } +} + +static int sm_ll_dec_overflow(struct ll_disk *ll, dm_block_t b, + struct inc_context *ic, uint32_t *old_rc) +{ + /* + * Do we already have the correct overflow leaf? + */ + if (ic->overflow_leaf) { + int index; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + n = dm_block_data(ic->overflow_leaf); + index = lower_bound(n, b); + if (contains_key(n, b, index)) { + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr); + *old_rc = rc; + + if (rc > 3) { + rc--; + *v_ptr = cpu_to_le32(rc); + return 0; + } else { + return __sm_ll_del_overflow(ll, b, ic); + } + + } + } + + return __sm_ll_dec_overflow(ll, b, ic, old_rc); +} + +/* + * Loops round incrementing entries in a single bitmap. + */ +static inline int sm_ll_dec_bitmap(struct ll_disk *ll, dm_block_t b, + uint32_t bit, uint32_t bit_end, + struct inc_context *ic, + int32_t *nr_allocations, dm_block_t *new_b) +{ + int r; + uint32_t old; + + for (; bit != bit_end; bit++, b++) { + /* + * We only need to drop the bitmap if we need to find a new btree + * leaf for the overflow. So if it was dropped last iteration, + * we now re-get it. + */ + r = ensure_bitmap(ll, ic); + if (r) + return r; + + old = sm_lookup_bitmap(ic->bitmap, bit); + switch (old) { + case 0: + DMERR("unable to decrement block"); + return -EINVAL; + + case 1: + /* dec bitmap */ + sm_set_bitmap(ic->bitmap, bit, 0); + (*nr_allocations)--; + ll->nr_allocated--; + le32_add_cpu(&ic->ie_disk.nr_free, 1); + ic->ie_disk.none_free_before = + cpu_to_le32(min(le32_to_cpu(ic->ie_disk.none_free_before), bit)); + break; + + case 2: + /* dec bitmap and insert into overflow */ + sm_set_bitmap(ic->bitmap, bit, 1); + break; + + case 3: + r = sm_ll_dec_overflow(ll, b, ic, &old); + if (r < 0) + return r; + + if (old == 3) { + r = ensure_bitmap(ll, ic); + if (r) + return r; + + sm_set_bitmap(ic->bitmap, bit, 2); + } + break; + } + } + + *new_b = b; return 0; } -int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +static int __sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations, dm_block_t *new_b) +{ + int r; + uint32_t bit, bit_end; + struct inc_context ic; + dm_block_t index = b; + + init_inc_context(&ic); + + bit = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ic.ie_disk); + if (r < 0) + return r; + + r = shadow_bitmap(ll, &ic); + if (r) + return r; + + bit_end = min(bit + (e - b), (dm_block_t) ll->entries_per_block); + r = sm_ll_dec_bitmap(ll, b, bit, bit_end, &ic, nr_allocations, new_b); + exit_inc_context(ll, &ic); + + if (r) + return r; + + return ll->save_ie(ll, index, &ic.ie_disk); +} + +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations) { - return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev); + *nr_allocations = 0; + while (b != e) { + int r = __sm_ll_dec(ll, b, e, nr_allocations, &b); + if (r) + return r; + } + + return 0; } +/*----------------------------------------------------------------*/ + int sm_ll_commit(struct ll_disk *ll) { int r = 0; @@ -687,28 +1084,92 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, /*----------------------------------------------------------------*/ +static inline int ie_cache_writeback(struct ll_disk *ll, struct ie_cache *iec) +{ + iec->dirty = false; + __dm_bless_for_disk(iec->ie); + return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root, + &iec->index, &iec->ie, &ll->bitmap_root); +} + +static inline unsigned hash_index(dm_block_t index) +{ + return dm_hash_block(index, IE_CACHE_MASK); +} + static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie) { - return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie); + int r; + unsigned h = hash_index(index); + struct ie_cache *iec = ll->ie_cache + h; + + if (iec->valid) { + if (iec->index == index) { + memcpy(ie, &iec->ie, sizeof(*ie)); + return 0; + } + + if (iec->dirty) { + r = ie_cache_writeback(ll, iec); + if (r) + return r; + } + } + + r = dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie); + if (!r) { + iec->valid = true; + iec->dirty = false; + iec->index = index; + memcpy(&iec->ie, ie, sizeof(*ie)); + } + + return r; } static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie) { - __dm_bless_for_disk(ie); - return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root, - &index, ie, &ll->bitmap_root); + int r; + unsigned h = hash_index(index); + struct ie_cache *iec = ll->ie_cache + h; + + ll->bitmap_index_changed = true; + if (iec->valid) { + if (iec->index == index) { + memcpy(&iec->ie, ie, sizeof(*ie)); + iec->dirty = true; + return 0; + } + + if (iec->dirty) { + r = ie_cache_writeback(ll, iec); + if (r) + return r; + } + } + + iec->valid = true; + iec->dirty = true; + iec->index = index; + memcpy(&iec->ie, ie, sizeof(*ie)); + return 0; } static int disk_ll_init_index(struct ll_disk *ll) { + unsigned i; + for (i = 0; i < IE_CACHE_SIZE; i++) { + struct ie_cache *iec = ll->ie_cache + i; + iec->valid = false; + iec->dirty = false; + } return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root); } static int disk_ll_open(struct ll_disk *ll) { - /* nothing to do */ return 0; } @@ -719,7 +1180,16 @@ static dm_block_t disk_ll_max_entries(struct ll_disk *ll) static int disk_ll_commit(struct ll_disk *ll) { - return 0; + int r = 0; + unsigned i; + + for (i = 0; i < IE_CACHE_SIZE; i++) { + struct ie_cache *iec = ll->ie_cache + i; + if (iec->valid && iec->dirty) + r = ie_cache_writeback(ll, iec); + } + + return r; } int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm) diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h index 87e17909ef52..706ceb85d680 100644 --- a/drivers/md/persistent-data/dm-space-map-common.h +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -54,6 +54,20 @@ typedef int (*open_index_fn)(struct ll_disk *ll); typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll); typedef int (*commit_fn)(struct ll_disk *ll); +/* + * A lot of time can be wasted reading and writing the same + * index entry. So we cache a few entries. + */ +#define IE_CACHE_SIZE 64 +#define IE_CACHE_MASK (IE_CACHE_SIZE - 1) + +struct ie_cache { + bool valid; + bool dirty; + dm_block_t index; + struct disk_index_entry ie; +}; + struct ll_disk { struct dm_transaction_manager *tm; struct dm_btree_info bitmap_info; @@ -79,6 +93,8 @@ struct ll_disk { max_index_entries_fn max_entries; commit_fn commit; bool bitmap_index_changed:1; + + struct ie_cache ie_cache[IE_CACHE_SIZE]; }; struct disk_sm_root { @@ -96,12 +112,6 @@ struct disk_bitmap_header { __le64 blocknr; } __attribute__ ((packed, aligned(8))); -enum allocation_event { - SM_NONE, - SM_ALLOC, - SM_FREE, -}; - /*----------------------------------------------------------------*/ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks); @@ -111,9 +121,15 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, dm_block_t end, dm_block_t *result); int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, dm_block_t begin, dm_block_t end, dm_block_t *result); -int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev); -int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); -int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); + +/* + * The next three functions return (via nr_allocations) the net number of + * allocations that were made. This number may be negative if there were + * more frees than allocs. + */ +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, int32_t *nr_allocations); +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations); +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations); int sm_ll_commit(struct ll_disk *ll); int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm); diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index 61f56909e00b..d0a8d5e73c28 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -87,76 +87,39 @@ static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) { int r; - uint32_t old_count; - enum allocation_event ev; + int32_t nr_allocations; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_insert(&smd->ll, b, count, &ev); + r = sm_ll_insert(&smd->ll, b, count, &nr_allocations); if (!r) { - switch (ev) { - case SM_NONE: - break; - - case SM_ALLOC: - /* - * This _must_ be free in the prior transaction - * otherwise we've lost atomicity. - */ - smd->nr_allocated_this_transaction++; - break; - - case SM_FREE: - /* - * It's only free if it's also free in the last - * transaction. - */ - r = sm_ll_lookup(&smd->old_ll, b, &old_count); - if (r) - return r; - - if (!old_count) - smd->nr_allocated_this_transaction--; - break; - } + smd->nr_allocated_this_transaction += nr_allocations; } return r; } -static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) +static int sm_disk_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { int r; - enum allocation_event ev; + int32_t nr_allocations; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_inc(&smd->ll, b, &ev); - if (!r && (ev == SM_ALLOC)) - /* - * This _must_ be free in the prior transaction - * otherwise we've lost atomicity. - */ - smd->nr_allocated_this_transaction++; + r = sm_ll_inc(&smd->ll, b, e, &nr_allocations); + if (!r) + smd->nr_allocated_this_transaction += nr_allocations; return r; } -static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) +static int sm_disk_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { int r; - uint32_t old_count; - enum allocation_event ev; + int32_t nr_allocations; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_dec(&smd->ll, b, &ev); - if (!r && (ev == SM_FREE)) { - /* - * It's only free if it's also free in the last - * transaction. - */ - r = sm_ll_lookup(&smd->old_ll, b, &old_count); - if (!r && !old_count) - smd->nr_allocated_this_transaction--; - } + r = sm_ll_dec(&smd->ll, b, e, &nr_allocations); + if (!r) + smd->nr_allocated_this_transaction += nr_allocations; return r; } @@ -164,21 +127,28 @@ static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) { int r; - enum allocation_event ev; + int32_t nr_allocations; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); /* * Any block we allocate has to be free in both the old and current ll. */ r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b); + if (r == -ENOSPC) { + /* + * There's no free block between smd->begin and the end of the metadata device. + * We search before smd->begin in case something has been freed. + */ + r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, 0, smd->begin, b); + } + if (r) return r; smd->begin = *b + 1; - r = sm_ll_inc(&smd->ll, *b, &ev); + r = sm_ll_inc(&smd->ll, *b, *b + 1, &nr_allocations); if (!r) { - BUG_ON(ev != SM_ALLOC); - smd->nr_allocated_this_transaction++; + smd->nr_allocated_this_transaction += nr_allocations; } return r; @@ -194,7 +164,6 @@ static int sm_disk_commit(struct dm_space_map *sm) return r; memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll)); - smd->begin = 0; smd->nr_allocated_this_transaction = 0; return 0; @@ -235,8 +204,8 @@ static struct dm_space_map ops = { .get_count = sm_disk_get_count, .count_is_more_than_one = sm_disk_count_is_more_than_one, .set_count = sm_disk_set_count, - .inc_block = sm_disk_inc_block, - .dec_block = sm_disk_dec_block, + .inc_blocks = sm_disk_inc_blocks, + .dec_blocks = sm_disk_dec_blocks, .new_block = sm_disk_new_block, .commit = sm_disk_commit, .root_size = sm_disk_root_size, diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 9e3c64ec2026..392ae26134a4 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -89,7 +89,8 @@ enum block_op_type { struct block_op { enum block_op_type type; - dm_block_t block; + dm_block_t b; + dm_block_t e; }; struct bop_ring_buffer { @@ -116,7 +117,7 @@ static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) } static int brb_push(struct bop_ring_buffer *brb, - enum block_op_type type, dm_block_t b) + enum block_op_type type, dm_block_t b, dm_block_t e) { struct block_op *bop; unsigned next = brb_next(brb, brb->end); @@ -130,7 +131,8 @@ static int brb_push(struct bop_ring_buffer *brb, bop = brb->bops + brb->end; bop->type = type; - bop->block = b; + bop->b = b; + bop->e = e; brb->end = next; @@ -145,9 +147,7 @@ static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result) return -ENODATA; bop = brb->bops + brb->begin; - result->type = bop->type; - result->block = bop->block; - + memcpy(result, bop, sizeof(*result)); return 0; } @@ -178,10 +178,9 @@ struct sm_metadata { struct threshold threshold; }; -static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) +static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b, dm_block_t e) { - int r = brb_push(&smm->uncommitted, type, b); - + int r = brb_push(&smm->uncommitted, type, b, e); if (r) { DMERR("too many recursive allocations"); return -ENOMEM; @@ -193,15 +192,15 @@ static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t static int commit_bop(struct sm_metadata *smm, struct block_op *op) { int r = 0; - enum allocation_event ev; + int32_t nr_allocations; switch (op->type) { case BOP_INC: - r = sm_ll_inc(&smm->ll, op->block, &ev); + r = sm_ll_inc(&smm->ll, op->b, op->e, &nr_allocations); break; case BOP_DEC: - r = sm_ll_dec(&smm->ll, op->block, &ev); + r = sm_ll_dec(&smm->ll, op->b, op->e, &nr_allocations); break; } @@ -314,7 +313,7 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, i = brb_next(&smm->uncommitted, i)) { struct block_op *op = smm->uncommitted.bops + i; - if (op->block != b) + if (b < op->b || b >= op->e) continue; switch (op->type) { @@ -355,7 +354,7 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, struct block_op *op = smm->uncommitted.bops + i; - if (op->block != b) + if (b < op->b || b >= op->e) continue; switch (op->type) { @@ -393,7 +392,7 @@ static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) { int r, r2; - enum allocation_event ev; + int32_t nr_allocations; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); if (smm->recursion_count) { @@ -402,40 +401,42 @@ static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b, } in(smm); - r = sm_ll_insert(&smm->ll, b, count, &ev); + r = sm_ll_insert(&smm->ll, b, count, &nr_allocations); r2 = out(smm); return combine_errors(r, r2); } -static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b) +static int sm_metadata_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { int r, r2 = 0; - enum allocation_event ev; + int32_t nr_allocations; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - if (recursing(smm)) - r = add_bop(smm, BOP_INC, b); - else { + if (recursing(smm)) { + r = add_bop(smm, BOP_INC, b, e); + if (r) + return r; + } else { in(smm); - r = sm_ll_inc(&smm->ll, b, &ev); + r = sm_ll_inc(&smm->ll, b, e, &nr_allocations); r2 = out(smm); } return combine_errors(r, r2); } -static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b) +static int sm_metadata_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { int r, r2 = 0; - enum allocation_event ev; + int32_t nr_allocations; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); if (recursing(smm)) - r = add_bop(smm, BOP_DEC, b); + r = add_bop(smm, BOP_DEC, b, e); else { in(smm); - r = sm_ll_dec(&smm->ll, b, &ev); + r = sm_ll_dec(&smm->ll, b, e, &nr_allocations); r2 = out(smm); } @@ -445,23 +446,31 @@ static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b) static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) { int r, r2 = 0; - enum allocation_event ev; + int32_t nr_allocations; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); /* * Any block we allocate has to be free in both the old and current ll. */ r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b); + if (r == -ENOSPC) { + /* + * There's no free block between smm->begin and the end of the metadata device. + * We search before smm->begin in case something has been freed. + */ + r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, 0, smm->begin, b); + } + if (r) return r; smm->begin = *b + 1; if (recursing(smm)) - r = add_bop(smm, BOP_INC, *b); + r = add_bop(smm, BOP_INC, *b, *b + 1); else { in(smm); - r = sm_ll_inc(&smm->ll, *b, &ev); + r = sm_ll_inc(&smm->ll, *b, *b + 1, &nr_allocations); r2 = out(smm); } @@ -503,7 +512,6 @@ static int sm_metadata_commit(struct dm_space_map *sm) return r; memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); - smm->begin = 0; smm->allocated_this_transaction = 0; return 0; @@ -556,8 +564,8 @@ static const struct dm_space_map ops = { .get_count = sm_metadata_get_count, .count_is_more_than_one = sm_metadata_count_is_more_than_one, .set_count = sm_metadata_set_count, - .inc_block = sm_metadata_inc_block, - .dec_block = sm_metadata_dec_block, + .inc_blocks = sm_metadata_inc_blocks, + .dec_blocks = sm_metadata_dec_blocks, .new_block = sm_metadata_new_block, .commit = sm_metadata_commit, .root_size = sm_metadata_root_size, @@ -641,18 +649,28 @@ static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b) return 0; } -static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b) +static int sm_bootstrap_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { + int r; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - return add_bop(smm, BOP_INC, b); + r = add_bop(smm, BOP_INC, b, e); + if (r) + return r; + + return 0; } -static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b) +static int sm_bootstrap_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { + int r; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - return add_bop(smm, BOP_DEC, b); + r = add_bop(smm, BOP_DEC, b, e); + if (r) + return r; + + return 0; } static int sm_bootstrap_commit(struct dm_space_map *sm) @@ -683,8 +701,8 @@ static const struct dm_space_map bootstrap_ops = { .get_count = sm_bootstrap_get_count, .count_is_more_than_one = sm_bootstrap_count_is_more_than_one, .set_count = sm_bootstrap_set_count, - .inc_block = sm_bootstrap_inc_block, - .dec_block = sm_bootstrap_dec_block, + .inc_blocks = sm_bootstrap_inc_blocks, + .dec_blocks = sm_bootstrap_dec_blocks, .new_block = sm_bootstrap_new_block, .commit = sm_bootstrap_commit, .root_size = sm_bootstrap_root_size, @@ -696,7 +714,7 @@ static const struct dm_space_map bootstrap_ops = { static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) { - int r, i; + int r; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); dm_block_t old_len = smm->ll.nr_blocks; @@ -718,9 +736,7 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) * allocate any new blocks. */ do { - for (i = old_len; !r && i < smm->begin; i++) - r = add_bop(smm, BOP_INC, i); - + r = add_bop(smm, BOP_INC, old_len, smm->begin); if (r) goto out; @@ -767,7 +783,6 @@ int dm_sm_metadata_create(struct dm_space_map *sm, dm_block_t superblock) { int r; - dm_block_t i; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); smm->begin = superblock + 1; @@ -792,9 +807,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm, * Now we need to update the newly created data structures with the * allocated blocks that they were built from. */ - for (i = superblock; !r && i < smm->begin; i++) - r = add_bop(smm, BOP_INC, i); - + r = add_bop(smm, BOP_INC, superblock, smm->begin); if (r) return r; diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h index 3e6d1153b7c4..a015cd11f6e9 100644 --- a/drivers/md/persistent-data/dm-space-map.h +++ b/drivers/md/persistent-data/dm-space-map.h @@ -46,8 +46,8 @@ struct dm_space_map { int (*commit)(struct dm_space_map *sm); - int (*inc_block)(struct dm_space_map *sm, dm_block_t b); - int (*dec_block)(struct dm_space_map *sm, dm_block_t b); + int (*inc_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e); + int (*dec_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e); /* * new_block will increment the returned block. @@ -117,14 +117,24 @@ static inline int dm_sm_commit(struct dm_space_map *sm) return sm->commit(sm); } +static inline int dm_sm_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + return sm->inc_blocks(sm, b, e); +} + static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b) { - return sm->inc_block(sm, b); + return dm_sm_inc_blocks(sm, b, b + 1); +} + +static inline int dm_sm_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + return sm->dec_blocks(sm, b, e); } static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b) { - return sm->dec_block(sm, b); + return dm_sm_dec_blocks(sm, b, b + 1); } static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b) diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index abe2c5dd0993..16643fc974e8 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -359,6 +359,17 @@ void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b) } EXPORT_SYMBOL_GPL(dm_tm_inc); +void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_inc_blocks(tm->sm, b, e); +} +EXPORT_SYMBOL_GPL(dm_tm_inc_range); + void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b) { /* @@ -370,6 +381,47 @@ void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b) } EXPORT_SYMBOL_GPL(dm_tm_dec); +void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_dec_blocks(tm->sm, b, e); +} +EXPORT_SYMBOL_GPL(dm_tm_dec_range); + +void dm_tm_with_runs(struct dm_transaction_manager *tm, + const __le64 *value_le, unsigned count, dm_tm_run_fn fn) +{ + uint64_t b, begin, end; + bool in_run = false; + unsigned i; + + for (i = 0; i < count; i++, value_le++) { + b = le64_to_cpu(*value_le); + + if (in_run) { + if (b == end) + end++; + else { + fn(tm, begin, end); + begin = b; + end = b + 1; + } + } else { + in_run = true; + begin = b; + end = b + 1; + } + } + + if (in_run) + fn(tm, begin, end); +} +EXPORT_SYMBOL_GPL(dm_tm_with_runs); + int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, uint32_t *result) { @@ -379,6 +431,15 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, return dm_sm_get_count(tm->sm, b, result); } +int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b, + int *result) +{ + if (tm->is_clone) + return -EWOULDBLOCK; + + return dm_sm_count_is_more_than_one(tm->sm, b, result); +} + struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm) { return tm->bm; diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index f3a18be68f30..906c02ed0365 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -100,11 +100,27 @@ void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b); * Functions for altering the reference count of a block directly. */ void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b); - +void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e); void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b); +void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e); + +/* + * Builds up runs of adjacent blocks, and then calls the given fn + * (typically dm_tm_inc/dec). Very useful when you have to perform + * the same tm operation on all values in a btree leaf. + */ +typedef void (*dm_tm_run_fn)(struct dm_transaction_manager *, dm_block_t, dm_block_t); +void dm_tm_with_runs(struct dm_transaction_manager *tm, + const __le64 *value_le, unsigned count, dm_tm_run_fn fn); -int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, - uint32_t *result); +int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, uint32_t *result); + +/* + * Finds out if a given block is shared (ie. has a reference count higher + * than one). + */ +int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b, + int *result); struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index fd3860d18d7e..b05f2fd495fc 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -300,6 +300,7 @@ enum { BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_TRACKED, /* set if bio goes through the rq_qos path */ BIO_REMAPPED, + BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ BIO_FLAG_LAST }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d66d0da72529..103acc5228e7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1012,6 +1012,18 @@ static inline unsigned int blk_rq_stats_sectors(const struct request *rq) /* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond); +static inline unsigned int bio_zone_no(struct bio *bio) +{ + return blk_queue_zone_no(bdev_get_queue(bio->bi_bdev), + bio->bi_iter.bi_sector); +} + +static inline unsigned int bio_zone_is_seq(struct bio *bio) +{ + return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev), + bio->bi_iter.bi_sector); +} + static inline unsigned int blk_rq_zone_no(struct request *rq) { return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index ff700fb6ce1d..7457d49acf9a 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -361,6 +361,12 @@ struct dm_target { * Set if we need to limit the number of in-flight bios when swapping. */ bool limit_swap_bios:1; + + /* + * Set if this target implements a a zoned device and needs emulation of + * zone append operations using regular writes. + */ + bool emulate_zone_append:1; }; void *dm_per_bio_data(struct bio *bio, size_t data_size); @@ -478,7 +484,8 @@ struct dm_report_zones_args { /* must be filled by ->report_zones before calling dm_report_zones_cb */ sector_t start; }; -int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data); +int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector, + struct dm_report_zones_args *args, unsigned int nr_zones); #endif /* CONFIG_BLK_DEV_ZONED */ /* diff --git a/include/linux/dm-kcopyd.h b/include/linux/dm-kcopyd.h index e42de7750c88..c1707ee5b540 100644 --- a/include/linux/dm-kcopyd.h +++ b/include/linux/dm-kcopyd.h @@ -51,6 +51,7 @@ MODULE_PARM_DESC(name, description) struct dm_kcopyd_client; struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle); void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc); +void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc); /* * Submit a copy job to kcopyd. This is built on top of the |