diff options
Diffstat (limited to 'drivers/md')
36 files changed, 505 insertions, 255 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 0b1870a09e1f..ddb37f6670de 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -139,7 +139,7 @@ config MD_RAID456 tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD select RAID6_PQ - select LIBCRC32C + select CRC32 select ASYNC_MEMCPY select ASYNC_XOR select ASYNC_PQ @@ -267,6 +267,7 @@ config DM_CRYPT depends on BLK_DEV_DM depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) depends on (TRUSTED_KEYS || TRUSTED_KEYS=n) + select CRC32 select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 68b02216033d..d39dec34b7a3 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -123,7 +123,7 @@ void bch_cache_accounting_destroy(struct cache_accounting *acc) kobject_put(&acc->day.kobj); atomic_set(&acc->closing, 1); - if (del_timer_sync(&acc->timer)) + if (timer_delete_sync(&acc->timer)) closure_return(&acc->cl); } diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index aab8240429b0..9c8ed65cd87e 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -2234,7 +2234,7 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c } EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); -static bool forget_buffer(struct dm_bufio_client *c, sector_t block) +static void forget_buffer(struct dm_bufio_client *c, sector_t block) { struct dm_buffer *b; @@ -2249,8 +2249,6 @@ static bool forget_buffer(struct dm_bufio_client *c, sector_t block) cache_put_and_wake(c, b); } } - - return b ? true : false; } /* diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 9cb797a561d6..a10d75a562db 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -406,6 +406,12 @@ struct cache { mempool_t migration_pool; struct bio_set bs; + + /* + * Cache_size entries. Set bits indicate blocks mapped beyond the + * target length, which are marked for invalidation. + */ + unsigned long *invalid_bitset; }; struct per_bio_data { @@ -1922,6 +1928,9 @@ static void __destroy(struct cache *cache) if (cache->discard_bitset) free_bitset(cache->discard_bitset); + if (cache->invalid_bitset) + free_bitset(cache->invalid_bitset); + if (cache->copier) dm_kcopyd_client_destroy(cache->copier); @@ -2510,6 +2519,13 @@ static int cache_create(struct cache_args *ca, struct cache **result) } clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); + cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size)); + if (!cache->invalid_bitset) { + *error = "could not allocate bitset for invalid blocks"; + goto bad; + } + clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); + cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(cache->copier)) { *error = "could not create kcopyd client"; @@ -2808,6 +2824,24 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); } +static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) +{ + struct cache *cache = context; + + if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) { + if (dirty) { + DMERR("%s: unable to shrink origin; cache block %u is dirty", + cache_device_name(cache), from_cblock(cblock)); + return -EFBIG; + } + set_bit(from_cblock(cblock), cache->invalid_bitset); + return 0; + } + + return load_mapping(context, oblock, cblock, dirty, hint, hint_valid); +} + /* * The discard block size in the on disk metadata is not * necessarily the same as we're currently using. So we have to @@ -2899,6 +2933,27 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache) return to_cblock(size); } +static bool can_resume(struct cache *cache) +{ + /* + * Disallow retrying the resume operation for devices that failed the + * first resume attempt, as the failure leaves the policy object partially + * initialized. Retrying could trigger BUG_ON when loading cache mappings + * into the incomplete policy object. + */ + if (cache->sized && !cache->loaded_mappings) { + if (get_cache_mode(cache) != CM_WRITE) + DMERR("%s: unable to resume a failed-loaded cache, please check metadata.", + cache_device_name(cache)); + else + DMERR("%s: unable to resume cache due to missing proper cache table reload", + cache_device_name(cache)); + return false; + } + + return true; +} + static bool can_resize(struct cache *cache, dm_cblock_t new_size) { if (from_cblock(new_size) > from_cblock(cache->cache_size)) { @@ -2941,12 +2996,33 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) return 0; } +static int truncate_oblocks(struct cache *cache) +{ + uint32_t nr_blocks = from_cblock(cache->cache_size); + uint32_t i; + int r; + + for_each_set_bit(i, cache->invalid_bitset, nr_blocks) { + r = dm_cache_remove_mapping(cache->cmd, to_cblock(i)); + if (r) { + DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", + cache_device_name(cache)); + return r; + } + } + + return 0; +} + static int cache_preresume(struct dm_target *ti) { int r = 0; struct cache *cache = ti->private; dm_cblock_t csize = get_cache_dev_size(cache); + if (!can_resume(cache)) + return -EINVAL; + /* * Check to see if the cache has resized. */ @@ -2962,11 +3038,25 @@ static int cache_preresume(struct dm_target *ti) } if (!cache->loaded_mappings) { + /* + * The fast device could have been resized since the last + * failed preresume attempt. To be safe we start by a blank + * bitset for cache blocks. + */ + clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); + r = dm_cache_load_mappings(cache->cmd, cache->policy, - load_mapping, cache); + load_filtered_mapping, cache); if (r) { DMERR("%s: could not load cache mappings", cache_device_name(cache)); - metadata_operation_failed(cache, "dm_cache_load_mappings", r); + if (r != -EFBIG) + metadata_operation_failed(cache, "dm_cache_load_mappings", r); + return r; + } + + r = truncate_oblocks(cache); + if (r) { + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); return r; } @@ -3426,7 +3516,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {2, 2, 0}, + .version = {2, 3, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 02a2919f4e5a..9dfdb63220d7 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -17,6 +17,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> +#include <linux/crc32.h> #include <linux/mempool.h> #include <linux/slab.h> #include <linux/crypto.h> @@ -125,7 +126,6 @@ struct iv_lmk_private { #define TCW_WHITENING_SIZE 16 struct iv_tcw_private { - struct crypto_shash *crc32_tfm; u8 *iv_seed; u8 *whitening; }; @@ -607,10 +607,6 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc) tcw->iv_seed = NULL; kfree_sensitive(tcw->whitening); tcw->whitening = NULL; - - if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm)) - crypto_free_shash(tcw->crc32_tfm); - tcw->crc32_tfm = NULL; } static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, @@ -628,13 +624,6 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, return -EINVAL; } - tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, - CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(tcw->crc32_tfm)) { - ti->error = "Error initializing CRC32 in TCW"; - return PTR_ERR(tcw->crc32_tfm); - } - tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL); tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL); if (!tcw->iv_seed || !tcw->whitening) { @@ -668,36 +657,28 @@ static int crypt_iv_tcw_wipe(struct crypt_config *cc) return 0; } -static int crypt_iv_tcw_whitening(struct crypt_config *cc, - struct dm_crypt_request *dmreq, - u8 *data) +static void crypt_iv_tcw_whitening(struct crypt_config *cc, + struct dm_crypt_request *dmreq, u8 *data) { struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 buf[TCW_WHITENING_SIZE]; - SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm); - int i, r; + int i; /* xor whitening with sector number */ crypto_xor_cpy(buf, tcw->whitening, (u8 *)§or, 8); crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)§or, 8); /* calculate crc32 for every 32bit part and xor it */ - desc->tfm = tcw->crc32_tfm; - for (i = 0; i < 4; i++) { - r = crypto_shash_digest(desc, &buf[i * 4], 4, &buf[i * 4]); - if (r) - goto out; - } + for (i = 0; i < 4; i++) + put_unaligned_le32(crc32(0, &buf[i * 4], 4), &buf[i * 4]); crypto_xor(&buf[0], &buf[12], 4); crypto_xor(&buf[4], &buf[8], 4); /* apply whitening (8 bytes) to whole sector */ for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) crypto_xor(data + i * 8, buf, 8); -out: memzero_explicit(buf, sizeof(buf)); - return r; } static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, @@ -707,13 +688,12 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 *src; - int r = 0; /* Remove whitening from ciphertext */ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { sg = crypt_get_sg_data(cc, dmreq->sg_in); src = kmap_local_page(sg_page(sg)); - r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); + crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); kunmap_local(src); } @@ -723,7 +703,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)§or, cc->iv_size - 8); - return r; + return 0; } static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, @@ -731,7 +711,6 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, { struct scatterlist *sg; u8 *dst; - int r; if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) return 0; @@ -739,10 +718,10 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, /* Apply whitening on ciphertext */ sg = crypt_get_sg_data(cc, dmreq->sg_out); dst = kmap_local_page(sg_page(sg)); - r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); + crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); kunmap_local(dst); - return r; + return 0; } static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 08f6387620c1..d4cf0ac2a7aa 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -369,6 +369,21 @@ static int delay_map(struct dm_target *ti, struct bio *bio) return delay_bio(dc, c, bio); } +#ifdef CONFIG_BLK_DEV_ZONED +static int delay_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + struct delay_c *dc = ti->private; + struct delay_class *c = &dc->read; + + return dm_report_zones(c->dev->bdev, c->start, + c->start + dm_target_offset(ti, args->next_sector), + args, nr_zones); +} +#else +#define delay_report_zones NULL +#endif + #define DMEMIT_DELAY_CLASS(c) \ DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) @@ -424,11 +439,12 @@ out: static struct target_type delay_target = { .name = "delay", .version = {1, 4, 0}, - .features = DM_TARGET_PASSES_INTEGRITY, + .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM, .module = THIS_MODULE, .ctr = delay_ctr, .dtr = delay_dtr, .map = delay_map, + .report_zones = delay_report_zones, .presuspend = delay_presuspend, .resume = delay_resume, .status = delay_status, diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 18ae45dcbfb2..b19b0142a690 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -390,6 +390,12 @@ static int ebs_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } +static void ebs_postsuspend(struct dm_target *ti) +{ + struct ebs_c *ec = ti->private; + dm_bufio_client_reset(ec->bufio); +} + static void ebs_status(struct dm_target *ti, status_type_t type, unsigned int status_flags, char *result, unsigned int maxlen) { @@ -447,6 +453,7 @@ static struct target_type ebs_target = { .ctr = ebs_ctr, .dtr = ebs_dtr, .map = ebs_map, + .postsuspend = ebs_postsuspend, .status = ebs_status, .io_hints = ebs_io_hints, .prepare_ioctl = ebs_prepare_ioctl, diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index c8c1a00e7d80..2a283feb3319 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -21,6 +21,7 @@ #include <linux/reboot.h> #include <crypto/hash.h> #include <crypto/skcipher.h> +#include <crypto/utils.h> #include <linux/async_tx.h> #include <linux/dm-bufio.h> @@ -516,7 +517,7 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr) dm_integrity_io_error(ic, "crypto_shash_digest", r); return r; } - if (memcmp(mac, actual_mac, mac_size)) { + if (crypto_memneq(mac, actual_mac, mac_size)) { dm_integrity_io_error(ic, "superblock mac", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0); return -EILSEQ; @@ -859,7 +860,7 @@ static void rw_section_mac(struct dm_integrity_c *ic, unsigned int section, bool if (likely(wr)) memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR); else { - if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { + if (crypto_memneq(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { dm_integrity_io_error(ic, "journal mac", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0); } @@ -1401,10 +1402,9 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, unsigned int *metadata_offset, unsigned int total_size, int op) { -#define MAY_BE_FILLER 1 -#define MAY_BE_HASH 2 unsigned int hash_offset = 0; - unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + unsigned char mismatch_hash = 0; + unsigned char mismatch_filler = !ic->discard; do { unsigned char *data, *dp; @@ -1425,7 +1425,7 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se if (op == TAG_READ) { memcpy(tag, dp, to_copy); } else if (op == TAG_WRITE) { - if (memcmp(dp, tag, to_copy)) { + if (crypto_memneq(dp, tag, to_copy)) { memcpy(dp, tag, to_copy); dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); } @@ -1433,29 +1433,30 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se /* e.g.: op == TAG_CMP */ if (likely(is_power_of_2(ic->tag_size))) { - if (unlikely(memcmp(dp, tag, to_copy))) - if (unlikely(!ic->discard) || - unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) { - goto thorough_test; - } + if (unlikely(crypto_memneq(dp, tag, to_copy))) + goto thorough_test; } else { unsigned int i, ts; thorough_test: ts = total_size; for (i = 0; i < to_copy; i++, ts--) { - if (unlikely(dp[i] != tag[i])) - may_be &= ~MAY_BE_HASH; - if (likely(dp[i] != DISCARD_FILLER)) - may_be &= ~MAY_BE_FILLER; + /* + * Warning: the control flow must not be + * dependent on match/mismatch of + * individual bytes. + */ + mismatch_hash |= dp[i] ^ tag[i]; + mismatch_filler |= dp[i] ^ DISCARD_FILLER; hash_offset++; if (unlikely(hash_offset == ic->tag_size)) { - if (unlikely(!may_be)) { + if (unlikely(mismatch_hash) && unlikely(mismatch_filler)) { dm_bufio_release(b); return ts; } hash_offset = 0; - may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + mismatch_hash = 0; + mismatch_filler = !ic->discard; } } } @@ -1476,8 +1477,6 @@ thorough_test: } while (unlikely(total_size)); return 0; -#undef MAY_BE_FILLER -#undef MAY_BE_HASH } struct flush_request { @@ -2076,7 +2075,7 @@ retry_kmap: char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); - if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { + if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", logical_sector); dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum", @@ -2595,7 +2594,7 @@ static void dm_integrity_inline_recheck(struct work_struct *w) bio_put(outgoing_bio); integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); - if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); atomic64_inc(&ic->number_of_mismatches); @@ -2634,7 +2633,7 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status char *mem = bvec_kmap_local(&bv); //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); - if (unlikely(memcmp(digest, dio->integrity_payload + pos, + if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { kunmap_local(mem); dm_integrity_free_payload(dio); @@ -2708,7 +2707,7 @@ static void integrity_commit(struct work_struct *w) unsigned int i, j, n; struct bio *flushes; - del_timer(&ic->autocommit_timer); + timer_delete(&ic->autocommit_timer); if (ic->mode == 'I') return; @@ -2911,7 +2910,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), (char *)access_journal_data(ic, i, l), test_tag); - if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { + if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0); } @@ -3607,7 +3606,7 @@ static void dm_integrity_postsuspend(struct dm_target *ti) WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier)); - del_timer_sync(&ic->autocommit_timer); + timer_delete_sync(&ic->autocommit_timer); if (ic->recalc_wq) drain_workqueue(ic->recalc_wq); @@ -5072,16 +5071,19 @@ try_smaller_buffer: ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); if (!ic->recalc_bitmap) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); if (!ic->may_write_bitmap) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL); if (!ic->bbs) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 637977acc3dc..6c98f4ae5ea9 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -815,7 +815,7 @@ static void enable_nopath_timeout(struct multipath *m) static void disable_nopath_timeout(struct multipath *m) { - del_timer_sync(&m->nopath_timer); + timer_delete_sync(&m->nopath_timer); } /* diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 8c6f1f7e6456..9e615b4f1f5e 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1182,7 +1182,7 @@ static void mirror_dtr(struct dm_target *ti) { struct mirror_set *ms = ti->private; - del_timer_sync(&ms->timer); + timer_delete_sync(&ms->timer); flush_workqueue(ms->kmirrord_wq); flush_work(&ms->trigger_event); dm_kcopyd_client_destroy(ms->kcopyd_client); diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 3786ac67cefe..a1b7535c508a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -467,7 +467,7 @@ static struct target_type stripe_target = { .name = "striped", .version = {1, 7, 0}, .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | - DM_TARGET_ATOMIC_WRITES, + DM_TARGET_ATOMIC_WRITES | DM_TARGET_PASSES_CRYPTO, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 453803f1edf5..35100a435c88 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -697,6 +697,10 @@ int dm_table_add_target(struct dm_table *t, const char *type, DMERR("%s: zero-length target", dm_device_name(t->md)); return -EINVAL; } + if (start + len < start || start + len > LLONG_MAX >> SECTOR_SHIFT) { + DMERR("%s: too large device", dm_device_name(t->md)); + return -EINVAL; + } ti->type = dm_get_target_type(type); if (!ti->type) { diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c index 89cb7942ec5c..baf683cabb1b 100644 --- a/drivers/md/dm-vdo/block-map.c +++ b/drivers/md/dm-vdo/block-map.c @@ -451,7 +451,7 @@ static struct page_info * __must_check find_page(struct vdo_page_cache *cache, * select_lru_page() - Determine which page is least recently used. * * Picks the least recently used from among the non-busy entries at the front of each of the lru - * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely + * list. Since whenever we mark a page busy we also put it to the end of the list it is unlikely * that the entries at the front are busy unless the queue is very short, but not impossible. * * Return: A pointer to the info structure for a relevant page, or NULL if no such page can be @@ -1544,7 +1544,7 @@ static void write_page_if_not_dirtied(struct vdo_waiter *waiter, void *context) static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio) { - return_vio_to_pool(zone->vio_pool, vio); + return_vio_to_pool(vio); check_for_drain_complete(zone); } @@ -1837,7 +1837,7 @@ static void finish_block_map_page_load(struct vdo_completion *completion) if (!vdo_copy_valid_page(vio->data, nonce, pbn, page)) vdo_format_block_map_page(page, nonce, pbn, false); - return_vio_to_pool(zone->vio_pool, pooled); + return_vio_to_pool(pooled); /* Release our claim to the load and wake any waiters */ release_page_lock(data_vio, "load"); @@ -1851,10 +1851,9 @@ static void handle_io_error(struct vdo_completion *completion) struct vio *vio = as_vio(completion); struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio); struct data_vio *data_vio = completion->parent; - struct block_map_zone *zone = pooled->context; vio_record_metadata_io_error(vio); - return_vio_to_pool(zone->vio_pool, pooled); + return_vio_to_pool(pooled); abort_load(data_vio, result); } @@ -2499,7 +2498,7 @@ static void finish_cursor(struct cursor *cursor) struct cursors *cursors = cursor->parent; struct vdo_completion *completion = cursors->completion; - return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio)); + return_vio_to_pool(vdo_forget(cursor->vio)); if (--cursors->active_roots > 0) return; @@ -2746,7 +2745,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map, if (result != VDO_SUCCESS) return result; - result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, + result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, 1, zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR, VIO_PRIORITY_METADATA, zone, &zone->vio_pool); if (result != VDO_SUCCESS) diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h index a8c4d6e24b38..2a8b03779f87 100644 --- a/drivers/md/dm-vdo/constants.h +++ b/drivers/md/dm-vdo/constants.h @@ -44,9 +44,6 @@ enum { /* The default size of each slab journal, in blocks */ DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224, - /* Unit test minimum */ - MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2, - /* * The initial size of lbn_operations and pbn_operations, which is based upon the expected * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index 3f3d29af1be4..3c58b941e067 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -226,7 +226,7 @@ struct hash_lock { * A list containing the data VIOs sharing this lock, all having the same record name and * data block contents, linked by their hash_lock_node fields. */ - struct list_head duplicate_ring; + struct list_head duplicate_vios; /* The number of data_vios sharing this lock instance */ data_vio_count_t reference_count; @@ -343,7 +343,7 @@ static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *l { memset(lock, 0, sizeof(*lock)); INIT_LIST_HEAD(&lock->pool_node); - INIT_LIST_HEAD(&lock->duplicate_ring); + INIT_LIST_HEAD(&lock->duplicate_vios); vdo_waitq_init(&lock->waiters); list_add_tail(&lock->pool_node, &zone->lock_pool); } @@ -441,7 +441,7 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL, "must have a hash zone when holding a hash lock"); VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry), - "must be on a hash lock ring when holding a hash lock"); + "must be on a hash lock list when holding a hash lock"); VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0, "hash lock reference must be counted"); @@ -464,10 +464,10 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) if (new_lock != NULL) { /* - * Keep all data_vios sharing the lock on a ring since they can complete in any + * Keep all data_vios sharing the lock on a list since they can complete in any * order and we'll always need a pointer to one to compare data. */ - list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring); + list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_vios); new_lock->reference_count += 1; if (new_lock->max_references < new_lock->reference_count) new_lock->max_references = new_lock->reference_count; @@ -1789,10 +1789,10 @@ static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate struct hash_zone *zone; bool collides; - if (list_empty(&lock->duplicate_ring)) + if (list_empty(&lock->duplicate_vios)) return false; - lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio, + lock_holder = list_first_entry(&lock->duplicate_vios, struct data_vio, hash_lock_entry); zone = candidate->hash_zone; collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data); @@ -1815,7 +1815,7 @@ static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio return result; result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry), - "must not already be a member of a hash lock ring"); + "must not already be a member of a hash lock list"); if (result != VDO_SUCCESS) return result; @@ -1942,8 +1942,8 @@ void vdo_release_hash_lock(struct data_vio *data_vio) "returned hash lock must not be in use with state %s", get_hash_lock_state_name(lock->state)); VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node), - "hash lock returned to zone must not be in a pool ring"); - VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring), + "hash lock returned to zone must not be in a pool list"); + VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_vios), "hash lock returned to zone must not reference DataVIOs"); return_hash_lock_to_pool(zone, lock); @@ -2261,7 +2261,7 @@ static void check_for_drain_complete(struct hash_zone *zone) if ((atomic_read(&zone->timer_state) == DEDUPE_QUERY_TIMER_IDLE) || change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING, DEDUPE_QUERY_TIMER_IDLE)) { - del_timer_sync(&zone->timer); + timer_delete_sync(&zone->timer); } else { /* * There is an in flight time-out, which must get processed before we can continue. diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index 100e92f8f866..b7cc0f41caca 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -711,24 +711,11 @@ int vdo_configure_slab(block_count_t slab_size, block_count_t slab_journal_block ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks); meta_blocks = (ref_blocks + slab_journal_blocks); - /* Make sure test code hasn't configured slabs to be too small. */ + /* Make sure configured slabs are not too small. */ if (meta_blocks >= slab_size) return VDO_BAD_CONFIGURATION; - /* - * If the slab size is very small, assume this must be a unit test and override the number - * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their - * data_blocks fields to be the exact capacity of the configured volume, and that used to - * fall out since they use a power of two for the number of data blocks, the slab size was - * a power of two, and every block in a slab was a data block. - * - * TODO: Try to figure out some way of structuring testParameters and unit tests so this - * hack isn't needed without having to edit several unit tests every time the metadata size - * changes by one block. - */ data_blocks = slab_size - meta_blocks; - if ((slab_size < 1024) && !is_power_of_2(data_blocks)) - data_blocks = ((block_count_t) 1 << ilog2(data_blocks)); /* * Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in @@ -1221,11 +1208,6 @@ int vdo_validate_config(const struct vdo_config *config, if (result != VDO_SUCCESS) return result; - result = VDO_ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS, - "slab journal size meets minimum size"); - if (result != VDO_SUCCESS) - return result; - result = VDO_ASSERT(config->slab_journal_blocks <= config->slab_size, "slab journal size is within expected bound"); if (result != VDO_SUCCESS) diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c index af8fab83b0f3..61edf2b72427 100644 --- a/drivers/md/dm-vdo/indexer/index-layout.c +++ b/drivers/md/dm-vdo/indexer/index-layout.c @@ -54,7 +54,6 @@ * Each save also has a unique nonce. */ -#define MAGIC_SIZE 32 #define NONCE_INFO_SIZE 32 #define MAX_SAVES 2 @@ -98,9 +97,11 @@ enum region_type { #define SUPER_VERSION_CURRENT 3 #define SUPER_VERSION_MAXIMUM 7 -static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; +static const u8 LAYOUT_MAGIC[] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */ +#define MAGIC_SIZE (sizeof(LAYOUT_MAGIC) - 1) + struct region_header { u64 magic; u64 region_blocks; diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c index aee0914d604a..aa575a24e0b2 100644 --- a/drivers/md/dm-vdo/indexer/index-session.c +++ b/drivers/md/dm-vdo/indexer/index-session.c @@ -100,7 +100,6 @@ static int get_index_session(struct uds_index_session *index_session) int uds_launch_request(struct uds_request *request) { - size_t internal_size; int result; if (request->callback == NULL) { @@ -121,10 +120,7 @@ int uds_launch_request(struct uds_request *request) } /* Reset all internal fields before processing. */ - internal_size = - sizeof(struct uds_request) - offsetof(struct uds_request, zone_number); - // FIXME should be using struct_group for this instead - memset((char *) request + sizeof(*request) - internal_size, 0, internal_size); + memset(&request->internal, 0, sizeof(request->internal)); result = get_index_session(request->session); if (result != UDS_SUCCESS) diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h index 183a94eb7e92..7c1fc4577f5b 100644 --- a/drivers/md/dm-vdo/indexer/indexer.h +++ b/drivers/md/dm-vdo/indexer/indexer.h @@ -8,6 +8,7 @@ #include <linux/mutex.h> #include <linux/sched.h> +#include <linux/stddef.h> #include <linux/types.h> #include <linux/wait.h> @@ -73,7 +74,7 @@ enum uds_request_type { /* Remove any mapping for a name. */ UDS_DELETE, -}; +} __packed; enum uds_open_index_type { /* Create a new index. */ @@ -226,7 +227,7 @@ struct uds_zone_message { enum uds_zone_message_type type; /* The virtual chapter number to which the message applies */ u64 virtual_chapter; -}; +} __packed; struct uds_index_session; struct uds_index; @@ -253,34 +254,32 @@ struct uds_request { /* The existing data associated with the request name, if any */ struct uds_record_data old_metadata; - /* Either UDS_SUCCESS or an error code for the request */ - int status; /* True if the record name had an existing entry in the index */ bool found; + /* Either UDS_SUCCESS or an error code for the request */ + int status; - /* - * The remaining fields are used internally and should not be altered by clients. The index - * relies on zone_number being the first field in this section. - */ - - /* The number of the zone which will process this request*/ - unsigned int zone_number; - /* A link for adding a request to a lock-free queue */ - struct funnel_queue_entry queue_link; - /* A link for adding a request to a standard linked list */ - struct uds_request *next_request; - /* A pointer to the index processing this request */ - struct uds_index *index; - /* Control message for coordinating between zones */ - struct uds_zone_message zone_message; - /* If true, process request immediately by waking the worker thread */ - bool unbatched; - /* If true, continue this request before processing newer requests */ - bool requeued; - /* The virtual chapter containing the record name, if known */ - u64 virtual_chapter; - /* The region of the index containing the record name */ - enum uds_index_region location; + /* The remaining fields are used internally and should not be altered by clients. */ + struct_group(internal, + /* The virtual chapter containing the record name, if known */ + u64 virtual_chapter; + /* The region of the index containing the record name */ + enum uds_index_region location; + /* If true, process request immediately by waking the worker thread */ + bool unbatched; + /* If true, continue this request before processing newer requests */ + bool requeued; + /* Control message for coordinating between zones */ + struct uds_zone_message zone_message; + /* The number of the zone which will process this request*/ + unsigned int zone_number; + /* A link for adding a request to a lock-free queue */ + struct funnel_queue_entry queue_link; + /* A link for adding a request to a standard linked list */ + struct uds_request *next_request; + /* A pointer to the index processing this request */ + struct uds_index *index; + ); }; /* A session is required for most index operations. */ diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c index 421e5436c32c..11d47770b54d 100644 --- a/drivers/md/dm-vdo/io-submitter.c +++ b/drivers/md/dm-vdo/io-submitter.c @@ -327,6 +327,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio) * @error_handler: the handler for submission or I/O errors (may be NULL) * @operation: the type of I/O to perform * @data: the buffer to read or write (may be NULL) + * @size: the I/O amount in bytes * * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block * other vdo threads. @@ -338,7 +339,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio) */ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, - blk_opf_t operation, char *data) + blk_opf_t operation, char *data, int size) { int result; struct vdo_completion *completion = &vio->completion; @@ -349,7 +350,8 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, vdo_reset_completion(completion); completion->error_handler = error_handler; - result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical); + result = vio_reset_bio_with_size(vio, data, size, callback, operation | REQ_META, + physical); if (result != VDO_SUCCESS) { continue_vio(vio, result); return; diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h index 80748699496f..3088f11055fd 100644 --- a/drivers/md/dm-vdo/io-submitter.h +++ b/drivers/md/dm-vdo/io-submitter.h @@ -8,6 +8,7 @@ #include <linux/bio.h> +#include "constants.h" #include "types.h" struct io_submitter; @@ -26,14 +27,25 @@ void vdo_submit_data_vio(struct data_vio *data_vio); void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, - blk_opf_t operation, char *data); + blk_opf_t operation, char *data, int size); static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, blk_opf_t operation) { __submit_metadata_vio(vio, physical, callback, error_handler, - operation, vio->data); + operation, vio->data, vio->block_count * VDO_BLOCK_SIZE); +} + +static inline void vdo_submit_metadata_vio_with_size(struct vio *vio, + physical_block_number_t physical, + bio_end_io_t callback, + vdo_action_fn error_handler, + blk_opf_t operation, + int size) +{ + __submit_metadata_vio(vio, physical, callback, error_handler, + operation, vio->data, size); } static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, @@ -41,7 +53,7 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, { /* FIXME: Can we just use REQ_OP_FLUSH? */ __submit_metadata_vio(vio, 0, callback, error_handler, - REQ_OP_WRITE | REQ_PREFLUSH, NULL); + REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0); } #endif /* VDO_IO_SUBMITTER_H */ diff --git a/drivers/md/dm-vdo/packer.h b/drivers/md/dm-vdo/packer.h index 0f3be44710b5..8c8d6892582d 100644 --- a/drivers/md/dm-vdo/packer.h +++ b/drivers/md/dm-vdo/packer.h @@ -46,7 +46,7 @@ struct compressed_block { /* * Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed - * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with + * block. The bins are kept in a list sorted by the amount of unused space so the first bin with * enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or * is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin. * Upon entering the packer, each data_vio already has its compressed data in the first slot of the diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c index 42d3d8d0e4b5..9bae8256ba4e 100644 --- a/drivers/md/dm-vdo/priority-table.c +++ b/drivers/md/dm-vdo/priority-table.c @@ -199,7 +199,7 @@ void vdo_priority_table_remove(struct priority_table *table, struct list_head *e /* * Remove the entry from the bucket list, remembering a pointer to another entry in the - * ring. + * list. */ next_entry = entry->next; list_del_init(entry); diff --git a/drivers/md/dm-vdo/recovery-journal.h b/drivers/md/dm-vdo/recovery-journal.h index 899071173015..25e7ec6d19f6 100644 --- a/drivers/md/dm-vdo/recovery-journal.h +++ b/drivers/md/dm-vdo/recovery-journal.h @@ -43,9 +43,9 @@ * has a vio which is used to commit that block to disk. The vio's data is the on-disk * representation of the journal block. In addition each in-memory block has a buffer which is used * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are - * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active - * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is - * moved back to the 'free_tail_blocks' ring. + * kept on two lists. Free blocks live on the 'free_tail_blocks' list. When a block becomes active + * (see below) it is moved to the 'active_tail_blocks' list. When a block is fully committed, it is + * moved back to the 'free_tail_blocks' list. * * When entries are added to the journal, they are added to the active in-memory block, as * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index 8f0a35c63af6..f3d80ff7bef5 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -139,7 +139,7 @@ static bool is_slab_journal_blank(const struct vdo_slab *slab) } /** - * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct + * mark_slab_journal_dirty() - Put a slab journal on the dirty list of its allocator in the correct * order. * @journal: The journal to be marked dirty. * @lock: The recovery journal lock held by the slab journal. @@ -414,8 +414,7 @@ static void complete_reaping(struct vdo_completion *completion) { struct slab_journal *journal = completion->parent; - return_vio_to_pool(journal->slab->allocator->vio_pool, - vio_as_pooled_vio(as_vio(vdo_forget(completion)))); + return_vio_to_pool(vio_as_pooled_vio(as_vio(completion))); finish_reaping(journal); reap_slab_journal(journal); } @@ -698,7 +697,7 @@ static void complete_write(struct vdo_completion *completion) sequence_number_t committed = get_committing_sequence_number(pooled); list_del_init(&pooled->list_entry); - return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled)); + return_vio_to_pool(pooled); if (result != VDO_SUCCESS) { vio_record_metadata_io_error(as_vio(completion)); @@ -822,7 +821,7 @@ static void commit_tail(struct slab_journal *journal) /* * Since we are about to commit the tail block, this journal no longer needs to be on the - * ring of journals which the recovery journal might ask to commit. + * list of journals which the recovery journal might ask to commit. */ mark_slab_journal_clean(journal); @@ -1076,7 +1075,7 @@ static void finish_reference_block_write(struct vdo_completion *completion) /* Release the slab journal lock. */ adjust_slab_journal_block_reference(&slab->journal, block->slab_journal_lock_to_release, -1); - return_vio_to_pool(slab->allocator->vio_pool, pooled); + return_vio_to_pool(pooled); /* * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause @@ -1170,8 +1169,8 @@ static void handle_io_error(struct vdo_completion *completion) struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab; vio_record_metadata_io_error(vio); - return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); - slab->active_count--; + return_vio_to_pool(vio_as_pooled_vio(vio)); + slab->active_count -= vio->io_size / VDO_BLOCK_SIZE; vdo_enter_read_only_mode(slab->allocator->depot->vdo, result); check_if_slab_drained(slab); } @@ -1372,7 +1371,7 @@ static unsigned int calculate_slab_priority(struct vdo_slab *slab) static void prioritize_slab(struct vdo_slab *slab) { VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), - "a slab must not already be on a ring when prioritizing"); + "a slab must not already be on a list when prioritizing"); slab->priority = calculate_slab_priority(slab); vdo_priority_table_enqueue(slab->allocator->prioritized_slabs, slab->priority, &slab->allocq_entry); @@ -2165,28 +2164,95 @@ static void dirty_all_reference_blocks(struct vdo_slab *slab) dirty_block(&slab->reference_blocks[i]); } +static inline bool journal_points_equal(struct journal_point first, + struct journal_point second) +{ + return ((first.sequence_number == second.sequence_number) && + (first.entry_count == second.entry_count)); +} + /** - * clear_provisional_references() - Clear the provisional reference counts from a reference block. - * @block: The block to clear. + * match_bytes() - Check an 8-byte word for bytes matching the value specified + * @input: A word to examine the bytes of + * @match: The byte value sought + * + * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise */ -static void clear_provisional_references(struct reference_block *block) +static inline u64 match_bytes(u64 input, u8 match) { - vdo_refcount_t *counters = get_reference_counters_for_block(block); - block_count_t j; + u64 temp = input ^ (match * 0x0101010101010101ULL); + /* top bit of each byte is set iff top bit of temp byte is clear; rest are 0 */ + u64 test_top_bits = ~temp & 0x8080808080808080ULL; + /* top bit of each byte is set iff low 7 bits of temp byte are clear; rest are useless */ + u64 test_low_bits = 0x8080808080808080ULL - (temp & 0x7f7f7f7f7f7f7f7fULL); + /* return 1 when both tests indicate temp byte is 0 */ + return (test_top_bits & test_low_bits) >> 7; +} + +/** + * count_valid_references() - Process a newly loaded refcount array + * @counters: the array of counters from a metadata block + * + * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't + * cleaned up at shutdown, changing them internally to "empty". + * + * Return: the number of blocks that are referenced (counters not "empty") + */ +static unsigned int count_valid_references(vdo_refcount_t *counters) +{ + u64 *words = (u64 *)counters; + /* It's easier to count occurrences of a specific byte than its absences. */ + unsigned int empty_count = 0; + /* For speed, we process 8 bytes at once. */ + unsigned int words_left = COUNTS_PER_BLOCK / sizeof(u64); + + /* + * Sanity check assumptions used for optimizing this code: Counters are bytes. The counter + * array is a multiple of the word size. + */ + BUILD_BUG_ON(sizeof(vdo_refcount_t) != 1); + BUILD_BUG_ON((COUNTS_PER_BLOCK % sizeof(u64)) != 0); + + while (words_left > 0) { + /* + * This is used effectively as 8 byte-size counters. Byte 0 counts how many words + * had the target value found in byte 0, etc. We just have to avoid overflow. + */ + u64 split_count = 0; + /* + * The counter "% 255" trick used below to fold split_count into empty_count + * imposes a limit of 254 bytes examined each iteration of the outer loop. We + * process a word at a time, so that limit gets rounded down to 31 u64 words. + */ + const unsigned int max_words_per_iteration = 254 / sizeof(u64); + unsigned int iter_words_left = min_t(unsigned int, words_left, + max_words_per_iteration); + + words_left -= iter_words_left; + + while (iter_words_left--) { + u64 word = *words; + u64 temp; + + /* First, if we have any provisional refcount values, clear them. */ + temp = match_bytes(word, PROVISIONAL_REFERENCE_COUNT); + if (temp) { + /* + * 'temp' has 0x01 bytes where 'word' has PROVISIONAL; this xor + * will alter just those bytes, changing PROVISIONAL to EMPTY. + */ + word ^= temp * (PROVISIONAL_REFERENCE_COUNT ^ EMPTY_REFERENCE_COUNT); + *words = word; + } - for (j = 0; j < COUNTS_PER_BLOCK; j++) { - if (counters[j] == PROVISIONAL_REFERENCE_COUNT) { - counters[j] = EMPTY_REFERENCE_COUNT; - block->allocated_count--; + /* Now count the EMPTY_REFERENCE_COUNT bytes, updating the 8 counters. */ + split_count += match_bytes(word, EMPTY_REFERENCE_COUNT); + words++; } + empty_count += split_count % 255; } -} -static inline bool journal_points_equal(struct journal_point first, - struct journal_point second) -{ - return ((first.sequence_number == second.sequence_number) && - (first.entry_count == second.entry_count)); + return COUNTS_PER_BLOCK - empty_count; } /** @@ -2197,7 +2263,6 @@ static inline bool journal_points_equal(struct journal_point first, static void unpack_reference_block(struct packed_reference_block *packed, struct reference_block *block) { - block_count_t index; sector_count_t i; struct vdo_slab *slab = block->slab; vdo_refcount_t *counters = get_reference_counters_for_block(block); @@ -2223,11 +2288,7 @@ static void unpack_reference_block(struct packed_reference_block *packed, } } - block->allocated_count = 0; - for (index = 0; index < COUNTS_PER_BLOCK; index++) { - if (counters[index] != EMPTY_REFERENCE_COUNT) - block->allocated_count++; - } + block->allocated_count = count_valid_references(counters); } /** @@ -2240,13 +2301,19 @@ static void finish_reference_block_load(struct vdo_completion *completion) struct pooled_vio *pooled = vio_as_pooled_vio(vio); struct reference_block *block = completion->parent; struct vdo_slab *slab = block->slab; + unsigned int block_count = vio->io_size / VDO_BLOCK_SIZE; + unsigned int i; + char *data = vio->data; - unpack_reference_block((struct packed_reference_block *) vio->data, block); - return_vio_to_pool(slab->allocator->vio_pool, pooled); - slab->active_count--; - clear_provisional_references(block); + for (i = 0; i < block_count; i++, block++, data += VDO_BLOCK_SIZE) { + struct packed_reference_block *packed = (struct packed_reference_block *) data; + + unpack_reference_block(packed, block); + slab->free_blocks -= block->allocated_count; + } + return_vio_to_pool(pooled); + slab->active_count -= block_count; - slab->free_blocks -= block->allocated_count; check_if_slab_drained(slab); } @@ -2260,23 +2327,25 @@ static void load_reference_block_endio(struct bio *bio) } /** - * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the - * block. - * @waiter: The waiter of the block to load. + * load_reference_block_group() - After a block waiter has gotten a VIO from the VIO pool, load + * a set of blocks. + * @waiter: The waiter of the first block to load. * @context: The VIO returned by the pool. */ -static void load_reference_block(struct vdo_waiter *waiter, void *context) +static void load_reference_block_group(struct vdo_waiter *waiter, void *context) { struct pooled_vio *pooled = context; struct vio *vio = &pooled->vio; struct reference_block *block = container_of(waiter, struct reference_block, waiter); - size_t block_offset = (block - block->slab->reference_blocks); + u32 block_offset = block - block->slab->reference_blocks; + u32 max_block_count = block->slab->reference_block_count - block_offset; + u32 block_count = min_t(int, vio->block_count, max_block_count); vio->completion.parent = block; - vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset, - load_reference_block_endio, handle_io_error, - REQ_OP_READ); + vdo_submit_metadata_vio_with_size(vio, block->slab->ref_counts_origin + block_offset, + load_reference_block_endio, handle_io_error, + REQ_OP_READ, block_count * VDO_BLOCK_SIZE); } /** @@ -2286,14 +2355,21 @@ static void load_reference_block(struct vdo_waiter *waiter, void *context) static void load_reference_blocks(struct vdo_slab *slab) { block_count_t i; + u64 blocks_per_vio = slab->allocator->refcount_blocks_per_big_vio; + struct vio_pool *pool = slab->allocator->refcount_big_vio_pool; + + if (!pool) { + pool = slab->allocator->vio_pool; + blocks_per_vio = 1; + } slab->free_blocks = slab->block_count; slab->active_count = slab->reference_block_count; - for (i = 0; i < slab->reference_block_count; i++) { + for (i = 0; i < slab->reference_block_count; i += blocks_per_vio) { struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter; - waiter->callback = load_reference_block; - acquire_vio_from_pool(slab->allocator->vio_pool, waiter); + waiter->callback = load_reference_block_group; + acquire_vio_from_pool(pool, waiter); } } @@ -2429,7 +2505,7 @@ static void finish_loading_journal(struct vdo_completion *completion) initialize_journal_state(journal); } - return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + return_vio_to_pool(vio_as_pooled_vio(vio)); vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab)); } @@ -2449,7 +2525,7 @@ static void handle_load_error(struct vdo_completion *completion) struct vio *vio = as_vio(completion); vio_record_metadata_io_error(vio); - return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + return_vio_to_pool(vio_as_pooled_vio(vio)); vdo_finish_loading_with_result(&journal->slab->state, result); } @@ -2547,7 +2623,7 @@ static void queue_slab(struct vdo_slab *slab) int result; VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), - "a requeued slab must not already be on a ring"); + "a requeued slab must not already be on a list"); if (vdo_is_read_only(allocator->depot->vdo)) return; @@ -2700,6 +2776,7 @@ static void finish_scrubbing(struct slab_scrubber *scrubber, int result) vdo_log_info("VDO commencing normal operation"); else if (prior_state == VDO_RECOVERING) vdo_log_info("Exiting recovery mode"); + free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool)); } /* @@ -3281,7 +3358,7 @@ int vdo_release_block_reference(struct block_allocator *allocator, * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as * the primary key and the 'emptiness' field as the secondary key. * - * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping + * Slabs need to be pushed onto the lists in the same order they are to be popped off. Popping * should always get the most empty first, so pushing should be from most empty to least empty. * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements * before larger ones. @@ -3983,6 +4060,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, struct vdo *vdo = depot->vdo; block_count_t max_free_blocks = depot->slab_config.data_blocks; unsigned int max_priority = (2 + ilog2(max_free_blocks)); + u32 reference_block_count, refcount_reads_needed, refcount_blocks_per_vio; *allocator = (struct block_allocator) { .depot = depot, @@ -4000,12 +4078,24 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, return result; vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION); - result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id, + result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, 1, allocator->thread_id, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, allocator, &allocator->vio_pool); if (result != VDO_SUCCESS) return result; + /* Initialize the refcount-reading vio pool. */ + reference_block_count = vdo_get_saved_reference_count_size(depot->slab_config.slab_blocks); + refcount_reads_needed = DIV_ROUND_UP(reference_block_count, MAX_BLOCKS_PER_VIO); + refcount_blocks_per_vio = DIV_ROUND_UP(reference_block_count, refcount_reads_needed); + allocator->refcount_blocks_per_big_vio = refcount_blocks_per_vio; + result = make_vio_pool(vdo, BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE, + allocator->refcount_blocks_per_big_vio, allocator->thread_id, + VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, + NULL, &allocator->refcount_big_vio_pool); + if (result != VDO_SUCCESS) + return result; + result = initialize_slab_scrubber(allocator); if (result != VDO_SUCCESS) return result; @@ -4223,6 +4313,7 @@ void vdo_free_slab_depot(struct slab_depot *depot) uninitialize_allocator_summary(allocator); uninitialize_scrubber_vio(&allocator->scrubber); free_vio_pool(vdo_forget(allocator->vio_pool)); + free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool)); vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs)); } diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h index f234853501ca..fadc0c9d4dc4 100644 --- a/drivers/md/dm-vdo/slab-depot.h +++ b/drivers/md/dm-vdo/slab-depot.h @@ -45,6 +45,13 @@ enum { /* The number of vios in the vio pool is proportional to the throughput of the VDO. */ BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128, + + /* + * The number of vios in the vio pool used for loading reference count data. A slab's + * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be + * plenty. + */ + BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9, }; /* @@ -248,7 +255,7 @@ struct vdo_slab { /* A list of the dirty blocks waiting to be written out */ struct vdo_wait_queue dirty_blocks; - /* The number of blocks which are currently writing */ + /* The number of blocks which are currently reading or writing */ size_t active_count; /* A waiter object for updating the slab summary */ @@ -425,6 +432,10 @@ struct block_allocator { /* The vio pool for reading and writing block allocator metadata */ struct vio_pool *vio_pool; + /* The vio pool for large initial reads of ref count areas */ + struct vio_pool *refcount_big_vio_pool; + /* How many ref count blocks are read per vio at initial load */ + u32 refcount_blocks_per_big_vio; /* The dm_kcopyd client for erasing slab journals */ struct dm_kcopyd_client *eraser; /* Iterator over the slabs to be erased */ diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h index dbe892b10f26..cdf36e7d7702 100644 --- a/drivers/md/dm-vdo/types.h +++ b/drivers/md/dm-vdo/types.h @@ -376,6 +376,9 @@ struct vio { /* The size of this vio in blocks */ unsigned int block_count; + /* The amount of data to be read or written, in bytes */ + unsigned int io_size; + /* The data being read or written. */ char *data; diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c index a7e32baab4af..80b608674022 100644 --- a/drivers/md/dm-vdo/vdo.c +++ b/drivers/md/dm-vdo/vdo.c @@ -31,9 +31,7 @@ #include <linux/completion.h> #include <linux/device-mapper.h> -#include <linux/kernel.h> #include <linux/lz4.h> -#include <linux/module.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/types.h> @@ -142,12 +140,6 @@ static void finish_vdo_request_queue(void *ptr) vdo_unregister_allocating_thread(); } -#ifdef MODULE -#define MODULE_NAME THIS_MODULE->name -#else -#define MODULE_NAME "dm-vdo" -#endif /* MODULE */ - static const struct vdo_work_queue_type default_queue_type = { .start = start_vdo_request_queue, .finish = finish_vdo_request_queue, @@ -559,8 +551,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason, *vdo_ptr = vdo; snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix), - "%s%u", MODULE_NAME, instance); - BUG_ON(vdo->thread_name_prefix[0] == '\0'); + "vdo%u", instance); result = vdo_allocate(vdo->thread_config.thread_count, struct vdo_thread, __func__, &vdo->threads); if (result != VDO_SUCCESS) { diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index e710f3c5a972..e7f4153e55e3 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -188,14 +188,23 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb /* * Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated - * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a - * vio associated with the bio. + * bio, as it assumes the bio wraps a 4k-multiple buffer that is 4k aligned, but there does not + * have to be a vio associated with the bio. */ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, blk_opf_t bi_opf, physical_block_number_t pbn) { - int bvec_count, offset, len, i; + return vio_reset_bio_with_size(vio, data, vio->block_count * VDO_BLOCK_SIZE, + callback, bi_opf, pbn); +} + +int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, + blk_opf_t bi_opf, physical_block_number_t pbn) +{ + int bvec_count, offset, i; struct bio *bio = vio->bio; + int vio_size = vio->block_count * VDO_BLOCK_SIZE; + int remaining; bio_reset(bio, bio->bi_bdev, bi_opf); vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn); @@ -205,22 +214,21 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, bio->bi_ioprio = 0; bio->bi_io_vec = bio->bi_inline_vecs; bio->bi_max_vecs = vio->block_count + 1; - len = VDO_BLOCK_SIZE * vio->block_count; + if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", + size, vio_size) != VDO_SUCCESS) + size = vio_size; + vio->io_size = size; offset = offset_in_page(data); - bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE); + bvec_count = DIV_ROUND_UP(offset + size, PAGE_SIZE); + remaining = size; - /* - * If we knew that data was always on one page, or contiguous pages, we wouldn't need the - * loop. But if we're using vmalloc, it's not impossible that the data is in different - * pages that can't be merged in bio_add_page... - */ - for (i = 0; (i < bvec_count) && (len > 0); i++) { + for (i = 0; (i < bvec_count) && (remaining > 0); i++) { struct page *page; int bytes_added; int bytes = PAGE_SIZE - offset; - if (bytes > len) - bytes = len; + if (bytes > remaining) + bytes = remaining; page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data); bytes_added = bio_add_page(bio, page, bytes, offset); @@ -232,7 +240,7 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, } data += bytes; - len -= bytes; + remaining -= bytes; offset = 0; } @@ -301,6 +309,7 @@ void vio_record_metadata_io_error(struct vio *vio) * make_vio_pool() - Create a new vio pool. * @vdo: The vdo. * @pool_size: The number of vios in the pool. + * @block_count: The number of 4k blocks per vio. * @thread_id: The ID of the thread using this pool. * @vio_type: The type of vios in the pool. * @priority: The priority with which vios from the pool should be enqueued. @@ -309,13 +318,14 @@ void vio_record_metadata_io_error(struct vio *vio) * * Return: A success or error code. */ -int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, +int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_id_t thread_id, enum vio_type vio_type, enum vio_priority priority, void *context, struct vio_pool **pool_ptr) { struct vio_pool *pool; char *ptr; int result; + size_t per_vio_size = VDO_BLOCK_SIZE * block_count; result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio, __func__, &pool); @@ -326,7 +336,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, INIT_LIST_HEAD(&pool->available); INIT_LIST_HEAD(&pool->busy); - result = vdo_allocate(pool_size * VDO_BLOCK_SIZE, char, + result = vdo_allocate(pool_size * per_vio_size, char, "VIO pool buffer", &pool->buffer); if (result != VDO_SUCCESS) { free_vio_pool(pool); @@ -334,10 +344,10 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, } ptr = pool->buffer; - for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) { + for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += per_vio_size) { struct pooled_vio *pooled = &pool->vios[pool->size]; - result = allocate_vio_components(vdo, vio_type, priority, NULL, 1, ptr, + result = allocate_vio_components(vdo, vio_type, priority, NULL, block_count, ptr, &pooled->vio); if (result != VDO_SUCCESS) { free_vio_pool(pool); @@ -345,6 +355,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, } pooled->context = context; + pooled->pool = pool; list_add_tail(&pooled->pool_entry, &pool->available); } @@ -419,12 +430,13 @@ void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter) } /** - * return_vio_to_pool() - Return a vio to the pool - * @pool: The vio pool. + * return_vio_to_pool() - Return a vio to its pool * @vio: The pooled vio to return. */ -void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio) +void return_vio_to_pool(struct pooled_vio *vio) { + struct vio_pool *pool = vio->pool; + VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()), "vio pool entry returned on same thread as it was acquired"); diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h index 3490e9f59b04..4bfcb21901f1 100644 --- a/drivers/md/dm-vdo/vio.h +++ b/drivers/md/dm-vdo/vio.h @@ -30,6 +30,8 @@ struct pooled_vio { void *context; /* The list entry used by the pool */ struct list_head pool_entry; + /* The pool this vio is allocated from */ + struct vio_pool *pool; }; /** @@ -123,6 +125,8 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, blk_opf_t bi_opf, physical_block_number_t pbn); +int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, + blk_opf_t bi_opf, physical_block_number_t pbn); void update_vio_error_stats(struct vio *vio, const char *format, ...) __printf(2, 3); @@ -188,12 +192,13 @@ static inline struct pooled_vio *vio_as_pooled_vio(struct vio *vio) struct vio_pool; -int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, - enum vio_type vio_type, enum vio_priority priority, - void *context, struct vio_pool **pool_ptr); +int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, + thread_id_t thread_id, enum vio_type vio_type, + enum vio_priority priority, void *context, + struct vio_pool **pool_ptr); void free_vio_pool(struct vio_pool *pool); bool __must_check is_vio_pool_busy(struct vio_pool *pool); void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter); -void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio); +void return_vio_to_pool(struct pooled_vio *vio); #endif /* VIO_H */ diff --git a/drivers/md/dm-vdo/wait-queue.c b/drivers/md/dm-vdo/wait-queue.c index 6e1e739277ef..f81ed0cee2bf 100644 --- a/drivers/md/dm-vdo/wait-queue.c +++ b/drivers/md/dm-vdo/wait-queue.c @@ -34,7 +34,7 @@ void vdo_waitq_enqueue_waiter(struct vdo_wait_queue *waitq, struct vdo_waiter *w waitq->last_waiter->next_waiter = waiter; } - /* In both cases, the waiter we added to the ring becomes the last waiter. */ + /* In both cases, the waiter we added to the list becomes the last waiter. */ waitq->last_waiter = waiter; waitq->length += 1; } diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index e86c1431b108..3c427f18a04b 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -30,6 +30,7 @@ #define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR" #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 +#define DM_VERITY_USE_BH_DEFAULT_BYTES 8192 #define DM_VERITY_MAX_CORRUPTED_ERRS 100 @@ -49,6 +50,15 @@ static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644); +static unsigned int dm_verity_use_bh_bytes[4] = { + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_NONE + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_RT + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_BE + 0 // IOPRIO_CLASS_IDLE +}; + +module_param_array_named(use_bh_bytes, dm_verity_use_bh_bytes, uint, NULL, 0644); + static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled); /* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */ @@ -311,7 +321,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { data = dm_bufio_get(v->bufio, hash_block, &buf); - if (data == NULL) { + if (IS_ERR_OR_NULL(data)) { /* * In tasklet and the hash was not in the bufio cache. * Return early and resume execution from a work-queue @@ -324,8 +334,24 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, &buf, bio->bi_ioprio); } - if (IS_ERR(data)) - return PTR_ERR(data); + if (IS_ERR(data)) { + if (skip_unverified) + return 1; + r = PTR_ERR(data); + data = dm_bufio_new(v->bufio, hash_block, &buf); + if (IS_ERR(data)) + return r; + if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, + hash_block, data) == 0) { + aux = dm_bufio_get_aux_data(buf); + aux->hash_verified = 1; + goto release_ok; + } else { + dm_bufio_release(buf); + dm_bufio_forget(v->bufio, hash_block); + return r; + } + } aux = dm_bufio_get_aux_data(buf); @@ -366,6 +392,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, } } +release_ok: data += offset; memcpy(want_digest, data, v->digest_size); r = 0; @@ -652,9 +679,17 @@ static void verity_bh_work(struct work_struct *w) verity_finish_io(io, errno_to_blk_status(err)); } +static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio) +{ + return ioprio <= IOPRIO_CLASS_IDLE && + bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]); +} + static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; + unsigned short ioprio = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + unsigned int bytes = io->n_blocks << io->v->data_dev_block_bits; if (bio->bi_status && (!verity_fec_is_enabled(io->v) || @@ -664,9 +699,14 @@ static void verity_end_io(struct bio *bio) return; } - if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq) { - INIT_WORK(&io->bh_work, verity_bh_work); - queue_work(system_bh_wq, &io->bh_work); + if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq && + verity_use_bh(bytes, ioprio)) { + if (in_hardirq() || irqs_disabled()) { + INIT_WORK(&io->bh_work, verity_bh_work); + queue_work(system_bh_wq, &io->bh_work); + } else { + verity_bh_work(&io->bh_work); + } } else { INIT_WORK(&io->work, verity_work); queue_work(io->v->verify_wq, &io->work); @@ -796,6 +836,13 @@ static int verity_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +static void verity_postsuspend(struct dm_target *ti) +{ + struct dm_verity *v = ti->private; + flush_workqueue(v->verify_wq); + dm_bufio_client_reset(v->bufio); +} + /* * Status: V (valid) or C (corruption found) */ @@ -1761,11 +1808,12 @@ static struct target_type verity_target = { .name = "verity", /* Note: the LSMs depend on the singleton and immutable features */ .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, - .version = {1, 10, 0}, + .version = {1, 11, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, .map = verity_map, + .postsuspend = verity_postsuspend, .status = verity_status, .prepare_ioctl = verity_prepare_ioctl, .iterate_devices = verity_iterate_devices, diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 7ce8847b3404..d6a04a57472d 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -797,7 +797,7 @@ static void writecache_flush(struct dm_writecache *wc) bool need_flush_after_free; wc->uncommitted_blocks = 0; - del_timer(&wc->autocommit_timer); + timer_delete(&wc->autocommit_timer); if (list_empty(&wc->lru)) return; @@ -927,8 +927,8 @@ static void writecache_suspend(struct dm_target *ti) struct dm_writecache *wc = ti->private; bool flush_on_suspend; - del_timer_sync(&wc->autocommit_timer); - del_timer_sync(&wc->max_age_timer); + timer_delete_sync(&wc->autocommit_timer); + timer_delete_sync(&wc->max_age_timer); wc_lock(wc); writecache_flush(wc); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4d1e42891d24..5ab7574c0c76 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1540,14 +1540,18 @@ static void __send_empty_flush(struct clone_info *ci) { struct dm_table *t = ci->map; struct bio flush_bio; + blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + + if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) == + (REQ_IDLE | REQ_SYNC)) + opf |= REQ_IDLE; /* * Use an on-stack bio for this, it's safe since we don't * need to reference it after submit. It's just used as * the basis for the clone(s). */ - bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, - REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); + bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf); ci->bio = &flush_bio; ci->sector_count = 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index 438e71e45c16..9daa78c5fe33 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4064,7 +4064,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) * it must always be in_sync */ mddev->in_sync = 1; - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); } pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -6405,7 +6405,7 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { mddev->pers->quiesce(mddev, 1); diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index f4f948b0e173..dbb97a7233ab 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig @@ -2,7 +2,7 @@ config DM_PERSISTENT_DATA tristate depends on BLK_DEV_DM - select LIBCRC32C + select CRC32 select DM_BUFIO help Library providing immutable on-disk data structure support for |