summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig1
-rw-r--r--drivers/md/bcache/stats.c2
-rw-r--r--drivers/md/dm-bufio.c4
-rw-r--r--drivers/md/dm-cache-target.c96
-rw-r--r--drivers/md/dm-crypt.c41
-rw-r--r--drivers/md/dm-delay.c18
-rw-r--r--drivers/md/dm-ebs-target.c7
-rw-r--r--drivers/md/dm-flakey.c2
-rw-r--r--drivers/md/dm-integrity.c89
-rw-r--r--drivers/md/dm-mpath.c2
-rw-r--r--drivers/md/dm-raid1.c2
-rw-r--r--drivers/md/dm-stripe.c2
-rw-r--r--drivers/md/dm-table.c11
-rw-r--r--drivers/md/dm-vdo/block-map.c13
-rw-r--r--drivers/md/dm-vdo/constants.h3
-rw-r--r--drivers/md/dm-vdo/dedupe.c23
-rw-r--r--drivers/md/dm-vdo/encodings.c20
-rw-r--r--drivers/md/dm-vdo/indexer/index-layout.c5
-rw-r--r--drivers/md/dm-vdo/indexer/index-session.c6
-rw-r--r--drivers/md/dm-vdo/indexer/indexer.h53
-rw-r--r--drivers/md/dm-vdo/io-submitter.c6
-rw-r--r--drivers/md/dm-vdo/io-submitter.h18
-rw-r--r--drivers/md/dm-vdo/packer.h2
-rw-r--r--drivers/md/dm-vdo/priority-table.c2
-rw-r--r--drivers/md/dm-vdo/recovery-journal.h6
-rw-r--r--drivers/md/dm-vdo/slab-depot.c193
-rw-r--r--drivers/md/dm-vdo/slab-depot.h13
-rw-r--r--drivers/md/dm-vdo/types.h3
-rw-r--r--drivers/md/dm-vdo/vdo.c11
-rw-r--r--drivers/md/dm-vdo/vio.c54
-rw-r--r--drivers/md/dm-vdo/vio.h13
-rw-r--r--drivers/md/dm-vdo/wait-queue.c2
-rw-r--r--drivers/md/dm-verity-target.c62
-rw-r--r--drivers/md/dm-writecache.c6
-rw-r--r--drivers/md/dm.c8
-rw-r--r--drivers/md/md-bitmap.c14
-rw-r--r--drivers/md/md-cluster.c18
-rw-r--r--drivers/md/md-cluster.h6
-rw-r--r--drivers/md/md-linear.c19
-rw-r--r--drivers/md/md.c360
-rw-r--r--drivers/md/md.h62
-rw-r--r--drivers/md/raid0.c22
-rw-r--r--drivers/md/raid1-10.c6
-rw-r--r--drivers/md/raid1.c60
-rw-r--r--drivers/md/raid10.c70
-rw-r--r--drivers/md/raid5-cache.c31
-rw-r--r--drivers/md/raid5-ppl.c16
-rw-r--r--drivers/md/raid5.c91
48 files changed, 929 insertions, 645 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 0b1870a09e1f..06f809e70f15 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -267,6 +267,7 @@ config DM_CRYPT
depends on BLK_DEV_DM
depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
depends on (TRUSTED_KEYS || TRUSTED_KEYS=n)
+ select CRC32
select CRYPTO
select CRYPTO_CBC
select CRYPTO_ESSIV
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index 68b02216033d..d39dec34b7a3 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -123,7 +123,7 @@ void bch_cache_accounting_destroy(struct cache_accounting *acc)
kobject_put(&acc->day.kobj);
atomic_set(&acc->closing, 1);
- if (del_timer_sync(&acc->timer))
+ if (timer_delete_sync(&acc->timer))
closure_return(&acc->cl);
}
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index aab8240429b0..9c8ed65cd87e 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -2234,7 +2234,7 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c
}
EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
-static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
+static void forget_buffer(struct dm_bufio_client *c, sector_t block)
{
struct dm_buffer *b;
@@ -2249,8 +2249,6 @@ static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
cache_put_and_wake(c, b);
}
}
-
- return b ? true : false;
}
/*
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9cb797a561d6..a10d75a562db 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -406,6 +406,12 @@ struct cache {
mempool_t migration_pool;
struct bio_set bs;
+
+ /*
+ * Cache_size entries. Set bits indicate blocks mapped beyond the
+ * target length, which are marked for invalidation.
+ */
+ unsigned long *invalid_bitset;
};
struct per_bio_data {
@@ -1922,6 +1928,9 @@ static void __destroy(struct cache *cache)
if (cache->discard_bitset)
free_bitset(cache->discard_bitset);
+ if (cache->invalid_bitset)
+ free_bitset(cache->invalid_bitset);
+
if (cache->copier)
dm_kcopyd_client_destroy(cache->copier);
@@ -2510,6 +2519,13 @@ static int cache_create(struct cache_args *ca, struct cache **result)
}
clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
+ cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size));
+ if (!cache->invalid_bitset) {
+ *error = "could not allocate bitset for invalid blocks";
+ goto bad;
+ }
+ clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size));
+
cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
if (IS_ERR(cache->copier)) {
*error = "could not create kcopyd client";
@@ -2808,6 +2824,24 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
}
+static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
+ bool dirty, uint32_t hint, bool hint_valid)
+{
+ struct cache *cache = context;
+
+ if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) {
+ if (dirty) {
+ DMERR("%s: unable to shrink origin; cache block %u is dirty",
+ cache_device_name(cache), from_cblock(cblock));
+ return -EFBIG;
+ }
+ set_bit(from_cblock(cblock), cache->invalid_bitset);
+ return 0;
+ }
+
+ return load_mapping(context, oblock, cblock, dirty, hint, hint_valid);
+}
+
/*
* The discard block size in the on disk metadata is not
* necessarily the same as we're currently using. So we have to
@@ -2899,6 +2933,27 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache)
return to_cblock(size);
}
+static bool can_resume(struct cache *cache)
+{
+ /*
+ * Disallow retrying the resume operation for devices that failed the
+ * first resume attempt, as the failure leaves the policy object partially
+ * initialized. Retrying could trigger BUG_ON when loading cache mappings
+ * into the incomplete policy object.
+ */
+ if (cache->sized && !cache->loaded_mappings) {
+ if (get_cache_mode(cache) != CM_WRITE)
+ DMERR("%s: unable to resume a failed-loaded cache, please check metadata.",
+ cache_device_name(cache));
+ else
+ DMERR("%s: unable to resume cache due to missing proper cache table reload",
+ cache_device_name(cache));
+ return false;
+ }
+
+ return true;
+}
+
static bool can_resize(struct cache *cache, dm_cblock_t new_size)
{
if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
@@ -2941,12 +2996,33 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
return 0;
}
+static int truncate_oblocks(struct cache *cache)
+{
+ uint32_t nr_blocks = from_cblock(cache->cache_size);
+ uint32_t i;
+ int r;
+
+ for_each_set_bit(i, cache->invalid_bitset, nr_blocks) {
+ r = dm_cache_remove_mapping(cache->cmd, to_cblock(i));
+ if (r) {
+ DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ return r;
+ }
+ }
+
+ return 0;
+}
+
static int cache_preresume(struct dm_target *ti)
{
int r = 0;
struct cache *cache = ti->private;
dm_cblock_t csize = get_cache_dev_size(cache);
+ if (!can_resume(cache))
+ return -EINVAL;
+
/*
* Check to see if the cache has resized.
*/
@@ -2962,11 +3038,25 @@ static int cache_preresume(struct dm_target *ti)
}
if (!cache->loaded_mappings) {
+ /*
+ * The fast device could have been resized since the last
+ * failed preresume attempt. To be safe we start by a blank
+ * bitset for cache blocks.
+ */
+ clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size));
+
r = dm_cache_load_mappings(cache->cmd, cache->policy,
- load_mapping, cache);
+ load_filtered_mapping, cache);
if (r) {
DMERR("%s: could not load cache mappings", cache_device_name(cache));
- metadata_operation_failed(cache, "dm_cache_load_mappings", r);
+ if (r != -EFBIG)
+ metadata_operation_failed(cache, "dm_cache_load_mappings", r);
+ return r;
+ }
+
+ r = truncate_oblocks(cache);
+ if (r) {
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
return r;
}
@@ -3426,7 +3516,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {2, 2, 0},
+ .version = {2, 3, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 02a2919f4e5a..9dfdb63220d7 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -17,6 +17,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
+#include <linux/crc32.h>
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/crypto.h>
@@ -125,7 +126,6 @@ struct iv_lmk_private {
#define TCW_WHITENING_SIZE 16
struct iv_tcw_private {
- struct crypto_shash *crc32_tfm;
u8 *iv_seed;
u8 *whitening;
};
@@ -607,10 +607,6 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc)
tcw->iv_seed = NULL;
kfree_sensitive(tcw->whitening);
tcw->whitening = NULL;
-
- if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
- crypto_free_shash(tcw->crc32_tfm);
- tcw->crc32_tfm = NULL;
}
static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -628,13 +624,6 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
return -EINVAL;
}
- tcw->crc32_tfm = crypto_alloc_shash("crc32", 0,
- CRYPTO_ALG_ALLOCATES_MEMORY);
- if (IS_ERR(tcw->crc32_tfm)) {
- ti->error = "Error initializing CRC32 in TCW";
- return PTR_ERR(tcw->crc32_tfm);
- }
-
tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL);
tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL);
if (!tcw->iv_seed || !tcw->whitening) {
@@ -668,36 +657,28 @@ static int crypt_iv_tcw_wipe(struct crypt_config *cc)
return 0;
}
-static int crypt_iv_tcw_whitening(struct crypt_config *cc,
- struct dm_crypt_request *dmreq,
- u8 *data)
+static void crypt_iv_tcw_whitening(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq, u8 *data)
{
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
__le64 sector = cpu_to_le64(dmreq->iv_sector);
u8 buf[TCW_WHITENING_SIZE];
- SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm);
- int i, r;
+ int i;
/* xor whitening with sector number */
crypto_xor_cpy(buf, tcw->whitening, (u8 *)&sector, 8);
crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)&sector, 8);
/* calculate crc32 for every 32bit part and xor it */
- desc->tfm = tcw->crc32_tfm;
- for (i = 0; i < 4; i++) {
- r = crypto_shash_digest(desc, &buf[i * 4], 4, &buf[i * 4]);
- if (r)
- goto out;
- }
+ for (i = 0; i < 4; i++)
+ put_unaligned_le32(crc32(0, &buf[i * 4], 4), &buf[i * 4]);
crypto_xor(&buf[0], &buf[12], 4);
crypto_xor(&buf[4], &buf[8], 4);
/* apply whitening (8 bytes) to whole sector */
for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
crypto_xor(data + i * 8, buf, 8);
-out:
memzero_explicit(buf, sizeof(buf));
- return r;
}
static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
@@ -707,13 +688,12 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
__le64 sector = cpu_to_le64(dmreq->iv_sector);
u8 *src;
- int r = 0;
/* Remove whitening from ciphertext */
if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
sg = crypt_get_sg_data(cc, dmreq->sg_in);
src = kmap_local_page(sg_page(sg));
- r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
+ crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
kunmap_local(src);
}
@@ -723,7 +703,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)&sector,
cc->iv_size - 8);
- return r;
+ return 0;
}
static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
@@ -731,7 +711,6 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
{
struct scatterlist *sg;
u8 *dst;
- int r;
if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
return 0;
@@ -739,10 +718,10 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
/* Apply whitening on ciphertext */
sg = crypt_get_sg_data(cc, dmreq->sg_out);
dst = kmap_local_page(sg_page(sg));
- r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
+ crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
kunmap_local(dst);
- return r;
+ return 0;
}
static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 08f6387620c1..d4cf0ac2a7aa 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -369,6 +369,21 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
return delay_bio(dc, c, bio);
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static int delay_report_zones(struct dm_target *ti,
+ struct dm_report_zones_args *args, unsigned int nr_zones)
+{
+ struct delay_c *dc = ti->private;
+ struct delay_class *c = &dc->read;
+
+ return dm_report_zones(c->dev->bdev, c->start,
+ c->start + dm_target_offset(ti, args->next_sector),
+ args, nr_zones);
+}
+#else
+#define delay_report_zones NULL
+#endif
+
#define DMEMIT_DELAY_CLASS(c) \
DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay)
@@ -424,11 +439,12 @@ out:
static struct target_type delay_target = {
.name = "delay",
.version = {1, 4, 0},
- .features = DM_TARGET_PASSES_INTEGRITY,
+ .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
.module = THIS_MODULE,
.ctr = delay_ctr,
.dtr = delay_dtr,
.map = delay_map,
+ .report_zones = delay_report_zones,
.presuspend = delay_presuspend,
.resume = delay_resume,
.status = delay_status,
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index 18ae45dcbfb2..b19b0142a690 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -390,6 +390,12 @@ static int ebs_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
+static void ebs_postsuspend(struct dm_target *ti)
+{
+ struct ebs_c *ec = ti->private;
+ dm_bufio_client_reset(ec->bufio);
+}
+
static void ebs_status(struct dm_target *ti, status_type_t type,
unsigned int status_flags, char *result, unsigned int maxlen)
{
@@ -447,6 +453,7 @@ static struct target_type ebs_target = {
.ctr = ebs_ctr,
.dtr = ebs_dtr,
.map = ebs_map,
+ .postsuspend = ebs_postsuspend,
.status = ebs_status,
.io_hints = ebs_io_hints,
.prepare_ioctl = ebs_prepare_ioctl,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 731467d4ed10..b690905ab89f 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -426,7 +426,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b
if (!clone)
return NULL;
- bio_init(clone, fc->dev->bdev, bio->bi_inline_vecs, nr_iovecs, bio->bi_opf);
+ bio_init(clone, fc->dev->bdev, clone->bi_inline_vecs, nr_iovecs, bio->bi_opf);
clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector);
clone->bi_private = bio;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index ee9f7cecd78e..2a283feb3319 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -21,6 +21,7 @@
#include <linux/reboot.h>
#include <crypto/hash.h>
#include <crypto/skcipher.h>
+#include <crypto/utils.h>
#include <linux/async_tx.h>
#include <linux/dm-bufio.h>
@@ -516,7 +517,7 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr)
dm_integrity_io_error(ic, "crypto_shash_digest", r);
return r;
}
- if (memcmp(mac, actual_mac, mac_size)) {
+ if (crypto_memneq(mac, actual_mac, mac_size)) {
dm_integrity_io_error(ic, "superblock mac", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0);
return -EILSEQ;
@@ -859,7 +860,7 @@ static void rw_section_mac(struct dm_integrity_c *ic, unsigned int section, bool
if (likely(wr))
memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
else {
- if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) {
+ if (crypto_memneq(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) {
dm_integrity_io_error(ic, "journal mac", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0);
}
@@ -1401,10 +1402,9 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_
static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
unsigned int *metadata_offset, unsigned int total_size, int op)
{
-#define MAY_BE_FILLER 1
-#define MAY_BE_HASH 2
unsigned int hash_offset = 0;
- unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+ unsigned char mismatch_hash = 0;
+ unsigned char mismatch_filler = !ic->discard;
do {
unsigned char *data, *dp;
@@ -1425,7 +1425,7 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
if (op == TAG_READ) {
memcpy(tag, dp, to_copy);
} else if (op == TAG_WRITE) {
- if (memcmp(dp, tag, to_copy)) {
+ if (crypto_memneq(dp, tag, to_copy)) {
memcpy(dp, tag, to_copy);
dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
}
@@ -1433,29 +1433,30 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
/* e.g.: op == TAG_CMP */
if (likely(is_power_of_2(ic->tag_size))) {
- if (unlikely(memcmp(dp, tag, to_copy)))
- if (unlikely(!ic->discard) ||
- unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) {
- goto thorough_test;
- }
+ if (unlikely(crypto_memneq(dp, tag, to_copy)))
+ goto thorough_test;
} else {
unsigned int i, ts;
thorough_test:
ts = total_size;
for (i = 0; i < to_copy; i++, ts--) {
- if (unlikely(dp[i] != tag[i]))
- may_be &= ~MAY_BE_HASH;
- if (likely(dp[i] != DISCARD_FILLER))
- may_be &= ~MAY_BE_FILLER;
+ /*
+ * Warning: the control flow must not be
+ * dependent on match/mismatch of
+ * individual bytes.
+ */
+ mismatch_hash |= dp[i] ^ tag[i];
+ mismatch_filler |= dp[i] ^ DISCARD_FILLER;
hash_offset++;
if (unlikely(hash_offset == ic->tag_size)) {
- if (unlikely(!may_be)) {
+ if (unlikely(mismatch_hash) && unlikely(mismatch_filler)) {
dm_bufio_release(b);
return ts;
}
hash_offset = 0;
- may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+ mismatch_hash = 0;
+ mismatch_filler = !ic->discard;
}
}
}
@@ -1476,8 +1477,6 @@ thorough_test:
} while (unlikely(total_size));
return 0;
-#undef MAY_BE_FILLER
-#undef MAY_BE_HASH
}
struct flush_request {
@@ -2076,7 +2075,7 @@ retry_kmap:
char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
- if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
+ if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
logical_sector);
dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum",
@@ -2595,7 +2594,7 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
bio_put(outgoing_bio);
integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest);
- if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
+ if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
ic->dev->bdev, dio->bio_details.bi_iter.bi_sector);
atomic64_inc(&ic->number_of_mismatches);
@@ -2634,7 +2633,7 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status
char *mem = bvec_kmap_local(&bv);
//memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT);
integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest);
- if (unlikely(memcmp(digest, dio->integrity_payload + pos,
+ if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos,
min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
kunmap_local(mem);
dm_integrity_free_payload(dio);
@@ -2708,7 +2707,7 @@ static void integrity_commit(struct work_struct *w)
unsigned int i, j, n;
struct bio *flushes;
- del_timer(&ic->autocommit_timer);
+ timer_delete(&ic->autocommit_timer);
if (ic->mode == 'I')
return;
@@ -2911,7 +2910,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start
integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
(char *)access_journal_data(ic, i, l), test_tag);
- if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
+ if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0);
}
@@ -3607,7 +3606,7 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier));
- del_timer_sync(&ic->autocommit_timer);
+ timer_delete_sync(&ic->autocommit_timer);
if (ic->recalc_wq)
drain_workqueue(ic->recalc_wq);
@@ -3790,20 +3789,18 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE: {
- __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
-
- watermark_percentage += ic->journal_entries / 2;
- do_div(watermark_percentage, ic->journal_entries);
- arg_count = 3;
+ arg_count = 1; /* buffer_sectors */
arg_count += !!ic->meta_dev;
arg_count += ic->sectors_per_block != 1;
arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
arg_count += ic->reset_recalculate_flag;
arg_count += ic->discard;
- arg_count += ic->mode == 'J';
- arg_count += ic->mode == 'J';
- arg_count += ic->mode == 'B';
- arg_count += ic->mode == 'B';
+ arg_count += ic->mode != 'I'; /* interleave_sectors */
+ arg_count += ic->mode == 'J'; /* journal_sectors */
+ arg_count += ic->mode == 'J'; /* journal_watermark */
+ arg_count += ic->mode == 'J'; /* commit_time */
+ arg_count += ic->mode == 'B'; /* sectors_per_bit */
+ arg_count += ic->mode == 'B'; /* bitmap_flush_interval */
arg_count += !!ic->internal_hash_alg.alg_string;
arg_count += !!ic->journal_crypt_alg.alg_string;
arg_count += !!ic->journal_mac_alg.alg_string;
@@ -3822,10 +3819,15 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
DMEMIT(" reset_recalculate");
if (ic->discard)
DMEMIT(" allow_discards");
- DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
- DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
+ if (ic->mode != 'I')
+ DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
if (ic->mode == 'J') {
+ __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
+
+ watermark_percentage += ic->journal_entries / 2;
+ do_div(watermark_percentage, ic->journal_entries);
+ DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
DMEMIT(" journal_watermark:%u", (unsigned int)watermark_percentage);
DMEMIT(" commit_time:%u", ic->autocommit_msec);
}
@@ -4808,23 +4810,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
ti->error = "Cannot allocate bio set";
goto bad;
}
- r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE);
- if (r) {
- ti->error = "Cannot allocate bio integrity set";
- r = -ENOMEM;
- goto bad;
- }
r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS);
if (r) {
ti->error = "Cannot allocate bio set";
goto bad;
}
- r = bioset_integrity_create(&ic->recalc_bios, 1);
- if (r) {
- ti->error = "Cannot allocate bio integrity set";
- r = -ENOMEM;
- goto bad;
- }
}
ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
@@ -5081,16 +5071,19 @@ try_smaller_buffer:
ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
if (!ic->recalc_bitmap) {
+ ti->error = "Could not allocate memory for bitmap";
r = -ENOMEM;
goto bad;
}
ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
if (!ic->may_write_bitmap) {
+ ti->error = "Could not allocate memory for bitmap";
r = -ENOMEM;
goto bad;
}
ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
if (!ic->bbs) {
+ ti->error = "Could not allocate memory for bitmap";
r = -ENOMEM;
goto bad;
}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 637977acc3dc..6c98f4ae5ea9 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -815,7 +815,7 @@ static void enable_nopath_timeout(struct multipath *m)
static void disable_nopath_timeout(struct multipath *m)
{
- del_timer_sync(&m->nopath_timer);
+ timer_delete_sync(&m->nopath_timer);
}
/*
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 8c6f1f7e6456..9e615b4f1f5e 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1182,7 +1182,7 @@ static void mirror_dtr(struct dm_target *ti)
{
struct mirror_set *ms = ti->private;
- del_timer_sync(&ms->timer);
+ timer_delete_sync(&ms->timer);
flush_workqueue(ms->kmirrord_wq);
flush_work(&ms->trigger_event);
dm_kcopyd_client_destroy(ms->kcopyd_client);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 3786ac67cefe..a1b7535c508a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -467,7 +467,7 @@ static struct target_type stripe_target = {
.name = "striped",
.version = {1, 7, 0},
.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT |
- DM_TARGET_ATOMIC_WRITES,
+ DM_TARGET_ATOMIC_WRITES | DM_TARGET_PASSES_CRYPTO,
.module = THIS_MODULE,
.ctr = stripe_ctr,
.dtr = stripe_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 0ef5203387b2..35100a435c88 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -697,6 +697,10 @@ int dm_table_add_target(struct dm_table *t, const char *type,
DMERR("%s: zero-length target", dm_device_name(t->md));
return -EINVAL;
}
+ if (start + len < start || start + len > LLONG_MAX >> SECTOR_SHIFT) {
+ DMERR("%s: too large device", dm_device_name(t->md));
+ return -EINVAL;
+ }
ti->type = dm_get_target_type(type);
if (!ti->type) {
@@ -1081,15 +1085,9 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
__alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags))
goto out_free_pools;
- if (mempool_needs_integrity &&
- bioset_integrity_create(&pools->io_bs, pool_size))
- goto out_free_pools;
init_bs:
if (bioset_init(&pools->bs, pool_size, front_pad, 0))
goto out_free_pools;
- if (mempool_needs_integrity &&
- bioset_integrity_create(&pools->bs, pool_size))
- goto out_free_pools;
t->mempools = pools;
return 0;
@@ -1250,6 +1248,7 @@ static int dm_table_construct_crypto_profile(struct dm_table *t)
profile->max_dun_bytes_supported = UINT_MAX;
memset(profile->modes_supported, 0xFF,
sizeof(profile->modes_supported));
+ profile->key_types_supported = ~0;
for (i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
index 89cb7942ec5c..baf683cabb1b 100644
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@@ -451,7 +451,7 @@ static struct page_info * __must_check find_page(struct vdo_page_cache *cache,
* select_lru_page() - Determine which page is least recently used.
*
* Picks the least recently used from among the non-busy entries at the front of each of the lru
- * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely
+ * list. Since whenever we mark a page busy we also put it to the end of the list it is unlikely
* that the entries at the front are busy unless the queue is very short, but not impossible.
*
* Return: A pointer to the info structure for a relevant page, or NULL if no such page can be
@@ -1544,7 +1544,7 @@ static void write_page_if_not_dirtied(struct vdo_waiter *waiter, void *context)
static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio)
{
- return_vio_to_pool(zone->vio_pool, vio);
+ return_vio_to_pool(vio);
check_for_drain_complete(zone);
}
@@ -1837,7 +1837,7 @@ static void finish_block_map_page_load(struct vdo_completion *completion)
if (!vdo_copy_valid_page(vio->data, nonce, pbn, page))
vdo_format_block_map_page(page, nonce, pbn, false);
- return_vio_to_pool(zone->vio_pool, pooled);
+ return_vio_to_pool(pooled);
/* Release our claim to the load and wake any waiters */
release_page_lock(data_vio, "load");
@@ -1851,10 +1851,9 @@ static void handle_io_error(struct vdo_completion *completion)
struct vio *vio = as_vio(completion);
struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
struct data_vio *data_vio = completion->parent;
- struct block_map_zone *zone = pooled->context;
vio_record_metadata_io_error(vio);
- return_vio_to_pool(zone->vio_pool, pooled);
+ return_vio_to_pool(pooled);
abort_load(data_vio, result);
}
@@ -2499,7 +2498,7 @@ static void finish_cursor(struct cursor *cursor)
struct cursors *cursors = cursor->parent;
struct vdo_completion *completion = cursors->completion;
- return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio));
+ return_vio_to_pool(vdo_forget(cursor->vio));
if (--cursors->active_roots > 0)
return;
@@ -2746,7 +2745,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map,
if (result != VDO_SUCCESS)
return result;
- result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE,
+ result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, 1,
zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR,
VIO_PRIORITY_METADATA, zone, &zone->vio_pool);
if (result != VDO_SUCCESS)
diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h
index a8c4d6e24b38..2a8b03779f87 100644
--- a/drivers/md/dm-vdo/constants.h
+++ b/drivers/md/dm-vdo/constants.h
@@ -44,9 +44,6 @@ enum {
/* The default size of each slab journal, in blocks */
DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224,
- /* Unit test minimum */
- MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2,
-
/*
* The initial size of lbn_operations and pbn_operations, which is based upon the expected
* maximum number of outstanding VIOs. This value was chosen to make it highly unlikely
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index b6f8e2dc7729..3c58b941e067 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -226,7 +226,7 @@ struct hash_lock {
* A list containing the data VIOs sharing this lock, all having the same record name and
* data block contents, linked by their hash_lock_node fields.
*/
- struct list_head duplicate_ring;
+ struct list_head duplicate_vios;
/* The number of data_vios sharing this lock instance */
data_vio_count_t reference_count;
@@ -343,7 +343,7 @@ static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *l
{
memset(lock, 0, sizeof(*lock));
INIT_LIST_HEAD(&lock->pool_node);
- INIT_LIST_HEAD(&lock->duplicate_ring);
+ INIT_LIST_HEAD(&lock->duplicate_vios);
vdo_waitq_init(&lock->waiters);
list_add_tail(&lock->pool_node, &zone->lock_pool);
}
@@ -441,7 +441,7 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)
VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL,
"must have a hash zone when holding a hash lock");
VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry),
- "must be on a hash lock ring when holding a hash lock");
+ "must be on a hash lock list when holding a hash lock");
VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0,
"hash lock reference must be counted");
@@ -464,10 +464,10 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)
if (new_lock != NULL) {
/*
- * Keep all data_vios sharing the lock on a ring since they can complete in any
+ * Keep all data_vios sharing the lock on a list since they can complete in any
* order and we'll always need a pointer to one to compare data.
*/
- list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring);
+ list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_vios);
new_lock->reference_count += 1;
if (new_lock->max_references < new_lock->reference_count)
new_lock->max_references = new_lock->reference_count;
@@ -1789,10 +1789,10 @@ static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate
struct hash_zone *zone;
bool collides;
- if (list_empty(&lock->duplicate_ring))
+ if (list_empty(&lock->duplicate_vios))
return false;
- lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio,
+ lock_holder = list_first_entry(&lock->duplicate_vios, struct data_vio,
hash_lock_entry);
zone = candidate->hash_zone;
collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data);
@@ -1815,7 +1815,7 @@ static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio
return result;
result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry),
- "must not already be a member of a hash lock ring");
+ "must not already be a member of a hash lock list");
if (result != VDO_SUCCESS)
return result;
@@ -1942,8 +1942,8 @@ void vdo_release_hash_lock(struct data_vio *data_vio)
"returned hash lock must not be in use with state %s",
get_hash_lock_state_name(lock->state));
VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node),
- "hash lock returned to zone must not be in a pool ring");
- VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring),
+ "hash lock returned to zone must not be in a pool list");
+ VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_vios),
"hash lock returned to zone must not reference DataVIOs");
return_hash_lock_to_pool(zone, lock);
@@ -2178,6 +2178,7 @@ static int initialize_index(struct vdo *vdo, struct hash_zones *zones)
vdo_set_dedupe_index_timeout_interval(vdo_dedupe_index_timeout_interval);
vdo_set_dedupe_index_min_timer_interval(vdo_dedupe_index_min_timer_interval);
+ spin_lock_init(&zones->lock);
/*
* Since we will save up the timeouts that would have been reported but were ratelimited,
@@ -2260,7 +2261,7 @@ static void check_for_drain_complete(struct hash_zone *zone)
if ((atomic_read(&zone->timer_state) == DEDUPE_QUERY_TIMER_IDLE) ||
change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING,
DEDUPE_QUERY_TIMER_IDLE)) {
- del_timer_sync(&zone->timer);
+ timer_delete_sync(&zone->timer);
} else {
/*
* There is an in flight time-out, which must get processed before we can continue.
diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
index 100e92f8f866..b7cc0f41caca 100644
--- a/drivers/md/dm-vdo/encodings.c
+++ b/drivers/md/dm-vdo/encodings.c
@@ -711,24 +711,11 @@ int vdo_configure_slab(block_count_t slab_size, block_count_t slab_journal_block
ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks);
meta_blocks = (ref_blocks + slab_journal_blocks);
- /* Make sure test code hasn't configured slabs to be too small. */
+ /* Make sure configured slabs are not too small. */
if (meta_blocks >= slab_size)
return VDO_BAD_CONFIGURATION;
- /*
- * If the slab size is very small, assume this must be a unit test and override the number
- * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their
- * data_blocks fields to be the exact capacity of the configured volume, and that used to
- * fall out since they use a power of two for the number of data blocks, the slab size was
- * a power of two, and every block in a slab was a data block.
- *
- * TODO: Try to figure out some way of structuring testParameters and unit tests so this
- * hack isn't needed without having to edit several unit tests every time the metadata size
- * changes by one block.
- */
data_blocks = slab_size - meta_blocks;
- if ((slab_size < 1024) && !is_power_of_2(data_blocks))
- data_blocks = ((block_count_t) 1 << ilog2(data_blocks));
/*
* Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in
@@ -1221,11 +1208,6 @@ int vdo_validate_config(const struct vdo_config *config,
if (result != VDO_SUCCESS)
return result;
- result = VDO_ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS,
- "slab journal size meets minimum size");
- if (result != VDO_SUCCESS)
- return result;
-
result = VDO_ASSERT(config->slab_journal_blocks <= config->slab_size,
"slab journal size is within expected bound");
if (result != VDO_SUCCESS)
diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c
index af8fab83b0f3..61edf2b72427 100644
--- a/drivers/md/dm-vdo/indexer/index-layout.c
+++ b/drivers/md/dm-vdo/indexer/index-layout.c
@@ -54,7 +54,6 @@
* Each save also has a unique nonce.
*/
-#define MAGIC_SIZE 32
#define NONCE_INFO_SIZE 32
#define MAX_SAVES 2
@@ -98,9 +97,11 @@ enum region_type {
#define SUPER_VERSION_CURRENT 3
#define SUPER_VERSION_MAXIMUM 7
-static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
+static const u8 LAYOUT_MAGIC[] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */
+#define MAGIC_SIZE (sizeof(LAYOUT_MAGIC) - 1)
+
struct region_header {
u64 magic;
u64 region_blocks;
diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
index aee0914d604a..aa575a24e0b2 100644
--- a/drivers/md/dm-vdo/indexer/index-session.c
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@@ -100,7 +100,6 @@ static int get_index_session(struct uds_index_session *index_session)
int uds_launch_request(struct uds_request *request)
{
- size_t internal_size;
int result;
if (request->callback == NULL) {
@@ -121,10 +120,7 @@ int uds_launch_request(struct uds_request *request)
}
/* Reset all internal fields before processing. */
- internal_size =
- sizeof(struct uds_request) - offsetof(struct uds_request, zone_number);
- // FIXME should be using struct_group for this instead
- memset((char *) request + sizeof(*request) - internal_size, 0, internal_size);
+ memset(&request->internal, 0, sizeof(request->internal));
result = get_index_session(request->session);
if (result != UDS_SUCCESS)
diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h
index 183a94eb7e92..7c1fc4577f5b 100644
--- a/drivers/md/dm-vdo/indexer/indexer.h
+++ b/drivers/md/dm-vdo/indexer/indexer.h
@@ -8,6 +8,7 @@
#include <linux/mutex.h>
#include <linux/sched.h>
+#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/wait.h>
@@ -73,7 +74,7 @@ enum uds_request_type {
/* Remove any mapping for a name. */
UDS_DELETE,
-};
+} __packed;
enum uds_open_index_type {
/* Create a new index. */
@@ -226,7 +227,7 @@ struct uds_zone_message {
enum uds_zone_message_type type;
/* The virtual chapter number to which the message applies */
u64 virtual_chapter;
-};
+} __packed;
struct uds_index_session;
struct uds_index;
@@ -253,34 +254,32 @@ struct uds_request {
/* The existing data associated with the request name, if any */
struct uds_record_data old_metadata;
- /* Either UDS_SUCCESS or an error code for the request */
- int status;
/* True if the record name had an existing entry in the index */
bool found;
+ /* Either UDS_SUCCESS or an error code for the request */
+ int status;
- /*
- * The remaining fields are used internally and should not be altered by clients. The index
- * relies on zone_number being the first field in this section.
- */
-
- /* The number of the zone which will process this request*/
- unsigned int zone_number;
- /* A link for adding a request to a lock-free queue */
- struct funnel_queue_entry queue_link;
- /* A link for adding a request to a standard linked list */
- struct uds_request *next_request;
- /* A pointer to the index processing this request */
- struct uds_index *index;
- /* Control message for coordinating between zones */
- struct uds_zone_message zone_message;
- /* If true, process request immediately by waking the worker thread */
- bool unbatched;
- /* If true, continue this request before processing newer requests */
- bool requeued;
- /* The virtual chapter containing the record name, if known */
- u64 virtual_chapter;
- /* The region of the index containing the record name */
- enum uds_index_region location;
+ /* The remaining fields are used internally and should not be altered by clients. */
+ struct_group(internal,
+ /* The virtual chapter containing the record name, if known */
+ u64 virtual_chapter;
+ /* The region of the index containing the record name */
+ enum uds_index_region location;
+ /* If true, process request immediately by waking the worker thread */
+ bool unbatched;
+ /* If true, continue this request before processing newer requests */
+ bool requeued;
+ /* Control message for coordinating between zones */
+ struct uds_zone_message zone_message;
+ /* The number of the zone which will process this request*/
+ unsigned int zone_number;
+ /* A link for adding a request to a lock-free queue */
+ struct funnel_queue_entry queue_link;
+ /* A link for adding a request to a standard linked list */
+ struct uds_request *next_request;
+ /* A pointer to the index processing this request */
+ struct uds_index *index;
+ );
};
/* A session is required for most index operations. */
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
index 421e5436c32c..11d47770b54d 100644
--- a/drivers/md/dm-vdo/io-submitter.c
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -327,6 +327,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio)
* @error_handler: the handler for submission or I/O errors (may be NULL)
* @operation: the type of I/O to perform
* @data: the buffer to read or write (may be NULL)
+ * @size: the I/O amount in bytes
*
* The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block
* other vdo threads.
@@ -338,7 +339,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio)
*/
void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
bio_end_io_t callback, vdo_action_fn error_handler,
- blk_opf_t operation, char *data)
+ blk_opf_t operation, char *data, int size)
{
int result;
struct vdo_completion *completion = &vio->completion;
@@ -349,7 +350,8 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
vdo_reset_completion(completion);
completion->error_handler = error_handler;
- result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical);
+ result = vio_reset_bio_with_size(vio, data, size, callback, operation | REQ_META,
+ physical);
if (result != VDO_SUCCESS) {
continue_vio(vio, result);
return;
diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h
index 80748699496f..3088f11055fd 100644
--- a/drivers/md/dm-vdo/io-submitter.h
+++ b/drivers/md/dm-vdo/io-submitter.h
@@ -8,6 +8,7 @@
#include <linux/bio.h>
+#include "constants.h"
#include "types.h"
struct io_submitter;
@@ -26,14 +27,25 @@ void vdo_submit_data_vio(struct data_vio *data_vio);
void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
bio_end_io_t callback, vdo_action_fn error_handler,
- blk_opf_t operation, char *data);
+ blk_opf_t operation, char *data, int size);
static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
bio_end_io_t callback, vdo_action_fn error_handler,
blk_opf_t operation)
{
__submit_metadata_vio(vio, physical, callback, error_handler,
- operation, vio->data);
+ operation, vio->data, vio->block_count * VDO_BLOCK_SIZE);
+}
+
+static inline void vdo_submit_metadata_vio_with_size(struct vio *vio,
+ physical_block_number_t physical,
+ bio_end_io_t callback,
+ vdo_action_fn error_handler,
+ blk_opf_t operation,
+ int size)
+{
+ __submit_metadata_vio(vio, physical, callback, error_handler,
+ operation, vio->data, size);
}
static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback,
@@ -41,7 +53,7 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback,
{
/* FIXME: Can we just use REQ_OP_FLUSH? */
__submit_metadata_vio(vio, 0, callback, error_handler,
- REQ_OP_WRITE | REQ_PREFLUSH, NULL);
+ REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0);
}
#endif /* VDO_IO_SUBMITTER_H */
diff --git a/drivers/md/dm-vdo/packer.h b/drivers/md/dm-vdo/packer.h
index 0f3be44710b5..8c8d6892582d 100644
--- a/drivers/md/dm-vdo/packer.h
+++ b/drivers/md/dm-vdo/packer.h
@@ -46,7 +46,7 @@ struct compressed_block {
/*
* Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed
- * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with
+ * block. The bins are kept in a list sorted by the amount of unused space so the first bin with
* enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or
* is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin.
* Upon entering the packer, each data_vio already has its compressed data in the first slot of the
diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c
index 42d3d8d0e4b5..9bae8256ba4e 100644
--- a/drivers/md/dm-vdo/priority-table.c
+++ b/drivers/md/dm-vdo/priority-table.c
@@ -199,7 +199,7 @@ void vdo_priority_table_remove(struct priority_table *table, struct list_head *e
/*
* Remove the entry from the bucket list, remembering a pointer to another entry in the
- * ring.
+ * list.
*/
next_entry = entry->next;
list_del_init(entry);
diff --git a/drivers/md/dm-vdo/recovery-journal.h b/drivers/md/dm-vdo/recovery-journal.h
index 899071173015..25e7ec6d19f6 100644
--- a/drivers/md/dm-vdo/recovery-journal.h
+++ b/drivers/md/dm-vdo/recovery-journal.h
@@ -43,9 +43,9 @@
* has a vio which is used to commit that block to disk. The vio's data is the on-disk
* representation of the journal block. In addition each in-memory block has a buffer which is used
* to accumulate entries while a partial commit of the block is in progress. In-memory blocks are
- * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active
- * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is
- * moved back to the 'free_tail_blocks' ring.
+ * kept on two lists. Free blocks live on the 'free_tail_blocks' list. When a block becomes active
+ * (see below) it is moved to the 'active_tail_blocks' list. When a block is fully committed, it is
+ * moved back to the 'free_tail_blocks' list.
*
* When entries are added to the journal, they are added to the active in-memory block, as
* indicated by the 'active_block' field. If the caller wishes to wait for the entry to be
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
index 8f0a35c63af6..f3d80ff7bef5 100644
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -139,7 +139,7 @@ static bool is_slab_journal_blank(const struct vdo_slab *slab)
}
/**
- * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct
+ * mark_slab_journal_dirty() - Put a slab journal on the dirty list of its allocator in the correct
* order.
* @journal: The journal to be marked dirty.
* @lock: The recovery journal lock held by the slab journal.
@@ -414,8 +414,7 @@ static void complete_reaping(struct vdo_completion *completion)
{
struct slab_journal *journal = completion->parent;
- return_vio_to_pool(journal->slab->allocator->vio_pool,
- vio_as_pooled_vio(as_vio(vdo_forget(completion))));
+ return_vio_to_pool(vio_as_pooled_vio(as_vio(completion)));
finish_reaping(journal);
reap_slab_journal(journal);
}
@@ -698,7 +697,7 @@ static void complete_write(struct vdo_completion *completion)
sequence_number_t committed = get_committing_sequence_number(pooled);
list_del_init(&pooled->list_entry);
- return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled));
+ return_vio_to_pool(pooled);
if (result != VDO_SUCCESS) {
vio_record_metadata_io_error(as_vio(completion));
@@ -822,7 +821,7 @@ static void commit_tail(struct slab_journal *journal)
/*
* Since we are about to commit the tail block, this journal no longer needs to be on the
- * ring of journals which the recovery journal might ask to commit.
+ * list of journals which the recovery journal might ask to commit.
*/
mark_slab_journal_clean(journal);
@@ -1076,7 +1075,7 @@ static void finish_reference_block_write(struct vdo_completion *completion)
/* Release the slab journal lock. */
adjust_slab_journal_block_reference(&slab->journal,
block->slab_journal_lock_to_release, -1);
- return_vio_to_pool(slab->allocator->vio_pool, pooled);
+ return_vio_to_pool(pooled);
/*
* We can't clear the is_writing flag earlier as releasing the slab journal lock may cause
@@ -1170,8 +1169,8 @@ static void handle_io_error(struct vdo_completion *completion)
struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab;
vio_record_metadata_io_error(vio);
- return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
- slab->active_count--;
+ return_vio_to_pool(vio_as_pooled_vio(vio));
+ slab->active_count -= vio->io_size / VDO_BLOCK_SIZE;
vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
check_if_slab_drained(slab);
}
@@ -1372,7 +1371,7 @@ static unsigned int calculate_slab_priority(struct vdo_slab *slab)
static void prioritize_slab(struct vdo_slab *slab)
{
VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
- "a slab must not already be on a ring when prioritizing");
+ "a slab must not already be on a list when prioritizing");
slab->priority = calculate_slab_priority(slab);
vdo_priority_table_enqueue(slab->allocator->prioritized_slabs,
slab->priority, &slab->allocq_entry);
@@ -2165,28 +2164,95 @@ static void dirty_all_reference_blocks(struct vdo_slab *slab)
dirty_block(&slab->reference_blocks[i]);
}
+static inline bool journal_points_equal(struct journal_point first,
+ struct journal_point second)
+{
+ return ((first.sequence_number == second.sequence_number) &&
+ (first.entry_count == second.entry_count));
+}
+
/**
- * clear_provisional_references() - Clear the provisional reference counts from a reference block.
- * @block: The block to clear.
+ * match_bytes() - Check an 8-byte word for bytes matching the value specified
+ * @input: A word to examine the bytes of
+ * @match: The byte value sought
+ *
+ * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise
*/
-static void clear_provisional_references(struct reference_block *block)
+static inline u64 match_bytes(u64 input, u8 match)
{
- vdo_refcount_t *counters = get_reference_counters_for_block(block);
- block_count_t j;
+ u64 temp = input ^ (match * 0x0101010101010101ULL);
+ /* top bit of each byte is set iff top bit of temp byte is clear; rest are 0 */
+ u64 test_top_bits = ~temp & 0x8080808080808080ULL;
+ /* top bit of each byte is set iff low 7 bits of temp byte are clear; rest are useless */
+ u64 test_low_bits = 0x8080808080808080ULL - (temp & 0x7f7f7f7f7f7f7f7fULL);
+ /* return 1 when both tests indicate temp byte is 0 */
+ return (test_top_bits & test_low_bits) >> 7;
+}
+
+/**
+ * count_valid_references() - Process a newly loaded refcount array
+ * @counters: the array of counters from a metadata block
+ *
+ * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't
+ * cleaned up at shutdown, changing them internally to "empty".
+ *
+ * Return: the number of blocks that are referenced (counters not "empty")
+ */
+static unsigned int count_valid_references(vdo_refcount_t *counters)
+{
+ u64 *words = (u64 *)counters;
+ /* It's easier to count occurrences of a specific byte than its absences. */
+ unsigned int empty_count = 0;
+ /* For speed, we process 8 bytes at once. */
+ unsigned int words_left = COUNTS_PER_BLOCK / sizeof(u64);
+
+ /*
+ * Sanity check assumptions used for optimizing this code: Counters are bytes. The counter
+ * array is a multiple of the word size.
+ */
+ BUILD_BUG_ON(sizeof(vdo_refcount_t) != 1);
+ BUILD_BUG_ON((COUNTS_PER_BLOCK % sizeof(u64)) != 0);
+
+ while (words_left > 0) {
+ /*
+ * This is used effectively as 8 byte-size counters. Byte 0 counts how many words
+ * had the target value found in byte 0, etc. We just have to avoid overflow.
+ */
+ u64 split_count = 0;
+ /*
+ * The counter "% 255" trick used below to fold split_count into empty_count
+ * imposes a limit of 254 bytes examined each iteration of the outer loop. We
+ * process a word at a time, so that limit gets rounded down to 31 u64 words.
+ */
+ const unsigned int max_words_per_iteration = 254 / sizeof(u64);
+ unsigned int iter_words_left = min_t(unsigned int, words_left,
+ max_words_per_iteration);
+
+ words_left -= iter_words_left;
+
+ while (iter_words_left--) {
+ u64 word = *words;
+ u64 temp;
+
+ /* First, if we have any provisional refcount values, clear them. */
+ temp = match_bytes(word, PROVISIONAL_REFERENCE_COUNT);
+ if (temp) {
+ /*
+ * 'temp' has 0x01 bytes where 'word' has PROVISIONAL; this xor
+ * will alter just those bytes, changing PROVISIONAL to EMPTY.
+ */
+ word ^= temp * (PROVISIONAL_REFERENCE_COUNT ^ EMPTY_REFERENCE_COUNT);
+ *words = word;
+ }
- for (j = 0; j < COUNTS_PER_BLOCK; j++) {
- if (counters[j] == PROVISIONAL_REFERENCE_COUNT) {
- counters[j] = EMPTY_REFERENCE_COUNT;
- block->allocated_count--;
+ /* Now count the EMPTY_REFERENCE_COUNT bytes, updating the 8 counters. */
+ split_count += match_bytes(word, EMPTY_REFERENCE_COUNT);
+ words++;
}
+ empty_count += split_count % 255;
}
-}
-static inline bool journal_points_equal(struct journal_point first,
- struct journal_point second)
-{
- return ((first.sequence_number == second.sequence_number) &&
- (first.entry_count == second.entry_count));
+ return COUNTS_PER_BLOCK - empty_count;
}
/**
@@ -2197,7 +2263,6 @@ static inline bool journal_points_equal(struct journal_point first,
static void unpack_reference_block(struct packed_reference_block *packed,
struct reference_block *block)
{
- block_count_t index;
sector_count_t i;
struct vdo_slab *slab = block->slab;
vdo_refcount_t *counters = get_reference_counters_for_block(block);
@@ -2223,11 +2288,7 @@ static void unpack_reference_block(struct packed_reference_block *packed,
}
}
- block->allocated_count = 0;
- for (index = 0; index < COUNTS_PER_BLOCK; index++) {
- if (counters[index] != EMPTY_REFERENCE_COUNT)
- block->allocated_count++;
- }
+ block->allocated_count = count_valid_references(counters);
}
/**
@@ -2240,13 +2301,19 @@ static void finish_reference_block_load(struct vdo_completion *completion)
struct pooled_vio *pooled = vio_as_pooled_vio(vio);
struct reference_block *block = completion->parent;
struct vdo_slab *slab = block->slab;
+ unsigned int block_count = vio->io_size / VDO_BLOCK_SIZE;
+ unsigned int i;
+ char *data = vio->data;
- unpack_reference_block((struct packed_reference_block *) vio->data, block);
- return_vio_to_pool(slab->allocator->vio_pool, pooled);
- slab->active_count--;
- clear_provisional_references(block);
+ for (i = 0; i < block_count; i++, block++, data += VDO_BLOCK_SIZE) {
+ struct packed_reference_block *packed = (struct packed_reference_block *) data;
+
+ unpack_reference_block(packed, block);
+ slab->free_blocks -= block->allocated_count;
+ }
+ return_vio_to_pool(pooled);
+ slab->active_count -= block_count;
- slab->free_blocks -= block->allocated_count;
check_if_slab_drained(slab);
}
@@ -2260,23 +2327,25 @@ static void load_reference_block_endio(struct bio *bio)
}
/**
- * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the
- * block.
- * @waiter: The waiter of the block to load.
+ * load_reference_block_group() - After a block waiter has gotten a VIO from the VIO pool, load
+ * a set of blocks.
+ * @waiter: The waiter of the first block to load.
* @context: The VIO returned by the pool.
*/
-static void load_reference_block(struct vdo_waiter *waiter, void *context)
+static void load_reference_block_group(struct vdo_waiter *waiter, void *context)
{
struct pooled_vio *pooled = context;
struct vio *vio = &pooled->vio;
struct reference_block *block =
container_of(waiter, struct reference_block, waiter);
- size_t block_offset = (block - block->slab->reference_blocks);
+ u32 block_offset = block - block->slab->reference_blocks;
+ u32 max_block_count = block->slab->reference_block_count - block_offset;
+ u32 block_count = min_t(int, vio->block_count, max_block_count);
vio->completion.parent = block;
- vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset,
- load_reference_block_endio, handle_io_error,
- REQ_OP_READ);
+ vdo_submit_metadata_vio_with_size(vio, block->slab->ref_counts_origin + block_offset,
+ load_reference_block_endio, handle_io_error,
+ REQ_OP_READ, block_count * VDO_BLOCK_SIZE);
}
/**
@@ -2286,14 +2355,21 @@ static void load_reference_block(struct vdo_waiter *waiter, void *context)
static void load_reference_blocks(struct vdo_slab *slab)
{
block_count_t i;
+ u64 blocks_per_vio = slab->allocator->refcount_blocks_per_big_vio;
+ struct vio_pool *pool = slab->allocator->refcount_big_vio_pool;
+
+ if (!pool) {
+ pool = slab->allocator->vio_pool;
+ blocks_per_vio = 1;
+ }
slab->free_blocks = slab->block_count;
slab->active_count = slab->reference_block_count;
- for (i = 0; i < slab->reference_block_count; i++) {
+ for (i = 0; i < slab->reference_block_count; i += blocks_per_vio) {
struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter;
- waiter->callback = load_reference_block;
- acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
+ waiter->callback = load_reference_block_group;
+ acquire_vio_from_pool(pool, waiter);
}
}
@@ -2429,7 +2505,7 @@ static void finish_loading_journal(struct vdo_completion *completion)
initialize_journal_state(journal);
}
- return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+ return_vio_to_pool(vio_as_pooled_vio(vio));
vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab));
}
@@ -2449,7 +2525,7 @@ static void handle_load_error(struct vdo_completion *completion)
struct vio *vio = as_vio(completion);
vio_record_metadata_io_error(vio);
- return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+ return_vio_to_pool(vio_as_pooled_vio(vio));
vdo_finish_loading_with_result(&journal->slab->state, result);
}
@@ -2547,7 +2623,7 @@ static void queue_slab(struct vdo_slab *slab)
int result;
VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
- "a requeued slab must not already be on a ring");
+ "a requeued slab must not already be on a list");
if (vdo_is_read_only(allocator->depot->vdo))
return;
@@ -2700,6 +2776,7 @@ static void finish_scrubbing(struct slab_scrubber *scrubber, int result)
vdo_log_info("VDO commencing normal operation");
else if (prior_state == VDO_RECOVERING)
vdo_log_info("Exiting recovery mode");
+ free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool));
}
/*
@@ -3281,7 +3358,7 @@ int vdo_release_block_reference(struct block_allocator *allocator,
* This is a min_heap callback function orders slab_status structures using the 'is_clean' field as
* the primary key and the 'emptiness' field as the secondary key.
*
- * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping
+ * Slabs need to be pushed onto the lists in the same order they are to be popped off. Popping
* should always get the most empty first, so pushing should be from most empty to least empty.
* Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements
* before larger ones.
@@ -3983,6 +4060,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot,
struct vdo *vdo = depot->vdo;
block_count_t max_free_blocks = depot->slab_config.data_blocks;
unsigned int max_priority = (2 + ilog2(max_free_blocks));
+ u32 reference_block_count, refcount_reads_needed, refcount_blocks_per_vio;
*allocator = (struct block_allocator) {
.depot = depot,
@@ -4000,12 +4078,24 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot,
return result;
vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION);
- result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id,
+ result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, 1, allocator->thread_id,
VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
allocator, &allocator->vio_pool);
if (result != VDO_SUCCESS)
return result;
+ /* Initialize the refcount-reading vio pool. */
+ reference_block_count = vdo_get_saved_reference_count_size(depot->slab_config.slab_blocks);
+ refcount_reads_needed = DIV_ROUND_UP(reference_block_count, MAX_BLOCKS_PER_VIO);
+ refcount_blocks_per_vio = DIV_ROUND_UP(reference_block_count, refcount_reads_needed);
+ allocator->refcount_blocks_per_big_vio = refcount_blocks_per_vio;
+ result = make_vio_pool(vdo, BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE,
+ allocator->refcount_blocks_per_big_vio, allocator->thread_id,
+ VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
+ NULL, &allocator->refcount_big_vio_pool);
+ if (result != VDO_SUCCESS)
+ return result;
+
result = initialize_slab_scrubber(allocator);
if (result != VDO_SUCCESS)
return result;
@@ -4223,6 +4313,7 @@ void vdo_free_slab_depot(struct slab_depot *depot)
uninitialize_allocator_summary(allocator);
uninitialize_scrubber_vio(&allocator->scrubber);
free_vio_pool(vdo_forget(allocator->vio_pool));
+ free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool));
vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs));
}
diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h
index f234853501ca..fadc0c9d4dc4 100644
--- a/drivers/md/dm-vdo/slab-depot.h
+++ b/drivers/md/dm-vdo/slab-depot.h
@@ -45,6 +45,13 @@
enum {
/* The number of vios in the vio pool is proportional to the throughput of the VDO. */
BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128,
+
+ /*
+ * The number of vios in the vio pool used for loading reference count data. A slab's
+ * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be
+ * plenty.
+ */
+ BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9,
};
/*
@@ -248,7 +255,7 @@ struct vdo_slab {
/* A list of the dirty blocks waiting to be written out */
struct vdo_wait_queue dirty_blocks;
- /* The number of blocks which are currently writing */
+ /* The number of blocks which are currently reading or writing */
size_t active_count;
/* A waiter object for updating the slab summary */
@@ -425,6 +432,10 @@ struct block_allocator {
/* The vio pool for reading and writing block allocator metadata */
struct vio_pool *vio_pool;
+ /* The vio pool for large initial reads of ref count areas */
+ struct vio_pool *refcount_big_vio_pool;
+ /* How many ref count blocks are read per vio at initial load */
+ u32 refcount_blocks_per_big_vio;
/* The dm_kcopyd client for erasing slab journals */
struct dm_kcopyd_client *eraser;
/* Iterator over the slabs to be erased */
diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h
index dbe892b10f26..cdf36e7d7702 100644
--- a/drivers/md/dm-vdo/types.h
+++ b/drivers/md/dm-vdo/types.h
@@ -376,6 +376,9 @@ struct vio {
/* The size of this vio in blocks */
unsigned int block_count;
+ /* The amount of data to be read or written, in bytes */
+ unsigned int io_size;
+
/* The data being read or written. */
char *data;
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
index a7e32baab4af..80b608674022 100644
--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@@ -31,9 +31,7 @@
#include <linux/completion.h>
#include <linux/device-mapper.h>
-#include <linux/kernel.h>
#include <linux/lz4.h>
-#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/types.h>
@@ -142,12 +140,6 @@ static void finish_vdo_request_queue(void *ptr)
vdo_unregister_allocating_thread();
}
-#ifdef MODULE
-#define MODULE_NAME THIS_MODULE->name
-#else
-#define MODULE_NAME "dm-vdo"
-#endif /* MODULE */
-
static const struct vdo_work_queue_type default_queue_type = {
.start = start_vdo_request_queue,
.finish = finish_vdo_request_queue,
@@ -559,8 +551,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason,
*vdo_ptr = vdo;
snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix),
- "%s%u", MODULE_NAME, instance);
- BUG_ON(vdo->thread_name_prefix[0] == '\0');
+ "vdo%u", instance);
result = vdo_allocate(vdo->thread_config.thread_count,
struct vdo_thread, __func__, &vdo->threads);
if (result != VDO_SUCCESS) {
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
index e710f3c5a972..e7f4153e55e3 100644
--- a/drivers/md/dm-vdo/vio.c
+++ b/drivers/md/dm-vdo/vio.c
@@ -188,14 +188,23 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb
/*
* Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated
- * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a
- * vio associated with the bio.
+ * bio, as it assumes the bio wraps a 4k-multiple buffer that is 4k aligned, but there does not
+ * have to be a vio associated with the bio.
*/
int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
blk_opf_t bi_opf, physical_block_number_t pbn)
{
- int bvec_count, offset, len, i;
+ return vio_reset_bio_with_size(vio, data, vio->block_count * VDO_BLOCK_SIZE,
+ callback, bi_opf, pbn);
+}
+
+int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback,
+ blk_opf_t bi_opf, physical_block_number_t pbn)
+{
+ int bvec_count, offset, i;
struct bio *bio = vio->bio;
+ int vio_size = vio->block_count * VDO_BLOCK_SIZE;
+ int remaining;
bio_reset(bio, bio->bi_bdev, bi_opf);
vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn);
@@ -205,22 +214,21 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
bio->bi_ioprio = 0;
bio->bi_io_vec = bio->bi_inline_vecs;
bio->bi_max_vecs = vio->block_count + 1;
- len = VDO_BLOCK_SIZE * vio->block_count;
+ if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d",
+ size, vio_size) != VDO_SUCCESS)
+ size = vio_size;
+ vio->io_size = size;
offset = offset_in_page(data);
- bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+ bvec_count = DIV_ROUND_UP(offset + size, PAGE_SIZE);
+ remaining = size;
- /*
- * If we knew that data was always on one page, or contiguous pages, we wouldn't need the
- * loop. But if we're using vmalloc, it's not impossible that the data is in different
- * pages that can't be merged in bio_add_page...
- */
- for (i = 0; (i < bvec_count) && (len > 0); i++) {
+ for (i = 0; (i < bvec_count) && (remaining > 0); i++) {
struct page *page;
int bytes_added;
int bytes = PAGE_SIZE - offset;
- if (bytes > len)
- bytes = len;
+ if (bytes > remaining)
+ bytes = remaining;
page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data);
bytes_added = bio_add_page(bio, page, bytes, offset);
@@ -232,7 +240,7 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
}
data += bytes;
- len -= bytes;
+ remaining -= bytes;
offset = 0;
}
@@ -301,6 +309,7 @@ void vio_record_metadata_io_error(struct vio *vio)
* make_vio_pool() - Create a new vio pool.
* @vdo: The vdo.
* @pool_size: The number of vios in the pool.
+ * @block_count: The number of 4k blocks per vio.
* @thread_id: The ID of the thread using this pool.
* @vio_type: The type of vios in the pool.
* @priority: The priority with which vios from the pool should be enqueued.
@@ -309,13 +318,14 @@ void vio_record_metadata_io_error(struct vio *vio)
*
* Return: A success or error code.
*/
-int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
+int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_id_t thread_id,
enum vio_type vio_type, enum vio_priority priority, void *context,
struct vio_pool **pool_ptr)
{
struct vio_pool *pool;
char *ptr;
int result;
+ size_t per_vio_size = VDO_BLOCK_SIZE * block_count;
result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio,
__func__, &pool);
@@ -326,7 +336,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
INIT_LIST_HEAD(&pool->available);
INIT_LIST_HEAD(&pool->busy);
- result = vdo_allocate(pool_size * VDO_BLOCK_SIZE, char,
+ result = vdo_allocate(pool_size * per_vio_size, char,
"VIO pool buffer", &pool->buffer);
if (result != VDO_SUCCESS) {
free_vio_pool(pool);
@@ -334,10 +344,10 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
}
ptr = pool->buffer;
- for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) {
+ for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += per_vio_size) {
struct pooled_vio *pooled = &pool->vios[pool->size];
- result = allocate_vio_components(vdo, vio_type, priority, NULL, 1, ptr,
+ result = allocate_vio_components(vdo, vio_type, priority, NULL, block_count, ptr,
&pooled->vio);
if (result != VDO_SUCCESS) {
free_vio_pool(pool);
@@ -345,6 +355,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
}
pooled->context = context;
+ pooled->pool = pool;
list_add_tail(&pooled->pool_entry, &pool->available);
}
@@ -419,12 +430,13 @@ void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter)
}
/**
- * return_vio_to_pool() - Return a vio to the pool
- * @pool: The vio pool.
+ * return_vio_to_pool() - Return a vio to its pool
* @vio: The pooled vio to return.
*/
-void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio)
+void return_vio_to_pool(struct pooled_vio *vio)
{
+ struct vio_pool *pool = vio->pool;
+
VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()),
"vio pool entry returned on same thread as it was acquired");
diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h
index 3490e9f59b04..4bfcb21901f1 100644
--- a/drivers/md/dm-vdo/vio.h
+++ b/drivers/md/dm-vdo/vio.h
@@ -30,6 +30,8 @@ struct pooled_vio {
void *context;
/* The list entry used by the pool */
struct list_head pool_entry;
+ /* The pool this vio is allocated from */
+ struct vio_pool *pool;
};
/**
@@ -123,6 +125,8 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb
int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback,
blk_opf_t bi_opf, physical_block_number_t pbn);
+int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback,
+ blk_opf_t bi_opf, physical_block_number_t pbn);
void update_vio_error_stats(struct vio *vio, const char *format, ...)
__printf(2, 3);
@@ -188,12 +192,13 @@ static inline struct pooled_vio *vio_as_pooled_vio(struct vio *vio)
struct vio_pool;
-int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id,
- enum vio_type vio_type, enum vio_priority priority,
- void *context, struct vio_pool **pool_ptr);
+int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count,
+ thread_id_t thread_id, enum vio_type vio_type,
+ enum vio_priority priority, void *context,
+ struct vio_pool **pool_ptr);
void free_vio_pool(struct vio_pool *pool);
bool __must_check is_vio_pool_busy(struct vio_pool *pool);
void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter);
-void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio);
+void return_vio_to_pool(struct pooled_vio *vio);
#endif /* VIO_H */
diff --git a/drivers/md/dm-vdo/wait-queue.c b/drivers/md/dm-vdo/wait-queue.c
index 6e1e739277ef..f81ed0cee2bf 100644
--- a/drivers/md/dm-vdo/wait-queue.c
+++ b/drivers/md/dm-vdo/wait-queue.c
@@ -34,7 +34,7 @@ void vdo_waitq_enqueue_waiter(struct vdo_wait_queue *waitq, struct vdo_waiter *w
waitq->last_waiter->next_waiter = waiter;
}
- /* In both cases, the waiter we added to the ring becomes the last waiter. */
+ /* In both cases, the waiter we added to the list becomes the last waiter. */
waitq->last_waiter = waiter;
waitq->length += 1;
}
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index e86c1431b108..3c427f18a04b 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -30,6 +30,7 @@
#define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR"
#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
+#define DM_VERITY_USE_BH_DEFAULT_BYTES 8192
#define DM_VERITY_MAX_CORRUPTED_ERRS 100
@@ -49,6 +50,15 @@ static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE
module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644);
+static unsigned int dm_verity_use_bh_bytes[4] = {
+ DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_NONE
+ DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_RT
+ DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_BE
+ 0 // IOPRIO_CLASS_IDLE
+};
+
+module_param_array_named(use_bh_bytes, dm_verity_use_bh_bytes, uint, NULL, 0644);
+
static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled);
/* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */
@@ -311,7 +321,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
data = dm_bufio_get(v->bufio, hash_block, &buf);
- if (data == NULL) {
+ if (IS_ERR_OR_NULL(data)) {
/*
* In tasklet and the hash was not in the bufio cache.
* Return early and resume execution from a work-queue
@@ -324,8 +334,24 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
&buf, bio->bi_ioprio);
}
- if (IS_ERR(data))
- return PTR_ERR(data);
+ if (IS_ERR(data)) {
+ if (skip_unverified)
+ return 1;
+ r = PTR_ERR(data);
+ data = dm_bufio_new(v->bufio, hash_block, &buf);
+ if (IS_ERR(data))
+ return r;
+ if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA,
+ hash_block, data) == 0) {
+ aux = dm_bufio_get_aux_data(buf);
+ aux->hash_verified = 1;
+ goto release_ok;
+ } else {
+ dm_bufio_release(buf);
+ dm_bufio_forget(v->bufio, hash_block);
+ return r;
+ }
+ }
aux = dm_bufio_get_aux_data(buf);
@@ -366,6 +392,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
}
}
+release_ok:
data += offset;
memcpy(want_digest, data, v->digest_size);
r = 0;
@@ -652,9 +679,17 @@ static void verity_bh_work(struct work_struct *w)
verity_finish_io(io, errno_to_blk_status(err));
}
+static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio)
+{
+ return ioprio <= IOPRIO_CLASS_IDLE &&
+ bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]);
+}
+
static void verity_end_io(struct bio *bio)
{
struct dm_verity_io *io = bio->bi_private;
+ unsigned short ioprio = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+ unsigned int bytes = io->n_blocks << io->v->data_dev_block_bits;
if (bio->bi_status &&
(!verity_fec_is_enabled(io->v) ||
@@ -664,9 +699,14 @@ static void verity_end_io(struct bio *bio)
return;
}
- if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq) {
- INIT_WORK(&io->bh_work, verity_bh_work);
- queue_work(system_bh_wq, &io->bh_work);
+ if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq &&
+ verity_use_bh(bytes, ioprio)) {
+ if (in_hardirq() || irqs_disabled()) {
+ INIT_WORK(&io->bh_work, verity_bh_work);
+ queue_work(system_bh_wq, &io->bh_work);
+ } else {
+ verity_bh_work(&io->bh_work);
+ }
} else {
INIT_WORK(&io->work, verity_work);
queue_work(io->v->verify_wq, &io->work);
@@ -796,6 +836,13 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
+static void verity_postsuspend(struct dm_target *ti)
+{
+ struct dm_verity *v = ti->private;
+ flush_workqueue(v->verify_wq);
+ dm_bufio_client_reset(v->bufio);
+}
+
/*
* Status: V (valid) or C (corruption found)
*/
@@ -1761,11 +1808,12 @@ static struct target_type verity_target = {
.name = "verity",
/* Note: the LSMs depend on the singleton and immutable features */
.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
- .version = {1, 10, 0},
+ .version = {1, 11, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
.map = verity_map,
+ .postsuspend = verity_postsuspend,
.status = verity_status,
.prepare_ioctl = verity_prepare_ioctl,
.iterate_devices = verity_iterate_devices,
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 7ce8847b3404..d6a04a57472d 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -797,7 +797,7 @@ static void writecache_flush(struct dm_writecache *wc)
bool need_flush_after_free;
wc->uncommitted_blocks = 0;
- del_timer(&wc->autocommit_timer);
+ timer_delete(&wc->autocommit_timer);
if (list_empty(&wc->lru))
return;
@@ -927,8 +927,8 @@ static void writecache_suspend(struct dm_target *ti)
struct dm_writecache *wc = ti->private;
bool flush_on_suspend;
- del_timer_sync(&wc->autocommit_timer);
- del_timer_sync(&wc->max_age_timer);
+ timer_delete_sync(&wc->autocommit_timer);
+ timer_delete_sync(&wc->max_age_timer);
wc_lock(wc);
writecache_flush(wc);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4d1e42891d24..5ab7574c0c76 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1540,14 +1540,18 @@ static void __send_empty_flush(struct clone_info *ci)
{
struct dm_table *t = ci->map;
struct bio flush_bio;
+ blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+
+ if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) ==
+ (REQ_IDLE | REQ_SYNC))
+ opf |= REQ_IDLE;
/*
* Use an on-stack bio for this, it's safe since we don't
* need to reference it after submit. It's just used as
* the basis for the clone(s).
*/
- bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
- REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
+ bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf);
ci->bio = &flush_bio;
ci->sector_count = 0;
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 23c09d22fcdb..44ec9b17cfd3 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -29,8 +29,10 @@
#include <linux/buffer_head.h>
#include <linux/seq_file.h>
#include <trace/events/block.h>
+
#include "md.h"
#include "md-bitmap.h"
+#include "md-cluster.h"
#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
@@ -426,8 +428,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
struct block_device *bdev;
struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage;
- unsigned int bitmap_limit = (bitmap->storage.file_pages - pg_index) <<
- PAGE_SHIFT;
+ unsigned long num_pages = bitmap->storage.file_pages;
+ unsigned int bitmap_limit = (num_pages - pg_index % num_pages) << PAGE_SHIFT;
loff_t sboff, offset = mddev->bitmap_info.offset;
sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
unsigned int size = PAGE_SIZE;
@@ -436,7 +438,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
/* we compare length (page numbers), not page offset. */
- if ((pg_index - store->sb_index) == store->file_pages - 1) {
+ if ((pg_index - store->sb_index) == num_pages - 1) {
unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
if (last_page_size == 0)
@@ -942,7 +944,7 @@ out:
bmname(bitmap), err);
goto out_no_sb;
}
- bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
+ bitmap->cluster_slot = bitmap->mddev->cluster_ops->slot_number(bitmap->mddev);
goto re_read;
}
@@ -2021,7 +2023,7 @@ static void md_bitmap_free(void *data)
sysfs_put(bitmap->sysfs_can_clear);
if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info &&
- bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev))
+ bitmap->cluster_slot == bitmap->mddev->cluster_ops->slot_number(bitmap->mddev))
md_cluster_stop(bitmap->mddev);
/* Shouldn't be needed - but just in case.... */
@@ -2229,7 +2231,7 @@ static int bitmap_load(struct mddev *mddev)
mddev_create_serial_pool(mddev, rdev);
if (mddev_is_clustered(mddev))
- md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
+ mddev->cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
/* Clear out old bitmap info first: Either there is none, or we
* are resuming after someone else has possibly changed things,
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 6595f89becdb..94221d964d4f 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1166,7 +1166,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
struct dlm_lock_resource *bm_lockres;
char str[64];
- if (i == md_cluster_ops->slot_number(mddev))
+ if (i == slot_number(mddev))
continue;
bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
@@ -1216,7 +1216,7 @@ out:
*/
static int cluster_check_sync_size(struct mddev *mddev)
{
- int current_slot = md_cluster_ops->slot_number(mddev);
+ int current_slot = slot_number(mddev);
int node_num = mddev->bitmap_info.nodes;
struct dlm_lock_resource *bm_lockres;
struct md_bitmap_stats stats;
@@ -1612,7 +1612,14 @@ out:
return err;
}
-static const struct md_cluster_operations cluster_ops = {
+static struct md_cluster_operations cluster_ops = {
+ .head = {
+ .type = MD_CLUSTER,
+ .id = ID_CLUSTER,
+ .name = "cluster",
+ .owner = THIS_MODULE,
+ },
+
.join = join,
.leave = leave,
.slot_number = slot_number,
@@ -1642,13 +1649,12 @@ static int __init cluster_init(void)
{
pr_warn("md-cluster: support raid1 and raid10 (limited support)\n");
pr_info("Registering Cluster MD functions\n");
- register_md_cluster_operations(&cluster_ops, THIS_MODULE);
- return 0;
+ return register_md_submodule(&cluster_ops.head);
}
static void cluster_exit(void)
{
- unregister_md_cluster_operations();
+ unregister_md_submodule(&cluster_ops.head);
}
module_init(cluster_init);
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 470bf18ffde5..8fb06d853173 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -10,6 +10,8 @@ struct mddev;
struct md_rdev;
struct md_cluster_operations {
+ struct md_submodule_head head;
+
int (*join)(struct mddev *mddev, int nodes);
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
@@ -35,4 +37,8 @@ struct md_cluster_operations {
void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
};
+extern int md_setup_cluster(struct mddev *mddev, int nodes);
+extern void md_cluster_stop(struct mddev *mddev);
+extern void md_reload_sb(struct mddev *mddev, int raid_disk);
+
#endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index a382929ce7ba..5d9b08115375 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -5,7 +5,6 @@
*/
#include <linux/blkdev.h>
-#include <linux/raid/md_u.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -76,10 +75,8 @@ static int linear_set_limits(struct mddev *mddev)
lim.max_write_zeroes_sectors = mddev->chunk_sectors;
lim.io_min = mddev->chunk_sectors << 9;
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
- if (err) {
- queue_limits_cancel_update(mddev->gendisk->queue);
+ if (err)
return err;
- }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
@@ -322,9 +319,13 @@ static void linear_quiesce(struct mddev *mddev, int state)
}
static struct md_personality linear_personality = {
- .name = "linear",
- .level = LEVEL_LINEAR,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_LINEAR,
+ .name = "linear",
+ .owner = THIS_MODULE,
+ },
+
.make_request = linear_make_request,
.run = linear_run,
.free = linear_free,
@@ -337,12 +338,12 @@ static struct md_personality linear_personality = {
static int __init linear_init(void)
{
- return register_md_personality(&linear_personality);
+ return register_md_submodule(&linear_personality.head);
}
static void linear_exit(void)
{
- unregister_md_personality(&linear_personality);
+ unregister_md_submodule(&linear_personality.head);
}
module_init(linear_init);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 30b3dbbce2d2..9daa78c5fe33 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -79,16 +79,10 @@ static const char *action_name[NR_SYNC_ACTIONS] = {
[ACTION_IDLE] = "idle",
};
-/* pers_list is a list of registered personalities protected by pers_lock. */
-static LIST_HEAD(pers_list);
-static DEFINE_SPINLOCK(pers_lock);
+static DEFINE_XARRAY(md_submodule);
static const struct kobj_type md_ktype;
-const struct md_cluster_operations *md_cluster_ops;
-EXPORT_SYMBOL(md_cluster_ops);
-static struct module *md_cluster_mod;
-
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
@@ -629,6 +623,12 @@ static void __mddev_put(struct mddev *mddev)
queue_work(md_misc_wq, &mddev->del_work);
}
+static void mddev_put_locked(struct mddev *mddev)
+{
+ if (atomic_dec_and_test(&mddev->active))
+ __mddev_put(mddev);
+}
+
void mddev_put(struct mddev *mddev)
{
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
@@ -888,16 +888,40 @@ struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
}
EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
-static struct md_personality *find_pers(int level, char *clevel)
+static struct md_personality *get_pers(int level, char *clevel)
{
- struct md_personality *pers;
- list_for_each_entry(pers, &pers_list, list) {
- if (level != LEVEL_NONE && pers->level == level)
- return pers;
- if (strcmp(pers->name, clevel)==0)
- return pers;
+ struct md_personality *ret = NULL;
+ struct md_submodule_head *head;
+ unsigned long i;
+
+ xa_lock(&md_submodule);
+ xa_for_each(&md_submodule, i, head) {
+ if (head->type != MD_PERSONALITY)
+ continue;
+ if ((level != LEVEL_NONE && head->id == level) ||
+ !strcmp(head->name, clevel)) {
+ if (try_module_get(head->owner))
+ ret = (void *)head;
+ break;
+ }
}
- return NULL;
+ xa_unlock(&md_submodule);
+
+ if (!ret) {
+ if (level != LEVEL_NONE)
+ pr_warn("md: personality for level %d is not loaded!\n",
+ level);
+ else
+ pr_warn("md: personality for level %s is not loaded!\n",
+ clevel);
+ }
+
+ return ret;
+}
+
+static void put_pers(struct md_personality *pers)
+{
+ module_put(pers->head.owner);
}
/* return the offset of the super block in 512byte sectors */
@@ -1180,7 +1204,7 @@ int md_check_no_bitmap(struct mddev *mddev)
if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
return 0;
pr_warn("%s: bitmaps are not supported for %s\n",
- mdname(mddev), mddev->pers->name);
+ mdname(mddev), mddev->pers->head.name);
return 1;
}
EXPORT_SYMBOL(md_check_no_bitmap);
@@ -1748,7 +1772,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
count <<= sb->bblog_shift;
if (bb + 1 == 0)
break;
- if (badblocks_set(&rdev->badblocks, sector, count, 1))
+ if (!badblocks_set(&rdev->badblocks, sector, count, 1))
return -EINVAL;
}
} else if (sb->bblog_offset != 0)
@@ -2359,19 +2383,6 @@ int md_integrity_register(struct mddev *mddev)
return 0; /* shouldn't register */
pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
- if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
- (mddev->level != 1 && mddev->level != 10 &&
- bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
- /*
- * No need to handle the failure of bioset_integrity_create,
- * because the function is called by md_run() -> pers->run(),
- * md_run calls bioset_exit -> bioset_integrity_free in case
- * of failure case.
- */
- pr_err("md: failed to create integrity pool for %s\n",
- mdname(mddev));
- return -EINVAL;
- }
return 0;
}
EXPORT_SYMBOL(md_integrity_register);
@@ -2639,11 +2650,11 @@ repeat:
force_change = 1;
if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
nospares = 1;
- ret = md_cluster_ops->metadata_update_start(mddev);
+ ret = mddev->cluster_ops->metadata_update_start(mddev);
/* Has someone else has updated the sb */
if (!does_sb_need_changing(mddev)) {
if (ret == 0)
- md_cluster_ops->metadata_update_cancel(mddev);
+ mddev->cluster_ops->metadata_update_cancel(mddev);
bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
BIT(MD_SB_CHANGE_DEVS) |
BIT(MD_SB_CHANGE_CLEAN));
@@ -2783,7 +2794,7 @@ rewrite:
/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
if (mddev_is_clustered(mddev) && ret == 0)
- md_cluster_ops->metadata_update_finish(mddev);
+ mddev->cluster_ops->metadata_update_finish(mddev);
if (mddev->in_sync != sync_req ||
!bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
@@ -2942,7 +2953,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
else {
err = 0;
if (mddev_is_clustered(mddev))
- err = md_cluster_ops->remove_disk(mddev, rdev);
+ err = mddev->cluster_ops->remove_disk(mddev, rdev);
if (err == 0) {
md_kick_rdev_from_array(rdev);
@@ -3052,7 +3063,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
* by this node eventually
*/
if (!mddev_is_clustered(rdev->mddev) ||
- (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
+ (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) {
clear_bit(Faulty, &rdev->flags);
err = add_bound_rdev(rdev);
}
@@ -3860,7 +3871,7 @@ level_show(struct mddev *mddev, char *page)
spin_lock(&mddev->lock);
p = mddev->pers;
if (p)
- ret = sprintf(page, "%s\n", p->name);
+ ret = sprintf(page, "%s\n", p->head.name);
else if (mddev->clevel[0])
ret = sprintf(page, "%s\n", mddev->clevel);
else if (mddev->level != LEVEL_NONE)
@@ -3917,7 +3928,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
rv = -EINVAL;
if (!mddev->pers->quiesce) {
pr_warn("md: %s: %s does not support online personality change\n",
- mdname(mddev), mddev->pers->name);
+ mdname(mddev), mddev->pers->head.name);
goto out_unlock;
}
@@ -3931,24 +3942,20 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
if (request_module("md-%s", clevel) != 0)
request_module("md-level-%s", clevel);
- spin_lock(&pers_lock);
- pers = find_pers(level, clevel);
- if (!pers || !try_module_get(pers->owner)) {
- spin_unlock(&pers_lock);
- pr_warn("md: personality %s not loaded\n", clevel);
+ pers = get_pers(level, clevel);
+ if (!pers) {
rv = -EINVAL;
goto out_unlock;
}
- spin_unlock(&pers_lock);
if (pers == mddev->pers) {
/* Nothing to do! */
- module_put(pers->owner);
+ put_pers(pers);
rv = len;
goto out_unlock;
}
if (!pers->takeover) {
- module_put(pers->owner);
+ put_pers(pers);
pr_warn("md: %s: %s does not support personality takeover\n",
mdname(mddev), clevel);
rv = -EINVAL;
@@ -3969,7 +3976,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->raid_disks -= mddev->delta_disks;
mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
- module_put(pers->owner);
+ put_pers(pers);
pr_warn("md: %s: %s would not accept array\n",
mdname(mddev), clevel);
rv = PTR_ERR(priv);
@@ -3984,7 +3991,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
oldpriv = mddev->private;
mddev->pers = pers;
mddev->private = priv;
- strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
mddev->level = mddev->new_level;
mddev->layout = mddev->new_layout;
mddev->chunk_sectors = mddev->new_chunk_sectors;
@@ -4026,7 +4033,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->to_remove = &md_redundancy_group;
}
- module_put(oldpers->owner);
+ put_pers(oldpers);
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0)
@@ -4057,7 +4064,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
* it must always be in_sync
*/
mddev->in_sync = 1;
- del_timer_sync(&mddev->safemode_timer);
+ timer_delete_sync(&mddev->safemode_timer);
}
pers->run(mddev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
@@ -5584,7 +5591,7 @@ __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
{
- if (mddev->pers == NULL || (mddev->pers->level != 1))
+ if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1))
return sprintf(page, "n/a\n");
else
return sprintf(page, "%d\n", mddev->serialize_policy);
@@ -5610,7 +5617,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
err = mddev_suspend_and_lock(mddev);
if (err)
return err;
- if (mddev->pers == NULL || (mddev->pers->level != 1)) {
+ if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) {
pr_err("md: serialize_policy is only effective for raid1\n");
err = -EINVAL;
goto unlock;
@@ -6096,30 +6103,21 @@ int md_run(struct mddev *mddev)
goto exit_sync_set;
}
- spin_lock(&pers_lock);
- pers = find_pers(mddev->level, mddev->clevel);
- if (!pers || !try_module_get(pers->owner)) {
- spin_unlock(&pers_lock);
- if (mddev->level != LEVEL_NONE)
- pr_warn("md: personality for level %d is not loaded!\n",
- mddev->level);
- else
- pr_warn("md: personality for level %s is not loaded!\n",
- mddev->clevel);
+ pers = get_pers(mddev->level, mddev->clevel);
+ if (!pers) {
err = -EINVAL;
goto abort;
}
- spin_unlock(&pers_lock);
- if (mddev->level != pers->level) {
- mddev->level = pers->level;
- mddev->new_level = pers->level;
+ if (mddev->level != pers->head.id) {
+ mddev->level = pers->head.id;
+ mddev->new_level = pers->head.id;
}
- strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
- module_put(pers->owner);
+ put_pers(pers);
err = -EINVAL;
goto abort;
}
@@ -6246,7 +6244,7 @@ bitmap_abort:
if (mddev->private)
pers->free(mddev, mddev->private);
mddev->private = NULL;
- module_put(pers->owner);
+ put_pers(pers);
mddev->bitmap_ops->destroy(mddev);
abort:
bioset_exit(&mddev->io_clone_set);
@@ -6407,7 +6405,7 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
- del_timer_sync(&mddev->safemode_timer);
+ timer_delete_sync(&mddev->safemode_timer);
if (mddev->pers && mddev->pers->quiesce) {
mddev->pers->quiesce(mddev, 1);
@@ -6467,7 +6465,7 @@ static void __md_stop(struct mddev *mddev)
mddev->private = NULL;
if (pers->sync_request && mddev->to_remove == NULL)
mddev->to_remove = &md_redundancy_group;
- module_put(pers->owner);
+ put_pers(pers);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
bioset_exit(&mddev->bio_set);
@@ -6983,7 +6981,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
set_bit(Candidate, &rdev->flags);
else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
/* --add initiated by this node */
- err = md_cluster_ops->add_new_disk(mddev, rdev);
+ err = mddev->cluster_ops->add_new_disk(mddev, rdev);
if (err) {
export_rdev(rdev, mddev);
return err;
@@ -7000,14 +6998,14 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
if (mddev_is_clustered(mddev)) {
if (info->state & (1 << MD_DISK_CANDIDATE)) {
if (!err) {
- err = md_cluster_ops->new_disk_ack(mddev,
- err == 0);
+ err = mddev->cluster_ops->new_disk_ack(
+ mddev, err == 0);
if (err)
md_kick_rdev_from_array(rdev);
}
} else {
if (err)
- md_cluster_ops->add_new_disk_cancel(mddev);
+ mddev->cluster_ops->add_new_disk_cancel(mddev);
else
err = add_bound_rdev(rdev);
}
@@ -7087,10 +7085,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
goto busy;
kick_rdev:
- if (mddev_is_clustered(mddev)) {
- if (md_cluster_ops->remove_disk(mddev, rdev))
- goto busy;
- }
+ if (mddev_is_clustered(mddev) &&
+ mddev->cluster_ops->remove_disk(mddev, rdev))
+ goto busy;
md_kick_rdev_from_array(rdev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
@@ -7393,7 +7390,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
rv = mddev->pers->resize(mddev, num_sectors);
if (!rv) {
if (mddev_is_clustered(mddev))
- md_cluster_ops->update_size(mddev, old_dev_sectors);
+ mddev->cluster_ops->update_size(mddev, old_dev_sectors);
else if (!mddev_is_dm(mddev))
set_capacity_and_notify(mddev->gendisk,
mddev->array_sectors);
@@ -7441,6 +7438,28 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
return rv;
}
+static int get_cluster_ops(struct mddev *mddev)
+{
+ xa_lock(&md_submodule);
+ mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER);
+ if (mddev->cluster_ops &&
+ !try_module_get(mddev->cluster_ops->head.owner))
+ mddev->cluster_ops = NULL;
+ xa_unlock(&md_submodule);
+
+ return mddev->cluster_ops == NULL ? -ENOENT : 0;
+}
+
+static void put_cluster_ops(struct mddev *mddev)
+{
+ if (!mddev->cluster_ops)
+ return;
+
+ mddev->cluster_ops->leave(mddev);
+ module_put(mddev->cluster_ops->head.owner);
+ mddev->cluster_ops = NULL;
+}
+
/*
* update_array_info is used to change the configuration of an
* on-line array.
@@ -7549,16 +7568,15 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
if (mddev->bitmap_info.nodes) {
/* hold PW on all the bitmap lock */
- if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
+ if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) {
pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
rv = -EPERM;
- md_cluster_ops->unlock_all_bitmaps(mddev);
+ mddev->cluster_ops->unlock_all_bitmaps(mddev);
goto err;
}
mddev->bitmap_info.nodes = 0;
- md_cluster_ops->leave(mddev);
- module_put(md_cluster_mod);
+ put_cluster_ops(mddev);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
mddev->bitmap_ops->destroy(mddev);
@@ -7842,7 +7860,7 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
case CLUSTERED_DISK_NACK:
if (mddev_is_clustered(mddev))
- md_cluster_ops->new_disk_ack(mddev, false);
+ mddev->cluster_ops->new_disk_ack(mddev, false);
else
err = -EINVAL;
goto unlock;
@@ -8124,7 +8142,8 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
return;
mddev->pers->error_handler(mddev, rdev);
- if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR)
+ if (mddev->pers->head.id == ID_RAID0 ||
+ mddev->pers->head.id == ID_LINEAR)
return;
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
@@ -8162,14 +8181,17 @@ static void status_unused(struct seq_file *seq)
static void status_personalities(struct seq_file *seq)
{
- struct md_personality *pers;
+ struct md_submodule_head *head;
+ unsigned long i;
seq_puts(seq, "Personalities : ");
- spin_lock(&pers_lock);
- list_for_each_entry(pers, &pers_list, list)
- seq_printf(seq, "[%s] ", pers->name);
- spin_unlock(&pers_lock);
+ xa_lock(&md_submodule);
+ xa_for_each(&md_submodule, i, head)
+ if (head->type == MD_PERSONALITY)
+ seq_printf(seq, "[%s] ", head->name);
+ xa_unlock(&md_submodule);
+
seq_puts(seq, "\n");
}
@@ -8392,7 +8414,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, " (read-only)");
if (mddev->ro == MD_AUTO_READ)
seq_printf(seq, " (auto-read-only)");
- seq_printf(seq, " %s", mddev->pers->name);
+ seq_printf(seq, " %s", mddev->pers->head.name);
} else {
seq_printf(seq, "inactive");
}
@@ -8461,9 +8483,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
status_unused(seq);
- if (atomic_dec_and_test(&mddev->active))
- __mddev_put(mddev);
-
+ mddev_put_locked(mddev);
return 0;
}
@@ -8514,67 +8534,34 @@ static const struct proc_ops mdstat_proc_ops = {
.proc_poll = mdstat_poll,
};
-int register_md_personality(struct md_personality *p)
-{
- pr_debug("md: %s personality registered for level %d\n",
- p->name, p->level);
- spin_lock(&pers_lock);
- list_add_tail(&p->list, &pers_list);
- spin_unlock(&pers_lock);
- return 0;
-}
-EXPORT_SYMBOL(register_md_personality);
-
-int unregister_md_personality(struct md_personality *p)
+int register_md_submodule(struct md_submodule_head *msh)
{
- pr_debug("md: %s personality unregistered\n", p->name);
- spin_lock(&pers_lock);
- list_del_init(&p->list);
- spin_unlock(&pers_lock);
- return 0;
+ return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL);
}
-EXPORT_SYMBOL(unregister_md_personality);
+EXPORT_SYMBOL_GPL(register_md_submodule);
-int register_md_cluster_operations(const struct md_cluster_operations *ops,
- struct module *module)
+void unregister_md_submodule(struct md_submodule_head *msh)
{
- int ret = 0;
- spin_lock(&pers_lock);
- if (md_cluster_ops != NULL)
- ret = -EALREADY;
- else {
- md_cluster_ops = ops;
- md_cluster_mod = module;
- }
- spin_unlock(&pers_lock);
- return ret;
+ xa_erase(&md_submodule, msh->id);
}
-EXPORT_SYMBOL(register_md_cluster_operations);
-
-int unregister_md_cluster_operations(void)
-{
- spin_lock(&pers_lock);
- md_cluster_ops = NULL;
- spin_unlock(&pers_lock);
- return 0;
-}
-EXPORT_SYMBOL(unregister_md_cluster_operations);
+EXPORT_SYMBOL_GPL(unregister_md_submodule);
int md_setup_cluster(struct mddev *mddev, int nodes)
{
- int ret;
- if (!md_cluster_ops)
+ int ret = get_cluster_ops(mddev);
+
+ if (ret) {
request_module("md-cluster");
- spin_lock(&pers_lock);
+ ret = get_cluster_ops(mddev);
+ }
+
/* ensure module won't be unloaded */
- if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
+ if (ret) {
pr_warn("can't find md-cluster module or get its reference.\n");
- spin_unlock(&pers_lock);
- return -ENOENT;
+ return ret;
}
- spin_unlock(&pers_lock);
- ret = md_cluster_ops->join(mddev, nodes);
+ ret = mddev->cluster_ops->join(mddev, nodes);
if (!ret)
mddev->safemode_delay = 0;
return ret;
@@ -8582,10 +8569,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
void md_cluster_stop(struct mddev *mddev)
{
- if (!md_cluster_ops)
- return;
- md_cluster_ops->leave(mddev);
- module_put(md_cluster_mod);
+ put_cluster_ops(mddev);
}
static int is_mddev_idle(struct mddev *mddev, int init)
@@ -8978,7 +8962,7 @@ void md_do_sync(struct md_thread *thread)
}
if (mddev_is_clustered(mddev)) {
- ret = md_cluster_ops->resync_start(mddev);
+ ret = mddev->cluster_ops->resync_start(mddev);
if (ret)
goto skip;
@@ -9005,7 +8989,7 @@ void md_do_sync(struct md_thread *thread)
*
*/
if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_start_notify(mddev);
+ mddev->cluster_ops->resync_start_notify(mddev);
do {
int mddev2_minor = -1;
mddev->curr_resync = MD_RESYNC_DELAYED;
@@ -9460,6 +9444,13 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
return true;
}
+ /* Check if resync is in progress. */
+ if (mddev->recovery_cp < MaxSector) {
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ return true;
+ }
+
/*
* Remove any failed drives, then add spares if possible. Spares are
* also removed and re-added, to allow the personality to fail the
@@ -9476,13 +9467,6 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
return true;
}
- /* Check if recovery is in progress. */
- if (mddev->recovery_cp < MaxSector) {
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- return true;
- }
-
/* Delay to choose resync/check/repair in md_do_sync(). */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
return true;
@@ -9789,7 +9773,7 @@ void md_reap_sync_thread(struct mddev *mddev)
* call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
* clustered raid */
if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
- md_cluster_ops->resync_finish(mddev);
+ mddev->cluster_ops->resync_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -9797,13 +9781,13 @@ void md_reap_sync_thread(struct mddev *mddev)
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
/*
- * We call md_cluster_ops->update_size here because sync_size could
+ * We call mddev->cluster_ops->update_size here because sync_size could
* be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
* so it is time to update size across cluster.
*/
if (mddev_is_clustered(mddev) && is_reshaped
&& !test_bit(MD_CLOSING, &mddev->flags))
- md_cluster_ops->update_size(mddev, old_dev_sectors);
+ mddev->cluster_ops->update_size(mddev, old_dev_sectors);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
@@ -9841,12 +9825,11 @@ EXPORT_SYMBOL(md_finish_reshape);
/* Bad block management */
-/* Returns 1 on success, 0 on failure */
-int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int is_new)
+/* Returns true on success, false on failure */
+bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new)
{
struct mddev *mddev = rdev->mddev;
- int rv;
/*
* Recording new badblocks for faulty rdev will force unnecessary
@@ -9856,50 +9839,51 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
* avoid it.
*/
if (test_bit(Faulty, &rdev->flags))
- return 1;
+ return true;
if (is_new)
s += rdev->new_data_offset;
else
s += rdev->data_offset;
- rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
- if (rv == 0) {
- /* Make sure they get written out promptly */
- if (test_bit(ExternalBbl, &rdev->flags))
- sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
- sysfs_notify_dirent_safe(rdev->sysfs_state);
- set_mask_bits(&mddev->sb_flags, 0,
- BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
- md_wakeup_thread(rdev->mddev->thread);
- return 1;
- } else
- return 0;
+
+ if (!badblocks_set(&rdev->badblocks, s, sectors, 0))
+ return false;
+
+ /* Make sure they get written out promptly */
+ if (test_bit(ExternalBbl, &rdev->flags))
+ sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ set_mask_bits(&mddev->sb_flags, 0,
+ BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
+ md_wakeup_thread(rdev->mddev->thread);
+ return true;
}
EXPORT_SYMBOL_GPL(rdev_set_badblocks);
-int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int is_new)
+void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new)
{
- int rv;
if (is_new)
s += rdev->new_data_offset;
else
s += rdev->data_offset;
- rv = badblocks_clear(&rdev->badblocks, s, sectors);
- if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
+
+ if (!badblocks_clear(&rdev->badblocks, s, sectors))
+ return;
+
+ if (test_bit(ExternalBbl, &rdev->flags))
sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
- return rv;
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
- struct mddev *mddev, *n;
+ struct mddev *mddev;
int need_delay = 0;
spin_lock(&all_mddevs_lock);
- list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
if (!mddev_get(mddev))
continue;
spin_unlock(&all_mddevs_lock);
@@ -9911,8 +9895,8 @@ static int md_notify_reboot(struct notifier_block *this,
mddev_unlock(mddev);
}
need_delay = 1;
- mddev_put(mddev);
spin_lock(&all_mddevs_lock);
+ mddev_put_locked(mddev);
}
spin_unlock(&all_mddevs_lock);
@@ -10029,7 +10013,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RESHAPE_ACTIVE) &&
- !md_cluster_ops->resync_status_get(mddev)) {
+ !mddev->cluster_ops->resync_status_get(mddev)) {
/*
* -1 to make raid1_add_disk() set conf->fullsync
* to 1. This could avoid skipping sync when the
@@ -10245,7 +10229,7 @@ void md_autostart_arrays(int part)
static __exit void md_exit(void)
{
- struct mddev *mddev, *n;
+ struct mddev *mddev;
int delay = 1;
unregister_blkdev(MD_MAJOR,"md");
@@ -10266,7 +10250,7 @@ static __exit void md_exit(void)
remove_proc_entry("mdstat", NULL);
spin_lock(&all_mddevs_lock);
- list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
if (!mddev_get(mddev))
continue;
spin_unlock(&all_mddevs_lock);
@@ -10278,8 +10262,8 @@ static __exit void md_exit(void)
* the mddev for destruction by a workqueue, and the
* destroy_workqueue() below will wait for that to complete.
*/
- mddev_put(mddev);
spin_lock(&all_mddevs_lock);
+ mddev_put_locked(mddev);
}
spin_unlock(&all_mddevs_lock);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index def808064ad8..1cf00a04bcdd 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -18,11 +18,37 @@
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
+#include <linux/raid/md_u.h>
#include <trace/events/block.h>
-#include "md-cluster.h"
#define MaxSector (~(sector_t)0)
+enum md_submodule_type {
+ MD_PERSONALITY = 0,
+ MD_CLUSTER,
+ MD_BITMAP, /* TODO */
+};
+
+enum md_submodule_id {
+ ID_LINEAR = LEVEL_LINEAR,
+ ID_RAID0 = 0,
+ ID_RAID1 = 1,
+ ID_RAID4 = 4,
+ ID_RAID5 = 5,
+ ID_RAID6 = 6,
+ ID_RAID10 = 10,
+ ID_CLUSTER,
+ ID_BITMAP, /* TODO */
+ ID_LLBITMAP, /* TODO */
+};
+
+struct md_submodule_head {
+ enum md_submodule_type type;
+ enum md_submodule_id id;
+ const char *name;
+ struct module *owner;
+};
+
/*
* These flags should really be called "NO_RETRY" rather than
* "FAILFAST" because they don't make any promise about time lapse,
@@ -266,8 +292,8 @@ enum flag_bits {
Nonrot, /* non-rotational device (SSD) */
};
-static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
+static inline int is_badblock(struct md_rdev *rdev, sector_t s, sector_t sectors,
+ sector_t *first_bad, sector_t *bad_sectors)
{
if (unlikely(rdev->badblocks.count)) {
int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
@@ -284,16 +310,17 @@ static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
int sectors)
{
sector_t first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors);
}
-extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int is_new);
-extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int is_new);
+extern bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new);
+extern void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new);
struct md_cluster_info;
+struct md_cluster_operations;
/**
* enum mddev_flags - md device flags.
@@ -576,6 +603,7 @@ struct mddev {
mempool_t *serial_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
+ struct md_cluster_operations *cluster_ops;
unsigned int good_device_nr; /* good device num within cluster raid */
unsigned int noio_flag; /* for memalloc scope API */
@@ -699,10 +727,8 @@ static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
struct md_personality
{
- char *name;
- int level;
- struct list_head list;
- struct module *owner;
+ struct md_submodule_head head;
+
bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio);
/*
* start up works that do NOT require md_thread. tasks that
@@ -843,13 +869,9 @@ static inline void safe_put_page(struct page *p)
if (p) put_page(p);
}
-extern int register_md_personality(struct md_personality *p);
-extern int unregister_md_personality(struct md_personality *p);
-extern int register_md_cluster_operations(const struct md_cluster_operations *ops,
- struct module *module);
-extern int unregister_md_cluster_operations(void);
-extern int md_setup_cluster(struct mddev *mddev, int nodes);
-extern void md_cluster_stop(struct mddev *mddev);
+int register_md_submodule(struct md_submodule_head *msh);
+void unregister_md_submodule(struct md_submodule_head *msh);
+
extern struct md_thread *md_register_thread(
void (*run)(struct md_thread *thread),
struct mddev *mddev,
@@ -906,7 +928,6 @@ extern void md_idle_sync_thread(struct mddev *mddev);
extern void md_frozen_sync_thread(struct mddev *mddev);
extern void md_unfrozen_sync_thread(struct mddev *mddev);
-extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev);
extern void mddev_destroy_serial_pool(struct mddev *mddev,
@@ -928,7 +949,6 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
}
}
-extern const struct md_cluster_operations *md_cluster_ops;
static inline int mddev_is_clustered(struct mddev *mddev)
{
return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 8fc9339b00c7..d8f639f4ae12 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -386,10 +386,8 @@ static int raid0_set_limits(struct mddev *mddev)
lim.io_opt = lim.io_min * mddev->raid_disks;
lim.features |= BLK_FEAT_ATOMIC_WRITES;
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
- if (err) {
- queue_limits_cancel_update(mddev->gendisk->queue);
+ if (err)
return err;
- }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
@@ -811,9 +809,13 @@ static void raid0_quiesce(struct mddev *mddev, int quiesce)
static struct md_personality raid0_personality=
{
- .name = "raid0",
- .level = 0,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID0,
+ .name = "raid0",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid0_make_request,
.run = raid0_run,
.free = raid0_free,
@@ -824,14 +826,14 @@ static struct md_personality raid0_personality=
.error_handler = raid0_error,
};
-static int __init raid0_init (void)
+static int __init raid0_init(void)
{
- return register_md_personality (&raid0_personality);
+ return register_md_submodule(&raid0_personality.head);
}
-static void raid0_exit (void)
+static void __exit raid0_exit(void)
{
- unregister_md_personality (&raid0_personality);
+ unregister_md_submodule(&raid0_personality.head);
}
module_init(raid0_init);
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 4378d3250bd7..c7efd8aab675 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -247,7 +247,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev,
sector_t this_sector, int *len)
{
sector_t first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
/* no bad block overlap */
if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
@@ -287,8 +287,8 @@ static inline bool raid1_should_read_first(struct mddev *mddev,
return true;
if (mddev_is_clustered(mddev) &&
- md_cluster_ops->area_resyncing(mddev, READ, this_sector,
- this_sector + len))
+ mddev->cluster_ops->area_resyncing(mddev, READ, this_sector,
+ this_sector + len))
return true;
return false;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 9d57a88dbd26..0efc03cea24e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -36,6 +36,7 @@
#include "md.h"
#include "raid1.h"
#include "md-bitmap.h"
+#include "md-cluster.h"
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
@@ -45,6 +46,7 @@
static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
+static void raid1_free(struct mddev *mddev, void *priv);
#define RAID_1_10_NAME "raid1"
#include "raid1-10.c"
@@ -1315,8 +1317,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
struct r1conf *conf = mddev->private;
struct raid1_info *mirror;
struct bio *read_bio;
- const enum req_op op = bio_op(bio);
- const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
int max_sectors;
int rdisk, error;
bool r1bio_existed = !!r1_bio;
@@ -1404,7 +1404,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
read_bio->bi_iter.bi_sector = r1_bio->sector +
mirror->rdev->data_offset;
read_bio->bi_end_io = raid1_end_read_request;
- read_bio->bi_opf = op | do_sync;
if (test_bit(FailFast, &mirror->rdev->flags) &&
test_bit(R1BIO_FailFast, &r1_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
@@ -1467,7 +1466,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
bool is_discard = (bio_op(bio) == REQ_OP_DISCARD);
if (mddev_is_clustered(mddev) &&
- md_cluster_ops->area_resyncing(mddev, WRITE,
+ mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio))) {
DEFINE_WAIT(w);
@@ -1478,7 +1477,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
for (;;) {
prepare_to_wait(&conf->wait_barrier,
&w, TASK_IDLE);
- if (!md_cluster_ops->area_resyncing(mddev, WRITE,
+ if (!mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector,
bio_end_sector(bio)))
break;
@@ -1537,7 +1536,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
atomic_inc(&rdev->nr_pending);
if (test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
int is_bad;
is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
@@ -1653,8 +1652,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset);
mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_opf = bio_op(bio) |
- (bio->bi_opf & (REQ_SYNC | REQ_FUA | REQ_ATOMIC));
if (test_bit(FailFast, &rdev->flags) &&
!test_bit(WriteMostly, &rdev->flags) &&
conf->raid_disks - mddev->degraded > 1)
@@ -2486,7 +2483,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
}
}
-static int narrow_write_error(struct r1bio *r1_bio, int i)
+static bool narrow_write_error(struct r1bio *r1_bio, int i)
{
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
@@ -2507,10 +2504,10 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
sector_t sector;
int sectors;
int sect_to_write = r1_bio->sectors;
- int ok = 1;
+ bool ok = true;
if (rdev->badblocks.shift < 0)
- return 0;
+ return false;
block_sectors = roundup(1 << rdev->badblocks.shift,
bdev_logical_block_size(rdev->bdev) >> 9);
@@ -2886,7 +2883,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
} else {
/* may need to read from here */
sector_t first_bad = MaxSector;
- int bad_sectors;
+ sector_t bad_sectors;
if (is_badblock(rdev, sector_nr, good_sectors,
&first_bad, &bad_sectors)) {
@@ -3038,9 +3035,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
conf->cluster_sync_low = mddev->curr_resync_completed;
conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
/* Send resync message */
- md_cluster_ops->resync_info_update(mddev,
- conf->cluster_sync_low,
- conf->cluster_sync_high);
+ mddev->cluster_ops->resync_info_update(mddev,
+ conf->cluster_sync_low,
+ conf->cluster_sync_high);
}
/* For a user-requested sync, we read all readable devices and do a
@@ -3219,10 +3216,8 @@ static int raid1_set_limits(struct mddev *mddev)
lim.max_write_zeroes_sectors = 0;
lim.features |= BLK_FEAT_ATOMIC_WRITES;
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
- if (err) {
- queue_limits_cancel_update(mddev->gendisk->queue);
+ if (err)
return err;
- }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
@@ -3258,8 +3253,11 @@ static int raid1_run(struct mddev *mddev)
if (!mddev_is_dm(mddev)) {
ret = raid1_set_limits(mddev);
- if (ret)
+ if (ret) {
+ if (!mddev->private)
+ raid1_free(mddev, conf);
return ret;
+ }
}
mddev->degraded = 0;
@@ -3273,6 +3271,8 @@ static int raid1_run(struct mddev *mddev)
*/
if (conf->raid_disks - mddev->degraded < 1) {
md_unregister_thread(mddev, &conf->thread);
+ if (!mddev->private)
+ raid1_free(mddev, conf);
return -EINVAL;
}
@@ -3493,9 +3493,13 @@ static void *raid1_takeover(struct mddev *mddev)
static struct md_personality raid1_personality =
{
- .name = "raid1",
- .level = 1,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID1,
+ .name = "raid1",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid1_make_request,
.run = raid1_run,
.free = raid1_free,
@@ -3512,18 +3516,18 @@ static struct md_personality raid1_personality =
.takeover = raid1_takeover,
};
-static int __init raid_init(void)
+static int __init raid1_init(void)
{
- return register_md_personality(&raid1_personality);
+ return register_md_submodule(&raid1_personality.head);
}
-static void raid_exit(void)
+static void __exit raid1_exit(void)
{
- unregister_md_personality(&raid1_personality);
+ unregister_md_submodule(&raid1_personality.head);
}
-module_init(raid_init);
-module_exit(raid_exit);
+module_init(raid1_init);
+module_exit(raid1_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
MODULE_ALIAS("md-personality-3"); /* RAID1 */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index efe93b979167..846c5f29486e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -24,6 +24,7 @@
#include "raid10.h"
#include "raid0.h"
#include "md-bitmap.h"
+#include "md-cluster.h"
/*
* RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -747,7 +748,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
for (slot = 0; slot < conf->copies ; slot++) {
sector_t first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
sector_t dev_sector;
unsigned int pending;
bool nonrot;
@@ -1146,8 +1147,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
{
struct r10conf *conf = mddev->private;
struct bio *read_bio;
- const enum req_op op = bio_op(bio);
- const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
int max_sectors;
struct md_rdev *rdev;
char b[BDEVNAME_SIZE];
@@ -1228,7 +1227,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
choose_data_offset(r10_bio, rdev);
read_bio->bi_end_io = raid10_end_read_request;
- read_bio->bi_opf = op | do_sync;
if (test_bit(FailFast, &rdev->flags) &&
test_bit(R10BIO_FailFast, &r10_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
@@ -1247,10 +1245,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
struct bio *bio, bool replacement,
int n_copy)
{
- const enum req_op op = bio_op(bio);
- const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
- const blk_opf_t do_fua = bio->bi_opf & REQ_FUA;
- const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC;
unsigned long flags;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev;
@@ -1269,7 +1263,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
choose_data_offset(r10_bio, rdev));
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_opf = op | do_sync | do_fua | do_atomic;
if (!replacement && test_bit(FailFast,
&conf->mirrors[devnum].rdev->flags)
&& enough(conf, devnum))
@@ -1355,9 +1348,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
int error;
if ((mddev_is_clustered(mddev) &&
- md_cluster_ops->area_resyncing(mddev, WRITE,
- bio->bi_iter.bi_sector,
- bio_end_sector(bio)))) {
+ mddev->cluster_ops->area_resyncing(mddev, WRITE,
+ bio->bi_iter.bi_sector,
+ bio_end_sector(bio)))) {
DEFINE_WAIT(w);
/* Bail out if REQ_NOWAIT is set for the bio */
if (bio->bi_opf & REQ_NOWAIT) {
@@ -1367,7 +1360,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
for (;;) {
prepare_to_wait(&conf->wait_barrier,
&w, TASK_IDLE);
- if (!md_cluster_ops->area_resyncing(mddev, WRITE,
+ if (!mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio)))
break;
schedule();
@@ -1438,7 +1431,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
- int bad_sectors;
+ sector_t bad_sectors;
int is_bad;
is_bad = is_badblock(rdev, dev_sector, max_sectors,
@@ -1631,11 +1624,10 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return -EAGAIN;
- if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) {
+ if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
bio_wouldblock_error(bio);
return 0;
}
- wait_barrier(conf, false);
/*
* Check reshape again to avoid reshape happens after checking
@@ -2786,7 +2778,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
}
}
-static int narrow_write_error(struct r10bio *r10_bio, int i)
+static bool narrow_write_error(struct r10bio *r10_bio, int i)
{
struct bio *bio = r10_bio->master_bio;
struct mddev *mddev = r10_bio->mddev;
@@ -2807,10 +2799,10 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
sector_t sector;
int sectors;
int sect_to_write = r10_bio->sectors;
- int ok = 1;
+ bool ok = true;
if (rdev->badblocks.shift < 0)
- return 0;
+ return false;
block_sectors = roundup(1 << rdev->badblocks.shift,
bdev_logical_block_size(rdev->bdev) >> 9);
@@ -3413,7 +3405,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t from_addr, to_addr;
struct md_rdev *rdev = conf->mirrors[d].rdev;
sector_t sector, first_bad;
- int bad_sectors;
+ sector_t bad_sectors;
if (!rdev ||
!test_bit(In_sync, &rdev->flags))
continue;
@@ -3609,7 +3601,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
sector_t first_bad, sector;
- int bad_sectors;
+ sector_t bad_sectors;
struct md_rdev *rdev;
if (r10_bio->devs[i].repl_bio)
@@ -3716,7 +3708,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
conf->cluster_sync_low = mddev->curr_resync_completed;
raid10_set_cluster_sync_high(conf);
/* Send resync message */
- md_cluster_ops->resync_info_update(mddev,
+ mddev->cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
@@ -3749,7 +3741,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
if (broadcast_msg) {
raid10_set_cluster_sync_high(conf);
- md_cluster_ops->resync_info_update(mddev,
+ mddev->cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
@@ -4020,10 +4012,8 @@ static int raid10_set_queue_limits(struct mddev *mddev)
lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
lim.features |= BLK_FEAT_ATOMIC_WRITES;
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
- if (err) {
- queue_limits_cancel_update(mddev->gendisk->queue);
+ if (err)
return err;
- }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
@@ -4543,7 +4533,7 @@ static int raid10_start_reshape(struct mddev *mddev)
if (ret)
goto abort;
- ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
+ ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
if (ret) {
mddev->bitmap_ops->resize(mddev, oldsize, 0, false);
goto abort;
@@ -4834,7 +4824,7 @@ read_more:
conf->cluster_sync_low = sb_reshape_pos;
}
- md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
+ mddev->cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
conf->cluster_sync_high);
}
@@ -4979,7 +4969,7 @@ static void raid10_update_reshape_pos(struct mddev *mddev)
struct r10conf *conf = mddev->private;
sector_t lo, hi;
- md_cluster_ops->resync_info_get(mddev, &lo, &hi);
+ mddev->cluster_ops->resync_info_get(mddev, &lo, &hi);
if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
|| mddev->reshape_position == MaxSector)
conf->reshape_progress = mddev->reshape_position;
@@ -5125,9 +5115,13 @@ static void raid10_finish_reshape(struct mddev *mddev)
static struct md_personality raid10_personality =
{
- .name = "raid10",
- .level = 10,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID10,
+ .name = "raid10",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid10_make_request,
.run = raid10_run,
.free = raid10_free,
@@ -5147,18 +5141,18 @@ static struct md_personality raid10_personality =
.update_reshape_pos = raid10_update_reshape_pos,
};
-static int __init raid_init(void)
+static int __init raid10_init(void)
{
- return register_md_personality(&raid10_personality);
+ return register_md_submodule(&raid10_personality.head);
}
-static void raid_exit(void)
+static void __exit raid10_exit(void)
{
- unregister_md_personality(&raid10_personality);
+ unregister_md_submodule(&raid10_personality.head);
}
-module_init(raid_init);
-module_exit(raid_exit);
+module_init(raid10_init);
+module_exit(raid10_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
MODULE_ALIAS("md-personality-9"); /* RAID10 */
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index e530271cb86b..ba768ca7f422 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -714,7 +714,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
block = page_address(io->meta_page);
block->meta_size = cpu_to_le32(io->meta_offset);
- crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
+ crc = crc32c(log->uuid_checksum, block, PAGE_SIZE);
block->checksum = cpu_to_le32(crc);
log->current_io = NULL;
@@ -1020,8 +1020,8 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
continue;
addr = kmap_local_page(sh->dev[i].page);
- sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
- addr, PAGE_SIZE);
+ sh->dev[i].log_checksum = crc32c(log->uuid_checksum,
+ addr, PAGE_SIZE);
kunmap_local(addr);
}
parity_pages = 1 + !!(sh->qd_idx >= 0);
@@ -1741,7 +1741,7 @@ static int r5l_recovery_read_meta_block(struct r5l_log *log,
le64_to_cpu(mb->position) != ctx->pos)
return -EINVAL;
- crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+ crc = crc32c(log->uuid_checksum, mb, PAGE_SIZE);
if (stored_crc != crc)
return -EINVAL;
@@ -1780,8 +1780,7 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
return -ENOMEM;
r5l_recovery_create_empty_meta_block(log, page, pos, seq);
mb = page_address(page);
- mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
- mb, PAGE_SIZE));
+ mb->checksum = cpu_to_le32(crc32c(log->uuid_checksum, mb, PAGE_SIZE));
if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE |
REQ_SYNC | REQ_FUA, false)) {
__free_page(page);
@@ -1976,7 +1975,7 @@ r5l_recovery_verify_data_checksum(struct r5l_log *log,
r5l_recovery_read_page(log, ctx, page, log_offset);
addr = kmap_local_page(page);
- checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
+ checksum = crc32c(log->uuid_checksum, addr, PAGE_SIZE);
kunmap_local(addr);
return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
}
@@ -2379,8 +2378,8 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
raid5_compute_blocknr(sh, i, 0));
addr = kmap_local_page(dev->page);
payload->checksum[0] = cpu_to_le32(
- crc32c_le(log->uuid_checksum, addr,
- PAGE_SIZE));
+ crc32c(log->uuid_checksum, addr,
+ PAGE_SIZE));
kunmap_local(addr);
sync_page_io(log->rdev, write_pos, PAGE_SIZE,
dev->page, REQ_OP_WRITE, false);
@@ -2392,8 +2391,8 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
}
}
mb->meta_size = cpu_to_le32(offset);
- mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
- mb, PAGE_SIZE));
+ mb->checksum = cpu_to_le32(crc32c(log->uuid_checksum,
+ mb, PAGE_SIZE));
sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false);
sh->log_start = ctx->pos;
@@ -2885,8 +2884,8 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
continue;
addr = kmap_local_page(sh->dev[i].page);
- sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
- addr, PAGE_SIZE);
+ sh->dev[i].log_checksum = crc32c(log->uuid_checksum,
+ addr, PAGE_SIZE);
kunmap_local(addr);
pages++;
}
@@ -2969,7 +2968,7 @@ static int r5l_load_log(struct r5l_log *log)
}
stored_crc = le32_to_cpu(mb->checksum);
mb->checksum = 0;
- expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+ expected_crc = crc32c(log->uuid_checksum, mb, PAGE_SIZE);
if (stored_crc != expected_crc) {
create_super = true;
goto create;
@@ -3077,8 +3076,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
return -ENOMEM;
log->rdev = rdev;
log->need_cache_flush = bdev_write_cache(rdev->bdev);
- log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
- sizeof(rdev->mddev->uuid));
+ log->uuid_checksum = crc32c(~0, rdev->mddev->uuid,
+ sizeof(rdev->mddev->uuid));
mutex_init(&log->io_mutex);
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 37c4da5311ca..c0fb335311aa 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -346,9 +346,9 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
le32_add_cpu(&e->pp_size, PAGE_SIZE);
io->pp_size += PAGE_SIZE;
- e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
- page_address(sh->ppl_page),
- PAGE_SIZE));
+ e->checksum = cpu_to_le32(crc32c(le32_to_cpu(e->checksum),
+ page_address(sh->ppl_page),
+ PAGE_SIZE));
}
list_add_tail(&sh->log_list, &io->stripe_list);
@@ -454,7 +454,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
}
pplhdr->entries_count = cpu_to_le32(io->entries_count);
- pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
+ pplhdr->checksum = cpu_to_le32(~crc32c(~0, pplhdr, PPL_HEADER_SIZE));
/* Rewind the buffer if current PPL is larger then remaining space */
if (log->use_multippl &&
@@ -998,7 +998,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr,
goto out;
}
- crc = crc32c_le(crc, page_address(page), s);
+ crc = crc32c(crc, page_address(page), s);
pp_size -= s;
sector += s >> 9;
@@ -1052,7 +1052,7 @@ static int ppl_write_empty_header(struct ppl_log *log)
log->rdev->ppl.size, GFP_NOIO, 0);
memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
- pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+ pplhdr->checksum = cpu_to_le32(~crc32c(~0, pplhdr, PAGE_SIZE));
if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_SYNC |
@@ -1106,7 +1106,7 @@ static int ppl_load_distributed(struct ppl_log *log)
/* check header validity */
crc_stored = le32_to_cpu(pplhdr->checksum);
pplhdr->checksum = 0;
- crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+ crc = ~crc32c(~0, pplhdr, PAGE_SIZE);
if (crc_stored != crc) {
pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x (offset: %llu)\n",
@@ -1390,7 +1390,7 @@ int ppl_init_log(struct r5conf *conf)
spin_lock_init(&ppl_conf->no_mem_stripes_lock);
if (!mddev->external) {
- ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
+ ppl_conf->signature = ~crc32c(~0, mddev->uuid, sizeof(mddev->uuid));
ppl_conf->block_size = 512;
} else {
ppl_conf->block_size =
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5c79429acc64..6389383166c0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5858,6 +5858,9 @@ static enum reshape_loc get_reshape_loc(struct mddev *mddev,
struct r5conf *conf, sector_t logical_sector)
{
sector_t reshape_progress, reshape_safe;
+
+ if (likely(conf->reshape_progress == MaxSector))
+ return LOC_NO_RESHAPE;
/*
* Spinlock is needed as reshape_progress may be
* 64bit on a 32bit platform, and so it might be
@@ -5935,22 +5938,19 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
const int rw = bio_data_dir(bi);
enum stripe_result ret;
struct stripe_head *sh;
+ enum reshape_loc loc;
sector_t new_sector;
int previous = 0, flags = 0;
int seq, dd_idx;
seq = read_seqcount_begin(&conf->gen_lock);
-
- if (unlikely(conf->reshape_progress != MaxSector)) {
- enum reshape_loc loc = get_reshape_loc(mddev, conf,
- logical_sector);
- if (loc == LOC_INSIDE_RESHAPE) {
- ret = STRIPE_SCHEDULE_AND_RETRY;
- goto out;
- }
- if (loc == LOC_AHEAD_OF_RESHAPE)
- previous = 1;
+ loc = get_reshape_loc(mddev, conf, logical_sector);
+ if (loc == LOC_INSIDE_RESHAPE) {
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out;
}
+ if (loc == LOC_AHEAD_OF_RESHAPE)
+ previous = 1;
new_sector = raid5_compute_sector(conf, logical_sector, previous,
&dd_idx, NULL);
@@ -6127,7 +6127,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
/* Bail out if conflicts with reshape and REQ_NOWAIT is set */
if ((bi->bi_opf & REQ_NOWAIT) &&
- (conf->reshape_progress != MaxSector) &&
get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) {
bio_wouldblock_error(bi);
if (rw == WRITE)
@@ -8954,9 +8953,13 @@ static void raid5_prepare_suspend(struct mddev *mddev)
static struct md_personality raid6_personality =
{
- .name = "raid6",
- .level = 6,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID6,
+ .name = "raid6",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid5_make_request,
.run = raid5_run,
.start = raid5_start,
@@ -8980,9 +8983,13 @@ static struct md_personality raid6_personality =
};
static struct md_personality raid5_personality =
{
- .name = "raid5",
- .level = 5,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID5,
+ .name = "raid5",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid5_make_request,
.run = raid5_run,
.start = raid5_start,
@@ -9007,9 +9014,13 @@ static struct md_personality raid5_personality =
static struct md_personality raid4_personality =
{
- .name = "raid4",
- .level = 4,
- .owner = THIS_MODULE,
+ .head = {
+ .type = MD_PERSONALITY,
+ .id = ID_RAID4,
+ .name = "raid4",
+ .owner = THIS_MODULE,
+ },
+
.make_request = raid5_make_request,
.run = raid5_run,
.start = raid5_start,
@@ -9045,21 +9056,39 @@ static int __init raid5_init(void)
"md/raid5:prepare",
raid456_cpu_up_prepare,
raid456_cpu_dead);
- if (ret) {
- destroy_workqueue(raid5_wq);
- return ret;
- }
- register_md_personality(&raid6_personality);
- register_md_personality(&raid5_personality);
- register_md_personality(&raid4_personality);
+ if (ret)
+ goto err_destroy_wq;
+
+ ret = register_md_submodule(&raid6_personality.head);
+ if (ret)
+ goto err_cpuhp_remove;
+
+ ret = register_md_submodule(&raid5_personality.head);
+ if (ret)
+ goto err_unregister_raid6;
+
+ ret = register_md_submodule(&raid4_personality.head);
+ if (ret)
+ goto err_unregister_raid5;
+
return 0;
+
+err_unregister_raid5:
+ unregister_md_submodule(&raid5_personality.head);
+err_unregister_raid6:
+ unregister_md_submodule(&raid6_personality.head);
+err_cpuhp_remove:
+ cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
+err_destroy_wq:
+ destroy_workqueue(raid5_wq);
+ return ret;
}
-static void raid5_exit(void)
+static void __exit raid5_exit(void)
{
- unregister_md_personality(&raid6_personality);
- unregister_md_personality(&raid5_personality);
- unregister_md_personality(&raid4_personality);
+ unregister_md_submodule(&raid6_personality.head);
+ unregister_md_submodule(&raid5_personality.head);
+ unregister_md_submodule(&raid4_personality.head);
cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
destroy_workqueue(raid5_wq);
}