summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2023-04-14 06:32:44 -0600
committerJens Axboe <axboe@kernel.dk>2023-04-14 06:32:44 -0600
commit310e9c85c61a4393e772c9286947e259bb8c32a7 (patch)
treee1fc81693accd3daaa9deb8670320ca8ec9cde96
parentd2a1d45ced846da6e71d777d9f230e47c5d694d9 (diff)
parent7bc436121e557b1f5bebf5ad67e7ed3614d6df92 (diff)
downloadlwn-310e9c85c61a4393e772c9286947e259bb8c32a7.tar.gz
lwn-310e9c85c61a4393e772c9286947e259bb8c32a7.zip
Merge branch 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.4/block
Pull MD updates from Song: "- md/bitmap: Optimal last page size, by Jon Derrick - Various raid10 fixes, by Yu Kuai and Li Nan - md: add error_handlers for raid0 and linear, by Mariusz Tkaczyk" * 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md: md/raid5: remove unused working_disks variable md/raid10: don't call bio_start_io_acct twice for bio which experienced read error md/raid10: fix memleak of md thread md/raid10: fix memleak for 'conf->bio_split' md/raid10: fix leak of 'r10bio->remaining' for recovery md/raid10: don't BUG_ON() in raise_barrier() md: fix soft lockup in status_resync md: add error_handlers for raid0 and linear md: Use optimal I/O size for last bitmap page md: Fix types in sb writer md: Move sb writer loop to its own function md/raid10: Fix typo in comment (replacment -> replacement) md: make kobj_type structures constant md/raid10: fix null-ptr-deref in raid10_sync_request md/raid10: fix task hung in raid10d
-rw-r--r--drivers/md/md-bitmap.c143
-rw-r--r--drivers/md/md-linear.c14
-rw-r--r--drivers/md/md.c27
-rw-r--r--drivers/md/md.h10
-rw-r--r--drivers/md/raid0.c14
-rw-r--r--drivers/md/raid10.c102
-rw-r--r--drivers/md/raid5.c5
7 files changed, 184 insertions, 131 deletions
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index e7cc6ba1b657..920bb68156d2 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -209,76 +209,99 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
return NULL;
}
-static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+static unsigned int optimal_io_size(struct block_device *bdev,
+ unsigned int last_page_size,
+ unsigned int io_size)
+{
+ if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev))
+ return roundup(last_page_size, bdev_io_opt(bdev));
+ return io_size;
+}
+
+static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
+ sector_t start, sector_t boundary)
+{
+ if (io_size != opt_size &&
+ start + opt_size / SECTOR_SIZE <= boundary)
+ return opt_size;
+ if (start + io_size / SECTOR_SIZE <= boundary)
+ return io_size;
+
+ /* Overflows boundary */
+ return 0;
+}
+
+static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
+ struct page *page)
{
- struct md_rdev *rdev;
struct block_device *bdev;
struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage;
+ sector_t offset = mddev->bitmap_info.offset;
+ sector_t ps, sboff, doff;
+ unsigned int size = PAGE_SIZE;
+ unsigned int opt_size = PAGE_SIZE;
+
+ bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
+ if (page->index == store->file_pages - 1) {
+ unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
+
+ if (last_page_size == 0)
+ last_page_size = PAGE_SIZE;
+ size = roundup(last_page_size, bdev_logical_block_size(bdev));
+ opt_size = optimal_io_size(bdev, last_page_size, size);
+ }
+
+ ps = page->index * PAGE_SIZE / SECTOR_SIZE;
+ sboff = rdev->sb_start + offset;
+ doff = rdev->data_offset;
+
+ /* Just make sure we aren't corrupting data or metadata */
+ if (mddev->external) {
+ /* Bitmap could be anywhere. */
+ if (sboff + ps > doff &&
+ sboff < (doff + mddev->dev_sectors + PAGE_SIZE / SECTOR_SIZE))
+ return -EINVAL;
+ } else if (offset < 0) {
+ /* DATA BITMAP METADATA */
+ size = bitmap_io_size(size, opt_size, offset + ps, 0);
+ if (size == 0)
+ /* bitmap runs in to metadata */
+ return -EINVAL;
+
+ if (doff + mddev->dev_sectors > sboff)
+ /* data runs in to bitmap */
+ return -EINVAL;
+ } else if (rdev->sb_start < rdev->data_offset) {
+ /* METADATA BITMAP DATA */
+ size = bitmap_io_size(size, opt_size, sboff + ps, doff);
+ if (size == 0)
+ /* bitmap runs in to data */
+ return -EINVAL;
+ } else {
+ /* DATA METADATA BITMAP - no problems */
+ }
-restart:
- rdev = NULL;
- while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
- int size = PAGE_SIZE;
- loff_t offset = mddev->bitmap_info.offset;
+ md_super_write(mddev, rdev, sboff + ps, (int) size, page);
+ return 0;
+}
- bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
+static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+{
+ struct md_rdev *rdev;
+ struct mddev *mddev = bitmap->mddev;
+ int ret;
- if (page->index == store->file_pages-1) {
- int last_page_size = store->bytes & (PAGE_SIZE-1);
- if (last_page_size == 0)
- last_page_size = PAGE_SIZE;
- size = roundup(last_page_size,
- bdev_logical_block_size(bdev));
- }
- /* Just make sure we aren't corrupting data or
- * metadata
- */
- if (mddev->external) {
- /* Bitmap could be anywhere. */
- if (rdev->sb_start + offset + (page->index
- * (PAGE_SIZE/512))
- > rdev->data_offset
- &&
- rdev->sb_start + offset
- < (rdev->data_offset + mddev->dev_sectors
- + (PAGE_SIZE/512)))
- goto bad_alignment;
- } else if (offset < 0) {
- /* DATA BITMAP METADATA */
- if (offset
- + (long)(page->index * (PAGE_SIZE/512))
- + size/512 > 0)
- /* bitmap runs in to metadata */
- goto bad_alignment;
- if (rdev->data_offset + mddev->dev_sectors
- > rdev->sb_start + offset)
- /* data runs in to bitmap */
- goto bad_alignment;
- } else if (rdev->sb_start < rdev->data_offset) {
- /* METADATA BITMAP DATA */
- if (rdev->sb_start
- + offset
- + page->index*(PAGE_SIZE/512) + size/512
- > rdev->data_offset)
- /* bitmap runs in to data */
- goto bad_alignment;
- } else {
- /* DATA METADATA BITMAP - no problems */
+ do {
+ rdev = NULL;
+ while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
+ ret = __write_sb_page(rdev, bitmap, page);
+ if (ret)
+ return ret;
}
- md_super_write(mddev, rdev,
- rdev->sb_start + offset
- + page->index * (PAGE_SIZE/512),
- size,
- page);
- }
+ } while (wait && md_super_wait(mddev) < 0);
- if (wait && md_super_wait(mddev) < 0)
- goto restart;
return 0;
-
- bad_alignment:
- return -EINVAL;
}
static void md_bitmap_file_kick(struct bitmap *bitmap);
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 6e7797b4e738..4eb72b9dd933 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -223,7 +223,8 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
bio_sector < start_sector))
goto out_of_bounds;
- if (unlikely(is_mddev_broken(tmp_dev->rdev, "linear"))) {
+ if (unlikely(is_rdev_broken(tmp_dev->rdev))) {
+ md_error(mddev, tmp_dev->rdev);
bio_io_error(bio);
return true;
}
@@ -270,6 +271,16 @@ static void linear_status (struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
}
+static void linear_error(struct mddev *mddev, struct md_rdev *rdev)
+{
+ if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
+ char *md_name = mdname(mddev);
+
+ pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
+ md_name, rdev->bdev);
+ }
+}
+
static void linear_quiesce(struct mddev *mddev, int state)
{
}
@@ -286,6 +297,7 @@ static struct md_personality linear_personality =
.hot_add_disk = linear_add,
.size = linear_size,
.quiesce = linear_quiesce,
+ .error_handler = linear_error,
};
static int __init linear_init (void)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 927a43db5dfb..122ae28e785c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -78,7 +78,7 @@
static LIST_HEAD(pers_list);
static DEFINE_SPINLOCK(pers_lock);
-static struct kobj_type md_ktype;
+static const struct kobj_type md_ktype;
struct md_cluster_operations *md_cluster_ops;
EXPORT_SYMBOL(md_cluster_ops);
@@ -3597,7 +3597,7 @@ static const struct sysfs_ops rdev_sysfs_ops = {
.show = rdev_attr_show,
.store = rdev_attr_store,
};
-static struct kobj_type rdev_ktype = {
+static const struct kobj_type rdev_ktype = {
.release = rdev_free,
.sysfs_ops = &rdev_sysfs_ops,
.default_groups = rdev_default_groups,
@@ -5555,7 +5555,7 @@ static const struct sysfs_ops md_sysfs_ops = {
.show = md_attr_show,
.store = md_attr_store,
};
-static struct kobj_type md_ktype = {
+static const struct kobj_type md_ktype = {
.release = md_kobj_release,
.sysfs_ops = &md_sysfs_ops,
.default_groups = md_attr_groups,
@@ -7974,6 +7974,9 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
return;
mddev->pers->error_handler(mddev, rdev);
+ if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR)
+ return;
+
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -8029,16 +8032,16 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
} else if (resync > max_sectors) {
resync = max_sectors;
} else {
- resync -= atomic_read(&mddev->recovery_active);
- if (resync < MD_RESYNC_ACTIVE) {
- /*
- * Resync has started, but the subtraction has
- * yielded one of the special values. Force it
- * to active to ensure the status reports an
- * active resync.
- */
+ res = atomic_read(&mddev->recovery_active);
+ /*
+ * Resync has started, but the subtraction has overflowed or
+ * yielded one of the special values. Force it to active to
+ * ensure the status reports an active resync.
+ */
+ if (resync < res || resync - res < MD_RESYNC_ACTIVE)
resync = MD_RESYNC_ACTIVE;
- }
+ else
+ resync -= res;
}
if (resync == MD_RESYNC_NONE) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e148e3c83b0d..fd8f260ed5f8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -790,15 +790,9 @@ extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
-static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type)
+static inline bool is_rdev_broken(struct md_rdev *rdev)
{
- if (!disk_live(rdev->bdev->bd_disk)) {
- if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags))
- pr_warn("md: %s: %s array has a missing/failed member\n",
- mdname(rdev->mddev), md_type);
- return true;
- }
- return false;
+ return !disk_live(rdev->bdev->bd_disk);
}
static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index b536befd8898..f8ee9a95e25d 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -569,8 +569,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
return true;
}
- if (unlikely(is_mddev_broken(tmp_dev, "raid0"))) {
+ if (unlikely(is_rdev_broken(tmp_dev))) {
bio_io_error(bio);
+ md_error(mddev, tmp_dev);
return true;
}
@@ -592,6 +593,16 @@ static void raid0_status(struct seq_file *seq, struct mddev *mddev)
return;
}
+static void raid0_error(struct mddev *mddev, struct md_rdev *rdev)
+{
+ if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
+ char *md_name = mdname(mddev);
+
+ pr_crit("md/raid0%s: Disk failure on %pg detected, failing array.\n",
+ md_name, rdev->bdev);
+ }
+}
+
static void *raid0_takeover_raid45(struct mddev *mddev)
{
struct md_rdev *rdev;
@@ -767,6 +778,7 @@ static struct md_personality raid0_personality=
.size = raid0_size,
.takeover = raid0_takeover,
.quiesce = raid0_quiesce,
+ .error_handler = raid0_error,
};
static int __init raid0_init (void)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6c66357f92f5..4fcfcb350d2b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -952,7 +952,9 @@ static void flush_pending_writes(struct r10conf *conf)
static void raise_barrier(struct r10conf *conf, int force)
{
write_seqlock_irq(&conf->resync_lock);
- BUG_ON(force && !conf->barrier);
+
+ if (WARN_ON_ONCE(force && !conf->barrier))
+ force = false;
/* Wait until no block IO is waiting (unless 'force') */
wait_event_barrier(conf, force || !conf->nr_waiting);
@@ -995,11 +997,15 @@ static bool stop_waiting_barrier(struct r10conf *conf)
(!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
return true;
- /* move on if recovery thread is blocked by us */
- if (conf->mddev->thread->tsk == current &&
- test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) &&
- conf->nr_queued > 0)
+ /*
+ * move on if io is issued from raid10d(), nr_pending is not released
+ * from original io(see handle_read_error()). All raise barrier is
+ * blocked until this io is done.
+ */
+ if (conf->mddev->thread->tsk == current) {
+ WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0);
return true;
+ }
return false;
}
@@ -1244,7 +1250,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
slot = r10_bio->read_slot;
- if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
+ if (!r10_bio->start_time &&
+ blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
r10_bio->start_time = bio_start_io_acct(bio);
read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
@@ -1574,6 +1581,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
r10_bio->read_slot = -1;
+ r10_bio->start_time = 0;
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
conf->geo.raid_disks);
@@ -1626,7 +1634,7 @@ static void raid10_end_discard_request(struct bio *bio)
/*
* raid10_remove_disk uses smp_mb to make sure rdev is set to
* replacement before setting replacement to NULL. It can read
- * rdev first without barrier protect even replacment is NULL
+ * rdev first without barrier protect even replacement is NULL
*/
smp_rmb();
rdev = conf->mirrors[dev].rdev;
@@ -2609,11 +2617,22 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
{
struct r10conf *conf = mddev->private;
int d;
- struct bio *wbio, *wbio2;
+ struct bio *wbio = r10_bio->devs[1].bio;
+ struct bio *wbio2 = r10_bio->devs[1].repl_bio;
+
+ /* Need to test wbio2->bi_end_io before we call
+ * submit_bio_noacct as if the former is NULL,
+ * the latter is free to free wbio2.
+ */
+ if (wbio2 && !wbio2->bi_end_io)
+ wbio2 = NULL;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
fix_recovery_read_error(r10_bio);
- end_sync_request(r10_bio);
+ if (wbio->bi_end_io)
+ end_sync_request(r10_bio);
+ if (wbio2)
+ end_sync_request(r10_bio);
return;
}
@@ -2622,14 +2641,6 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* and submit the write request
*/
d = r10_bio->devs[1].devnum;
- wbio = r10_bio->devs[1].bio;
- wbio2 = r10_bio->devs[1].repl_bio;
- /* Need to test wbio2->bi_end_io before we call
- * submit_bio_noacct as if the former is NULL,
- * the latter is free to free wbio2.
- */
- if (wbio2 && !wbio2->bi_end_io)
- wbio2 = NULL;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
@@ -2978,9 +2989,13 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
md_error(mddev, rdev);
rdev_dec_pending(rdev, mddev);
- allow_barrier(conf);
r10_bio->state = 0;
raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
+ /*
+ * allow_barrier after re-submit to ensure no sync io
+ * can be issued while regular io pending.
+ */
+ allow_barrier(conf);
}
static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
@@ -3289,10 +3304,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t chunk_mask = conf->geo.chunk_mask;
int page_idx = 0;
- if (!mempool_initialized(&conf->r10buf_pool))
- if (init_resync(conf))
- return 0;
-
/*
* Allow skipping a full rebuild for incremental assembly
* of a clean array, like RAID1 does.
@@ -3308,6 +3319,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
return mddev->dev_sectors - sector_nr;
}
+ if (!mempool_initialized(&conf->r10buf_pool))
+ if (init_resync(conf))
+ return 0;
+
skipped:
max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -4004,6 +4019,20 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
return nc*fc;
}
+static void raid10_free_conf(struct r10conf *conf)
+{
+ if (!conf)
+ return;
+
+ mempool_exit(&conf->r10bio_pool);
+ kfree(conf->mirrors);
+ kfree(conf->mirrors_old);
+ kfree(conf->mirrors_new);
+ safe_put_page(conf->tmppage);
+ bioset_exit(&conf->bio_split);
+ kfree(conf);
+}
+
static struct r10conf *setup_conf(struct mddev *mddev)
{
struct r10conf *conf = NULL;
@@ -4086,13 +4115,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
return conf;
out:
- if (conf) {
- mempool_exit(&conf->r10bio_pool);
- kfree(conf->mirrors);
- safe_put_page(conf->tmppage);
- bioset_exit(&conf->bio_split);
- kfree(conf);
- }
+ raid10_free_conf(conf);
return ERR_PTR(err);
}
@@ -4129,6 +4152,9 @@ static int raid10_run(struct mddev *mddev)
if (!conf)
goto out;
+ mddev->thread = conf->thread;
+ conf->thread = NULL;
+
if (mddev_is_clustered(conf->mddev)) {
int fc, fo;
@@ -4141,9 +4167,6 @@ static int raid10_run(struct mddev *mddev)
}
}
- mddev->thread = conf->thread;
- conf->thread = NULL;
-
if (mddev->queue) {
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
@@ -4283,10 +4306,7 @@ static int raid10_run(struct mddev *mddev)
out_free_conf:
md_unregister_thread(&mddev->thread);
- mempool_exit(&conf->r10bio_pool);
- safe_put_page(conf->tmppage);
- kfree(conf->mirrors);
- kfree(conf);
+ raid10_free_conf(conf);
mddev->private = NULL;
out:
return -EIO;
@@ -4294,15 +4314,7 @@ out:
static void raid10_free(struct mddev *mddev, void *priv)
{
- struct r10conf *conf = priv;
-
- mempool_exit(&conf->r10bio_pool);
- safe_put_page(conf->tmppage);
- kfree(conf->mirrors);
- kfree(conf->mirrors_old);
- kfree(conf->mirrors_new);
- bioset_exit(&conf->bio_split);
- kfree(conf);
+ raid10_free_conf(priv);
}
static void raid10_quiesce(struct mddev *mddev, int quiesce)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7b820b81d8c2..812a12e3e41a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7716,7 +7716,6 @@ static void raid5_set_io_opt(struct r5conf *conf)
static int raid5_run(struct mddev *mddev)
{
struct r5conf *conf;
- int working_disks = 0;
int dirty_parity_disks = 0;
struct md_rdev *rdev;
struct md_rdev *journal_dev = NULL;
@@ -7912,10 +7911,8 @@ static int raid5_run(struct mddev *mddev)
pr_warn("md: cannot handle concurrent replacement and reshape.\n");
goto abort;
}
- if (test_bit(In_sync, &rdev->flags)) {
- working_disks++;
+ if (test_bit(In_sync, &rdev->flags))
continue;
- }
/* This disc is not fully in-sync. However if it
* just stored parity (beyond the recovery_offset),
* when we don't need to be concerned about the